From 490062f05400245f190b466eeca415327b37b5e7 Mon Sep 17 00:00:00 2001
From: Xiaozhu Meng <mxz297@gmail.com>
Date: Fri, 19 Jun 2020 16:33:57 -0500
Subject: [PATCH 001/177] Move load map init and fnbounds init ahead of
 registering sample source. This is intended to deal with sample source init
 code that will dlopen other libraries (such as PAPI)

---
 src/tool/hpcrun/main.c | 15 ++++++---------
 1 file changed, 6 insertions(+), 9 deletions(-)

diff --git a/src/tool/hpcrun/main.c b/src/tool/hpcrun/main.c
index 234f6f1bc7..b570b40fea 100644
--- a/src/tool/hpcrun/main.c
+++ b/src/tool/hpcrun/main.c
@@ -470,8 +470,6 @@ hpcrun_init_internal(bool is_child)
   gotcha_restore_library_filter_func();
 #endif
 
-  hpcrun_initLoadmap();
-
   hpcrun_memory_reinit();
   hpcrun_mmap_init();
   hpcrun_thread_data_init(0, NULL, is_child, hpcrun_get_num_sample_sources());
@@ -485,12 +483,6 @@ hpcrun_init_internal(bool is_child)
   // init callbacks for each device
   hpcrun_initializer_init();
 
-  // WARNING: a perfmon bug requires us to fork off the fnbounds
-  // server before we call PAPI_init, which is done in argument
-  // processing below. Also, fnbounds_init must be done after the
-  // memory allocator is initialized.
-  fnbounds_init();
-
   main_addr = monitor_get_addr_main();
   setup_main_bounds_check(main_addr);
   TMSG(MAIN_BOUNDS, "main addr %p ==> lower %p, upper %p", main_addr, main_lower, main_upper);
@@ -933,8 +925,13 @@ monitor_init_process(int *argc, char **argv, void* data)
   copy_execname(process_name);
   hpcrun_files_set_executable(process_name);
 
-  hpcrun_registered_sources_init();
+  // We initialize the load map and fnbounds before registering sample source.
+  // This is because sample source init (such as PAPI)  may dlopen other libraries,
+  // which will trigger our library monitoring code and fnbound queries
+  hpcrun_initLoadmap();
+  fnbounds_init();
 
+  hpcrun_registered_sources_init();
   messages_init();
 
   control_knob_init();  

From e6826a245613855a74df56f8675a9178eaadbcea Mon Sep 17 00:00:00 2001
From: Xiaozhu Meng <mxz297@gmail.com>
Date: Fri, 19 Jun 2020 17:23:38 -0500
Subject: [PATCH 002/177] More re-ordering in hpcrun init code. The new order
 is:

1. init load map
2. set up debug flag, log file and measurement directory
3. save vdso
4. fnbounds init
5. register sample source

This re-ordering is intended to monitor dynamic libraryes loaded during
sampling source initialization
---
 src/tool/hpcrun/main.c | 20 +++++++++++++-------
 1 file changed, 13 insertions(+), 7 deletions(-)

diff --git a/src/tool/hpcrun/main.c b/src/tool/hpcrun/main.c
index b570b40fea..5f91855467 100644
--- a/src/tool/hpcrun/main.c
+++ b/src/tool/hpcrun/main.c
@@ -478,8 +478,6 @@ hpcrun_init_internal(bool is_child)
   // because mapping of load modules affects the recipe map.
   hpcrun_unw_init();
 
-  hpcrun_save_vdso();
-
   // init callbacks for each device
   hpcrun_initializer_init();
 
@@ -929,10 +927,22 @@ monitor_init_process(int *argc, char **argv, void* data)
   // This is because sample source init (such as PAPI)  may dlopen other libraries,
   // which will trigger our library monitoring code and fnbound queries
   hpcrun_initLoadmap();
+
+  // We need to initialize messages related functions and set up measurement directory,
+  // so that we can write vdso and prevent fnbounds print messages to the terminal.
+  messages_init();
+  if (!hpcrun_get_disabled()) {
+    hpcrun_files_set_directory();
+  }
+  messages_logfile_create();
+
+  // We need to save vdso before initializing fnbounds this
+  // is because fnbounds_init will iterate over the load map 
+  // and will invoke analysis on vdso
+  hpcrun_save_vdso();
   fnbounds_init();
 
   hpcrun_registered_sources_init();
-  messages_init();
 
   control_knob_init();  
 
@@ -955,15 +965,11 @@ monitor_init_process(int *argc, char **argv, void* data)
 
   hpcrun_process_sample_source_none();
 
-  if (!hpcrun_get_disabled()) {
-    hpcrun_files_set_directory();
-  }
 
   TMSG(PROCESS,"hpcrun_files_set_executable called w process name = %s", process_name);
 
   TMSG(PROCESS,"init");
 
-  messages_logfile_create();
   hpcrun_sample_prob_mesg();
 
   TMSG(PROCESS, "I am a %s process", is_child ? "child" : "parent");

From b6440b04c4a451b9e43b606845265cbf0a801ef9 Mon Sep 17 00:00:00 2001
From: Dejan XXX <dx4@llnl.cs.rice.edu>
Date: Tue, 30 Jun 2020 21:38:01 -0500
Subject: [PATCH 003/177] added papi_active_flag

---
 .../hpcrun/gpu/gpu-application-thread-api.c   |   1 -
 src/tool/hpcrun/gpu/nvidia/cupti-api.c        |   7 +-
 src/tool/hpcrun/main.c                        |  10 +-
 src/tool/hpcrun/sample-sources/papi-c.c       | 116 ++++++++++++++----
 src/tool/hpcrun/sample_sources_registered.c   |   7 +-
 5 files changed, 112 insertions(+), 29 deletions(-)

diff --git a/src/tool/hpcrun/gpu/gpu-application-thread-api.c b/src/tool/hpcrun/gpu/gpu-application-thread-api.c
index cbe81b036b..c96f998a14 100644
--- a/src/tool/hpcrun/gpu/gpu-application-thread-api.c
+++ b/src/tool/hpcrun/gpu/gpu-application-thread-api.c
@@ -118,7 +118,6 @@ gpu_application_thread_correlation_callback
     }
   }
 
-
   // skip procedure frames in libhpcrun
   while (libhpcrun_id != 0 && node_addr->ip_norm.lm_id == libhpcrun_id) {
     node = hpcrun_cct_parent(node);
diff --git a/src/tool/hpcrun/gpu/nvidia/cupti-api.c b/src/tool/hpcrun/gpu/nvidia/cupti-api.c
index ba550e0565..25fb97a466 100644
--- a/src/tool/hpcrun/gpu/nvidia/cupti-api.c
+++ b/src/tool/hpcrun/gpu/nvidia/cupti-api.c
@@ -103,7 +103,7 @@
 #include "cubin-hash-map.h"
 #include "cubin-id-map.h"
 
-
+extern bool is_papi_active();
 
 //******************************************************************************
 // macros
@@ -757,6 +757,9 @@ cupti_subscriber_callback
  const void *cb_info
 )
 {
+	TMSG(CUPTI, "papi_active = %d\n", is_papi_active());
+	if (is_papi_active()) return;
+
   if (domain == CUPTI_CB_DOMAIN_RESOURCE) {
     const CUpti_ResourceData *rd = (const CUpti_ResourceData *) cb_info;
     if (cb_id == CUPTI_CBID_RESOURCE_MODULE_LOADED) {
@@ -933,6 +936,7 @@ cupti_subscriber_callback
       default:
         break;
     }
+
     bool is_kernel_op = gpu_op_placeholder_flags_is_set(gpu_op_placeholder_flags,
       gpu_placeholder_type_kernel);
     // If we have a valid operation and is not in the interval of a cuda/ompt runtime api
@@ -1083,6 +1087,7 @@ cupti_subscriber_callback
       default:
         break;
     }
+
     if (is_valid_op) {
       if (cd->callbackSite == CUPTI_API_ENTER) {
         // Enter a CUDA runtime api
diff --git a/src/tool/hpcrun/main.c b/src/tool/hpcrun/main.c
index 5f91855467..07ecd81813 100644
--- a/src/tool/hpcrun/main.c
+++ b/src/tool/hpcrun/main.c
@@ -478,8 +478,8 @@ hpcrun_init_internal(bool is_child)
   // because mapping of load modules affects the recipe map.
   hpcrun_unw_init();
 
-  // init callbacks for each device
-  hpcrun_initializer_init();
+//  // init callbacks for each device
+//  hpcrun_initializer_init();
 
   main_addr = monitor_get_addr_main();
   setup_main_bounds_check(main_addr);
@@ -549,8 +549,9 @@ hpcrun_init_internal(bool is_child)
   //       -all- possible (e.g. registered) sample sources call their own init method
   //       no need to do it twice.
   //
+
   if (! is_child) {
-    SAMPLE_SOURCES(process_event_list, lush_metrics);
+			SAMPLE_SOURCES(process_event_list, lush_metrics);
     SAMPLE_SOURCES(finalize_event_list);
     hpcrun_metrics_data_finalize();
   }
@@ -942,6 +943,9 @@ monitor_init_process(int *argc, char **argv, void* data)
   hpcrun_save_vdso();
   fnbounds_init();
 
+  // init callbacks for each device //Module_ignore_map is here
+  hpcrun_initializer_init();
+
   hpcrun_registered_sources_init();
 
   control_knob_init();  
diff --git a/src/tool/hpcrun/sample-sources/papi-c.c b/src/tool/hpcrun/sample-sources/papi-c.c
index 6c96da80ac..2ccbdc4daa 100644
--- a/src/tool/hpcrun/sample-sources/papi-c.c
+++ b/src/tool/hpcrun/sample-sources/papi-c.c
@@ -135,10 +135,22 @@ static bool disable_papi_cuda = false;
 
 static kind_info_t *papi_kind;
 
+extern __thread bool papi_active;
 
 /******************************************************************************
  * private operations 
  *****************************************************************************/
+static void
+papi_flag_on()
+{
+	papi_active = true;
+}
+
+static void
+papi_flag_off()
+{
+	papi_active = false;
+}
 
 static int
 get_event_index(sample_source_t *self, int event_code)
@@ -223,12 +235,14 @@ strip_papi_prefix(const char *str)
 static void
 METHOD_FN(init)
 {
+	papi_flag_on();
   // PAPI_set_debug(0x3ff);
 
   // **NOTE: some papi components may start threads, so
   //         hpcrun must ignore these threads to ensure that PAPI_library_init
   //         succeeds
   //
+
   monitor_disable_new_threads();
   if (disable_papi_cuda) {
     TMSG(PAPI_C, "Will disable PAPI cuda component (if component is active)");
@@ -273,13 +287,15 @@ METHOD_FN(init)
   }
 
   self->state = INIT;
+  papi_flag_off();
 }
 
 static void
 METHOD_FN(thread_init)
 {
+	papi_flag_on();
   TMSG(PAPI, "thread init");
-  if (papi_unavail) { return; }
+  if (papi_unavail) { goto finish; }
 
   int retval = PAPI_thread_init(pthread_self);
   if (retval != PAPI_OK) {
@@ -287,13 +303,17 @@ METHOD_FN(thread_init)
     monitor_real_abort();
   }
   TMSG(PAPI, "thread init OK");
+
+finish:
+  papi_flag_off();
 }
 
 static void
 METHOD_FN(thread_init_action)
 {
+	papi_flag_on();
   TMSG(PAPI, "register thread");
-  if (papi_unavail) { return; }
+  if (papi_unavail) { goto finish; }
 
   int retval = PAPI_register_thread();
   if (retval != PAPI_OK) {
@@ -301,16 +321,20 @@ METHOD_FN(thread_init_action)
     monitor_real_abort();
   }
   TMSG(PAPI, "register thread ok");
+
+finish:
+	papi_flag_off();
 }
 
 static void
 METHOD_FN(start)
 {
+	papi_flag_on();
   int cidx;
   TMSG(PAPI, "start");
 
-  if (papi_unavail) { 
-    return; 
+  if (papi_unavail) {
+    goto finish;
   }
 
   thread_data_t* td = hpcrun_get_thread_data();
@@ -322,7 +346,7 @@ METHOD_FN(start)
 
   if (my_state == START) {
     TMSG(PAPI,"*NOTE* PAPI start called when already in state START");
-    return;
+		goto finish;
   }
 
   // for each active component, start its event set
@@ -358,27 +382,35 @@ METHOD_FN(start)
     }
   }
   td->ss_state[self->sel_idx] = START;
+
+finish:
+	papi_flag_off();
 }
 
 static void
 METHOD_FN(thread_fini_action)
 {
+	papi_flag_on();
   TMSG(PAPI, "unregister thread");
-  if (papi_unavail) { return; }
+  if (papi_unavail) { goto finish; }
 
   int retval = PAPI_unregister_thread();
   char msg[] = "!!NOT PAPI_OK!! (code = -9999999)\n";
   snprintf(msg, sizeof(msg)-1, "!!NOT PAPI_OK!! (code = %d)", retval);
   TMSG(PAPI, "unregister thread returns %s", retval == PAPI_OK? "PAPI_OK" : msg);
+finish:
+	papi_flag_off();
 }
 
 static void
 METHOD_FN(stop)
 {
-  int cidx;
+	papi_flag_on();
+
+	int cidx;
 
   TMSG(PAPI, "stop");
-  if (papi_unavail) { return; }
+  if (papi_unavail) { goto finish; }
 
   thread_data_t *td = hpcrun_get_thread_data();
   int nevents = self->evl.nevents;
@@ -386,12 +418,12 @@ METHOD_FN(stop)
 
   if (my_state == STOP) {
     TMSG(PAPI,"*NOTE* PAPI stop called when already in state STOP");
-    return;
+    goto finish;
   }
 
   if (my_state != START) {
     TMSG(PAPI,"*WARNING* PAPI stop called when not in state START");
-    return;
+    goto finish;
   }
 
   papi_source_info_t *psi = td->ss_info[self->sel_idx].ptr;
@@ -415,19 +447,24 @@ METHOD_FN(stop)
   }
 
   TD_GET(ss_state)[self->sel_idx] = STOP;
+finish:
+	papi_flag_off();
 }
 
 static void
 METHOD_FN(shutdown)
 {
+	papi_flag_on();
   TMSG(PAPI, "shutdown");
-  if (papi_unavail) { return; }
+  if (papi_unavail) { goto finish; }
 
   METHOD_CALL(self, stop); // make sure stop has been called
   // FIXME: add component shutdown code here
   PAPI_shutdown();
 
   self->state = UNINIT;
+finish:
+	papi_flag_off();
 }
 
 // Return true if PAPI recognizes the name, whether supported or not.
@@ -435,10 +472,12 @@ METHOD_FN(shutdown)
 static bool
 METHOD_FN(supports_event, const char *ev_str)
 {
+	papi_flag_on();
+	bool ret;
   ev_str = strip_papi_prefix(ev_str);
   
   TMSG(PAPI, "supports event");
-  if (papi_unavail) { return false; }
+  if (papi_unavail) { ret = false; goto finish;}
 
   if (self->state == UNINIT){
     METHOD_CALL(self, init);
@@ -449,14 +488,20 @@ METHOD_FN(supports_event, const char *ev_str)
   long th;
 
   hpcrun_extract_ev_thresh(ev_str, sizeof(evtmp), evtmp, &th, DEFAULT_THRESHOLD);
-  return PAPI_event_name_to_code(evtmp, &ec) == PAPI_OK;
+
+  ret = (PAPI_event_name_to_code(evtmp, &ec) == PAPI_OK);
+
+finish:
+	papi_flag_off();
+	return ret;
 }
  
 static void
 METHOD_FN(process_event_list, int lush_metrics)
 {
+	papi_flag_on();
   TMSG(PAPI, "process event list");
-  if (papi_unavail) { return; }
+  if (papi_unavail) { goto finish; }
 
   char *event;
   int i, ret;
@@ -569,6 +614,9 @@ METHOD_FN(process_event_list, int lush_metrics)
   if (! some_overflow) {
     hpcrun_ssfail_all_derived("PAPI");
   }
+
+finish:
+	papi_flag_off();
 }
 
 static void
@@ -579,12 +627,13 @@ METHOD_FN(finalize_event_list)
 static void
 METHOD_FN(gen_event_set, int lush_metrics)
 {
+	papi_flag_on();
   thread_data_t *td = hpcrun_get_thread_data();
   int i;
   int ret;
 
   TMSG(PAPI, "generating all event sets for all components");
-  if (papi_unavail) { return; }
+  if (papi_unavail) { goto finish; }
 
   int num_components = PAPI_num_components();
   int ss_info_size = sizeof(papi_source_info_t) + 
@@ -677,11 +726,15 @@ METHOD_FN(gen_event_set, int lush_metrics)
       }
     }
   }
+
+finish:
+	papi_flag_off();
 }
 
 static void
 METHOD_FN(display_events)
 {
+	papi_flag_on();
   PAPI_event_info_t info;
   int ev, ret, num_total, num_prof;
   int num_components, cidx;
@@ -689,7 +742,7 @@ METHOD_FN(display_events)
   if (papi_unavail) {
     printf("PAPI is not available.  Probably, the kernel doesn't support PAPI,\n"
 	   "or else maybe HPCToolkit is out of sync with PAPI.\n\n");
-    return;
+    goto finish;
   }
 
   cidx = 0; // CPU component
@@ -756,6 +809,9 @@ METHOD_FN(display_events)
 
   printf( "Total events reported: %d\n", num_total);
   printf("\n\n");
+
+finish:
+	papi_flag_off();
 }
 
 
@@ -775,7 +831,9 @@ METHOD_FN(display_events)
 void
 hpcrun_disable_papi_cuda(void)
 {
+	papi_flag_on();
   disable_papi_cuda = true;
+  papi_flag_off();
 }
 
 /******************************************************************************
@@ -787,26 +845,35 @@ hpcrun_disable_papi_cuda(void)
 static int
 event_is_derived(int ev_code)
 {
+	papi_flag_on();
+	int ret;
   PAPI_event_info_t info;
 
   // "Is derived" is kind of a bad thing, so if any unexpected failure
   // occurs, we'll return the "bad" answer.
   if (PAPI_get_event_info(ev_code, &info) != PAPI_OK
       || info.derived == NULL) {
-    return 1;
+    ret = 1;
+    goto finish;
   }
   if (info.count == 1
       || strlen(info.derived) == 0
       || strcmp(info.derived, "NOT_DERIVED") == 0
       || strcmp(info.derived, "DERIVED_CMPD") == 0) {
-    return 0;
+    ret = 0;
+    goto finish;
   }
-  return 1;
+  ret = 1;
+
+finish:
+	papi_flag_off();
+	return ret;
 }
 
 static void
 event_fatal_error(int ev_code, int papi_ret)
 {
+	papi_flag_on();
   char name[1024];
 
   PAPI_event_code_to_name(ev_code, name);
@@ -820,12 +887,15 @@ event_fatal_error(int ev_code, int papi_ret)
     hpcrun_ssfail_conflict("PAPI", name);
   }
   hpcrun_ssfail_unsupported("PAPI", name);
+
+  papi_flag_off();
 }
 
 static void
 papi_event_handler(int event_set, void *pc, long long ovec,
                    void *context)
 {
+	papi_flag_on();
   sample_source_t *self = &obj_name();
   long long values[MAX_EVENTS];
   int my_events[MAX_EVENTS];
@@ -837,19 +907,19 @@ papi_event_handler(int event_set, void *pc, long long ovec,
   int my_event_codes_count = MAX_EVENTS;
 
   // if sampling disabled explicitly for this thread, skip all processing
-  if (hpcrun_suppress_sample() || sample_filters_apply()) return;
+  if (hpcrun_suppress_sample() || sample_filters_apply()) goto finish;
 
   if (!ovec) {
     TMSG(PAPI_SAMPLE, "papi overflow event: event set %d ovec = %ld",
 	 event_set, ovec);
-    return;
+		goto finish;
   }
 
   // If the interrupt came from inside our code, then drop the sample
   // and return and avoid any MSG.
   if (! hpcrun_safe_enter_async(pc)) {
     hpcrun_stats_num_samples_blocked_async_inc();
-    return;
+    goto finish;
   }
 
   int cidx = PAPI_get_eventset_component(event_set);
@@ -941,5 +1011,7 @@ papi_event_handler(int event_set, void *pc, long long ovec,
     }
   }
 
+finish:
+	papi_flag_off();
   hpcrun_safe_exit();
 }
diff --git a/src/tool/hpcrun/sample_sources_registered.c b/src/tool/hpcrun/sample_sources_registered.c
index 6b8064d749..187c6758a6 100644
--- a/src/tool/hpcrun/sample_sources_registered.c
+++ b/src/tool/hpcrun/sample_sources_registered.c
@@ -73,7 +73,7 @@
 
 static sample_source_t* registered_sample_sources = NULL;
 
-
+__thread bool papi_active = false;
 
 //------------------------------------------------------------------------------
 // interface operations 
@@ -91,7 +91,10 @@ hpcrun_sample_sources_register(void)
 
 }
 
-
+extern bool is_papi_active()
+{
+	return papi_active;
+}
 
 //------------------------------------------------------------------------------
 // interface operations 

From 6359ced949803ed5479b001be3e357c38fa3059c Mon Sep 17 00:00:00 2001
From: Dejan XXX <dx4@llnl.cs.rice.edu>
Date: Wed, 1 Jul 2020 19:12:22 -0500
Subject: [PATCH 004/177] solved papi issue with init and deadlock on cupti

---
 src/tool/hpcrun/Makefile.am                 |  1 +
 src/tool/hpcrun/gpu/amd/roctracer-api.c     |  2 +-
 src/tool/hpcrun/gpu/gpu-correlation.c       |  4 +++-
 src/tool/hpcrun/gpu/gpu-correlation.h       |  2 +-
 src/tool/hpcrun/gpu/nvidia/cupti-api.c      | 10 +++++++---
 src/tool/hpcrun/main.c                      |  4 +++-
 src/tool/hpcrun/sample-sources/papi-c.c     |  6 +++---
 src/tool/hpcrun/sample_sources_registered.c |  4 ++--
 src/tool/hpcrun/tool_state.c                | 18 ++++++++++++++++++
 src/tool/hpcrun/tool_state.h                | 15 +++++++++++++++
 10 files changed, 54 insertions(+), 12 deletions(-)
 create mode 100644 src/tool/hpcrun/tool_state.c
 create mode 100644 src/tool/hpcrun/tool_state.h

diff --git a/src/tool/hpcrun/Makefile.am b/src/tool/hpcrun/Makefile.am
index 48189a9379..dd41b52788 100644
--- a/src/tool/hpcrun/Makefile.am
+++ b/src/tool/hpcrun/Makefile.am
@@ -290,6 +290,7 @@ MY_BASE_FILES =				\
 	sample_event.c			\
 	sample_prob.c			\
 	sample_sources_all.c		\
+	tool_state.c			\
 	sample-sources/blame-shift/blame-shift.c \
 	sample-sources/blame-shift/blame-map.c   \
 	sample-sources/blame-shift/directed.c    \
diff --git a/src/tool/hpcrun/gpu/amd/roctracer-api.c b/src/tool/hpcrun/gpu/amd/roctracer-api.c
index 1f7117fadf..157ac9165b 100644
--- a/src/tool/hpcrun/gpu/amd/roctracer-api.c
+++ b/src/tool/hpcrun/gpu/amd/roctracer-api.c
@@ -65,7 +65,7 @@
 #include <hpcrun/utilities/hpcrun-nanotime.h>
 
 
-
+extern int is_papi_active();
 //******************************************************************************
 // macros
 //******************************************************************************
diff --git a/src/tool/hpcrun/gpu/gpu-correlation.c b/src/tool/hpcrun/gpu/gpu-correlation.c
index da1bd70bcc..94f4b13ec7 100644
--- a/src/tool/hpcrun/gpu/gpu-correlation.c
+++ b/src/tool/hpcrun/gpu/gpu-correlation.c
@@ -102,7 +102,7 @@ gpu_correlation_produce
 )
 {
   c->host_correlation_id = host_correlation_id;
-  c->gpu_op_ccts = *gpu_op_ccts;
+  if (gpu_op_ccts) c->gpu_op_ccts = *gpu_op_ccts;
   c->activity_channel = activity_channel;
   c->cpu_submit_time = cpu_submit_time;
 }
@@ -118,6 +118,8 @@ gpu_correlation_consume
     printf("gpu_correlation_consume(%ld, %ld,%ld)\n", c->host_correlation_id); 
 #else
     PRINT("Insert correlation id %ld\n", c->host_correlation_id);
+    if (c->host_correlation_id == PAPI_CORR_ID) return;
+
     gpu_host_correlation_map_insert(c->host_correlation_id, &(c->gpu_op_ccts), 
 				    c->cpu_submit_time, c->activity_channel);
 #endif
diff --git a/src/tool/hpcrun/gpu/gpu-correlation.h b/src/tool/hpcrun/gpu/gpu-correlation.h
index 5046db898b..e39ab4fe9e 100644
--- a/src/tool/hpcrun/gpu/gpu-correlation.h
+++ b/src/tool/hpcrun/gpu/gpu-correlation.h
@@ -60,7 +60,7 @@
 
 #define UNIT_TEST_CORRELATION_HEADER 0
 
-
+#define PAPI_CORR_ID -1
 
 //******************************************************************************
 // forward type declarations
diff --git a/src/tool/hpcrun/gpu/nvidia/cupti-api.c b/src/tool/hpcrun/gpu/nvidia/cupti-api.c
index 25fb97a466..887d34e617 100644
--- a/src/tool/hpcrun/gpu/nvidia/cupti-api.c
+++ b/src/tool/hpcrun/gpu/nvidia/cupti-api.c
@@ -103,7 +103,7 @@
 #include "cubin-hash-map.h"
 #include "cubin-id-map.h"
 
-extern bool is_papi_active();
+extern int is_papi_active();
 
 //******************************************************************************
 // macros
@@ -757,8 +757,12 @@ cupti_subscriber_callback
  const void *cb_info
 )
 {
-	TMSG(CUPTI, "papi_active = %d\n", is_papi_active());
-	if (is_papi_active()) return;
+
+	if (is_papi_active()) {
+		TMSG(CUPTI, "PAPI correlation callback");
+		gpu_correlation_channel_produce(PAPI_CORR_ID, NULL, 0);
+		return;
+	}
 
   if (domain == CUPTI_CB_DOMAIN_RESOURCE) {
     const CUpti_ResourceData *rd = (const CUpti_ResourceData *) cb_info;
diff --git a/src/tool/hpcrun/main.c b/src/tool/hpcrun/main.c
index 07ecd81813..51bd2328e5 100644
--- a/src/tool/hpcrun/main.c
+++ b/src/tool/hpcrun/main.c
@@ -941,11 +941,13 @@ monitor_init_process(int *argc, char **argv, void* data)
   // is because fnbounds_init will iterate over the load map 
   // and will invoke analysis on vdso
   hpcrun_save_vdso();
-  fnbounds_init();
+//  fnbounds_init();
 
   // init callbacks for each device //Module_ignore_map is here
   hpcrun_initializer_init();
 
+	fnbounds_init();
+
   hpcrun_registered_sources_init();
 
   control_knob_init();  
diff --git a/src/tool/hpcrun/sample-sources/papi-c.c b/src/tool/hpcrun/sample-sources/papi-c.c
index 2ccbdc4daa..b21b40203d 100644
--- a/src/tool/hpcrun/sample-sources/papi-c.c
+++ b/src/tool/hpcrun/sample-sources/papi-c.c
@@ -135,7 +135,7 @@ static bool disable_papi_cuda = false;
 
 static kind_info_t *papi_kind;
 
-extern __thread bool papi_active;
+extern __thread int papi_active;
 
 /******************************************************************************
  * private operations 
@@ -143,13 +143,13 @@ extern __thread bool papi_active;
 static void
 papi_flag_on()
 {
-	papi_active = true;
+	papi_active++;
 }
 
 static void
 papi_flag_off()
 {
-	papi_active = false;
+	papi_active--;
 }
 
 static int
diff --git a/src/tool/hpcrun/sample_sources_registered.c b/src/tool/hpcrun/sample_sources_registered.c
index 187c6758a6..16696d9ad3 100644
--- a/src/tool/hpcrun/sample_sources_registered.c
+++ b/src/tool/hpcrun/sample_sources_registered.c
@@ -73,7 +73,7 @@
 
 static sample_source_t* registered_sample_sources = NULL;
 
-__thread bool papi_active = false;
+__thread int papi_active = false;
 
 //------------------------------------------------------------------------------
 // interface operations 
@@ -91,7 +91,7 @@ hpcrun_sample_sources_register(void)
 
 }
 
-extern bool is_papi_active()
+extern int is_papi_active()
 {
 	return papi_active;
 }
diff --git a/src/tool/hpcrun/tool_state.c b/src/tool/hpcrun/tool_state.c
new file mode 100644
index 0000000000..6eda8ff0f7
--- /dev/null
+++ b/src/tool/hpcrun/tool_state.c
@@ -0,0 +1,18 @@
+//
+// Created by dejan on 1.7.20..
+//
+
+#include "tool_state.h"
+
+static __thread int tool_active = false;
+
+void tool_enter(){
+	tool_active++;
+}
+void tool_exit(){
+	tool_active++;
+}
+
+bool is_tool_active(){
+	return tool_active;
+}
\ No newline at end of file
diff --git a/src/tool/hpcrun/tool_state.h b/src/tool/hpcrun/tool_state.h
new file mode 100644
index 0000000000..95bc91f67f
--- /dev/null
+++ b/src/tool/hpcrun/tool_state.h
@@ -0,0 +1,15 @@
+//
+// Created by dejan on 1.7.20..
+//
+
+#ifndef HPCTOOLKIT_TOOL_STATE_H
+#define HPCTOOLKIT_TOOL_STATE_H
+
+#include <stdbool.h>
+
+
+void tool_enter();
+void tool_exit();
+bool is_tool_active();
+
+#endif //HPCTOOLKIT_TOOL_STATE_H

From 957457418a815be6a47d6f1f171d79dda8af38bb Mon Sep 17 00:00:00 2001
From: Dejan XXX <dx4@llnl.cs.rice.edu>
Date: Thu, 2 Jul 2020 13:08:48 -0500
Subject: [PATCH 005/177] papi cuda component available

---
 src/tool/hpcrun/Makefile.in                 | 32 +++++++--
 src/tool/hpcrun/gpu/amd/roctracer-api.c     |  2 +-
 src/tool/hpcrun/gpu/nvidia/cupti-api.c      |  4 +-
 src/tool/hpcrun/sample-sources/papi-c.c     | 75 +++++++++------------
 src/tool/hpcrun/sample_sources_registered.c |  6 --
 5 files changed, 62 insertions(+), 57 deletions(-)

diff --git a/src/tool/hpcrun/Makefile.in b/src/tool/hpcrun/Makefile.in
index 6ae50f016b..1e358b3164 100644
--- a/src/tool/hpcrun/Makefile.in
+++ b/src/tool/hpcrun/Makefile.in
@@ -428,7 +428,7 @@ am__libhpcrun_la_SOURCES_DIST = utilities/first_func.c main.h main.c \
 	cct_backtrace_finalize.c env.c epoch.c files.c \
 	handling_sample.c hpcrun-initializers.c hpcrun_options.c \
 	hpcrun_stats.c loadmap.c metrics.c name.c rank.c \
-	sample_event.c sample_prob.c sample_sources_all.c \
+	sample_event.c sample_prob.c sample_sources_all.c tool_state.c \
 	sample-sources/blame-shift/blame-shift.c \
 	sample-sources/blame-shift/blame-map.c \
 	sample-sources/blame-shift/directed.c \
@@ -558,7 +558,7 @@ am__objects_14 = utilities/libhpcrun_la-first_func.lo \
 	libhpcrun_la-loadmap.lo libhpcrun_la-metrics.lo \
 	libhpcrun_la-name.lo libhpcrun_la-rank.lo \
 	libhpcrun_la-sample_event.lo libhpcrun_la-sample_prob.lo \
-	libhpcrun_la-sample_sources_all.lo \
+	libhpcrun_la-sample_sources_all.lo libhpcrun_la-tool_state.lo \
 	sample-sources/blame-shift/libhpcrun_la-blame-shift.lo \
 	sample-sources/blame-shift/libhpcrun_la-blame-map.lo \
 	sample-sources/blame-shift/libhpcrun_la-directed.lo \
@@ -805,7 +805,7 @@ am__libhpcrun_o_SOURCES_DIST = utilities/first_func.c main.h main.c \
 	cct_backtrace_finalize.c env.c epoch.c files.c \
 	handling_sample.c hpcrun-initializers.c hpcrun_options.c \
 	hpcrun_stats.c loadmap.c metrics.c name.c rank.c \
-	sample_event.c sample_prob.c sample_sources_all.c \
+	sample_event.c sample_prob.c sample_sources_all.c tool_state.c \
 	sample-sources/blame-shift/blame-shift.c \
 	sample-sources/blame-shift/blame-map.c \
 	sample-sources/blame-shift/directed.c \
@@ -936,6 +936,7 @@ am__objects_51 = utilities/libhpcrun_o-first_func.$(OBJEXT) \
 	libhpcrun_o-sample_event.$(OBJEXT) \
 	libhpcrun_o-sample_prob.$(OBJEXT) \
 	libhpcrun_o-sample_sources_all.$(OBJEXT) \
+	libhpcrun_o-tool_state.$(OBJEXT) \
 	sample-sources/blame-shift/libhpcrun_o-blame-shift.$(OBJEXT) \
 	sample-sources/blame-shift/libhpcrun_o-blame-map.$(OBJEXT) \
 	sample-sources/blame-shift/libhpcrun_o-directed.$(OBJEXT) \
@@ -1686,7 +1687,7 @@ MY_BASE_FILES = utilities/first_func.c main.h main.c disabled.c \
 	cct_backtrace_finalize.c env.c epoch.c files.c \
 	handling_sample.c hpcrun-initializers.c hpcrun_options.c \
 	hpcrun_stats.c loadmap.c metrics.c name.c rank.c \
-	sample_event.c sample_prob.c sample_sources_all.c \
+	sample_event.c sample_prob.c sample_sources_all.c tool_state.c \
 	sample-sources/blame-shift/blame-shift.c \
 	sample-sources/blame-shift/blame-map.c \
 	sample-sources/blame-shift/directed.c \
@@ -3424,6 +3425,7 @@ distclean-compile:
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libhpcrun_la-thread_finalize.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libhpcrun_la-thread_use.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libhpcrun_la-threadmgr.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libhpcrun_la-tool_state.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libhpcrun_la-trace.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libhpcrun_la-weak.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libhpcrun_la-write_data.Plo@am__quote@
@@ -3462,6 +3464,7 @@ distclean-compile:
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libhpcrun_o-thread_finalize.Po@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libhpcrun_o-thread_use.Po@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libhpcrun_o-threadmgr.Po@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libhpcrun_o-tool_state.Po@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libhpcrun_o-trace.Po@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libhpcrun_o-weak.Po@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libhpcrun_o-write_data.Po@am__quote@
@@ -4116,6 +4119,13 @@ libhpcrun_la-sample_sources_all.lo: sample_sources_all.c
 @AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
 @am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -c -o libhpcrun_la-sample_sources_all.lo `test -f 'sample_sources_all.c' || echo '$(srcdir)/'`sample_sources_all.c
 
+libhpcrun_la-tool_state.lo: tool_state.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -MT libhpcrun_la-tool_state.lo -MD -MP -MF $(DEPDIR)/libhpcrun_la-tool_state.Tpo -c -o libhpcrun_la-tool_state.lo `test -f 'tool_state.c' || echo '$(srcdir)/'`tool_state.c
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/libhpcrun_la-tool_state.Tpo $(DEPDIR)/libhpcrun_la-tool_state.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='tool_state.c' object='libhpcrun_la-tool_state.lo' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -c -o libhpcrun_la-tool_state.lo `test -f 'tool_state.c' || echo '$(srcdir)/'`tool_state.c
+
 sample-sources/blame-shift/libhpcrun_la-blame-shift.lo: sample-sources/blame-shift/blame-shift.c
 @am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -MT sample-sources/blame-shift/libhpcrun_la-blame-shift.lo -MD -MP -MF sample-sources/blame-shift/$(DEPDIR)/libhpcrun_la-blame-shift.Tpo -c -o sample-sources/blame-shift/libhpcrun_la-blame-shift.lo `test -f 'sample-sources/blame-shift/blame-shift.c' || echo '$(srcdir)/'`sample-sources/blame-shift/blame-shift.c
 @am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) sample-sources/blame-shift/$(DEPDIR)/libhpcrun_la-blame-shift.Tpo sample-sources/blame-shift/$(DEPDIR)/libhpcrun_la-blame-shift.Plo
@@ -5621,6 +5631,20 @@ libhpcrun_o-sample_sources_all.obj: sample_sources_all.c
 @AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
 @am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_o_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_o_CFLAGS) $(CFLAGS) -c -o libhpcrun_o-sample_sources_all.obj `if test -f 'sample_sources_all.c'; then $(CYGPATH_W) 'sample_sources_all.c'; else $(CYGPATH_W) '$(srcdir)/sample_sources_all.c'; fi`
 
+libhpcrun_o-tool_state.o: tool_state.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_o_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_o_CFLAGS) $(CFLAGS) -MT libhpcrun_o-tool_state.o -MD -MP -MF $(DEPDIR)/libhpcrun_o-tool_state.Tpo -c -o libhpcrun_o-tool_state.o `test -f 'tool_state.c' || echo '$(srcdir)/'`tool_state.c
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/libhpcrun_o-tool_state.Tpo $(DEPDIR)/libhpcrun_o-tool_state.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='tool_state.c' object='libhpcrun_o-tool_state.o' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_o_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_o_CFLAGS) $(CFLAGS) -c -o libhpcrun_o-tool_state.o `test -f 'tool_state.c' || echo '$(srcdir)/'`tool_state.c
+
+libhpcrun_o-tool_state.obj: tool_state.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_o_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_o_CFLAGS) $(CFLAGS) -MT libhpcrun_o-tool_state.obj -MD -MP -MF $(DEPDIR)/libhpcrun_o-tool_state.Tpo -c -o libhpcrun_o-tool_state.obj `if test -f 'tool_state.c'; then $(CYGPATH_W) 'tool_state.c'; else $(CYGPATH_W) '$(srcdir)/tool_state.c'; fi`
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/libhpcrun_o-tool_state.Tpo $(DEPDIR)/libhpcrun_o-tool_state.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='tool_state.c' object='libhpcrun_o-tool_state.obj' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_o_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_o_CFLAGS) $(CFLAGS) -c -o libhpcrun_o-tool_state.obj `if test -f 'tool_state.c'; then $(CYGPATH_W) 'tool_state.c'; else $(CYGPATH_W) '$(srcdir)/tool_state.c'; fi`
+
 sample-sources/blame-shift/libhpcrun_o-blame-shift.o: sample-sources/blame-shift/blame-shift.c
 @am__fastdepCC_TRUE@	$(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_o_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_o_CFLAGS) $(CFLAGS) -MT sample-sources/blame-shift/libhpcrun_o-blame-shift.o -MD -MP -MF sample-sources/blame-shift/$(DEPDIR)/libhpcrun_o-blame-shift.Tpo -c -o sample-sources/blame-shift/libhpcrun_o-blame-shift.o `test -f 'sample-sources/blame-shift/blame-shift.c' || echo '$(srcdir)/'`sample-sources/blame-shift/blame-shift.c
 @am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) sample-sources/blame-shift/$(DEPDIR)/libhpcrun_o-blame-shift.Tpo sample-sources/blame-shift/$(DEPDIR)/libhpcrun_o-blame-shift.Po
diff --git a/src/tool/hpcrun/gpu/amd/roctracer-api.c b/src/tool/hpcrun/gpu/amd/roctracer-api.c
index 157ac9165b..069f0beb39 100644
--- a/src/tool/hpcrun/gpu/amd/roctracer-api.c
+++ b/src/tool/hpcrun/gpu/amd/roctracer-api.c
@@ -64,8 +64,8 @@
 
 #include <hpcrun/utilities/hpcrun-nanotime.h>
 
+#include "tool_state.h"
 
-extern int is_papi_active();
 //******************************************************************************
 // macros
 //******************************************************************************
diff --git a/src/tool/hpcrun/gpu/nvidia/cupti-api.c b/src/tool/hpcrun/gpu/nvidia/cupti-api.c
index 887d34e617..dfb2272b42 100644
--- a/src/tool/hpcrun/gpu/nvidia/cupti-api.c
+++ b/src/tool/hpcrun/gpu/nvidia/cupti-api.c
@@ -103,7 +103,7 @@
 #include "cubin-hash-map.h"
 #include "cubin-id-map.h"
 
-extern int is_papi_active();
+#include "tool_state.h"
 
 //******************************************************************************
 // macros
@@ -758,7 +758,7 @@ cupti_subscriber_callback
 )
 {
 
-	if (is_papi_active()) {
+	if (is_tool_active()) {
 		TMSG(CUPTI, "PAPI correlation callback");
 		gpu_correlation_channel_produce(PAPI_CORR_ID, NULL, 0);
 		return;
diff --git a/src/tool/hpcrun/sample-sources/papi-c.c b/src/tool/hpcrun/sample-sources/papi-c.c
index b21b40203d..c4d10a3c9b 100644
--- a/src/tool/hpcrun/sample-sources/papi-c.c
+++ b/src/tool/hpcrun/sample-sources/papi-c.c
@@ -98,7 +98,7 @@
 #include <lush/lush-backtrace.h>
 #include <lib/prof-lean/hpcrun-fmt.h>
 
-
+#include "tool_state.h"
 
 /******************************************************************************
  * macros
@@ -135,22 +135,9 @@ static bool disable_papi_cuda = false;
 
 static kind_info_t *papi_kind;
 
-extern __thread int papi_active;
-
 /******************************************************************************
  * private operations 
  *****************************************************************************/
-static void
-papi_flag_on()
-{
-	papi_active++;
-}
-
-static void
-papi_flag_off()
-{
-	papi_active--;
-}
 
 static int
 get_event_index(sample_source_t *self, int event_code)
@@ -235,7 +222,7 @@ strip_papi_prefix(const char *str)
 static void
 METHOD_FN(init)
 {
-	papi_flag_on();
+	tool_enter();
   // PAPI_set_debug(0x3ff);
 
   // **NOTE: some papi components may start threads, so
@@ -287,13 +274,13 @@ METHOD_FN(init)
   }
 
   self->state = INIT;
-  papi_flag_off();
+  tool_exit();
 }
 
 static void
 METHOD_FN(thread_init)
 {
-	papi_flag_on();
+	tool_enter();
   TMSG(PAPI, "thread init");
   if (papi_unavail) { goto finish; }
 
@@ -305,13 +292,13 @@ METHOD_FN(thread_init)
   TMSG(PAPI, "thread init OK");
 
 finish:
-  papi_flag_off();
+  tool_exit();
 }
 
 static void
 METHOD_FN(thread_init_action)
 {
-	papi_flag_on();
+	tool_enter();
   TMSG(PAPI, "register thread");
   if (papi_unavail) { goto finish; }
 
@@ -323,13 +310,13 @@ METHOD_FN(thread_init_action)
   TMSG(PAPI, "register thread ok");
 
 finish:
-	papi_flag_off();
+	tool_exit();
 }
 
 static void
 METHOD_FN(start)
 {
-	papi_flag_on();
+	tool_enter();
   int cidx;
   TMSG(PAPI, "start");
 
@@ -384,13 +371,13 @@ METHOD_FN(start)
   td->ss_state[self->sel_idx] = START;
 
 finish:
-	papi_flag_off();
+	tool_exit();
 }
 
 static void
 METHOD_FN(thread_fini_action)
 {
-	papi_flag_on();
+	tool_enter();
   TMSG(PAPI, "unregister thread");
   if (papi_unavail) { goto finish; }
 
@@ -399,13 +386,13 @@ METHOD_FN(thread_fini_action)
   snprintf(msg, sizeof(msg)-1, "!!NOT PAPI_OK!! (code = %d)", retval);
   TMSG(PAPI, "unregister thread returns %s", retval == PAPI_OK? "PAPI_OK" : msg);
 finish:
-	papi_flag_off();
+	tool_exit();
 }
 
 static void
 METHOD_FN(stop)
 {
-	papi_flag_on();
+	tool_enter();
 
 	int cidx;
 
@@ -448,13 +435,13 @@ METHOD_FN(stop)
 
   TD_GET(ss_state)[self->sel_idx] = STOP;
 finish:
-	papi_flag_off();
+	tool_exit();
 }
 
 static void
 METHOD_FN(shutdown)
 {
-	papi_flag_on();
+	tool_enter();
   TMSG(PAPI, "shutdown");
   if (papi_unavail) { goto finish; }
 
@@ -464,7 +451,7 @@ METHOD_FN(shutdown)
 
   self->state = UNINIT;
 finish:
-	papi_flag_off();
+	tool_exit();
 }
 
 // Return true if PAPI recognizes the name, whether supported or not.
@@ -472,7 +459,7 @@ METHOD_FN(shutdown)
 static bool
 METHOD_FN(supports_event, const char *ev_str)
 {
-	papi_flag_on();
+	tool_enter();
 	bool ret;
   ev_str = strip_papi_prefix(ev_str);
   
@@ -492,14 +479,14 @@ METHOD_FN(supports_event, const char *ev_str)
   ret = (PAPI_event_name_to_code(evtmp, &ec) == PAPI_OK);
 
 finish:
-	papi_flag_off();
+	tool_exit();
 	return ret;
 }
  
 static void
 METHOD_FN(process_event_list, int lush_metrics)
 {
-	papi_flag_on();
+	tool_enter();
   TMSG(PAPI, "process event list");
   if (papi_unavail) { goto finish; }
 
@@ -616,7 +603,7 @@ METHOD_FN(process_event_list, int lush_metrics)
   }
 
 finish:
-	papi_flag_off();
+	tool_exit();
 }
 
 static void
@@ -627,7 +614,7 @@ METHOD_FN(finalize_event_list)
 static void
 METHOD_FN(gen_event_set, int lush_metrics)
 {
-	papi_flag_on();
+	tool_enter();
   thread_data_t *td = hpcrun_get_thread_data();
   int i;
   int ret;
@@ -728,13 +715,13 @@ METHOD_FN(gen_event_set, int lush_metrics)
   }
 
 finish:
-	papi_flag_off();
+	tool_exit();
 }
 
 static void
 METHOD_FN(display_events)
 {
-	papi_flag_on();
+	tool_enter();
   PAPI_event_info_t info;
   int ev, ret, num_total, num_prof;
   int num_components, cidx;
@@ -811,7 +798,7 @@ METHOD_FN(display_events)
   printf("\n\n");
 
 finish:
-	papi_flag_off();
+	tool_exit();
 }
 
 
@@ -831,9 +818,9 @@ METHOD_FN(display_events)
 void
 hpcrun_disable_papi_cuda(void)
 {
-	papi_flag_on();
+	tool_enter();
   disable_papi_cuda = true;
-  papi_flag_off();
+  tool_exit();
 }
 
 /******************************************************************************
@@ -845,7 +832,7 @@ hpcrun_disable_papi_cuda(void)
 static int
 event_is_derived(int ev_code)
 {
-	papi_flag_on();
+	tool_enter();
 	int ret;
   PAPI_event_info_t info;
 
@@ -866,14 +853,14 @@ event_is_derived(int ev_code)
   ret = 1;
 
 finish:
-	papi_flag_off();
+	tool_exit();
 	return ret;
 }
 
 static void
 event_fatal_error(int ev_code, int papi_ret)
 {
-	papi_flag_on();
+	tool_enter();
   char name[1024];
 
   PAPI_event_code_to_name(ev_code, name);
@@ -888,14 +875,14 @@ event_fatal_error(int ev_code, int papi_ret)
   }
   hpcrun_ssfail_unsupported("PAPI", name);
 
-  papi_flag_off();
+  tool_exit();
 }
 
 static void
 papi_event_handler(int event_set, void *pc, long long ovec,
                    void *context)
 {
-	papi_flag_on();
+	tool_enter();
   sample_source_t *self = &obj_name();
   long long values[MAX_EVENTS];
   int my_events[MAX_EVENTS];
@@ -1012,6 +999,6 @@ papi_event_handler(int event_set, void *pc, long long ovec,
   }
 
 finish:
-	papi_flag_off();
+	tool_exit();
   hpcrun_safe_exit();
 }
diff --git a/src/tool/hpcrun/sample_sources_registered.c b/src/tool/hpcrun/sample_sources_registered.c
index 16696d9ad3..99bc99fd87 100644
--- a/src/tool/hpcrun/sample_sources_registered.c
+++ b/src/tool/hpcrun/sample_sources_registered.c
@@ -73,7 +73,6 @@
 
 static sample_source_t* registered_sample_sources = NULL;
 
-__thread int papi_active = false;
 
 //------------------------------------------------------------------------------
 // interface operations 
@@ -91,11 +90,6 @@ hpcrun_sample_sources_register(void)
 
 }
 
-extern int is_papi_active()
-{
-	return papi_active;
-}
-
 //------------------------------------------------------------------------------
 // interface operations 
 //------------------------------------------------------------------------------

From 121e3c068ee7343de17b013f00bd1e5243a376c9 Mon Sep 17 00:00:00 2001
From: dejangrubisic <grubisic.dejan@yahoo.com>
Date: Mon, 6 Jul 2020 09:13:57 -0500
Subject: [PATCH 006/177] Rocm support added

---
 src/tool/hpcrun/gpu/amd/roctracer-api.c      | 6 ++++++
 src/tool/hpcrun/main.c                       | 4 ++--
 src/tool/hpcrun/messages/messages.flag-defns | 1 +
 src/tool/hpcrun/tool_state.c                 | 4 ++--
 4 files changed, 11 insertions(+), 4 deletions(-)

diff --git a/src/tool/hpcrun/gpu/amd/roctracer-api.c b/src/tool/hpcrun/gpu/amd/roctracer-api.c
index 069f0beb39..d668a485db 100644
--- a/src/tool/hpcrun/gpu/amd/roctracer-api.c
+++ b/src/tool/hpcrun/gpu/amd/roctracer-api.c
@@ -256,6 +256,12 @@ roctracer_subscriber_callback
  void* arg
 )
 {
+  if (is_tool_active()) {
+		TMSG(ROCM, "PAPI correlation callback");
+		gpu_correlation_channel_produce(PAPI_CORR_ID, NULL, 0);
+		return;
+  }
+
   gpu_op_placeholder_flags_t gpu_op_placeholder_flags = 0;
   bool is_valid_op = false;
   const hip_api_data_t* data = (const hip_api_data_t*)(callback_data);
diff --git a/src/tool/hpcrun/main.c b/src/tool/hpcrun/main.c
index 51bd2328e5..a0679949e5 100644
--- a/src/tool/hpcrun/main.c
+++ b/src/tool/hpcrun/main.c
@@ -551,7 +551,7 @@ hpcrun_init_internal(bool is_child)
   //
 
   if (! is_child) {
-			SAMPLE_SOURCES(process_event_list, lush_metrics);
+  	SAMPLE_SOURCES(process_event_list, lush_metrics);
     SAMPLE_SOURCES(finalize_event_list);
     hpcrun_metrics_data_finalize();
   }
@@ -941,11 +941,11 @@ monitor_init_process(int *argc, char **argv, void* data)
   // is because fnbounds_init will iterate over the load map 
   // and will invoke analysis on vdso
   hpcrun_save_vdso();
-//  fnbounds_init();
 
   // init callbacks for each device //Module_ignore_map is here
   hpcrun_initializer_init();
 
+  // fnbounds must be after module_ignore_map
 	fnbounds_init();
 
   hpcrun_registered_sources_init();
diff --git a/src/tool/hpcrun/messages/messages.flag-defns b/src/tool/hpcrun/messages/messages.flag-defns
index 22446c0cbe..a5494889c4 100644
--- a/src/tool/hpcrun/messages/messages.flag-defns
+++ b/src/tool/hpcrun/messages/messages.flag-defns
@@ -155,6 +155,7 @@
  E(CUPTI_TRACE),
  E(CUDA_CUBIN),
  E(CUPTI_ACTIVITY),
+ E(ROCM),
  E(DATACENTRIC),
  E(IDLE),
  E(MAIN_BOUNDS),
diff --git a/src/tool/hpcrun/tool_state.c b/src/tool/hpcrun/tool_state.c
index 6eda8ff0f7..3afaef4a46 100644
--- a/src/tool/hpcrun/tool_state.c
+++ b/src/tool/hpcrun/tool_state.c
@@ -10,9 +10,9 @@ void tool_enter(){
 	tool_active++;
 }
 void tool_exit(){
-	tool_active++;
+	tool_active--;
 }
 
 bool is_tool_active(){
 	return tool_active;
-}
\ No newline at end of file
+}

From 3124dac1bbce52652c2f331add604eb468ae900f Mon Sep 17 00:00:00 2001
From: Aaron Cherian <aarontcopal2@jlselogin2.ftm.alcf.anl.gov>
Date: Thu, 9 Jul 2020 01:42:12 +0000
Subject: [PATCH 007/177] created intel gptin instrumentation files for hpcrun

---
 src/tool/hpcrun/Makefile.am                   |  18 +-
 src/tool/hpcrun/Makefile.in                   | 155 ++++++-
 src/tool/hpcrun/gpu/gpu-activity-process.c    |  45 ++
 src/tool/hpcrun/gpu/gpu-activity.h            |  38 +-
 src/tool/hpcrun/gpu/gpu-metrics.c             |  45 +-
 src/tool/hpcrun/gpu/gpu-metrics.h             |  18 +
 .../gtpin-instrumentation-kernel-data-map.c   | 125 ++++++
 .../gtpin-instrumentation-kernel-data-map.h   |  62 +++
 .../gtpin-instrumentation-kernel-memory-map.c | 126 ++++++
 .../gtpin-instrumentation-kernel-memory-map.h |  60 +++
 .../instrumentation/opencl-instrumentation.c  | 399 ++++++++++++++++++
 .../instrumentation/opencl-instrumentation.h  |  18 +
 src/tool/hpcrun/gpu/opencl/opencl-intercept.c |   3 +
 13 files changed, 1071 insertions(+), 41 deletions(-)
 create mode 100644 src/tool/hpcrun/gpu/instrumentation/gtpin-instrumentation-kernel-data-map.c
 create mode 100644 src/tool/hpcrun/gpu/instrumentation/gtpin-instrumentation-kernel-data-map.h
 create mode 100644 src/tool/hpcrun/gpu/instrumentation/gtpin-instrumentation-kernel-memory-map.c
 create mode 100644 src/tool/hpcrun/gpu/instrumentation/gtpin-instrumentation-kernel-memory-map.h
 create mode 100644 src/tool/hpcrun/gpu/instrumentation/opencl-instrumentation.c
 create mode 100644 src/tool/hpcrun/gpu/instrumentation/opencl-instrumentation.h

diff --git a/src/tool/hpcrun/Makefile.am b/src/tool/hpcrun/Makefile.am
index 790d87883d..7966b690a0 100644
--- a/src/tool/hpcrun/Makefile.am
+++ b/src/tool/hpcrun/Makefile.am
@@ -507,11 +507,14 @@ endif
 if ENABLE_OPENCL
 #MY_OPENCL_FILES =
 MY_BASE_FILES += \
-	sample-sources/opencl.c 				\
-	gpu/opencl/opencl-intercept.c			\
-	gpu/opencl/opencl-api.c					\
-	gpu/opencl/opencl-memory-manager.c  	\
-	gpu/opencl/opencl-activity-translate.c
+	sample-sources/opencl.c 												\
+	gpu/opencl/opencl-intercept.c										\
+	gpu/opencl/opencl-api.c													\
+	gpu/opencl/opencl-memory-manager.c  						\
+	gpu/opencl/opencl-activity-translate.c					\
+	gpu/instrumentation/gtpin-instrumentation-kernel-data-map.c			\
+	gpu/instrumentation/gtpin-instrumentation-kernel-memory-map.c	\
+	gpu/instrumentation/opencl-instrumentation.c
 endif
 
 
@@ -961,8 +964,9 @@ if OPT_ENABLE_ROCM
 endif
 
 if ENABLE_OPENCL
-  libhpcrun_la_CFLAGS += $(OPENCL_IFLAGS)
-
+	libhpcrun_la_CFLAGS += $(OPENCL_IFLAGS)
+	libhpcrun_la_LDFLAGS += "-L/home/aarontcopal2/Documents/inteloneapi/gtpin/Profilers/Lib/intel64 -lgtpin"
+	libhpcrun_la_LDFLAGS +=	"-Wl,-rpath='/home/aarontcopal2/Documents/inteloneapi/gtpin/Profilers/Lib/intel64'"
   MY_CPP_DEFINES  += -DHPCRUN_SS_OPENCL
 endif 
 
diff --git a/src/tool/hpcrun/Makefile.in b/src/tool/hpcrun/Makefile.in
index 82c3d3dcab..4a88fa5b96 100644
--- a/src/tool/hpcrun/Makefile.in
+++ b/src/tool/hpcrun/Makefile.in
@@ -168,11 +168,14 @@ pkglibexec_PROGRAMS =
 
 #MY_OPENCL_FILES =
 @ENABLE_OPENCL_TRUE@am__append_17 = \
-@ENABLE_OPENCL_TRUE@	sample-sources/opencl.c 				\
-@ENABLE_OPENCL_TRUE@	gpu/opencl/opencl-intercept.c			\
-@ENABLE_OPENCL_TRUE@	gpu/opencl/opencl-api.c					\
-@ENABLE_OPENCL_TRUE@	gpu/opencl/opencl-memory-manager.c  	\
-@ENABLE_OPENCL_TRUE@	gpu/opencl/opencl-activity-translate.c
+@ENABLE_OPENCL_TRUE@	sample-sources/opencl.c 												\
+@ENABLE_OPENCL_TRUE@	gpu/opencl/opencl-intercept.c										\
+@ENABLE_OPENCL_TRUE@	gpu/opencl/opencl-api.c													\
+@ENABLE_OPENCL_TRUE@	gpu/opencl/opencl-memory-manager.c  						\
+@ENABLE_OPENCL_TRUE@	gpu/opencl/opencl-activity-translate.c					\
+@ENABLE_OPENCL_TRUE@	gpu/instrumentation/gtpin-instrumentation-kernel-data-map.c			\
+@ENABLE_OPENCL_TRUE@	gpu/instrumentation/gtpin-instrumentation-kernel-memory-map.c	\
+@ENABLE_OPENCL_TRUE@	gpu/instrumentation/opencl-instrumentation.c
 
 
 #
@@ -301,10 +304,9 @@ pkglibexec_PROGRAMS =
 @OPT_ENABLE_ROCM_TRUE@am__append_126 = -DENABLE_ROCM
 @OPT_ENABLE_ROCM_TRUE@am__append_127 = $(OPT_ROCM_IFLAGS) -g
 @OPT_ENABLE_ROCM_TRUE@am__append_128 = -DHPCRUN_SS_AMD
-@ENABLE_OPENCL_TRUE@am__append_129 = $(OPENCL_IFLAGS)
-@ENABLE_OPENCL_TRUE@am__append_130 = -DHPCRUN_SS_OPENCL
-@OPT_ENABLE_LUSH_TRUE@@OPT_WITH_CILK_TRUE@am__append_131 = libagent-cilk.la
-@OPT_ENABLE_LUSH_TRUE@am__append_132 = libagent-pthread.la \
+@ENABLE_OPENCL_TRUE@am__append_129 = -DHPCRUN_SS_OPENCL
+@OPT_ENABLE_LUSH_TRUE@@OPT_WITH_CILK_TRUE@am__append_130 = libagent-cilk.la
+@OPT_ENABLE_LUSH_TRUE@am__append_131 = libagent-pthread.la \
 @OPT_ENABLE_LUSH_TRUE@	libagent-tbb.la
 subdir = src/tool/hpcrun
 ACLOCAL_M4 = $(top_srcdir)/aclocal.m4
@@ -514,6 +516,9 @@ am__libhpcrun_la_SOURCES_DIST = utilities/first_func.c main.h main.c \
 	sample-sources/opencl.c gpu/opencl/opencl-intercept.c \
 	gpu/opencl/opencl-api.c gpu/opencl/opencl-memory-manager.c \
 	gpu/opencl/opencl-activity-translate.c \
+	gpu/instrumentation/gtpin-instrumentation-kernel-data-map.c \
+	gpu/instrumentation/gtpin-instrumentation-kernel-memory-map.c \
+	gpu/instrumentation/opencl-instrumentation.c \
 	fnbounds/fnbounds_client.c fnbounds/fnbounds_dynamic.c \
 	monitor-exts/openmp.c hpcrun_dlfns.c custom-init-dynamic.c \
 	os/linux/dylib.c unwind/common/default_validation_summary.c \
@@ -578,7 +583,10 @@ am__libhpcrun_la_SOURCES_DIST = utilities/first_func.c main.h main.c \
 @ENABLE_OPENCL_TRUE@	gpu/opencl/libhpcrun_la-opencl-intercept.lo \
 @ENABLE_OPENCL_TRUE@	gpu/opencl/libhpcrun_la-opencl-api.lo \
 @ENABLE_OPENCL_TRUE@	gpu/opencl/libhpcrun_la-opencl-memory-manager.lo \
-@ENABLE_OPENCL_TRUE@	gpu/opencl/libhpcrun_la-opencl-activity-translate.lo
+@ENABLE_OPENCL_TRUE@	gpu/opencl/libhpcrun_la-opencl-activity-translate.lo \
+@ENABLE_OPENCL_TRUE@	gpu/instrumentation/libhpcrun_la-gtpin-instrumentation-kernel-data-map.lo \
+@ENABLE_OPENCL_TRUE@	gpu/instrumentation/libhpcrun_la-gtpin-instrumentation-kernel-memory-map.lo \
+@ENABLE_OPENCL_TRUE@	gpu/instrumentation/libhpcrun_la-opencl-instrumentation.lo
 am__objects_15 = utilities/libhpcrun_la-first_func.lo \
 	libhpcrun_la-main.lo libhpcrun_la-disabled.lo \
 	libhpcrun_la-closure-registry.lo \
@@ -910,6 +918,9 @@ am__libhpcrun_o_SOURCES_DIST = utilities/first_func.c main.h main.c \
 	sample-sources/opencl.c gpu/opencl/opencl-intercept.c \
 	gpu/opencl/opencl-api.c gpu/opencl/opencl-memory-manager.c \
 	gpu/opencl/opencl-activity-translate.c \
+	gpu/instrumentation/gtpin-instrumentation-kernel-data-map.c \
+	gpu/instrumentation/gtpin-instrumentation-kernel-memory-map.c \
+	gpu/instrumentation/opencl-instrumentation.c \
 	fnbounds/fnbounds_static.c custom-init-static.c \
 	unwind/common/default_validation_summary.c \
 	trampoline/ppc64/ppc64-tramp.s \
@@ -970,7 +981,10 @@ am__libhpcrun_o_SOURCES_DIST = utilities/first_func.c main.h main.c \
 @ENABLE_OPENCL_TRUE@	gpu/opencl/libhpcrun_o-opencl-intercept.$(OBJEXT) \
 @ENABLE_OPENCL_TRUE@	gpu/opencl/libhpcrun_o-opencl-api.$(OBJEXT) \
 @ENABLE_OPENCL_TRUE@	gpu/opencl/libhpcrun_o-opencl-memory-manager.$(OBJEXT) \
-@ENABLE_OPENCL_TRUE@	gpu/opencl/libhpcrun_o-opencl-activity-translate.$(OBJEXT)
+@ENABLE_OPENCL_TRUE@	gpu/opencl/libhpcrun_o-opencl-activity-translate.$(OBJEXT) \
+@ENABLE_OPENCL_TRUE@	gpu/instrumentation/libhpcrun_o-gtpin-instrumentation-kernel-data-map.$(OBJEXT) \
+@ENABLE_OPENCL_TRUE@	gpu/instrumentation/libhpcrun_o-gtpin-instrumentation-kernel-memory-map.$(OBJEXT) \
+@ENABLE_OPENCL_TRUE@	gpu/instrumentation/libhpcrun_o-opencl-instrumentation.$(OBJEXT)
 am__objects_53 = utilities/libhpcrun_o-first_func.$(OBJEXT) \
 	libhpcrun_o-main.$(OBJEXT) libhpcrun_o-disabled.$(OBJEXT) \
 	libhpcrun_o-closure-registry.$(OBJEXT) \
@@ -1646,8 +1660,8 @@ bin_SCRIPTS = $(am__append_4) $(am__append_6)
 pkglibexec_SCRIPTS = $(am__append_1)
 include_HEADERS = $(am__append_2)
 pkglib_LIBRARIES = $(am__append_5)
-pkglib_LTLIBRARIES = $(am__append_3) $(am__append_7) $(am__append_131) \
-	$(am__append_132)
+pkglib_LTLIBRARIES = $(am__append_3) $(am__append_7) $(am__append_130) \
+	$(am__append_131)
 BUILT_SOURCES = $(am__append_21)
 CLEANFILES = $(am__append_22)
 PAPI_INC_FLGS = @OPT_PAPI_IFLAGS@ 
@@ -1737,7 +1751,7 @@ UNW_MIPS_LD_FLAGS =
 MY_CPP_DEFINES = -D_GNU_SOURCE -DINLINE_FN=1 -DLOCAL_BUILD=1 \
 	-D__HIP_PLATFORM_HCC__=1 $(am__append_11) $(am__append_18) \
 	$(am__append_103) $(am__append_107) $(am__append_109) \
-	$(am__append_113) $(am__append_128) $(am__append_130)
+	$(am__append_113) $(am__append_128) $(am__append_129)
 MY_BASE_FILES = utilities/first_func.c main.h main.c disabled.c \
 	closure-registry.c cct_insert_backtrace.c \
 	cct_backtrace_finalize.c env.c epoch.c files.c \
@@ -1866,7 +1880,9 @@ MY_INCLUDE_DIRS = \
         $(OPT_CUDA_IFLAGS)             \
         $(OPT_CUPTI_IFLAGS)             \
 	-I$(LIBELF_INC)			\
-	-I$(LIBMONITOR_INC)
+	-I$(LIBMONITOR_INC)	\
+	$(OPENCL_IFLAGS)	\
+	$(GOTCHA_IFLAGS)
 
 MY_MIPS_INCLUDE_DIRS = \
 	-I$(srcdir)/unwind/mips
@@ -2003,8 +2019,7 @@ libhpctoolkit_a_CPPFLAGS = \
 # cflags
 #-----------------------------------------------------------
 libhpcrun_la_CFLAGS = $(CFLAGS) $(HOST_CFLAGS) $(PERFMON_CFLAGS) \
-	$(am__append_123) $(am__append_127) $(am__append_129) \
-	$(GOTCHA_IFLAGS)
+	$(am__append_123) $(am__append_127) $(GOTCHA_IFLAGS)
 libhpcrun_o_CFLAGS = $(CFLAGS) $(HOST_CFLAGS) $(PERFMON_CFLAGS)
 libhpcrun_wrap_a_CFLAGS = $(CFLAGS) $(HOST_CFLAGS)
 libhpcrun_ga_la_CFLAGS = $(CFLAGS) $(HOST_CFLAGS)
@@ -2597,6 +2612,21 @@ gpu/opencl/libhpcrun_la-opencl-memory-manager.lo:  \
 gpu/opencl/libhpcrun_la-opencl-activity-translate.lo:  \
 	gpu/opencl/$(am__dirstamp) \
 	gpu/opencl/$(DEPDIR)/$(am__dirstamp)
+gpu/instrumentation/$(am__dirstamp):
+	@$(MKDIR_P) gpu/instrumentation
+	@: > gpu/instrumentation/$(am__dirstamp)
+gpu/instrumentation/$(DEPDIR)/$(am__dirstamp):
+	@$(MKDIR_P) gpu/instrumentation/$(DEPDIR)
+	@: > gpu/instrumentation/$(DEPDIR)/$(am__dirstamp)
+gpu/instrumentation/libhpcrun_la-gtpin-instrumentation-kernel-data-map.lo:  \
+	gpu/instrumentation/$(am__dirstamp) \
+	gpu/instrumentation/$(DEPDIR)/$(am__dirstamp)
+gpu/instrumentation/libhpcrun_la-gtpin-instrumentation-kernel-memory-map.lo:  \
+	gpu/instrumentation/$(am__dirstamp) \
+	gpu/instrumentation/$(DEPDIR)/$(am__dirstamp)
+gpu/instrumentation/libhpcrun_la-opencl-instrumentation.lo:  \
+	gpu/instrumentation/$(am__dirstamp) \
+	gpu/instrumentation/$(DEPDIR)/$(am__dirstamp)
 fnbounds/libhpcrun_la-fnbounds_client.lo: fnbounds/$(am__dirstamp) \
 	fnbounds/$(DEPDIR)/$(am__dirstamp)
 fnbounds/libhpcrun_la-fnbounds_dynamic.lo: fnbounds/$(am__dirstamp) \
@@ -3190,6 +3220,15 @@ gpu/opencl/libhpcrun_o-opencl-memory-manager.$(OBJEXT):  \
 gpu/opencl/libhpcrun_o-opencl-activity-translate.$(OBJEXT):  \
 	gpu/opencl/$(am__dirstamp) \
 	gpu/opencl/$(DEPDIR)/$(am__dirstamp)
+gpu/instrumentation/libhpcrun_o-gtpin-instrumentation-kernel-data-map.$(OBJEXT):  \
+	gpu/instrumentation/$(am__dirstamp) \
+	gpu/instrumentation/$(DEPDIR)/$(am__dirstamp)
+gpu/instrumentation/libhpcrun_o-gtpin-instrumentation-kernel-memory-map.$(OBJEXT):  \
+	gpu/instrumentation/$(am__dirstamp) \
+	gpu/instrumentation/$(DEPDIR)/$(am__dirstamp)
+gpu/instrumentation/libhpcrun_o-opencl-instrumentation.$(OBJEXT):  \
+	gpu/instrumentation/$(am__dirstamp) \
+	gpu/instrumentation/$(DEPDIR)/$(am__dirstamp)
 fnbounds/libhpcrun_o-fnbounds_static.$(OBJEXT):  \
 	fnbounds/$(am__dirstamp) fnbounds/$(DEPDIR)/$(am__dirstamp)
 unwind/common/libhpcrun_o-default_validation_summary.$(OBJEXT):  \
@@ -3442,6 +3481,8 @@ mostlyclean-compile:
 	-rm -f gpu/*.lo
 	-rm -f gpu/amd/*.$(OBJEXT)
 	-rm -f gpu/amd/*.lo
+	-rm -f gpu/instrumentation/*.$(OBJEXT)
+	-rm -f gpu/instrumentation/*.lo
 	-rm -f gpu/nvidia/*.$(OBJEXT)
 	-rm -f gpu/nvidia/*.lo
 	-rm -f gpu/opencl/*.$(OBJEXT)
@@ -3643,6 +3684,12 @@ distclean-compile:
 @AMDEP_TRUE@@am__include@ @am__quote@gpu/$(DEPDIR)/libhpcrun_o-gpu-trace.Po@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@gpu/amd/$(DEPDIR)/libhpcrun_la-roctracer-activity-translate.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@gpu/amd/$(DEPDIR)/libhpcrun_la-roctracer-api.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@gpu/instrumentation/$(DEPDIR)/libhpcrun_la-gtpin-instrumentation-kernel-data-map.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@gpu/instrumentation/$(DEPDIR)/libhpcrun_la-gtpin-instrumentation-kernel-memory-map.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@gpu/instrumentation/$(DEPDIR)/libhpcrun_la-opencl-instrumentation.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@gpu/instrumentation/$(DEPDIR)/libhpcrun_o-gtpin-instrumentation-kernel-data-map.Po@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@gpu/instrumentation/$(DEPDIR)/libhpcrun_o-gtpin-instrumentation-kernel-memory-map.Po@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@gpu/instrumentation/$(DEPDIR)/libhpcrun_o-opencl-instrumentation.Po@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@gpu/nvidia/$(DEPDIR)/libhpcrun_la-cubin-hash-map.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@gpu/nvidia/$(DEPDIR)/libhpcrun_la-cubin-id-map.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@gpu/nvidia/$(DEPDIR)/libhpcrun_la-cubin-symbols.Plo@am__quote@
@@ -5046,6 +5093,27 @@ gpu/opencl/libhpcrun_la-opencl-activity-translate.lo: gpu/opencl/opencl-activity
 @AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
 @am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -c -o gpu/opencl/libhpcrun_la-opencl-activity-translate.lo `test -f 'gpu/opencl/opencl-activity-translate.c' || echo '$(srcdir)/'`gpu/opencl/opencl-activity-translate.c
 
+gpu/instrumentation/libhpcrun_la-gtpin-instrumentation-kernel-data-map.lo: gpu/instrumentation/gtpin-instrumentation-kernel-data-map.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -MT gpu/instrumentation/libhpcrun_la-gtpin-instrumentation-kernel-data-map.lo -MD -MP -MF gpu/instrumentation/$(DEPDIR)/libhpcrun_la-gtpin-instrumentation-kernel-data-map.Tpo -c -o gpu/instrumentation/libhpcrun_la-gtpin-instrumentation-kernel-data-map.lo `test -f 'gpu/instrumentation/gtpin-instrumentation-kernel-data-map.c' || echo '$(srcdir)/'`gpu/instrumentation/gtpin-instrumentation-kernel-data-map.c
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) gpu/instrumentation/$(DEPDIR)/libhpcrun_la-gtpin-instrumentation-kernel-data-map.Tpo gpu/instrumentation/$(DEPDIR)/libhpcrun_la-gtpin-instrumentation-kernel-data-map.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='gpu/instrumentation/gtpin-instrumentation-kernel-data-map.c' object='gpu/instrumentation/libhpcrun_la-gtpin-instrumentation-kernel-data-map.lo' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -c -o gpu/instrumentation/libhpcrun_la-gtpin-instrumentation-kernel-data-map.lo `test -f 'gpu/instrumentation/gtpin-instrumentation-kernel-data-map.c' || echo '$(srcdir)/'`gpu/instrumentation/gtpin-instrumentation-kernel-data-map.c
+
+gpu/instrumentation/libhpcrun_la-gtpin-instrumentation-kernel-memory-map.lo: gpu/instrumentation/gtpin-instrumentation-kernel-memory-map.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -MT gpu/instrumentation/libhpcrun_la-gtpin-instrumentation-kernel-memory-map.lo -MD -MP -MF gpu/instrumentation/$(DEPDIR)/libhpcrun_la-gtpin-instrumentation-kernel-memory-map.Tpo -c -o gpu/instrumentation/libhpcrun_la-gtpin-instrumentation-kernel-memory-map.lo `test -f 'gpu/instrumentation/gtpin-instrumentation-kernel-memory-map.c' || echo '$(srcdir)/'`gpu/instrumentation/gtpin-instrumentation-kernel-memory-map.c
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) gpu/instrumentation/$(DEPDIR)/libhpcrun_la-gtpin-instrumentation-kernel-memory-map.Tpo gpu/instrumentation/$(DEPDIR)/libhpcrun_la-gtpin-instrumentation-kernel-memory-map.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='gpu/instrumentation/gtpin-instrumentation-kernel-memory-map.c' object='gpu/instrumentation/libhpcrun_la-gtpin-instrumentation-kernel-memory-map.lo' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -c -o gpu/instrumentation/libhpcrun_la-gtpin-instrumentation-kernel-memory-map.lo `test -f 'gpu/instrumentation/gtpin-instrumentation-kernel-memory-map.c' || echo '$(srcdir)/'`gpu/instrumentation/gtpin-instrumentation-kernel-memory-map.c
+
+gpu/instrumentation/libhpcrun_la-opencl-instrumentation.lo: gpu/instrumentation/opencl-instrumentation.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -MT gpu/instrumentation/libhpcrun_la-opencl-instrumentation.lo -MD -MP -MF gpu/instrumentation/$(DEPDIR)/libhpcrun_la-opencl-instrumentation.Tpo -c -o gpu/instrumentation/libhpcrun_la-opencl-instrumentation.lo `test -f 'gpu/instrumentation/opencl-instrumentation.c' || echo '$(srcdir)/'`gpu/instrumentation/opencl-instrumentation.c
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) gpu/instrumentation/$(DEPDIR)/libhpcrun_la-opencl-instrumentation.Tpo gpu/instrumentation/$(DEPDIR)/libhpcrun_la-opencl-instrumentation.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='gpu/instrumentation/opencl-instrumentation.c' object='gpu/instrumentation/libhpcrun_la-opencl-instrumentation.lo' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -c -o gpu/instrumentation/libhpcrun_la-opencl-instrumentation.lo `test -f 'gpu/instrumentation/opencl-instrumentation.c' || echo '$(srcdir)/'`gpu/instrumentation/opencl-instrumentation.c
+
 fnbounds/libhpcrun_la-fnbounds_client.lo: fnbounds/fnbounds_client.c
 @am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -MT fnbounds/libhpcrun_la-fnbounds_client.lo -MD -MP -MF fnbounds/$(DEPDIR)/libhpcrun_la-fnbounds_client.Tpo -c -o fnbounds/libhpcrun_la-fnbounds_client.lo `test -f 'fnbounds/fnbounds_client.c' || echo '$(srcdir)/'`fnbounds/fnbounds_client.c
 @am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) fnbounds/$(DEPDIR)/libhpcrun_la-fnbounds_client.Tpo fnbounds/$(DEPDIR)/libhpcrun_la-fnbounds_client.Plo
@@ -7384,6 +7452,48 @@ gpu/opencl/libhpcrun_o-opencl-activity-translate.obj: gpu/opencl/opencl-activity
 @AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
 @am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_o_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_o_CFLAGS) $(CFLAGS) -c -o gpu/opencl/libhpcrun_o-opencl-activity-translate.obj `if test -f 'gpu/opencl/opencl-activity-translate.c'; then $(CYGPATH_W) 'gpu/opencl/opencl-activity-translate.c'; else $(CYGPATH_W) '$(srcdir)/gpu/opencl/opencl-activity-translate.c'; fi`
 
+gpu/instrumentation/libhpcrun_o-gtpin-instrumentation-kernel-data-map.o: gpu/instrumentation/gtpin-instrumentation-kernel-data-map.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_o_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_o_CFLAGS) $(CFLAGS) -MT gpu/instrumentation/libhpcrun_o-gtpin-instrumentation-kernel-data-map.o -MD -MP -MF gpu/instrumentation/$(DEPDIR)/libhpcrun_o-gtpin-instrumentation-kernel-data-map.Tpo -c -o gpu/instrumentation/libhpcrun_o-gtpin-instrumentation-kernel-data-map.o `test -f 'gpu/instrumentation/gtpin-instrumentation-kernel-data-map.c' || echo '$(srcdir)/'`gpu/instrumentation/gtpin-instrumentation-kernel-data-map.c
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) gpu/instrumentation/$(DEPDIR)/libhpcrun_o-gtpin-instrumentation-kernel-data-map.Tpo gpu/instrumentation/$(DEPDIR)/libhpcrun_o-gtpin-instrumentation-kernel-data-map.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='gpu/instrumentation/gtpin-instrumentation-kernel-data-map.c' object='gpu/instrumentation/libhpcrun_o-gtpin-instrumentation-kernel-data-map.o' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_o_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_o_CFLAGS) $(CFLAGS) -c -o gpu/instrumentation/libhpcrun_o-gtpin-instrumentation-kernel-data-map.o `test -f 'gpu/instrumentation/gtpin-instrumentation-kernel-data-map.c' || echo '$(srcdir)/'`gpu/instrumentation/gtpin-instrumentation-kernel-data-map.c
+
+gpu/instrumentation/libhpcrun_o-gtpin-instrumentation-kernel-data-map.obj: gpu/instrumentation/gtpin-instrumentation-kernel-data-map.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_o_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_o_CFLAGS) $(CFLAGS) -MT gpu/instrumentation/libhpcrun_o-gtpin-instrumentation-kernel-data-map.obj -MD -MP -MF gpu/instrumentation/$(DEPDIR)/libhpcrun_o-gtpin-instrumentation-kernel-data-map.Tpo -c -o gpu/instrumentation/libhpcrun_o-gtpin-instrumentation-kernel-data-map.obj `if test -f 'gpu/instrumentation/gtpin-instrumentation-kernel-data-map.c'; then $(CYGPATH_W) 'gpu/instrumentation/gtpin-instrumentation-kernel-data-map.c'; else $(CYGPATH_W) '$(srcdir)/gpu/instrumentation/gtpin-instrumentation-kernel-data-map.c'; fi`
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) gpu/instrumentation/$(DEPDIR)/libhpcrun_o-gtpin-instrumentation-kernel-data-map.Tpo gpu/instrumentation/$(DEPDIR)/libhpcrun_o-gtpin-instrumentation-kernel-data-map.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='gpu/instrumentation/gtpin-instrumentation-kernel-data-map.c' object='gpu/instrumentation/libhpcrun_o-gtpin-instrumentation-kernel-data-map.obj' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_o_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_o_CFLAGS) $(CFLAGS) -c -o gpu/instrumentation/libhpcrun_o-gtpin-instrumentation-kernel-data-map.obj `if test -f 'gpu/instrumentation/gtpin-instrumentation-kernel-data-map.c'; then $(CYGPATH_W) 'gpu/instrumentation/gtpin-instrumentation-kernel-data-map.c'; else $(CYGPATH_W) '$(srcdir)/gpu/instrumentation/gtpin-instrumentation-kernel-data-map.c'; fi`
+
+gpu/instrumentation/libhpcrun_o-gtpin-instrumentation-kernel-memory-map.o: gpu/instrumentation/gtpin-instrumentation-kernel-memory-map.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_o_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_o_CFLAGS) $(CFLAGS) -MT gpu/instrumentation/libhpcrun_o-gtpin-instrumentation-kernel-memory-map.o -MD -MP -MF gpu/instrumentation/$(DEPDIR)/libhpcrun_o-gtpin-instrumentation-kernel-memory-map.Tpo -c -o gpu/instrumentation/libhpcrun_o-gtpin-instrumentation-kernel-memory-map.o `test -f 'gpu/instrumentation/gtpin-instrumentation-kernel-memory-map.c' || echo '$(srcdir)/'`gpu/instrumentation/gtpin-instrumentation-kernel-memory-map.c
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) gpu/instrumentation/$(DEPDIR)/libhpcrun_o-gtpin-instrumentation-kernel-memory-map.Tpo gpu/instrumentation/$(DEPDIR)/libhpcrun_o-gtpin-instrumentation-kernel-memory-map.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='gpu/instrumentation/gtpin-instrumentation-kernel-memory-map.c' object='gpu/instrumentation/libhpcrun_o-gtpin-instrumentation-kernel-memory-map.o' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_o_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_o_CFLAGS) $(CFLAGS) -c -o gpu/instrumentation/libhpcrun_o-gtpin-instrumentation-kernel-memory-map.o `test -f 'gpu/instrumentation/gtpin-instrumentation-kernel-memory-map.c' || echo '$(srcdir)/'`gpu/instrumentation/gtpin-instrumentation-kernel-memory-map.c
+
+gpu/instrumentation/libhpcrun_o-gtpin-instrumentation-kernel-memory-map.obj: gpu/instrumentation/gtpin-instrumentation-kernel-memory-map.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_o_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_o_CFLAGS) $(CFLAGS) -MT gpu/instrumentation/libhpcrun_o-gtpin-instrumentation-kernel-memory-map.obj -MD -MP -MF gpu/instrumentation/$(DEPDIR)/libhpcrun_o-gtpin-instrumentation-kernel-memory-map.Tpo -c -o gpu/instrumentation/libhpcrun_o-gtpin-instrumentation-kernel-memory-map.obj `if test -f 'gpu/instrumentation/gtpin-instrumentation-kernel-memory-map.c'; then $(CYGPATH_W) 'gpu/instrumentation/gtpin-instrumentation-kernel-memory-map.c'; else $(CYGPATH_W) '$(srcdir)/gpu/instrumentation/gtpin-instrumentation-kernel-memory-map.c'; fi`
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) gpu/instrumentation/$(DEPDIR)/libhpcrun_o-gtpin-instrumentation-kernel-memory-map.Tpo gpu/instrumentation/$(DEPDIR)/libhpcrun_o-gtpin-instrumentation-kernel-memory-map.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='gpu/instrumentation/gtpin-instrumentation-kernel-memory-map.c' object='gpu/instrumentation/libhpcrun_o-gtpin-instrumentation-kernel-memory-map.obj' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_o_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_o_CFLAGS) $(CFLAGS) -c -o gpu/instrumentation/libhpcrun_o-gtpin-instrumentation-kernel-memory-map.obj `if test -f 'gpu/instrumentation/gtpin-instrumentation-kernel-memory-map.c'; then $(CYGPATH_W) 'gpu/instrumentation/gtpin-instrumentation-kernel-memory-map.c'; else $(CYGPATH_W) '$(srcdir)/gpu/instrumentation/gtpin-instrumentation-kernel-memory-map.c'; fi`
+
+gpu/instrumentation/libhpcrun_o-opencl-instrumentation.o: gpu/instrumentation/opencl-instrumentation.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_o_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_o_CFLAGS) $(CFLAGS) -MT gpu/instrumentation/libhpcrun_o-opencl-instrumentation.o -MD -MP -MF gpu/instrumentation/$(DEPDIR)/libhpcrun_o-opencl-instrumentation.Tpo -c -o gpu/instrumentation/libhpcrun_o-opencl-instrumentation.o `test -f 'gpu/instrumentation/opencl-instrumentation.c' || echo '$(srcdir)/'`gpu/instrumentation/opencl-instrumentation.c
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) gpu/instrumentation/$(DEPDIR)/libhpcrun_o-opencl-instrumentation.Tpo gpu/instrumentation/$(DEPDIR)/libhpcrun_o-opencl-instrumentation.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='gpu/instrumentation/opencl-instrumentation.c' object='gpu/instrumentation/libhpcrun_o-opencl-instrumentation.o' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_o_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_o_CFLAGS) $(CFLAGS) -c -o gpu/instrumentation/libhpcrun_o-opencl-instrumentation.o `test -f 'gpu/instrumentation/opencl-instrumentation.c' || echo '$(srcdir)/'`gpu/instrumentation/opencl-instrumentation.c
+
+gpu/instrumentation/libhpcrun_o-opencl-instrumentation.obj: gpu/instrumentation/opencl-instrumentation.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_o_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_o_CFLAGS) $(CFLAGS) -MT gpu/instrumentation/libhpcrun_o-opencl-instrumentation.obj -MD -MP -MF gpu/instrumentation/$(DEPDIR)/libhpcrun_o-opencl-instrumentation.Tpo -c -o gpu/instrumentation/libhpcrun_o-opencl-instrumentation.obj `if test -f 'gpu/instrumentation/opencl-instrumentation.c'; then $(CYGPATH_W) 'gpu/instrumentation/opencl-instrumentation.c'; else $(CYGPATH_W) '$(srcdir)/gpu/instrumentation/opencl-instrumentation.c'; fi`
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) gpu/instrumentation/$(DEPDIR)/libhpcrun_o-opencl-instrumentation.Tpo gpu/instrumentation/$(DEPDIR)/libhpcrun_o-opencl-instrumentation.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='gpu/instrumentation/opencl-instrumentation.c' object='gpu/instrumentation/libhpcrun_o-opencl-instrumentation.obj' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_o_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_o_CFLAGS) $(CFLAGS) -c -o gpu/instrumentation/libhpcrun_o-opencl-instrumentation.obj `if test -f 'gpu/instrumentation/opencl-instrumentation.c'; then $(CYGPATH_W) 'gpu/instrumentation/opencl-instrumentation.c'; else $(CYGPATH_W) '$(srcdir)/gpu/instrumentation/opencl-instrumentation.c'; fi`
+
 fnbounds/libhpcrun_o-fnbounds_static.o: fnbounds/fnbounds_static.c
 @am__fastdepCC_TRUE@	$(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_o_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_o_CFLAGS) $(CFLAGS) -MT fnbounds/libhpcrun_o-fnbounds_static.o -MD -MP -MF fnbounds/$(DEPDIR)/libhpcrun_o-fnbounds_static.Tpo -c -o fnbounds/libhpcrun_o-fnbounds_static.o `test -f 'fnbounds/fnbounds_static.c' || echo '$(srcdir)/'`fnbounds/fnbounds_static.c
 @am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) fnbounds/$(DEPDIR)/libhpcrun_o-fnbounds_static.Tpo fnbounds/$(DEPDIR)/libhpcrun_o-fnbounds_static.Po
@@ -8176,6 +8286,7 @@ clean-libtool:
 	-rm -rf fnbounds/.libs fnbounds/_libs
 	-rm -rf gpu/.libs gpu/_libs
 	-rm -rf gpu/amd/.libs gpu/amd/_libs
+	-rm -rf gpu/instrumentation/.libs gpu/instrumentation/_libs
 	-rm -rf gpu/nvidia/.libs gpu/nvidia/_libs
 	-rm -rf gpu/opencl/.libs gpu/opencl/_libs
 	-rm -rf lush/.libs lush/_libs
@@ -8427,6 +8538,8 @@ distclean-generic:
 	-rm -f gpu/$(am__dirstamp)
 	-rm -f gpu/amd/$(DEPDIR)/$(am__dirstamp)
 	-rm -f gpu/amd/$(am__dirstamp)
+	-rm -f gpu/instrumentation/$(DEPDIR)/$(am__dirstamp)
+	-rm -f gpu/instrumentation/$(am__dirstamp)
 	-rm -f gpu/nvidia/$(DEPDIR)/$(am__dirstamp)
 	-rm -f gpu/nvidia/$(am__dirstamp)
 	-rm -f gpu/opencl/$(DEPDIR)/$(am__dirstamp)
@@ -8497,7 +8610,7 @@ clean-am: clean-generic clean-libtool clean-noinstPROGRAMS \
 	clean-pkglibexecPROGRAMS mostlyclean-am
 
 distclean: distclean-recursive
-	-rm -rf ./$(DEPDIR) cct/$(DEPDIR) fnbounds/$(DEPDIR) gpu/$(DEPDIR) gpu/amd/$(DEPDIR) gpu/nvidia/$(DEPDIR) gpu/opencl/$(DEPDIR) lush-agents/$(DEPDIR) lush/$(DEPDIR) memory/$(DEPDIR) messages/$(DEPDIR) monitor-exts/$(DEPDIR) ompt/$(DEPDIR) os/linux/$(DEPDIR) sample-sources/$(DEPDIR) sample-sources/blame-shift/$(DEPDIR) sample-sources/perf/$(DEPDIR) syscalls/$(DEPDIR) trampoline/aarch64/$(DEPDIR) trampoline/common/$(DEPDIR) trampoline/x86-family/$(DEPDIR) unwind/common/$(DEPDIR) unwind/generic-libunwind/$(DEPDIR) unwind/ppc64/$(DEPDIR) unwind/x86-family/$(DEPDIR) unwind/x86-family/manual-intervals/$(DEPDIR) utilities/$(DEPDIR) utilities/arch/ia64/$(DEPDIR) utilities/arch/libunwind/$(DEPDIR) utilities/arch/ppc64/$(DEPDIR) utilities/arch/x86-family/$(DEPDIR)
+	-rm -rf ./$(DEPDIR) cct/$(DEPDIR) fnbounds/$(DEPDIR) gpu/$(DEPDIR) gpu/amd/$(DEPDIR) gpu/instrumentation/$(DEPDIR) gpu/nvidia/$(DEPDIR) gpu/opencl/$(DEPDIR) lush-agents/$(DEPDIR) lush/$(DEPDIR) memory/$(DEPDIR) messages/$(DEPDIR) monitor-exts/$(DEPDIR) ompt/$(DEPDIR) os/linux/$(DEPDIR) sample-sources/$(DEPDIR) sample-sources/blame-shift/$(DEPDIR) sample-sources/perf/$(DEPDIR) syscalls/$(DEPDIR) trampoline/aarch64/$(DEPDIR) trampoline/common/$(DEPDIR) trampoline/x86-family/$(DEPDIR) unwind/common/$(DEPDIR) unwind/generic-libunwind/$(DEPDIR) unwind/ppc64/$(DEPDIR) unwind/x86-family/$(DEPDIR) unwind/x86-family/manual-intervals/$(DEPDIR) utilities/$(DEPDIR) utilities/arch/ia64/$(DEPDIR) utilities/arch/libunwind/$(DEPDIR) utilities/arch/ppc64/$(DEPDIR) utilities/arch/x86-family/$(DEPDIR)
 	-rm -f Makefile
 distclean-am: clean-am distclean-compile distclean-generic \
 	distclean-tags
@@ -8547,7 +8660,7 @@ install-ps-am:
 installcheck-am:
 
 maintainer-clean: maintainer-clean-recursive
-	-rm -rf ./$(DEPDIR) cct/$(DEPDIR) fnbounds/$(DEPDIR) gpu/$(DEPDIR) gpu/amd/$(DEPDIR) gpu/nvidia/$(DEPDIR) gpu/opencl/$(DEPDIR) lush-agents/$(DEPDIR) lush/$(DEPDIR) memory/$(DEPDIR) messages/$(DEPDIR) monitor-exts/$(DEPDIR) ompt/$(DEPDIR) os/linux/$(DEPDIR) sample-sources/$(DEPDIR) sample-sources/blame-shift/$(DEPDIR) sample-sources/perf/$(DEPDIR) syscalls/$(DEPDIR) trampoline/aarch64/$(DEPDIR) trampoline/common/$(DEPDIR) trampoline/x86-family/$(DEPDIR) unwind/common/$(DEPDIR) unwind/generic-libunwind/$(DEPDIR) unwind/ppc64/$(DEPDIR) unwind/x86-family/$(DEPDIR) unwind/x86-family/manual-intervals/$(DEPDIR) utilities/$(DEPDIR) utilities/arch/ia64/$(DEPDIR) utilities/arch/libunwind/$(DEPDIR) utilities/arch/ppc64/$(DEPDIR) utilities/arch/x86-family/$(DEPDIR)
+	-rm -rf ./$(DEPDIR) cct/$(DEPDIR) fnbounds/$(DEPDIR) gpu/$(DEPDIR) gpu/amd/$(DEPDIR) gpu/instrumentation/$(DEPDIR) gpu/nvidia/$(DEPDIR) gpu/opencl/$(DEPDIR) lush-agents/$(DEPDIR) lush/$(DEPDIR) memory/$(DEPDIR) messages/$(DEPDIR) monitor-exts/$(DEPDIR) ompt/$(DEPDIR) os/linux/$(DEPDIR) sample-sources/$(DEPDIR) sample-sources/blame-shift/$(DEPDIR) sample-sources/perf/$(DEPDIR) syscalls/$(DEPDIR) trampoline/aarch64/$(DEPDIR) trampoline/common/$(DEPDIR) trampoline/x86-family/$(DEPDIR) unwind/common/$(DEPDIR) unwind/generic-libunwind/$(DEPDIR) unwind/ppc64/$(DEPDIR) unwind/x86-family/$(DEPDIR) unwind/x86-family/manual-intervals/$(DEPDIR) utilities/$(DEPDIR) utilities/arch/ia64/$(DEPDIR) utilities/arch/libunwind/$(DEPDIR) utilities/arch/ppc64/$(DEPDIR) utilities/arch/x86-family/$(DEPDIR)
 	-rm -f Makefile
 maintainer-clean-am: distclean-am maintainer-clean-generic
 
@@ -8614,6 +8727,10 @@ endef
 @OPT_ENABLE_MPI_WRAP_TRUE@	$(PYTHON) $(srcdir)/sample-sources/make-wrappers.py \
 @OPT_ENABLE_MPI_WRAP_TRUE@		--f77symbol $(F77_SYMBOLS) $(srcdir)/sample-sources/$(MPI_PROTO_FILE)
 
+@ENABLE_OPENCL_TRUE@	libhpcrun_la_CFLAGS += $(OPENCL_IFLAGS)
+@ENABLE_OPENCL_TRUE@	libhpcrun_la_LDFLAGS += "-L/home/aarontcopal2/Documents/inteloneapi/gtpin/Profilers/Lib/intel64 -lgtpin"
+@ENABLE_OPENCL_TRUE@	libhpcrun_la_LDFLAGS +=	"-Wl,-rpath='/home/aarontcopal2/Documents/inteloneapi/gtpin/Profilers/Lib/intel64'"
+
 #-----------------------------------------------------------
 # local hooks
 #-----------------------------------------------------------
diff --git a/src/tool/hpcrun/gpu/gpu-activity-process.c b/src/tool/hpcrun/gpu/gpu-activity-process.c
index 5592e44d59..cdfe02f829 100644
--- a/src/tool/hpcrun/gpu/gpu-activity-process.c
+++ b/src/tool/hpcrun/gpu/gpu-activity-process.c
@@ -405,6 +405,47 @@ gpu_kernel_process
 }
 
 
+static void
+gpu_kernel_block_process
+(
+ gpu_activity_t* activity
+)
+{
+  uint32_t correlation_id = activity->details.kernel_block.correlation_id;
+
+  gpu_correlation_id_map_entry_t *cid_map_entry =
+    gpu_correlation_id_map_lookup(correlation_id);
+
+  if (cid_map_entry != NULL) {
+    uint64_t external_id =
+      gpu_correlation_id_map_entry_external_id_get(cid_map_entry);
+
+    ip_normalized_t ip = activity->details.kernel_block.pc;
+
+    gpu_host_correlation_map_entry_t *host_op_entry =
+      gpu_host_correlation_map_lookup(external_id);
+
+    if (host_op_entry != NULL) {
+      PRINT("external_id %lu\n", external_id);
+
+      cct_node_t *host_op_node =
+        gpu_host_correlation_map_entry_op_function_get(host_op_entry);
+
+			// create a child cct node that contains 2 metrics: offset of block head wrt. original binary, dynamic execution count of block
+      cct_node_t *cct_child = hpcrun_cct_insert_ip_norm(host_op_node, ip); // how to set the ip_norm
+      if (cct_child) {
+        PRINT("cct_child %p\n", cct_child);
+        attribute_activity(host_op_entry, activity, cct_child);
+      }
+    } else {
+      PRINT("host_map_entry %lu not found\n", external_id);
+    }
+  } else {
+    PRINT("correlation_id_map_entry %u not found\n", correlation_id);
+  }
+}
+
+
 static void
 gpu_synchronization_process
 (
@@ -604,6 +645,10 @@ gpu_activity_process
     gpu_kernel_process(ga);
     break;
 
+	case GPU_ACTIVITY_KERNEL_BLOCK:
+		gpu_kernel_block_process(ga);
+		break;
+
   case GPU_ACTIVITY_SYNCHRONIZATION:
     gpu_synchronization_process(ga);
     break;
diff --git a/src/tool/hpcrun/gpu/gpu-activity.h b/src/tool/hpcrun/gpu/gpu-activity.h
index 0b672c5ae8..2c5d219ec5 100644
--- a/src/tool/hpcrun/gpu/gpu-activity.h
+++ b/src/tool/hpcrun/gpu/gpu-activity.h
@@ -80,20 +80,21 @@ typedef struct gpu_activity_channel_t gpu_activity_channel_t;
 typedef enum {    
   GPU_ACTIVITY_UNKNOWN                 = 0,
   GPU_ACTIVITY_KERNEL                  = 1,
-  GPU_ACTIVITY_MEMCPY                  = 2,
-  GPU_ACTIVITY_MEMCPY2                 = 3,
-  GPU_ACTIVITY_MEMSET                  = 4,
-  GPU_ACTIVITY_MEMORY                  = 5,    
-  GPU_ACTIVITY_SYNCHRONIZATION         = 6,
-  GPU_ACTIVITY_GLOBAL_ACCESS           = 7,
-  GPU_ACTIVITY_LOCAL_ACCESS            = 8,
-  GPU_ACTIVITY_BRANCH                  = 9,
-  GPU_ACTIVITY_CDP_KERNEL              = 10,
-  GPU_ACTIVITY_PC_SAMPLING             = 11,
-  GPU_ACTIVITY_PC_SAMPLING_INFO        = 12, 
-  GPU_ACTIVITY_EXTERNAL_CORRELATION    = 13,
-  GPU_ACTIVITY_EVENT                   = 14,
-  GPU_ACTIVITY_FUNCTION                = 15
+	GPU_ACTIVITY_KERNEL_BLOCK						 = 2,	
+  GPU_ACTIVITY_MEMCPY                  = 3,
+  GPU_ACTIVITY_MEMCPY2                 = 4,
+  GPU_ACTIVITY_MEMSET                  = 5,
+  GPU_ACTIVITY_MEMORY                  = 6,    
+  GPU_ACTIVITY_SYNCHRONIZATION         = 7,
+  GPU_ACTIVITY_GLOBAL_ACCESS           = 8,
+  GPU_ACTIVITY_LOCAL_ACCESS            = 9,
+  GPU_ACTIVITY_BRANCH                  = 10,
+  GPU_ACTIVITY_CDP_KERNEL              = 11,
+  GPU_ACTIVITY_PC_SAMPLING             = 12,
+  GPU_ACTIVITY_PC_SAMPLING_INFO        = 13, 
+  GPU_ACTIVITY_EXTERNAL_CORRELATION    = 14,
+  GPU_ACTIVITY_EVENT                   = 15,
+  GPU_ACTIVITY_FUNCTION                = 16
 } gpu_activity_kind_t;
 
 
@@ -258,6 +259,14 @@ typedef struct gpu_kernel_t {
 } gpu_kernel_t;
 
 
+typedef struct gpu_kernel_block_t {
+	uint32_t correlation_id;
+	uint64_t offset;
+	uint64_t execution_count;
+	ip_normalized_t pc;
+} gpu_kernel_block_t;
+
+
 typedef struct gpu_cdpkernel_t {
   uint64_t start;
   uint64_t end;
@@ -351,6 +360,7 @@ typedef struct gpu_activity_details_t {
     gpu_memory_t memory;
     gpu_memset_t memset;
     gpu_kernel_t kernel;
+		gpu_kernel_block_t kernel_block;
     gpu_function_t function;
     gpu_cdpkernel_t cdpkernel;
     gpu_event_t event;
diff --git a/src/tool/hpcrun/gpu/gpu-metrics.c b/src/tool/hpcrun/gpu/gpu-metrics.c
index a0ab7d6e2e..1f2ad38b12 100644
--- a/src/tool/hpcrun/gpu/gpu-metrics.c
+++ b/src/tool/hpcrun/gpu/gpu-metrics.c
@@ -84,7 +84,8 @@
   macro(GPU_INST, 9)				\
   macro(GTIMES, 10)				\
   macro(KINFO, 12)				\
-  macro(GSAMP, 13)			
+  macro(GSAMP, 13)				\
+	macro(KER_BLKINFO, 14)
 
 
 #define FORALL_METRIC_KINDS(macro)	\
@@ -442,6 +443,26 @@ gpu_metrics_attribute_kernel
 }
 
 
+static void
+gpu_metrics_attribute_kernel_block
+(
+	gpu_activity_t *activity
+)
+{
+  gpu_kernel_block_t *b = &(activity->details.kernel_block);
+  cct_node_t *cct_node = activity->cct_node;
+
+	metric_data_list_t *metrics = 
+		hpcrun_reify_metric_set(cct_node, METRIC_ID(GPU_KINFO_STMEM_ACUMU));	//where will we get metrics from?
+
+	gpu_metrics_attribute_metric_int(metrics, METRIC_ID(KER_BLK_OFFSET), 
+					 b->offset);
+
+	gpu_metrics_attribute_metric_int(metrics, METRIC_ID(KER_BLK_EXECUTION_COUNT),	// need to increment execution count for existing ccts
+					 b->execution_count);
+}
+
+
 static void
 gpu_metrics_attribute_synchronization
 (
@@ -580,6 +601,10 @@ gpu_metrics_attribute
   case GPU_ACTIVITY_KERNEL:
     gpu_metrics_attribute_kernel(activity);
     break;
+
+	case GPU_ACTIVITY_KERNEL_BLOCK:
+		gpu_metrics_attribute_kernel_block(activity);
+		break;
     
   case GPU_ACTIVITY_SYNCHRONIZATION:
     gpu_metrics_attribute_synchronization(activity);
@@ -701,6 +726,24 @@ gpu_metrics_KINFO_enable
 }
 
 
+void
+gpu_metrics_KER_BLKINFO_enable
+(
+ void
+)
+{
+// kernel block characteristics metrics
+#undef CURRENT_METRIC 
+#define CURRENT_METRIC KER_BLKINFO
+
+  INITIALIZE_METRIC_KIND();
+
+  FORALL_KER_BLKINFO(INITIALIZE_SCALAR_METRIC_INT)
+
+  FINALIZE_METRIC_KIND();
+}
+
+
 void
 gpu_metrics_GICOPY_enable
 (
diff --git a/src/tool/hpcrun/gpu/gpu-metrics.h b/src/tool/hpcrun/gpu/gpu-metrics.h
index 0da2a7f34f..1dd3f42a11 100644
--- a/src/tool/hpcrun/gpu/gpu-metrics.h
+++ b/src/tool/hpcrun/gpu/gpu-metrics.h
@@ -381,6 +381,13 @@ typedef enum {
   FORALL_GSAMP_INT(macro)			\
   FORALL_GSAMP_REAL(macro)				
 
+#define FORALL_KER_BLKINFO(macro)		\
+  macro("KER:BLK_OFST (B)",            KER_BLK_OFFSET,		\
+	"block offset with respect to kernel binary")		\
+  macro("KER:BLK_EXEC_COUNT",            KER_BLK_EXECUTION_COUNT,		\
+	"count of number of dynamic executions of block")
+
+
 
 //******************************************************************************
 // interface operations
@@ -408,6 +415,17 @@ gpu_metrics_KINFO_enable
 );
 
 
+//--------------------------------------------------
+// record INTEL GTPIN kernel instrumentation info
+//--------------------------------------------------
+
+void
+gpu_metrics_KER_BLKINFO_enable
+(
+ void
+);
+
+
 //--------------------------------------------------
 // record implicit copy metrics for unified memory
 //--------------------------------------------------
diff --git a/src/tool/hpcrun/gpu/instrumentation/gtpin-instrumentation-kernel-data-map.c b/src/tool/hpcrun/gpu/instrumentation/gtpin-instrumentation-kernel-data-map.c
new file mode 100644
index 0000000000..d0af48e6c2
--- /dev/null
+++ b/src/tool/hpcrun/gpu/instrumentation/gtpin-instrumentation-kernel-data-map.c
@@ -0,0 +1,125 @@
+//******************************************************************************
+// system includes
+//******************************************************************************
+
+#include <string.h>
+#include <assert.h>
+
+
+//******************************************************************************
+// local includes
+//******************************************************************************
+
+#include <hpcrun/gpu/gpu-splay-allocator.h>
+#include <lib/prof-lean/splay-uint64.h>
+
+#include "gtpin-instrumentation-kernel-data-map.h"
+
+
+
+//******************************************************************************
+// type declarations
+//******************************************************************************
+
+#define kdm_insert																					\
+  typed_splay_insert(kernel_data_map)
+
+#define kdm_lookup																					\
+  typed_splay_lookup(kernel_data_map)
+
+#define kdm_delete																					\
+  typed_splay_delete(kernel_data_map)
+
+#define kdm_forall																					\
+  typed_splay_forall(kernel_data_map)
+
+#define kdm_count																						\
+  typed_splay_count(kernel_data_map)
+
+#define kdm_alloc(free_list)																\
+  typed_splay_alloc(free_list, kernel_data_map_t)
+
+#define kdm_free(free_list, node)														\
+  typed_splay_free(free_list, node)
+
+typed_splay_impl(kernel_data_map);
+
+
+
+//******************************************************************************
+// local data
+//******************************************************************************
+
+static kernel_data_map_t *kernel_data_map_root = NULL;
+static kernel_data_map_t *kernel_data_map_free_list = NULL;
+
+
+
+//******************************************************************************
+// private operations
+//******************************************************************************
+
+static kernel_data_map_t *
+kernel_data_alloc()
+{
+  return kdm_alloc(&kernel_data_map_free_list);
+}
+
+
+static kernel_data_map_t *
+kernel_data_new
+(
+	uint64_t GTPinKernel_id,
+	KernelData data
+)
+{
+  kernel_data_map_t *e = kernel_data_alloc();
+  memset(e, 0, sizeof(kernel_data_map_t)); 
+  e->GTPinKernel_id = GTPinKernel_id;
+  e->data = data;
+  return e;
+}
+
+
+
+//******************************************************************************
+// interface operations
+//******************************************************************************
+
+kernel_data_map_t*
+kernel_data_map_lookup1
+(
+	uint64_t GTPinKernel_id
+)
+{
+  kernel_data_map_t *result = kdm_lookup(&kernel_data_map_root, GTPinKernel_id);
+	return result;
+}
+
+
+void
+kernel_data_map_insert1
+(
+	uint64_t GTPinKernel_id,
+	KernelData data
+)
+{
+	if (kdm_lookup(&kernel_data_map_root, GTPinKernel_id)) {
+		assert(0);	// entry for a given key should be inserted only once
+	} else {
+		kernel_data_map_t *entry = kernel_data_new(GTPinKernel_id, data);
+		kdm_insert(&kernel_data_map_root, entry);	
+	}
+}
+
+
+void
+kernel_data_map_delete1
+(
+	uint64_t GTPinKernel_id
+)
+{
+	kernel_data_map_t *node = kdm_delete(&kernel_data_map_root, GTPinKernel_id);
+	kdm_free(&kernel_data_map_free_list, node);
+}
+
diff --git a/src/tool/hpcrun/gpu/instrumentation/gtpin-instrumentation-kernel-data-map.h b/src/tool/hpcrun/gpu/instrumentation/gtpin-instrumentation-kernel-data-map.h
new file mode 100644
index 0000000000..5324466751
--- /dev/null
+++ b/src/tool/hpcrun/gpu/instrumentation/gtpin-instrumentation-kernel-data-map.h
@@ -0,0 +1,62 @@
+
+//******************************************************************************
+// system includes
+//******************************************************************************
+
+#include <stdint.h>
+//#include <gtpin.h>
+
+
+
+//******************************************************************************
+// type declarations
+//******************************************************************************
+
+typedef struct KernelData {
+	uint64_t kernel_cct_correlation_id;
+  char *name;
+  uint32_t call_count;
+	uint32_t loadmap_module_id;
+	//block_map_t *block_map_root;
+} KernelData;
+
+
+#undef typed_splay_node
+#define typed_splay_node(kernel_data_map) kernel_data_map_t
+
+
+typedef struct typed_splay_node(kernel_data_map) {
+  struct typed_splay_node(kernel_data_map) *left;
+  struct typed_splay_node(kernel_data_map) *right;
+  uint64_t GTPinKernel_id; // key
+
+	KernelData data;
+}typed_splay_node(kernel_data_map);
+
+
+
+//******************************************************************************
+// interface operations
+//******************************************************************************
+
+kernel_data_map_t*
+kernel_data_map_lookup1
+(
+	uint64_t
+);
+
+
+void
+kernel_data_map_insert1
+(
+	uint64_t,
+	KernelData
+);
+
+
+void
+kernel_data_map_delete1
+(
+	uint64_t
+);
+
diff --git a/src/tool/hpcrun/gpu/instrumentation/gtpin-instrumentation-kernel-memory-map.c b/src/tool/hpcrun/gpu/instrumentation/gtpin-instrumentation-kernel-memory-map.c
new file mode 100644
index 0000000000..003f087caa
--- /dev/null
+++ b/src/tool/hpcrun/gpu/instrumentation/gtpin-instrumentation-kernel-memory-map.c
@@ -0,0 +1,126 @@
+
+//******************************************************************************
+// system includes
+//******************************************************************************
+
+#include <string.h>
+
+
+
+//******************************************************************************
+// local includes
+//******************************************************************************
+
+#include <hpcrun/gpu/gpu-splay-allocator.h>
+#include <lib/prof-lean/splay-uint64.h>
+
+#include "gtpin-instrumentation-kernel-memory-map.h"
+
+
+//******************************************************************************
+// type declarations
+//******************************************************************************
+
+#define kmm_insert																					\
+  typed_splay_insert(kernel_memory_map)
+
+#define kmm_lookup																					\
+  typed_splay_lookup(kernel_memory_map)
+
+#define kmm_delete																					\
+  typed_splay_delete(kernel_memory_map)
+
+#define kmm_forall																					\
+  typed_splay_forall(kernel_memory_map)
+
+#define kmm_count																						\
+  typed_splay_count(kernel_memory_map)
+
+#define kmm_alloc(free_list)																\
+  typed_splay_alloc(free_list, kernel_memory_map_t)
+
+#define kmm_free(free_list, node)														\
+  typed_splay_free(free_list, node)
+
+typed_splay_impl(kernel_memory_map);
+
+
+
+//******************************************************************************
+// local data
+//******************************************************************************
+
+static kernel_memory_map_t *kernel_memory_map_root = NULL;
+static kernel_memory_map_t *kernel_memory_map_free_list = NULL;
+
+
+
+//******************************************************************************
+// private operations
+//******************************************************************************
+
+static kernel_memory_map_t *
+kernel_mem_alloc()
+{
+  return kmm_alloc(&kernel_memory_map_free_list);
+}
+
+
+static kernel_memory_map_t *
+kernel_mem_new
+(
+	uint64_t GTPinKernel_id,
+	mem_pair_node *head
+)
+{
+  kernel_memory_map_t *e = kernel_mem_alloc();
+  memset(e, 0, sizeof(kernel_memory_map_t)); 
+  e->GTPinKernel_id = GTPinKernel_id;
+  e->head = head;
+  return e;
+}
+
+
+
+
+//******************************************************************************
+// interface operations
+//******************************************************************************
+
+kernel_memory_map_t*
+kernel_memory_map_lookup1
+(
+	uint64_t GTPinKernel_id
+)
+{
+  kernel_memory_map_t *result = kmm_lookup(&kernel_memory_map_root, GTPinKernel_id);
+	return result;
+}
+
+
+void
+kernel_memory_map_insert1
+(
+	uint64_t GTPinKernel_id,
+	mem_pair_node *head
+)
+{
+	if (kmm_lookup(&kernel_memory_map_root, GTPinKernel_id)) {
+		assert(0);	// entry for a given key should be inserted only once
+	} else {
+		kernel_memory_map_t *entry = kernel_mem_new(GTPinKernel_id, head);
+		kmm_insert(&kernel_memory_map_root, entry);	
+	}
+}
+
+
+void
+kernel_memory_map_delete1
+(
+	uint64_t GTPinKernel_id
+)
+{
+	kernel_memory_map_t *node = kmm_delete(&kernel_memory_map_root, GTPinKernel_id);
+	kmm_free(&kernel_memory_map_free_list, node);
+}
+
diff --git a/src/tool/hpcrun/gpu/instrumentation/gtpin-instrumentation-kernel-memory-map.h b/src/tool/hpcrun/gpu/instrumentation/gtpin-instrumentation-kernel-memory-map.h
new file mode 100644
index 0000000000..c9f0b89686
--- /dev/null
+++ b/src/tool/hpcrun/gpu/instrumentation/gtpin-instrumentation-kernel-memory-map.h
@@ -0,0 +1,60 @@
+
+//******************************************************************************
+// system includes
+//******************************************************************************
+
+#include <gtpin.h>
+
+
+
+//******************************************************************************
+// type declarations
+//******************************************************************************
+
+typedef struct mem_pair_node {
+	int32_t offset;
+	GTPinMem mem;
+	struct mem_pair_node *next;
+} mem_pair_node;
+
+
+#undef typed_splay_node
+#define typed_splay_node(kernel_memory_map) kernel_memory_map_t
+
+
+typedef struct typed_splay_node(kernel_memory_map) {
+  struct typed_splay_node(kernel_memory_map) *left;
+  struct typed_splay_node(kernel_memory_map) *right;
+  uint64_t GTPinKernel_id; // key
+
+	mem_pair_node *head;
+
+} typed_splay_node(kernel_memory_map);
+
+
+
+//******************************************************************************
+// interface operations
+//******************************************************************************
+
+kernel_memory_map_t*
+kernel_memory_map_lookup1
+(
+	uint64_t
+);
+
+
+void
+kernel_memory_map_insert1
+(
+	uint64_t,
+	mem_pair_node *
+);
+
+
+void
+kernel_memory_map_delete1
+(
+	uint64_t
+);
+
diff --git a/src/tool/hpcrun/gpu/instrumentation/opencl-instrumentation.c b/src/tool/hpcrun/gpu/instrumentation/opencl-instrumentation.c
new file mode 100644
index 0000000000..9396c891fe
--- /dev/null
+++ b/src/tool/hpcrun/gpu/instrumentation/opencl-instrumentation.c
@@ -0,0 +1,399 @@
+
+//******************************************************************************
+// system includes
+//******************************************************************************
+
+#include <assert.h>
+#include <gtpin.h>
+#include <stdlib.h>
+
+
+
+//******************************************************************************
+// local includes
+//******************************************************************************
+
+#include <hpcrun/safe-sampling.h>
+#include <hpcrun/cct/cct.h>
+#include <hpcrun/gpu/gpu-activity-process.h>
+#include <hpcrun/gpu/gpu-activity-channel.h>
+#include <hpcrun/gpu/gpu-application-thread-api.h>
+#include <hpcrun/gpu/gpu-correlation.h>
+#include <hpcrun/gpu/gpu-correlation-channel.h>
+#include <hpcrun/gpu/gpu-correlation-id-map.h>
+#include <hpcrun/gpu/gpu-op-placeholders.h>
+#include <hpcrun/gpu/gpu-metrics.h>
+#include <hpcrun/gpu/gpu-monitoring-thread-api.h>
+#include <hpcrun/utilities/hpcrun-nanotime.h>
+
+#include "opencl-instrumentation.h"
+
+
+
+//******************************************************************************
+// local data
+//******************************************************************************
+
+#define MAX_STR_SIZE 1024
+
+static atomic_long correlation_id;
+
+
+//******************************************************************************
+// private operations
+//******************************************************************************
+
+static void
+knobAddBool
+(
+	const char *name,
+	bool value
+)
+{
+  GTPinKnob knob = KNOB_FindArg(name);
+  assert(knob != NULL);
+  KnobValue knob_value;
+  knob_value.value._bool = value;
+  knob_value.type = KNOB_TYPE_BOOL;
+  KNOB_STATUS status = KNOB_AddValue(knob, &knob_value);
+  assert(status == KNOB_STATUS_SUCCESS);
+}
+
+
+static uint32_t
+getCorrelationId
+(
+  void
+)
+{
+  return atomic_fetch_add(&correlation_id, 1);
+}
+
+
+static void
+createKernelNode
+(
+	uint64_t correlation_id
+)
+{
+	cct_node_t *api_node = gpu_application_thread_correlation_callback(correlation_id);
+	gpu_correlation_id_map_insert(correlation_id, correlation_id);
+
+	gpu_op_ccts_t gpu_op_ccts;
+	gpu_op_placeholder_flags_t gpu_op_placeholder_flags = 0;
+	gpu_op_placeholder_flags_set(&gpu_op_placeholder_flags, gpu_placeholder_type_kernel);
+
+	hpcrun_safe_enter();
+	gpu_op_ccts_insert(api_node, &gpu_op_ccts, gpu_op_placeholder_flags);
+	hpcrun_safe_exit();
+
+	gpu_activity_channel_consume(gpu_metrics_attribute);
+	uint64_t cpu_submit_time = hpcrun_nanotime();
+  gpu_correlation_channel_produce(correlation_id, &gpu_op_ccts, cpu_submit_time);
+}
+
+
+static uint32_t
+add_opencl_binary_to_loadmap
+(
+	char *bin_filename
+)
+{
+	uint32_t hpctoolkit_module_id;
+	load_module_t *module = NULL;
+
+	hpcrun_loadmap_lock();
+	if ((module = hpcrun_loadmap_findByName(bin_filename)) == NULL) {
+		hpctoolkit_module_id = hpcrun_loadModule_add(bin_filename);
+	} else {
+		hpctoolkit_module_id = module->id;
+	}
+	hpcrun_loadmap_unlock();
+	return hpctoolkit_module_id;
+}
+
+
+static uint32_t
+save_opencl_binary
+(
+	GTPinKernel kernel,
+	char *bin_name
+)
+{
+	// dump the binary to files for using it at inside hpcprof 
+	uint32_t kernel_binary_size = 0;
+  GTPINTOOL_STATUS status = GTPin_GetKernelBinary(kernel, 0, NULL, &kernel_binary_size);
+  assert(status == GTPINTOOL_STATUS_SUCCESS);
+
+	uint8_t *binary = (uint8_t*) malloc(sizeof(uint8_t) * kernel_binary_size);
+
+	/*!
+	 * Copy original kernel's binary into specified buffer
+	 * @ingroup KERNEL
+	 * @param[in]        kernel         the target kernel.
+	 * @param[in]        buffer_size    size of the buffer in bytes.Ignored,
+	 *                                  if buffer is not provided('buf' is NULL)
+	 * @param[out, opt]  buf            buffer that receives the requested binary code. NULL pointer can be used to
+	 *                                  check actual size of the string without copying it into a client's buffer.
+	 * @param[out, opt]  binary_size    If specified(not NULL), receives the actual size of the requested binary in
+	 *                                  bytes, including terminating NULL.
+	 *
+	 * @par Availability:
+	 * - OnKernelComplete
+	 */
+  status = GTPin_GetKernelBinary(kernel, kernel_binary_size, (char *)(binary), NULL);
+
+	strcat(bin_name, "_kernel.bin");
+	FILE *bin_ptr = fopen(bin_name, "wb");
+	fwrite(binary, kernel_binary_size, 1, bin_ptr);
+  assert(status == GTPINTOOL_STATUS_SUCCESS);
+	return add_opencl_binary_to_loadmap(bin_name);
+}
+
+
+static void
+opencl_activity_notify
+(
+  void
+)
+{
+  gpu_monitoring_thread_activities_ready();
+}
+
+
+static void
+opencl_kernel_block_activity_translate
+(
+	gpu_activity_t *ga,
+	uint32_t correlation_id,
+	uint32_t loadmap_module_id,
+	uint64_t offset,
+	uint64_t execution_count
+)
+{
+	memset(&ga->details.kernel_block, 0, sizeof(gpu_kernel_block_t));
+	ga->details.kernel_block.correlation_id = correlation_id;
+	ga->details.kernel_block.pc.lm_id = (uint16_t)loadmap_module_id;
+	ga->details.kernel_block.pc.lm_ip = (uintptr_t)offset;
+	ga->details.kernel_block.offset = offset;
+	ga->details.kernel_block.execution_count = execution_count;
+	ga->kind = GPU_ACTIVITY_KERNEL_BLOCK;
+}
+
+
+static void
+opencl_kernel_block_activity_process
+(
+	gpu_activity_t *ga,
+	uint32_t correlation_id,
+	uint32_t loadmap_module_id,
+	uint64_t offset,
+	uint64_t execution_count
+)
+{
+	opencl_kernel_block_activity_translate(ga, correlation_id, loadmap_module_id, offset, execution_count);
+	gpu_activity_process(ga);
+}
+
+
+static void
+onKernelBuild
+(
+	GTPinKernel kernel,
+	void *v
+)
+{
+  GTPINTOOL_STATUS status = GTPINTOOL_STATUS_SUCCESS;
+
+  assert(kernel_memory_map_lookup1((uint64_t)kernel) == 0);
+  assert(kernel_data_map_lookup1((uint64_t)kernel) == 0);
+	
+  KernelData data;
+
+	uint32_t correlation_id = getCorrelationId();
+	data.kernel_cct_correlation_id = correlation_id;
+	createKernelNode(correlation_id);
+
+	mem_pair_node *h;
+	mem_pair_node *current;
+	bool isHeadNull = true;
+
+  for (GTPinBBL block = GTPin_BBLHead(kernel); GTPin_BBLValid(block); block = GTPin_BBLNext(block)) {
+    GTPinINS head = GTPin_InsHead(block);
+    assert(GTPin_InsValid(head));
+    
+		/*!
+		 * @return the offset of the instruction relative to the beginning of the original kernel's binary
+		 * -1 is returned in case of an error
+		 * @ingroup INS
+		 * @param[in]   ins the instruction handle.
+		 *
+		 * @par Availability:
+		 * - OnKernelBuild
+		 */
+		int32_t offset =  GTPin_InsOffset(head);
+
+    GTPinMem mem = NULL;
+    status = GTPin_MemClaim(kernel, sizeof(uint32_t), &mem);
+    assert(status == GTPINTOOL_STATUS_SUCCESS);
+
+		/*!
+		* Insert instrumentaion (Opcodeprof) that counts the number of dynamic executions of basic block
+		* *countSlot++
+		*
+		* @ingroup INSTRUMENTATION
+		*
+		* @param[in]       ins         instruction to be instrumented. The instrumentation code will be inserted
+		*                              BEFORE this instruction
+		*
+		* @param[in]       countSlot   memory slot to store the resulting counter in. The slot should be allocate
+		*                              by the GTPin_MemClaim() function, prior to this function call
+		*
+		* @return  Success/failure status
+		*
+		* @par Availability:
+		* - OnKernelBuild
+		*/
+    status = GTPin_OpcodeprofInstrument(head, mem);
+    assert(status == GTPINTOOL_STATUS_SUCCESS);
+
+		mem_pair_node *m = malloc(sizeof(mem_pair_node));
+		m->offset = offset;
+		m->mem = mem;
+		m->next = NULL;
+
+		if (isHeadNull == true) {
+			h = m;
+			current = m;
+			isHeadNull = false;
+		} else {
+			current->next = m;
+		}
+  }
+	if (h != NULL) {
+		kernel_memory_map_insert1((uint64_t)kernel, h);
+	}
+
+	gpu_activity_channel_consume(gpu_metrics_attribute);
+
+  char kernel_name[MAX_STR_SIZE];
+  status = GTPin_KernelGetName(kernel, MAX_STR_SIZE, kernel_name, NULL);
+  assert(status == GTPINTOOL_STATUS_SUCCESS);
+
+	// 
+	// m->next = NULL;
+	// add these details to cct_node. If thats not needed, we can create the kernel_cct in onKernelComplete
+  data.name = kernel_name;
+  data.call_count = 0;
+	data.loadmap_module_id = save_opencl_binary(kernel, kernel_name);
+	
+	kernel_data_map_insert1((uint64_t)kernel, data);
+  ETMSG(OPENCL, "onKernelBuild complete. Inserted key: %"PRIu64 "",(uint64_t)kernel);
+}
+
+
+static void
+onKernelRun
+(
+	GTPinKernelExec kernelExec,
+	void *v
+)
+{
+	GTPINTOOL_STATUS status = GTPINTOOL_STATUS_SUCCESS;
+  GTPin_KernelProfilingActive(kernelExec, 1);
+  assert(status == GTPINTOOL_STATUS_SUCCESS);
+}
+
+
+static void
+onKernelComplete
+(
+	GTPinKernelExec kernelExec,
+	void *v
+)
+{
+  GTPINTOOL_STATUS status = GTPINTOOL_STATUS_SUCCESS;
+  GTPinKernel kernel = GTPin_KernelExec_GetKernel(kernelExec);
+  ETMSG(OPENCL, "onKernelComplete starting. Lookup: key: %"PRIu64 "",(uint64_t)kernel);
+  assert(kernel_data_map_lookup1((uint64_t)kernel) != 0);
+  assert(kernel_memory_map_lookup1((uint64_t)kernel) != 0);
+
+	kernel_data_map_t *kernel_data_list = kernel_data_map_lookup1((uint64_t)kernel);
+  KernelData data = kernel_data_list->data;
+	kernel_memory_map_t *kernel_memory_list = kernel_memory_map_lookup1((uint64_t)kernel);
+	mem_pair_node *block = kernel_memory_list->head;
+
+ 	// get kernel cct root node from correlation_id
+	uint32_t correlation_id = data.kernel_cct_correlation_id;
+
+	while (block != NULL) {
+		/*!
+		 * @return sampling size for mem handle
+		 * @ingroup MEM
+		 * @param[in]   mem     the memory handle
+		 *
+		 * @par Availability:
+		 * - all callbacks
+		 */
+    uint32_t thread_count = GTPin_MemSampleLength(block->mem);
+    assert(thread_count > 0);
+
+    uint32_t total = 0, value = 0;
+    for (uint32_t tid = 0; tid < thread_count; ++tid) {
+    	status = GTPin_MemRead(block->mem, tid, sizeof(uint32_t), (char*)(&value), NULL);
+    	assert(status == GTPINTOOL_STATUS_SUCCESS);
+    	total += value;
+    }
+
+    //block_map_t *bm = block_map_lookup1(data.block_map_root, block->offset);
+    //assert(bm != 0);
+		uint64_t execution_count = total; // + bm->val 
+    //block_map_insert1(data.block_map_root, block->offset, execution_count);
+	
+		opencl_activity_notify();	
+		gpu_activity_t gpu_activity;
+		opencl_kernel_block_activity_process(&gpu_activity, correlation_id, data.loadmap_module_id, block->offset, execution_count);
+		block = block->next;
+		//how to make offset the primary key within the cct and += the execution value for existing ccts?
+  }
+
+  ++(data.call_count);
+}
+
+
+
+//******************************************************************************
+// interface operations
+//******************************************************************************
+
+void
+opencl_instrumentation_initialize
+(
+  void
+)
+{
+  atomic_store(&correlation_id, 5000);	// to avoid conflict with opencl operation correlation ids, we start instrumentation ids with 5000 (TODO:FIX)
+}
+
+
+void enableProfiling
+(
+  void
+)
+{
+  ETMSG(OPENCL, "inside enableProfiling");
+	opencl_instrumentation_initialize();
+	knobAddBool("silent_warnings", true);
+
+	/*if (utils::GetEnv("PTI_GEN12") != nullptr) {
+    std::cout << "[INFO] Experimental GTPin mode: GEN12" << std::endl;
+    KnobAddBool("gen12_1", true);
+  }*/
+
+  GTPin_OnKernelBuild(onKernelBuild, NULL);
+  GTPin_OnKernelRun(onKernelRun, NULL);
+  GTPin_OnKernelComplete(onKernelComplete, NULL);
+
+  GTPIN_Start();
+}
+
diff --git a/src/tool/hpcrun/gpu/instrumentation/opencl-instrumentation.h b/src/tool/hpcrun/gpu/instrumentation/opencl-instrumentation.h
new file mode 100644
index 0000000000..1204ac9f61
--- /dev/null
+++ b/src/tool/hpcrun/gpu/instrumentation/opencl-instrumentation.h
@@ -0,0 +1,18 @@
+
+//******************************************************************************
+// local includes
+//******************************************************************************
+
+#include "gtpin-instrumentation-kernel-memory-map.h"
+#include "gtpin-instrumentation-kernel-data-map.h"
+
+
+
+//******************************************************************************
+// interface operations
+//******************************************************************************
+
+void enableProfiling
+(
+  void
+);
diff --git a/src/tool/hpcrun/gpu/opencl/opencl-intercept.c b/src/tool/hpcrun/gpu/opencl/opencl-intercept.c
index 25b0827698..32ae780480 100644
--- a/src/tool/hpcrun/gpu/opencl/opencl-intercept.c
+++ b/src/tool/hpcrun/gpu/opencl/opencl-intercept.c
@@ -54,6 +54,7 @@
 // local includes
 //******************************************************************************
 
+#include <hpcrun/gpu/instrumentation/opencl-instrumentation.h>
 #include <hpcrun/messages/messages.h>
 #include <lib/prof-lean/hpcrun-gotcha.h>
 #include <lib/prof-lean/hpcrun-opencl.h>
@@ -341,6 +342,8 @@ opencl_intercept_setup
 {
   #ifndef HPCRUN_STATIC_LINK
   ETMSG(OPENCL, "setting up opencl intercepts");
+	gpu_metrics_KER_BLKINFO_enable();
+  enableProfiling();
   gotcha_wrap(opencl_bindings, 4, "opencl_bindings");
   opencl_intercept_initialize();
   #endif

From fcf00689c31d3c99a337a7683a03f74d0a79b379 Mon Sep 17 00:00:00 2001
From: Aaron Cherian <aarontcopal2@jlselogin2.ftm.alcf.anl.gov>
Date: Fri, 10 Jul 2020 00:22:30 +0000
Subject: [PATCH 008/177] updated imports in hpcrun makefile and opencl files

---
 src/lib/prof-lean/hpcrun-gotcha.h             | 2 ++
 src/tool/hpcrun/Makefile.am                   | 4 +++-
 src/tool/hpcrun/Makefile.in                   | 4 ++--
 src/tool/hpcrun/gpu/opencl/opencl-intercept.c | 2 +-
 src/tool/hpcrun/sample-sources/opencl.c       | 5 +++++
 5 files changed, 13 insertions(+), 4 deletions(-)

diff --git a/src/lib/prof-lean/hpcrun-gotcha.h b/src/lib/prof-lean/hpcrun-gotcha.h
index aa4bfdddeb..140f7ca7a7 100644
--- a/src/lib/prof-lean/hpcrun-gotcha.h
+++ b/src/lib/prof-lean/hpcrun-gotcha.h
@@ -52,6 +52,8 @@
 //******************************************************************************
 #include <gotcha/gotcha.h>
 
+
+
 //******************************************************************************
 // macros
 //******************************************************************************
diff --git a/src/tool/hpcrun/Makefile.am b/src/tool/hpcrun/Makefile.am
index 7966b690a0..92bbdd7dfc 100644
--- a/src/tool/hpcrun/Makefile.am
+++ b/src/tool/hpcrun/Makefile.am
@@ -552,7 +552,9 @@ MY_INCLUDE_DIRS =			\
         $(OPT_CUDA_IFLAGS)             \
         $(OPT_CUPTI_IFLAGS)             \
 	-I$(LIBELF_INC)			\
-	-I$(LIBMONITOR_INC)
+	-I$(LIBMONITOR_INC)	\
+	$(GOTCHA_IFLAGS)	\
+	$(OPENCL_IFLAGS)
 
 MY_MIPS_INCLUDE_DIRS = \
 	-I$(srcdir)/unwind/mips
diff --git a/src/tool/hpcrun/Makefile.in b/src/tool/hpcrun/Makefile.in
index 4a88fa5b96..7e23a9b0de 100644
--- a/src/tool/hpcrun/Makefile.in
+++ b/src/tool/hpcrun/Makefile.in
@@ -1881,8 +1881,8 @@ MY_INCLUDE_DIRS = \
         $(OPT_CUPTI_IFLAGS)             \
 	-I$(LIBELF_INC)			\
 	-I$(LIBMONITOR_INC)	\
-	$(OPENCL_IFLAGS)	\
-	$(GOTCHA_IFLAGS)
+	$(GOTCHA_IFLAGS)	\
+	$(OPENCL_IFLAGS)
 
 MY_MIPS_INCLUDE_DIRS = \
 	-I$(srcdir)/unwind/mips
diff --git a/src/tool/hpcrun/gpu/opencl/opencl-intercept.c b/src/tool/hpcrun/gpu/opencl/opencl-intercept.c
index 32ae780480..8d6eeb1723 100644
--- a/src/tool/hpcrun/gpu/opencl/opencl-intercept.c
+++ b/src/tool/hpcrun/gpu/opencl/opencl-intercept.c
@@ -45,7 +45,6 @@
 // system includes
 //******************************************************************************
 
-#include <gotcha/gotcha.h>
 #include <inttypes.h>
 
 
@@ -55,6 +54,7 @@
 //******************************************************************************
 
 #include <hpcrun/gpu/instrumentation/opencl-instrumentation.h>
+#include <hpcrun/gpu/gpu-metrics.h>
 #include <hpcrun/messages/messages.h>
 #include <lib/prof-lean/hpcrun-gotcha.h>
 #include <lib/prof-lean/hpcrun-opencl.h>
diff --git a/src/tool/hpcrun/sample-sources/opencl.c b/src/tool/hpcrun/sample-sources/opencl.c
index f314f2b0db..dfe1f6c5c0 100644
--- a/src/tool/hpcrun/sample-sources/opencl.c
+++ b/src/tool/hpcrun/sample-sources/opencl.c
@@ -47,12 +47,17 @@
 
 #include "common.h"
 
+#include <monitor.h> 
+
 #include <hpcrun/device-finalizers.h>
+#include <hpcrun/gpu/gpu-metrics.h>
 #include <hpcrun/gpu/opencl/opencl-api.h>
 #include <hpcrun/thread_data.h>
 
 #include <messages/messages.h>
 
+#include <utilities/tokenize.h>
+
 
 
 //******************************************************************************

From f794745bc8ef6402c1e703fc3157b681c773e054 Mon Sep 17 00:00:00 2001
From: Aaron Cherian <aarontcopal2@jlselogin2.ftm.alcf.anl.gov>
Date: Fri, 10 Jul 2020 00:22:30 +0000
Subject: [PATCH 009/177] updated imports in hpcrun makefile and opencl files

---
 src/lib/prof-lean/hpcrun-gotcha.h             | 2 ++
 src/tool/hpcrun/Makefile.am                   | 4 +++-
 src/tool/hpcrun/Makefile.in                   | 4 +++-
 src/tool/hpcrun/gpu/opencl/opencl-intercept.c | 2 +-
 src/tool/hpcrun/sample-sources/opencl.c       | 5 +++++
 5 files changed, 14 insertions(+), 3 deletions(-)

diff --git a/src/lib/prof-lean/hpcrun-gotcha.h b/src/lib/prof-lean/hpcrun-gotcha.h
index aa4bfdddeb..140f7ca7a7 100644
--- a/src/lib/prof-lean/hpcrun-gotcha.h
+++ b/src/lib/prof-lean/hpcrun-gotcha.h
@@ -52,6 +52,8 @@
 //******************************************************************************
 #include <gotcha/gotcha.h>
 
+
+
 //******************************************************************************
 // macros
 //******************************************************************************
diff --git a/src/tool/hpcrun/Makefile.am b/src/tool/hpcrun/Makefile.am
index 790d87883d..10e118a366 100644
--- a/src/tool/hpcrun/Makefile.am
+++ b/src/tool/hpcrun/Makefile.am
@@ -549,7 +549,9 @@ MY_INCLUDE_DIRS =			\
         $(OPT_CUDA_IFLAGS)             \
         $(OPT_CUPTI_IFLAGS)             \
 	-I$(LIBELF_INC)			\
-	-I$(LIBMONITOR_INC)
+	-I$(LIBMONITOR_INC)	\
+	$(GOTCHA_IFLAGS)	\
+	$(OPENCL_IFLAGS)
 
 MY_MIPS_INCLUDE_DIRS = \
 	-I$(srcdir)/unwind/mips
diff --git a/src/tool/hpcrun/Makefile.in b/src/tool/hpcrun/Makefile.in
index 82c3d3dcab..860c67da77 100644
--- a/src/tool/hpcrun/Makefile.in
+++ b/src/tool/hpcrun/Makefile.in
@@ -1866,7 +1866,9 @@ MY_INCLUDE_DIRS = \
         $(OPT_CUDA_IFLAGS)             \
         $(OPT_CUPTI_IFLAGS)             \
 	-I$(LIBELF_INC)			\
-	-I$(LIBMONITOR_INC)
+	-I$(LIBMONITOR_INC)	\
+	$(GOTCHA_IFLAGS)	\
+	$(OPENCL_IFLAGS)
 
 MY_MIPS_INCLUDE_DIRS = \
 	-I$(srcdir)/unwind/mips
diff --git a/src/tool/hpcrun/gpu/opencl/opencl-intercept.c b/src/tool/hpcrun/gpu/opencl/opencl-intercept.c
index 25b0827698..4a47870bdc 100644
--- a/src/tool/hpcrun/gpu/opencl/opencl-intercept.c
+++ b/src/tool/hpcrun/gpu/opencl/opencl-intercept.c
@@ -45,7 +45,6 @@
 // system includes
 //******************************************************************************
 
-#include <gotcha/gotcha.h>
 #include <inttypes.h>
 
 
@@ -54,6 +53,7 @@
 // local includes
 //******************************************************************************
 
+#include <hpcrun/gpu/gpu-metrics.h>
 #include <hpcrun/messages/messages.h>
 #include <lib/prof-lean/hpcrun-gotcha.h>
 #include <lib/prof-lean/hpcrun-opencl.h>
diff --git a/src/tool/hpcrun/sample-sources/opencl.c b/src/tool/hpcrun/sample-sources/opencl.c
index f314f2b0db..dfe1f6c5c0 100644
--- a/src/tool/hpcrun/sample-sources/opencl.c
+++ b/src/tool/hpcrun/sample-sources/opencl.c
@@ -47,12 +47,17 @@
 
 #include "common.h"
 
+#include <monitor.h> 
+
 #include <hpcrun/device-finalizers.h>
+#include <hpcrun/gpu/gpu-metrics.h>
 #include <hpcrun/gpu/opencl/opencl-api.h>
 #include <hpcrun/thread_data.h>
 
 #include <messages/messages.h>
 
+#include <utilities/tokenize.h>
+
 
 
 //******************************************************************************

From c2cc83178b2740e3554680df3629864c8122c9e0 Mon Sep 17 00:00:00 2001
From: Aaron Cherian <aarontcopal2@jlselogin2.ftm.alcf.anl.gov>
Date: Fri, 10 Jul 2020 00:50:55 +0000
Subject: [PATCH 010/177] added static_link guard inside opencl-intercept.c

---
 src/tool/hpcrun/gpu/opencl/opencl-intercept.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/tool/hpcrun/gpu/opencl/opencl-intercept.c b/src/tool/hpcrun/gpu/opencl/opencl-intercept.c
index 4a47870bdc..2d901f95e8 100644
--- a/src/tool/hpcrun/gpu/opencl/opencl-intercept.c
+++ b/src/tool/hpcrun/gpu/opencl/opencl-intercept.c
@@ -69,11 +69,13 @@
 // local data
 //******************************************************************************
 
+#ifndef HPCRUN_STATIC_LINK
 static gotcha_wrappee_handle_t clCreateCommandQueue_handle;
 static gotcha_wrappee_handle_t clEnqueueNDRangeKernel_handle;
 static gotcha_wrappee_handle_t clEnqueueReadBuffer_handle;
 static gotcha_wrappee_handle_t clEnqueueWriteBuffer_handle;
 static atomic_long correlation_id;
+#endif
 
 
 

From b7760cef24b7a0ca5cfe71053bcc8a863c2d7b23 Mon Sep 17 00:00:00 2001
From: Dejan XXX <dx4@llnl.cs.rice.edu>
Date: Tue, 14 Jul 2020 21:48:13 -0500
Subject: [PATCH 011/177] started MetricAPI XMIT-nvlink and able to collect
 papi metric from subscriber callback

---
 src/tool/hpcrun/gpu/gpu-activity.h            |   3 +-
 src/tool/hpcrun/gpu/gpu-metrics.c             |  59 +++++++++-
 src/tool/hpcrun/gpu/gpu-metrics.h             |  30 +++++
 .../gpu/nvidia/cupti-activity-translate.c     |   2 +-
 src/tool/hpcrun/gpu/nvidia/cupti-api.c        |  10 ++
 src/tool/hpcrun/sample-sources/amd.c          |   4 +
 .../blame-shift/blame-sample-source.h         |   5 +
 src/tool/hpcrun/sample-sources/cuda.c         |   5 +
 src/tool/hpcrun/sample-sources/ga.c           |   6 +
 src/tool/hpcrun/sample-sources/generic.c      |   7 ++
 src/tool/hpcrun/sample-sources/gpu_blame.c    |   7 ++
 src/tool/hpcrun/sample-sources/idle.c         |   5 +
 src/tool/hpcrun/sample-sources/io.c           |   5 +
 src/tool/hpcrun/sample-sources/itimer.c       |   5 +
 src/tool/hpcrun/sample-sources/memleak.c      |   7 ++
 src/tool/hpcrun/sample-sources/mpi.c          |   6 +
 src/tool/hpcrun/sample-sources/none.c         |   6 +
 src/tool/hpcrun/sample-sources/nvidia.c       |  39 ++++---
 src/tool/hpcrun/sample-sources/papi-c.c       |  33 +++++-
 src/tool/hpcrun/sample-sources/papi.c         | 110 ++++++++++++++----
 .../hpcrun/sample-sources/perf/linux_perf.c   |   5 +
 .../hpcrun/sample-sources/pthread-blame.c     |   5 +
 src/tool/hpcrun/sample-sources/retcnt.c       |   6 +
 .../hpcrun/sample-sources/sample_source_obj.h |   1 +
 src/tool/hpcrun/sample-sources/ss_obj.h       |   1 +
 src/tool/hpcrun/sample-sources/sync.c         |   6 +
 src/tool/hpcrun/sample-sources/tst.c          |   6 +
 src/tool/hpcrun/sample-sources/upc.c          |   6 +
 28 files changed, 343 insertions(+), 47 deletions(-)

diff --git a/src/tool/hpcrun/gpu/gpu-activity.h b/src/tool/hpcrun/gpu/gpu-activity.h
index 0b672c5ae8..263f2c3e90 100644
--- a/src/tool/hpcrun/gpu/gpu-activity.h
+++ b/src/tool/hpcrun/gpu/gpu-activity.h
@@ -93,7 +93,8 @@ typedef enum {
   GPU_ACTIVITY_PC_SAMPLING_INFO        = 12, 
   GPU_ACTIVITY_EXTERNAL_CORRELATION    = 13,
   GPU_ACTIVITY_EVENT                   = 14,
-  GPU_ACTIVITY_FUNCTION                = 15
+  GPU_ACTIVITY_FUNCTION                = 15,
+	GPU_ACTIVITY_LINK
 } gpu_activity_kind_t;
 
 
diff --git a/src/tool/hpcrun/gpu/gpu-metrics.c b/src/tool/hpcrun/gpu/gpu-metrics.c
index a0ab7d6e2e..4b5fe6c613 100644
--- a/src/tool/hpcrun/gpu/gpu-metrics.c
+++ b/src/tool/hpcrun/gpu/gpu-metrics.c
@@ -84,7 +84,8 @@
   macro(GPU_INST, 9)				\
   macro(GTIMES, 10)				\
   macro(KINFO, 12)				\
-  macro(GSAMP, 13)			
+  macro(GSAMP, 13)			\
+  macro(GXFER, 14)
 
 
 #define FORALL_METRIC_KINDS(macro)	\
@@ -546,6 +547,41 @@ gpu_metrics_attribute_branch
 }
 
 
+static void
+gpu_metrics_attribute_link
+(
+gpu_activity_t *activity
+)
+{
+
+	printf("Attrubute NVLINK\n\n");
+//	gpu_link_t *m = &(activity->details.memcpy);
+//	cct_node_t *cct_node = activity->cct_node;
+
+//	metric_data_list_t *metrics =
+//	hpcrun_reify_metric_set(cct_node, METRIC_ID(GPU_KINFO_STMEM_ACUMU));
+//
+//	gpu_metrics_attribute_metric_int(metrics, METRIC_ID(GPU_XFER_XMIT),
+//																	 m->staticSharedMemory);
+//
+//	gpu_metrics_attribute_metric_int(metrics, METRIC_ID(GPU_XFER_RCV),
+//																	 m->dynamicSharedMemory);
+//
+//	gpu_metrics_attribute_metric_int(metrics, METRIC_ID(GPU_XFER_XMIT_TP),
+//																	 m->localMemoryTotal);
+//
+//	gpu_metrics_attribute_metric_int(metrics, METRIC_ID(GPU_XFER_XRCV_TP),
+//																	 m->activeWarpsPerSM);
+//
+//	gpu_metrics_attribute_metric_int(metrics, METRIC_ID(GPU_XFER_XMIT_COUNT),
+//																	 m->activeWarpsPerSM);
+//
+//	gpu_metrics_attribute_metric_int(metrics, METRIC_ID(GPU_XFER_XRCV_COUNT),
+//																	 m->activeWarpsPerSM);
+
+
+}
+
 //******************************************************************************
 // interface operations
 //******************************************************************************
@@ -601,6 +637,10 @@ gpu_metrics_attribute
     gpu_metrics_attribute_branch(activity);
     break;
 
+	case GPU_ACTIVITY_LINK:
+		gpu_metrics_attribute_link(activity);
+		break;
+
   default:
     break;
   }
@@ -837,3 +877,20 @@ gpu_metrics_GPU_INST_STALL_enable
 
   FINALIZE_METRIC_KIND();
 }
+
+
+void
+gpu_metrics_GXFER_enable
+(
+void
+)
+{
+#undef CURRENT_METRIC
+#define CURRENT_METRIC GXFER
+
+	INITIALIZE_METRIC_KIND();
+
+	FORALL_GXFER(INITIALIZE_SCALAR_METRIC_INT)
+
+	FINALIZE_METRIC_KIND();
+}
\ No newline at end of file
diff --git a/src/tool/hpcrun/gpu/gpu-metrics.h b/src/tool/hpcrun/gpu/gpu-metrics.h
index 0da2a7f34f..50459facd6 100644
--- a/src/tool/hpcrun/gpu/gpu-metrics.h
+++ b/src/tool/hpcrun/gpu/gpu-metrics.h
@@ -89,6 +89,15 @@ typedef enum {
 } gpu_lmem_ops_t;
 
 
+typedef enum {
+GPU_XFER_XMIT                  				 = 0,
+GPU_XFER_XRCV                  				 = 1,
+GPU_XFER_XMIT_TP                   		 = 2,
+GPU_XFER_XRCV_TP                   		 = 3,
+GPU_XFER_XMIT_COUNT              			 = 4,
+GPU_XFER_XRCV_COUNT              			 = 5
+} gpu_xfer_ops_t;
+
 
 //--------------------------------------------------------------------------
 // indexed metrics
@@ -382,6 +391,22 @@ typedef enum {
   FORALL_GSAMP_REAL(macro)				
 
 
+// gpu transfer information
+#define FORALL_GXFER(macro)			\
+	macro("GXFER:XMIT (B)",          			GPU_XFER_XMIT,		\
+	"GPU link total data transmited")			\
+	macro("GXFER:XRCV (B)",          			GPU_XFER_XRCV,		\
+	"GPU link total data received")		\
+	macro("GXFER:XMIT_TP (GB)",          	GPU_XFER_XMIT_TP,		\
+	"GPU link total transmit throughput")		\
+	macro("GXFER:XRCV_TP (GB)",          	GPU_XFER_XRCV_TP,		\
+	"GPU link total received throughput")		\
+	macro("GXFER:XMIT_COUNT",             GPU_XFER_XMIT_COUNT,  			\
+	"GPU link launch count transmited")					\
+	macro("GXFER:XRCV_COUNT",             GPU_XFER_XRCV_COUNT,  			\
+	"GPU kernel: launch count received")
+
+
 //******************************************************************************
 // interface operations
 //******************************************************************************
@@ -446,6 +471,11 @@ gpu_metrics_GSAMP_enable
  void
 );
 
+void
+gpu_metrics_GXFER_enable
+(
+void
+);
 
 //--------------------------------------------------
 // record global memory access statistics
diff --git a/src/tool/hpcrun/gpu/nvidia/cupti-activity-translate.c b/src/tool/hpcrun/gpu/nvidia/cupti-activity-translate.c
index b4691bdd17..f23e318f6d 100644
--- a/src/tool/hpcrun/gpu/nvidia/cupti-activity-translate.c
+++ b/src/tool/hpcrun/gpu/nvidia/cupti-activity-translate.c
@@ -579,7 +579,7 @@ cupti_activity_translate
 
   case CUPTI_ACTIVITY_KIND_PC_SAMPLING_RECORD_INFO:
     convert_pcsampling_record_info
-      (ga, (CUpti_ActivityPCSamplingRecordInfo *)activity);
+				(ga, (CUpti_ActivityPCSamplingRecordInfo *)activity);
     break;
 
   case CUPTI_ACTIVITY_KIND_MEMCPY2:
diff --git a/src/tool/hpcrun/gpu/nvidia/cupti-api.c b/src/tool/hpcrun/gpu/nvidia/cupti-api.c
index dfb2272b42..da38043d57 100644
--- a/src/tool/hpcrun/gpu/nvidia/cupti-api.c
+++ b/src/tool/hpcrun/gpu/nvidia/cupti-api.c
@@ -105,6 +105,8 @@
 
 #include "tool_state.h"
 
+
+#include "sample_sources_all.h"
 //******************************************************************************
 // macros
 //******************************************************************************
@@ -952,6 +954,10 @@ cupti_subscriber_callback
         uint64_t correlation_id = gpu_correlation_id();
         cupti_correlation_id_push(correlation_id);
 
+        printf("Driver API: enter -----------------\n");
+				hpcrun_all_sources_stop();
+				hpcrun_all_sources_start();
+
         cct_node_t *api_node = cupti_correlation_callback(correlation_id);
 
         gpu_op_ccts_t gpu_op_ccts;
@@ -1098,6 +1104,10 @@ cupti_subscriber_callback
         cupti_runtime_api_flag_set();
         uint64_t correlation_id = gpu_correlation_id();
         cupti_correlation_id_push(correlation_id);
+
+				printf("Runtime API: enter -----------------\n");
+				hpcrun_all_sources_stop();
+				hpcrun_all_sources_start();
         // We should make notification records in the api enter callback.
         // A runtime API must be implemented by driver APIs.
         // Though unlikely in most cases,
diff --git a/src/tool/hpcrun/sample-sources/amd.c b/src/tool/hpcrun/sample-sources/amd.c
index 17500e3219..9823a531cb 100644
--- a/src/tool/hpcrun/sample-sources/amd.c
+++ b/src/tool/hpcrun/sample-sources/amd.c
@@ -193,6 +193,10 @@ METHOD_FN(display_events)
 }
 
 
+static void
+METHOD_FN(print_counters, const long long *values)
+{
+}
 
 //**************************************************************************
 // object
diff --git a/src/tool/hpcrun/sample-sources/blame-shift/blame-sample-source.h b/src/tool/hpcrun/sample-sources/blame-shift/blame-sample-source.h
index 9ee674392e..48a7f1dcf1 100644
--- a/src/tool/hpcrun/sample-sources/blame-shift/blame-sample-source.h
+++ b/src/tool/hpcrun/sample-sources/blame-shift/blame-sample-source.h
@@ -192,6 +192,11 @@ METHOD_FN(display_events)
 }
 
 
+static void
+METHOD_FN(print_counters, const long long *values)
+{
+}
+
 
 /*--------------------------------------------------------------------------
  | sample source object
diff --git a/src/tool/hpcrun/sample-sources/cuda.c b/src/tool/hpcrun/sample-sources/cuda.c
index 569c708235..fdaaaf8953 100644
--- a/src/tool/hpcrun/sample-sources/cuda.c
+++ b/src/tool/hpcrun/sample-sources/cuda.c
@@ -422,6 +422,11 @@ METHOD_FN(display_events)
   printf("\n");
 }
 
+static void
+METHOD_FN(print_counters, const long long *values)
+{
+}
+
 /***************************************************************************
  * object
  ***************************************************************************/
diff --git a/src/tool/hpcrun/sample-sources/ga.c b/src/tool/hpcrun/sample-sources/ga.c
index a8490d6d41..b7bfb8e9fe 100644
--- a/src/tool/hpcrun/sample-sources/ga.c
+++ b/src/tool/hpcrun/sample-sources/ga.c
@@ -237,6 +237,12 @@ METHOD_FN(display_events)
 }
 
 
+static void
+METHOD_FN(print_counters, const long long *values)
+{
+}
+
+
 //***************************************************************************
 // object
 //***************************************************************************
diff --git a/src/tool/hpcrun/sample-sources/generic.c b/src/tool/hpcrun/sample-sources/generic.c
index a265d87d99..d9173c0335 100644
--- a/src/tool/hpcrun/sample-sources/generic.c
+++ b/src/tool/hpcrun/sample-sources/generic.c
@@ -446,6 +446,13 @@ METHOD_FN(display_events)
   printf("\n");
 }
 
+
+static void
+METHOD_FN(print_counters, const long long *values)
+{
+}
+
+
 //***************************************************************************
 // object
 //***************************************************************************
diff --git a/src/tool/hpcrun/sample-sources/gpu_blame.c b/src/tool/hpcrun/sample-sources/gpu_blame.c
index 9030df17db..be19d2bb9c 100644
--- a/src/tool/hpcrun/sample-sources/gpu_blame.c
+++ b/src/tool/hpcrun/sample-sources/gpu_blame.c
@@ -257,6 +257,13 @@ static void METHOD_FN(display_events)
     printf("\n");
 }
 
+
+static void
+METHOD_FN(print_counters, const long long *values)
+{
+}
+
+
 /***************************************************************************
  * object
  ***************************************************************************/
diff --git a/src/tool/hpcrun/sample-sources/idle.c b/src/tool/hpcrun/sample-sources/idle.c
index 78287da975..c12f8dd9f8 100644
--- a/src/tool/hpcrun/sample-sources/idle.c
+++ b/src/tool/hpcrun/sample-sources/idle.c
@@ -231,6 +231,11 @@ METHOD_FN(display_events)
 }
 
 
+static void
+METHOD_FN(print_counters, const long long *values)
+{
+}
+
 
 /***************************************************************************
  * object
diff --git a/src/tool/hpcrun/sample-sources/io.c b/src/tool/hpcrun/sample-sources/io.c
index 7c91344211..ccd6b9c854 100644
--- a/src/tool/hpcrun/sample-sources/io.c
+++ b/src/tool/hpcrun/sample-sources/io.c
@@ -184,6 +184,11 @@ METHOD_FN(display_events)
 }
 
 
+static void
+METHOD_FN(print_counters, const long long *values)
+{
+}
+
 /***************************************************************************
  * object
  ***************************************************************************/
diff --git a/src/tool/hpcrun/sample-sources/itimer.c b/src/tool/hpcrun/sample-sources/itimer.c
index 3a2eb67e4b..c887b76a46 100644
--- a/src/tool/hpcrun/sample-sources/itimer.c
+++ b/src/tool/hpcrun/sample-sources/itimer.c
@@ -662,6 +662,11 @@ METHOD_FN(display_events)
 }
 
 
+static void
+METHOD_FN(print_counters, const long long *values)
+{
+}
+
 /***************************************************************************
  * object
  ***************************************************************************/
diff --git a/src/tool/hpcrun/sample-sources/memleak.c b/src/tool/hpcrun/sample-sources/memleak.c
index 343a5e74fc..63ca5036d1 100644
--- a/src/tool/hpcrun/sample-sources/memleak.c
+++ b/src/tool/hpcrun/sample-sources/memleak.c
@@ -204,6 +204,13 @@ METHOD_FN(display_events)
   printf("\n");
 }
 
+
+static void
+METHOD_FN(print_counters, const long long *values)
+{
+}
+
+
 /***************************************************************************
  * object
  ***************************************************************************/
diff --git a/src/tool/hpcrun/sample-sources/mpi.c b/src/tool/hpcrun/sample-sources/mpi.c
index 92aaaa1654..d4e2ba5494 100644
--- a/src/tool/hpcrun/sample-sources/mpi.c
+++ b/src/tool/hpcrun/sample-sources/mpi.c
@@ -186,6 +186,12 @@ METHOD_FN(display_events)
   printf("\n");
 }
 
+
+static void
+METHOD_FN(print_counters, const long long *values)
+{
+}
+
 /***************************************************************************
  * object
  ***************************************************************************/
diff --git a/src/tool/hpcrun/sample-sources/none.c b/src/tool/hpcrun/sample-sources/none.c
index ae0f84b17d..151a8344d2 100644
--- a/src/tool/hpcrun/sample-sources/none.c
+++ b/src/tool/hpcrun/sample-sources/none.c
@@ -187,6 +187,12 @@ METHOD_FN(display_events)
 {
 }
 
+
+static void
+METHOD_FN(print_counters, const long long *values)
+{
+}
+
 /***************************************************************************
  * object
  ***************************************************************************/
diff --git a/src/tool/hpcrun/sample-sources/nvidia.c b/src/tool/hpcrun/sample-sources/nvidia.c
index a79e94d58f..c33dd97e9a 100644
--- a/src/tool/hpcrun/sample-sources/nvidia.c
+++ b/src/tool/hpcrun/sample-sources/nvidia.c
@@ -98,7 +98,7 @@
 
 #define NVIDIA_CUDA "gpu=nvidia"
 #define NVIDIA_CUDA_PC_SAMPLING "gpu=nvidia,pc"
-
+#define NVIDIA_CUDA_NV_LINK "nvlink"
 
 
 /******************************************************************************
@@ -311,7 +311,8 @@ static bool
 METHOD_FN(supports_event, const char *ev_str)
 {
 #ifndef HPCRUN_STATIC_LINK
-  return hpcrun_ev_is(ev_str, NVIDIA_CUDA) || hpcrun_ev_is(ev_str, NVIDIA_CUDA_PC_SAMPLING);
+  return hpcrun_ev_is(ev_str, NVIDIA_CUDA) || hpcrun_ev_is(ev_str, NVIDIA_CUDA_PC_SAMPLING)
+																							|| hpcrun_ev_is(ev_str, NVIDIA_CUDA_NV_LINK);
 #else
   return false;
 #endif
@@ -330,25 +331,30 @@ METHOD_FN(process_event_list, int lush_metrics)
   char* event = start_tok(evlist);
   long int frequency = 0;
   int frequency_default = -1;
+
   hpcrun_extract_ev_thresh(event, sizeof(nvidia_name), nvidia_name,
     &frequency, frequency_default);
 
-  if (hpcrun_ev_is(nvidia_name, NVIDIA_CUDA)) {
-    trace_frequency =
-      (frequency == frequency_default) ? trace_frequency_default : frequency;
-    gpu_monitoring_trace_sample_frequency_set(trace_frequency);
-  } else if (hpcrun_ev_is(nvidia_name, NVIDIA_CUDA_PC_SAMPLING)) {
-    pc_sampling_frequency = (frequency == frequency_default) ?
-      pc_sampling_frequency_default : frequency;
+	for (; event != NULL; event = next_tok()) {
+		if (hpcrun_ev_is(event, NVIDIA_CUDA)) {
+			trace_frequency =
+			(frequency == frequency_default) ? trace_frequency_default : frequency;
+			gpu_monitoring_trace_sample_frequency_set(trace_frequency);
+		} else if (hpcrun_ev_is(event, NVIDIA_CUDA_PC_SAMPLING)) {
+			pc_sampling_frequency = (frequency == frequency_default) ?
+															pc_sampling_frequency_default : frequency;
 
-    gpu_monitoring_instruction_sample_frequency_set(pc_sampling_frequency);
+			gpu_monitoring_instruction_sample_frequency_set(pc_sampling_frequency);
 
-    gpu_metrics_GPU_INST_enable(); // instruction counts
+			gpu_metrics_GPU_INST_enable(); // instruction counts
 
-    gpu_metrics_GPU_INST_STALL_enable(); // stall metrics
+			gpu_metrics_GPU_INST_STALL_enable(); // stall metrics
 
-    gpu_metrics_GSAMP_enable(); // GPU utilization from sampling
-  }
+			gpu_metrics_GSAMP_enable(); // GPU utilization from sampling
+		}else if (hpcrun_ev_is(event, NVIDIA_CUDA_NV_LINK)) {
+			gpu_metrics_GXFER_enable();
+		}
+	}
 
   gpu_metrics_default_enable();
   gpu_metrics_KINFO_enable();
@@ -451,7 +457,10 @@ METHOD_FN(display_events)
   printf("\n");
 }
 
-
+static void
+METHOD_FN(print_counters, const int *values)
+{
+}
 
 //******************************************************************************
 // object
diff --git a/src/tool/hpcrun/sample-sources/papi-c.c b/src/tool/hpcrun/sample-sources/papi-c.c
index c4d10a3c9b..456850f2c8 100644
--- a/src/tool/hpcrun/sample-sources/papi-c.c
+++ b/src/tool/hpcrun/sample-sources/papi-c.c
@@ -100,13 +100,14 @@
 
 #include "tool_state.h"
 
+
 /******************************************************************************
  * macros
  *****************************************************************************/
 
 #define OVERFLOW_MODE 0
 #define WEIGHT_METRIC 0
-#define DEFAULT_THRESHOLD  2000000L
+#define DEFAULT_THRESHOLD  1 //2000000L
 
 #include "papi-c.h"
 
@@ -389,6 +390,7 @@ METHOD_FN(thread_fini_action)
 	tool_exit();
 }
 
+
 static void
 METHOD_FN(stop)
 {
@@ -423,12 +425,18 @@ METHOD_FN(stop)
       else {
 	TMSG(PAPI,"stop w event set = %d", ci->eventSet);
 	long_long values[nevents+2];
+//for (int i = 0; i < nevents+2; ++i) {
+//	values[i] = 0;
+//	printf("values_prev[%d] = %llu\n", i, values[i]);
+//}
 	//	long_long *values = (long_long *) alloca(sizeof(long_long) * (nevents+2));
 	int ret = PAPI_stop(ci->eventSet, values);
 	if (ret != PAPI_OK){
 	  EMSG("Failed to stop PAPI for eventset %d. Return code = %d ==> %s",
 	       ci->eventSet, ret, PAPI_strerror(ret));
 	}
+
+	METHOD_CALL(self, print_counters, values);
       }
     }
   }
@@ -438,6 +446,7 @@ METHOD_FN(stop)
 	tool_exit();
 }
 
+
 static void
 METHOD_FN(shutdown)
 {
@@ -445,7 +454,9 @@ METHOD_FN(shutdown)
   TMSG(PAPI, "shutdown");
   if (papi_unavail) { goto finish; }
 
-  METHOD_CALL(self, stop); // make sure stop has been called
+  do{
+		METHOD_CALL(self, stop); // make sure stop has been called
+	}while(0);
   // FIXME: add component shutdown code here
   PAPI_shutdown();
 
@@ -624,7 +635,7 @@ METHOD_FN(gen_event_set, int lush_metrics)
 
   int num_components = PAPI_num_components();
   int ss_info_size = sizeof(papi_source_info_t) + 
-    num_components * sizeof(papi_component_info_t);
+	num_components * sizeof(papi_component_info_t);
 
   TMSG(PAPI, "Num components = %d", num_components);
   papi_source_info_t* psi = hpcrun_malloc(ss_info_size);
@@ -796,11 +807,25 @@ METHOD_FN(display_events)
 
   printf( "Total events reported: %d\n", num_total);
   printf("\n\n");
-
 finish:
 	tool_exit();
 }
 
+static void
+METHOD_FN(print_counters, const long long *values)
+{
+	char* evlist = METHOD_CALL(self, get_event_str);
+	char *event;
+	int evcode;
+	int i;
+
+	for (i = 0, event = start_tok(evlist); more_tok(); i++, event = next_tok()) {
+		PAPI_event_name_to_code(event, &evcode);
+
+		printf("event %s \t-> event code = %x, value = %llu\n", event, evcode, values[i]);
+	}
+
+}
 
 /***************************************************************************
  * object
diff --git a/src/tool/hpcrun/sample-sources/papi.c b/src/tool/hpcrun/sample-sources/papi.c
index 0ff8238ea3..729a885f19 100644
--- a/src/tool/hpcrun/sample-sources/papi.c
+++ b/src/tool/hpcrun/sample-sources/papi.c
@@ -96,6 +96,8 @@
 #include <lush/lush-backtrace.h>
 #include <lib/prof-lean/hpcrun-fmt.h>
 
+#include "tool_state.h"
+
 
 /******************************************************************************
  * macros
@@ -165,6 +167,7 @@ strip_papi_prefix(const char *str)
 static void
 METHOD_FN(init)
 {
+	tool_enter();
   PAPI_set_debug(0x3ff);
 
   // **NOTE: some papi components may start threads, so
@@ -201,13 +204,15 @@ METHOD_FN(init)
   }
 
   self->state = INIT;
+	tool_exit();
 }
 
 static void
 METHOD_FN(thread_init)
 {
+	tool_enter();
   TMSG(PAPI, "thread init");
-  if (papi_unavail) { return; }
+  if (papi_unavail) { goto finish; }
 
   int retval = PAPI_thread_init(pthread_self);
   if (retval != PAPI_OK) {
@@ -215,13 +220,16 @@ METHOD_FN(thread_init)
     monitor_real_abort();
   }
   TMSG(PAPI, "thread init OK");
+finish:
+	tool_exit();
 }
 
 static void
 METHOD_FN(thread_init_action)
 {
+	tool_enter();
   TMSG(PAPI, "register thread");
-  if (papi_unavail) { return; }
+  if (papi_unavail) { goto finish; }
 
   int retval = PAPI_register_thread();
   if (retval != PAPI_OK) {
@@ -229,13 +237,16 @@ METHOD_FN(thread_init_action)
     monitor_real_abort();
   }
   TMSG(PAPI, "register thread ok");
+finish:
+	tool_exit();
 }
 
 static void
 METHOD_FN(start)
 {
+	tool_enter();
   TMSG(PAPI, "start");
-  if (papi_unavail) { return; }
+  if (papi_unavail) { goto finish; }
 
   thread_data_t *td = hpcrun_get_thread_data();
   papi_source_info_t *psi = td->ss_info[self->sel_idx].ptr;
@@ -247,7 +258,7 @@ METHOD_FN(start)
   // state PAPI is in.
 
   if (my_state == START) {
-    return;
+		goto finish;
   }
 
   TMSG(PAPI,"starting PAPI w event set %d",eventSet);
@@ -269,25 +280,33 @@ METHOD_FN(start)
   }
 
   TD_GET(ss_state)[self->sel_idx] = START;
+
+finish:
+	tool_exit();
 }
 
 static void
 METHOD_FN(thread_fini_action)
 {
-  TMSG(PAPI, "unregister thread");
-  if (papi_unavail) { return; }
+	tool_enter();
+	TMSG(PAPI, "unregister thread");
+  if (papi_unavail) { goto finish; }
 
   int retval = PAPI_unregister_thread();
   char msg[] = "!!NOT PAPI_OK!! (code = -9999999)\n";
   snprintf(msg, sizeof(msg)-1, "!!NOT PAPI_OK!! (code = %d)", retval);
   TMSG(PAPI, "unregister thread returns %s", retval == PAPI_OK? "PAPI_OK" : msg);
+finish:
+	tool_exit();
 }
 
 static void
 METHOD_FN(stop)
 {
-  TMSG(PAPI, "stop");
-  if (papi_unavail) { return; }
+	tool_enter();
+
+	TMSG(PAPI, "stop");
+  if (papi_unavail) { goto finish; }
 
   thread_data_t *td = hpcrun_get_thread_data();
   papi_source_info_t *psi = td->ss_info[self->sel_idx].ptr;
@@ -297,12 +316,12 @@ METHOD_FN(stop)
 
   if (my_state == STOP) {
     TMSG(PAPI,"--stop called on an already stopped event set %d",eventSet);
-    return;
+		goto finish;
   }
 
   if (my_state != START) {
     TMSG(PAPI,"*WARNING* Stop called on event set that has not been started");
-    return;
+		goto finish;
   }
 
   TMSG(PAPI,"stop w event set = %d",eventSet);
@@ -314,18 +333,23 @@ METHOD_FN(stop)
   }
 
   TD_GET(ss_state)[self->sel_idx] = STOP;
+finish:
+	tool_exit();
 }
 
 static void
 METHOD_FN(shutdown)
 {
-  TMSG(PAPI, "shutdown");
-  if (papi_unavail) { return; }
+	tool_enter();
+	TMSG(PAPI, "shutdown");
+  if (papi_unavail) { goto finish; }
 
   METHOD_CALL(self, stop); // make sure stop has been called
   PAPI_shutdown();
 
   self->state = UNINIT;
+finish:
+	tool_exit();
 }
 
 // Return true if PAPI recognizes the name, whether supported or not.
@@ -333,10 +357,12 @@ METHOD_FN(shutdown)
 static bool
 METHOD_FN(supports_event, const char *ev_str)
 {
+	tool_enter();
+	bool ret;
   ev_str = strip_papi_prefix(ev_str);
 
   TMSG(PAPI, "supports event");
-  if (papi_unavail) { return false; }
+  if (papi_unavail) { ret = false; goto finish; }
 
   if (self->state == UNINIT){
     METHOD_CALL(self, init);
@@ -347,14 +373,19 @@ METHOD_FN(supports_event, const char *ev_str)
   long th;
 
   hpcrun_extract_ev_thresh(ev_str, sizeof(evtmp), evtmp, &th, DEFAULT_THRESHOLD);
-  return PAPI_event_name_to_code(evtmp, &ec) == PAPI_OK;
+  ret = (PAPI_event_name_to_code(evtmp, &ec) == PAPI_OK);
+
+finish:
+	tool_exit();
+	return ret;
 }
  
 static void
 METHOD_FN(process_event_list, int lush_metrics)
 {
+	tool_enter();
   TMSG(PAPI, "process event list");
-  if (papi_unavail) { return; }
+  if (papi_unavail) { goto finish; }
 
   char *event;
   int i, ret;
@@ -456,17 +487,20 @@ METHOD_FN(process_event_list, int lush_metrics)
   if (! some_overflow) {
     hpcrun_ssfail_all_derived("PAPI");
   }
+finish:
+	tool_exit();
 }
 
 static void
 METHOD_FN(gen_event_set,int lush_metrics)
 {
-  int i;
+	tool_enter();
+	int i;
   int ret;
   int eventSet;
 
   TMSG(PAPI, "gen event set");
-  if (papi_unavail) { return; }
+  if (papi_unavail) { goto finish; }
 
   int ss_info_size = sizeof(papi_source_info_t);
   papi_source_info_t *psi = hpcrun_malloc(ss_info_size);
@@ -520,11 +554,14 @@ METHOD_FN(gen_event_set,int lush_metrics)
     }
   }
   psi->eventSet= eventSet;
+finish:
+	tool_exit();
 }
 
 static void
 METHOD_FN(display_events)
 {
+	tool_enter();
   PAPI_event_info_t info;
   char name[200], *prof;
   int ev, ret, num_total, num_prof;
@@ -538,7 +575,7 @@ METHOD_FN(display_events)
   if (papi_unavail) {
     printf("PAPI is not available.  Probably, the kernel doesn't support PAPI,\n"
 	   "or else maybe HPCToolkit is out of sync with PAPI.\n\n");
-    return;
+		goto finish;
   }
 
   num_total = 0;
@@ -592,8 +629,17 @@ METHOD_FN(display_events)
   }
   printf("Total native events: %d\n", num_total);
   printf("\n");
+finish:
+	tool_exit();
 }
 
+
+static void
+METHOD_FN(print_counters, const long long *values)
+{
+}
+
+
 /***************************************************************************
  * object
  ***************************************************************************/
@@ -626,26 +672,35 @@ hpcrun_disable_papi_cuda(void)
 static int
 event_is_derived(int ev_code)
 {
-  PAPI_event_info_t info;
+	tool_enter();
+	int ret;
+	PAPI_event_info_t info;
 
   // "Is derived" is kind of a bad thing, so if any unexpected failure
   // occurs, we'll return the "bad" answer.
   if (PAPI_get_event_info(ev_code, &info) != PAPI_OK
       || info.derived == NULL) {
-    return 1;
+		ret = 1;
+		goto finish;
   }
   if (info.count == 1
       || strlen(info.derived) == 0
       || strcmp(info.derived, "NOT_DERIVED") == 0
       || strcmp(info.derived, "DERIVED_CMPD") == 0) {
-    return 0;
+		ret = 0;
+		goto finish;
   }
-  return 1;
+	ret = 1;
+
+finish:
+	tool_exit();
+	return ret;
 }
 
 static void
 event_fatal_error(int ev_code, int papi_ret)
 {
+	tool_enter();
   char name[1024];
 
   PAPI_event_code_to_name(ev_code, name);
@@ -659,12 +714,15 @@ event_fatal_error(int ev_code, int papi_ret)
     hpcrun_ssfail_conflict("PAPI", name);
   }
   hpcrun_ssfail_unsupported("PAPI", name);
+
+  tool_exit();
 }
 
 static void
 papi_event_handler(int event_set, void *pc, long long ovec,
                    void *context)
 {
+	tool_enter();
   sample_source_t *self = &_papi_obj;
   long long values[MAX_EVENTS];
   int my_events[MAX_EVENTS];
@@ -673,14 +731,14 @@ papi_event_handler(int event_set, void *pc, long long ovec,
   int i, ret;
 
   // if sampling disabled explicitly for this thread, skip all processing
-  if (hpcrun_suppress_sample()) return;
+  if (hpcrun_suppress_sample()) goto finish;
 
 
   // If the interrupt came from inside our code, then drop the sample
   // and return and avoid any MSG.
   if (! hpcrun_safe_enter_async(pc)) {
     hpcrun_stats_num_samples_blocked_async_inc();
-    return;
+		goto finish;
   }
 
   TMSG(PAPI_SAMPLE,"papi event happened, ovec = %ld",ovec);
@@ -734,5 +792,7 @@ papi_event_handler(int event_set, void *pc, long long ovec,
     }
   }
 
-  hpcrun_safe_exit();
+finish:
+	tool_exit();
+	hpcrun_safe_exit();
 }
diff --git a/src/tool/hpcrun/sample-sources/perf/linux_perf.c b/src/tool/hpcrun/sample-sources/perf/linux_perf.c
index ae6e11be97..67aa4e8c2f 100644
--- a/src/tool/hpcrun/sample-sources/perf/linux_perf.c
+++ b/src/tool/hpcrun/sample-sources/perf/linux_perf.c
@@ -982,6 +982,11 @@ METHOD_FN(display_events)
 }
 
 
+static void
+METHOD_FN(print_counters, const long long *values)
+{
+}
+
 // --------------------------------------------------------------------------
 // read a counter from the file descriptor,
 //  and returns the value of the counter
diff --git a/src/tool/hpcrun/sample-sources/pthread-blame.c b/src/tool/hpcrun/sample-sources/pthread-blame.c
index 0c7491915a..1187d4d3b8 100644
--- a/src/tool/hpcrun/sample-sources/pthread-blame.c
+++ b/src/tool/hpcrun/sample-sources/pthread-blame.c
@@ -393,6 +393,11 @@ METHOD_FN(display_events)
 }
 
 
+static void
+METHOD_FN(print_counters, const long long *values)
+{
+}
+
 /*--------------------------------------------------------------------------
  | sample source object
  --------------------------------------------------------------------------*/
diff --git a/src/tool/hpcrun/sample-sources/retcnt.c b/src/tool/hpcrun/sample-sources/retcnt.c
index 2745a65f7f..5aaa90972e 100644
--- a/src/tool/hpcrun/sample-sources/retcnt.c
+++ b/src/tool/hpcrun/sample-sources/retcnt.c
@@ -208,6 +208,12 @@ METHOD_FN(display_events)
   printf("\n");
 }
 
+
+static void
+METHOD_FN(print_counters, const long long *values)
+{
+}
+
 #define ss_name retcnt
 #define ss_cls SS_SOFTWARE
 #define ss_sort_order 100
diff --git a/src/tool/hpcrun/sample-sources/sample_source_obj.h b/src/tool/hpcrun/sample-sources/sample_source_obj.h
index d74c9db766..c497898d80 100644
--- a/src/tool/hpcrun/sample-sources/sample_source_obj.h
+++ b/src/tool/hpcrun/sample-sources/sample_source_obj.h
@@ -106,6 +106,7 @@ typedef struct sample_source_t {
   VMETHOD_DEF(finalize_event_list);
   VMETHOD_DEF(gen_event_set, int lush_agents);
   VMETHOD_DEF(display_events);
+	VMETHOD_DEF(print_counters, const long long *values);
 
   // data
   evlist_t       	  evl;       	 // event list
diff --git a/src/tool/hpcrun/sample-sources/ss_obj.h b/src/tool/hpcrun/sample-sources/ss_obj.h
index 137e49afb7..12d8a7c0db 100644
--- a/src/tool/hpcrun/sample-sources/ss_obj.h
+++ b/src/tool/hpcrun/sample-sources/ss_obj.h
@@ -97,6 +97,7 @@ sample_source_t obj_name() = {
   .finalize_event_list = finalize_event_list,
   .gen_event_set = gen_event_set,
   .display_events = display_events,
+	.print_counters = print_counters,
 
   // data
   .evl = {
diff --git a/src/tool/hpcrun/sample-sources/sync.c b/src/tool/hpcrun/sample-sources/sync.c
index 3e8d65bc01..44c3c7fe76 100644
--- a/src/tool/hpcrun/sample-sources/sync.c
+++ b/src/tool/hpcrun/sample-sources/sync.c
@@ -199,6 +199,12 @@ METHOD_FN(display_events)
   printf("\n");
 }
 
+
+static void
+METHOD_FN(print_counters, const long long *values)
+{
+}
+
 /***************************************************************************
  * object
  ***************************************************************************/
diff --git a/src/tool/hpcrun/sample-sources/tst.c b/src/tool/hpcrun/sample-sources/tst.c
index 46926fbf55..b6f5438305 100644
--- a/src/tool/hpcrun/sample-sources/tst.c
+++ b/src/tool/hpcrun/sample-sources/tst.c
@@ -366,6 +366,12 @@ METHOD_FN(display_events)
 #endif
 }
 
+
+static void
+METHOD_FN(print_counters, const long long *values)
+{
+}
+
 /***************************************************************************
  * object
  ***************************************************************************/
diff --git a/src/tool/hpcrun/sample-sources/upc.c b/src/tool/hpcrun/sample-sources/upc.c
index 2a33c138a5..5fcac6d541 100644
--- a/src/tool/hpcrun/sample-sources/upc.c
+++ b/src/tool/hpcrun/sample-sources/upc.c
@@ -447,6 +447,12 @@ METHOD_FN(display_events)
   printf("\n");
 }
 
+
+static void
+METHOD_FN(print_counters, const long long *values)
+{
+}
+
 #define ss_name upc
 #define ss_cls SS_HARDWARE
 

From 3507328708acb309dccfca562e533c3c4cdc07e5 Mon Sep 17 00:00:00 2001
From: Dejan XXX <dx4@llnl.cs.rice.edu>
Date: Mon, 20 Jul 2020 15:09:04 -0500
Subject: [PATCH 012/177] added support for gpu-monitors register

---
 src/tool/hpcrun/Makefile.am             |   1 +
 src/tool/hpcrun/Makefile.in             |  97 +++++++-----
 src/tool/hpcrun/gpu-monitors.c          |  34 +++++
 src/tool/hpcrun/gpu-monitors.h          |  42 ++++++
 src/tool/hpcrun/gpu/amd/roctracer-api.c |  12 +-
 src/tool/hpcrun/gpu/nvidia/cupti-api.c  |  40 +++--
 src/tool/hpcrun/sample-sources/papi-c.c | 190 ++++++++++++++++++++----
 src/tool/hpcrun/sample-sources/papi-c.h |   1 -
 8 files changed, 336 insertions(+), 81 deletions(-)
 create mode 100644 src/tool/hpcrun/gpu-monitors.c
 create mode 100644 src/tool/hpcrun/gpu-monitors.h

diff --git a/src/tool/hpcrun/Makefile.am b/src/tool/hpcrun/Makefile.am
index dd41b52788..bd0a0be241 100644
--- a/src/tool/hpcrun/Makefile.am
+++ b/src/tool/hpcrun/Makefile.am
@@ -319,6 +319,7 @@ MY_BASE_FILES =				\
 	control-knob.c   \
 	control-knob.h   \
 	device-finalizers.c \
+	gpu-monitors.c		\
 	device-initializers.c \
 	module-ignore-map.c \
 	threadmgr.c			\
diff --git a/src/tool/hpcrun/Makefile.in b/src/tool/hpcrun/Makefile.in
index 1e358b3164..32a37ea9c3 100644
--- a/src/tool/hpcrun/Makefile.in
+++ b/src/tool/hpcrun/Makefile.in
@@ -443,18 +443,18 @@ am__libhpcrun_la_SOURCES_DIST = utilities/first_func.c main.h main.c \
 	sample-sources/sample-filters.c segv_handler.c start-stop.c \
 	term_handler.c thread_data.c thread_use.c thread_finalize.c \
 	control-knob.c control-knob.h device-finalizers.c \
-	device-initializers.c module-ignore-map.c threadmgr.c trace.c \
-	weak.c write_data.c cct/cct_bundle.c cct/cct_ctxt.c cct/cct.c \
-	cct/cct-node-vector.c cct2metrics.c lush/lush-backtrace.h \
-	lush/lush-backtrace.c lush/lush.h lush/lush.c \
-	lush/lush-pthread.h lush/lush-pthread.i lush/lush-pthread.c \
-	lush/lush-support-rt.h lush/lush-support-rt.c lush/lushi.h \
-	lush/lushi-cb.h lush/lushi-cb.c fnbounds/fnbounds_common.c \
-	memory/mem.c memory/mmap.c messages/debug-flag.c \
-	messages/messages-sync.c messages/messages-async.c \
-	messages/fmt.c hpcrun-placeholders.c gpu/gpu-activity.c \
-	gpu/gpu-activity-channel.c gpu/gpu-activity-process.c \
-	gpu/gpu-application-thread-api.c \
+	gpu-monitors.c device-initializers.c module-ignore-map.c \
+	threadmgr.c trace.c weak.c write_data.c cct/cct_bundle.c \
+	cct/cct_ctxt.c cct/cct.c cct/cct-node-vector.c cct2metrics.c \
+	lush/lush-backtrace.h lush/lush-backtrace.c lush/lush.h \
+	lush/lush.c lush/lush-pthread.h lush/lush-pthread.i \
+	lush/lush-pthread.c lush/lush-support-rt.h \
+	lush/lush-support-rt.c lush/lushi.h lush/lushi-cb.h \
+	lush/lushi-cb.c fnbounds/fnbounds_common.c memory/mem.c \
+	memory/mmap.c messages/debug-flag.c messages/messages-sync.c \
+	messages/messages-async.c messages/fmt.c hpcrun-placeholders.c \
+	gpu/gpu-activity.c gpu/gpu-activity-channel.c \
+	gpu/gpu-activity-process.c gpu/gpu-application-thread-api.c \
 	gpu/gpu-channel-item-allocator.c gpu/gpu-context-id-map.c \
 	gpu/gpu-correlation.c gpu/gpu-correlation-channel.c \
 	gpu/gpu-correlation-channel-set.c gpu/gpu-correlation-id.c \
@@ -582,6 +582,7 @@ am__objects_14 = utilities/libhpcrun_la-first_func.lo \
 	libhpcrun_la-term_handler.lo libhpcrun_la-thread_data.lo \
 	libhpcrun_la-thread_use.lo libhpcrun_la-thread_finalize.lo \
 	libhpcrun_la-control-knob.lo libhpcrun_la-device-finalizers.lo \
+	libhpcrun_la-gpu-monitors.lo \
 	libhpcrun_la-device-initializers.lo \
 	libhpcrun_la-module-ignore-map.lo libhpcrun_la-threadmgr.lo \
 	libhpcrun_la-trace.lo libhpcrun_la-weak.lo \
@@ -820,18 +821,18 @@ am__libhpcrun_o_SOURCES_DIST = utilities/first_func.c main.h main.c \
 	sample-sources/sample-filters.c segv_handler.c start-stop.c \
 	term_handler.c thread_data.c thread_use.c thread_finalize.c \
 	control-knob.c control-knob.h device-finalizers.c \
-	device-initializers.c module-ignore-map.c threadmgr.c trace.c \
-	weak.c write_data.c cct/cct_bundle.c cct/cct_ctxt.c cct/cct.c \
-	cct/cct-node-vector.c cct2metrics.c lush/lush-backtrace.h \
-	lush/lush-backtrace.c lush/lush.h lush/lush.c \
-	lush/lush-pthread.h lush/lush-pthread.i lush/lush-pthread.c \
-	lush/lush-support-rt.h lush/lush-support-rt.c lush/lushi.h \
-	lush/lushi-cb.h lush/lushi-cb.c fnbounds/fnbounds_common.c \
-	memory/mem.c memory/mmap.c messages/debug-flag.c \
-	messages/messages-sync.c messages/messages-async.c \
-	messages/fmt.c hpcrun-placeholders.c gpu/gpu-activity.c \
-	gpu/gpu-activity-channel.c gpu/gpu-activity-process.c \
-	gpu/gpu-application-thread-api.c \
+	gpu-monitors.c device-initializers.c module-ignore-map.c \
+	threadmgr.c trace.c weak.c write_data.c cct/cct_bundle.c \
+	cct/cct_ctxt.c cct/cct.c cct/cct-node-vector.c cct2metrics.c \
+	lush/lush-backtrace.h lush/lush-backtrace.c lush/lush.h \
+	lush/lush.c lush/lush-pthread.h lush/lush-pthread.i \
+	lush/lush-pthread.c lush/lush-support-rt.h \
+	lush/lush-support-rt.c lush/lushi.h lush/lushi-cb.h \
+	lush/lushi-cb.c fnbounds/fnbounds_common.c memory/mem.c \
+	memory/mmap.c messages/debug-flag.c messages/messages-sync.c \
+	messages/messages-async.c messages/fmt.c hpcrun-placeholders.c \
+	gpu/gpu-activity.c gpu/gpu-activity-channel.c \
+	gpu/gpu-activity-process.c gpu/gpu-application-thread-api.c \
 	gpu/gpu-channel-item-allocator.c gpu/gpu-context-id-map.c \
 	gpu/gpu-correlation.c gpu/gpu-correlation-channel.c \
 	gpu/gpu-correlation-channel-set.c gpu/gpu-correlation-id.c \
@@ -964,6 +965,7 @@ am__objects_51 = utilities/libhpcrun_o-first_func.$(OBJEXT) \
 	libhpcrun_o-thread_finalize.$(OBJEXT) \
 	libhpcrun_o-control-knob.$(OBJEXT) \
 	libhpcrun_o-device-finalizers.$(OBJEXT) \
+	libhpcrun_o-gpu-monitors.$(OBJEXT) \
 	libhpcrun_o-device-initializers.$(OBJEXT) \
 	libhpcrun_o-module-ignore-map.$(OBJEXT) \
 	libhpcrun_o-threadmgr.$(OBJEXT) libhpcrun_o-trace.$(OBJEXT) \
@@ -1702,18 +1704,18 @@ MY_BASE_FILES = utilities/first_func.c main.h main.c disabled.c \
 	sample-sources/sample-filters.c segv_handler.c start-stop.c \
 	term_handler.c thread_data.c thread_use.c thread_finalize.c \
 	control-knob.c control-knob.h device-finalizers.c \
-	device-initializers.c module-ignore-map.c threadmgr.c trace.c \
-	weak.c write_data.c cct/cct_bundle.c cct/cct_ctxt.c cct/cct.c \
-	cct/cct-node-vector.c cct2metrics.c lush/lush-backtrace.h \
-	lush/lush-backtrace.c lush/lush.h lush/lush.c \
-	lush/lush-pthread.h lush/lush-pthread.i lush/lush-pthread.c \
-	lush/lush-support-rt.h lush/lush-support-rt.c lush/lushi.h \
-	lush/lushi-cb.h lush/lushi-cb.c fnbounds/fnbounds_common.c \
-	memory/mem.c memory/mmap.c messages/debug-flag.c \
-	messages/messages-sync.c messages/messages-async.c \
-	messages/fmt.c hpcrun-placeholders.c gpu/gpu-activity.c \
-	gpu/gpu-activity-channel.c gpu/gpu-activity-process.c \
-	gpu/gpu-application-thread-api.c \
+	gpu-monitors.c device-initializers.c module-ignore-map.c \
+	threadmgr.c trace.c weak.c write_data.c cct/cct_bundle.c \
+	cct/cct_ctxt.c cct/cct.c cct/cct-node-vector.c cct2metrics.c \
+	lush/lush-backtrace.h lush/lush-backtrace.c lush/lush.h \
+	lush/lush.c lush/lush-pthread.h lush/lush-pthread.i \
+	lush/lush-pthread.c lush/lush-support-rt.h \
+	lush/lush-support-rt.c lush/lushi.h lush/lushi-cb.h \
+	lush/lushi-cb.c fnbounds/fnbounds_common.c memory/mem.c \
+	memory/mmap.c messages/debug-flag.c messages/messages-sync.c \
+	messages/messages-async.c messages/fmt.c hpcrun-placeholders.c \
+	gpu/gpu-activity.c gpu/gpu-activity-channel.c \
+	gpu/gpu-activity-process.c gpu/gpu-application-thread-api.c \
 	gpu/gpu-channel-item-allocator.c gpu/gpu-context-id-map.c \
 	gpu/gpu-correlation.c gpu/gpu-correlation-channel.c \
 	gpu/gpu-correlation-channel-set.c gpu/gpu-correlation-id.c \
@@ -3402,6 +3404,7 @@ distclean-compile:
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libhpcrun_la-env.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libhpcrun_la-epoch.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libhpcrun_la-files.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libhpcrun_la-gpu-monitors.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libhpcrun_la-handling_sample.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libhpcrun_la-hpcrun-initializers.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libhpcrun_la-hpcrun-placeholders.Plo@am__quote@
@@ -3442,6 +3445,7 @@ distclean-compile:
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libhpcrun_o-env.Po@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libhpcrun_o-epoch.Po@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libhpcrun_o-files.Po@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libhpcrun_o-gpu-monitors.Po@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libhpcrun_o-handling_sample.Po@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libhpcrun_o-hpcrun-initializers.Po@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libhpcrun_o-hpcrun-placeholders.Po@am__quote@
@@ -4315,6 +4319,13 @@ libhpcrun_la-device-finalizers.lo: device-finalizers.c
 @AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
 @am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -c -o libhpcrun_la-device-finalizers.lo `test -f 'device-finalizers.c' || echo '$(srcdir)/'`device-finalizers.c
 
+libhpcrun_la-gpu-monitors.lo: gpu-monitors.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -MT libhpcrun_la-gpu-monitors.lo -MD -MP -MF $(DEPDIR)/libhpcrun_la-gpu-monitors.Tpo -c -o libhpcrun_la-gpu-monitors.lo `test -f 'gpu-monitors.c' || echo '$(srcdir)/'`gpu-monitors.c
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/libhpcrun_la-gpu-monitors.Tpo $(DEPDIR)/libhpcrun_la-gpu-monitors.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='gpu-monitors.c' object='libhpcrun_la-gpu-monitors.lo' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -c -o libhpcrun_la-gpu-monitors.lo `test -f 'gpu-monitors.c' || echo '$(srcdir)/'`gpu-monitors.c
+
 libhpcrun_la-device-initializers.lo: device-initializers.c
 @am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -MT libhpcrun_la-device-initializers.lo -MD -MP -MF $(DEPDIR)/libhpcrun_la-device-initializers.Tpo -c -o libhpcrun_la-device-initializers.lo `test -f 'device-initializers.c' || echo '$(srcdir)/'`device-initializers.c
 @am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/libhpcrun_la-device-initializers.Tpo $(DEPDIR)/libhpcrun_la-device-initializers.Plo
@@ -6023,6 +6034,20 @@ libhpcrun_o-device-finalizers.obj: device-finalizers.c
 @AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
 @am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_o_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_o_CFLAGS) $(CFLAGS) -c -o libhpcrun_o-device-finalizers.obj `if test -f 'device-finalizers.c'; then $(CYGPATH_W) 'device-finalizers.c'; else $(CYGPATH_W) '$(srcdir)/device-finalizers.c'; fi`
 
+libhpcrun_o-gpu-monitors.o: gpu-monitors.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_o_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_o_CFLAGS) $(CFLAGS) -MT libhpcrun_o-gpu-monitors.o -MD -MP -MF $(DEPDIR)/libhpcrun_o-gpu-monitors.Tpo -c -o libhpcrun_o-gpu-monitors.o `test -f 'gpu-monitors.c' || echo '$(srcdir)/'`gpu-monitors.c
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/libhpcrun_o-gpu-monitors.Tpo $(DEPDIR)/libhpcrun_o-gpu-monitors.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='gpu-monitors.c' object='libhpcrun_o-gpu-monitors.o' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_o_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_o_CFLAGS) $(CFLAGS) -c -o libhpcrun_o-gpu-monitors.o `test -f 'gpu-monitors.c' || echo '$(srcdir)/'`gpu-monitors.c
+
+libhpcrun_o-gpu-monitors.obj: gpu-monitors.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_o_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_o_CFLAGS) $(CFLAGS) -MT libhpcrun_o-gpu-monitors.obj -MD -MP -MF $(DEPDIR)/libhpcrun_o-gpu-monitors.Tpo -c -o libhpcrun_o-gpu-monitors.obj `if test -f 'gpu-monitors.c'; then $(CYGPATH_W) 'gpu-monitors.c'; else $(CYGPATH_W) '$(srcdir)/gpu-monitors.c'; fi`
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/libhpcrun_o-gpu-monitors.Tpo $(DEPDIR)/libhpcrun_o-gpu-monitors.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='gpu-monitors.c' object='libhpcrun_o-gpu-monitors.obj' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_o_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_o_CFLAGS) $(CFLAGS) -c -o libhpcrun_o-gpu-monitors.obj `if test -f 'gpu-monitors.c'; then $(CYGPATH_W) 'gpu-monitors.c'; else $(CYGPATH_W) '$(srcdir)/gpu-monitors.c'; fi`
+
 libhpcrun_o-device-initializers.o: device-initializers.c
 @am__fastdepCC_TRUE@	$(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_o_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_o_CFLAGS) $(CFLAGS) -MT libhpcrun_o-device-initializers.o -MD -MP -MF $(DEPDIR)/libhpcrun_o-device-initializers.Tpo -c -o libhpcrun_o-device-initializers.o `test -f 'device-initializers.c' || echo '$(srcdir)/'`device-initializers.c
 @am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/libhpcrun_o-device-initializers.Tpo $(DEPDIR)/libhpcrun_o-device-initializers.Po
diff --git a/src/tool/hpcrun/gpu-monitors.c b/src/tool/hpcrun/gpu-monitors.c
new file mode 100644
index 0000000000..13ca174a4b
--- /dev/null
+++ b/src/tool/hpcrun/gpu-monitors.c
@@ -0,0 +1,34 @@
+//
+// Created by dejan on 7/15/20.
+//
+
+#include "gpu-monitors.h"
+
+static gpu_monitor_fn_entry_t *kinds[2] = {0, 0};
+static const char *gpu_name[] = {"unknown", "nvidia", "amd", "intel"};
+
+
+void
+gpu_monitor_register(gpu_monitor_type_t type, gpu_monitor_fn_entry_t *entry)
+{
+	gpu_monitor_fn_entry_t* device_fn = kinds[type];
+	entry->next = device_fn;
+	kinds[type] = entry;
+}
+
+
+void
+gpu_monitors_apply(void *args_in, gpu_monitor_type_t type)
+{
+	gpu_monitor_fn_entry_t* fn = kinds[type];
+	while (fn != 0) {
+		fn->fn(fn->reg_info, args_in);
+		fn = fn->next;
+	}
+}
+
+char *
+gpu_monitors_get_gpu_name(gpu_type_t t)
+{
+	return gpu_name[t];
+}
\ No newline at end of file
diff --git a/src/tool/hpcrun/gpu-monitors.h b/src/tool/hpcrun/gpu-monitors.h
new file mode 100644
index 0000000000..42454f3e0c
--- /dev/null
+++ b/src/tool/hpcrun/gpu-monitors.h
@@ -0,0 +1,42 @@
+//
+// Created by dejan on 7/15/20.
+//
+
+#ifndef HPCTOOLKIT_GPU_MONITORS_H
+#define HPCTOOLKIT_GPU_MONITORS_H
+
+#include "cct.h"
+
+typedef void (*gpu_monitor_fn_t)(void* reg_info, void* args_in);
+
+typedef enum {
+	gpu_monitor_type_enter,
+	gpu_monitor_type_exit
+} gpu_monitor_type_t;
+
+typedef enum {
+	unknown,
+	nvidia,
+	amd,
+	intel
+} gpu_type_t;
+
+
+typedef struct gpu_monitors_apply_t {
+	cct_node_t *cct_node;
+	gpu_type_t gpu_type;
+} gpu_monitors_apply_t;
+
+
+typedef struct gpu_monitor_fn_entry_t {
+	struct gpu_monitor_fn_entry_t* next;
+	gpu_monitor_fn_t fn;
+	void* reg_info;
+} gpu_monitor_fn_entry_t;
+
+
+extern void gpu_monitor_register(gpu_monitor_type_t type, gpu_monitor_fn_entry_t* entry);
+extern void gpu_monitors_apply(void *args, gpu_monitor_type_t type);
+char * gpu_monitors_get_gpu_name(gpu_type_t t);
+
+#endif //HPCTOOLKIT_GPU_MONITORS_H
diff --git a/src/tool/hpcrun/gpu/amd/roctracer-api.c b/src/tool/hpcrun/gpu/amd/roctracer-api.c
index d668a485db..bb35d99fcc 100644
--- a/src/tool/hpcrun/gpu/amd/roctracer-api.c
+++ b/src/tool/hpcrun/gpu/amd/roctracer-api.c
@@ -50,6 +50,8 @@
 
 #include <roctracer_hip.h>
 
+#include <hpcrun/gpu-monitors.h>
+
 #include <hpcrun/gpu/gpu-activity-channel.h>
 #include <hpcrun/gpu/gpu-activity-process.h>
 #include <hpcrun/gpu/gpu-correlation-channel.h>
@@ -99,6 +101,7 @@
 //******************************************************************************
 // local variables
 //******************************************************************************
+static __thread cct_node_t *cupti_kernel_ph = NULL;
 
 //----------------------------------------------------------
 // roctracer function pointers for late binding
@@ -368,7 +371,14 @@ roctracer_subscriber_callback
 
     // Generate notification entry
     uint64_t cpu_submit_time = hpcrun_nanotime();
-    gpu_correlation_channel_produce(correlation_id, &gpu_op_ccts, cpu_submit_time);
+
+		printf("\nRuntime API: enter -----------------| cct = %p | gpu = %d\n", api_node, amd );
+		cupti_kernel_ph = gpu_op_ccts_get(&gpu_op_ccts, gpu_placeholder_type_kernel); //dejan: added
+		gpu_monitors_apply( &(gpu_monitors_apply_t){.cct_node=cupti_kernel_ph, .gpu_type=amd}, gpu_monitor_type_enter);
+
+		gpu_correlation_channel_produce(correlation_id, &gpu_op_ccts, cpu_submit_time);
+  }else{
+		printf("\nRuntime API_PHASE = %d\n", data->phase);
   }
 }
 
diff --git a/src/tool/hpcrun/gpu/nvidia/cupti-api.c b/src/tool/hpcrun/gpu/nvidia/cupti-api.c
index da38043d57..9cdafeed20 100644
--- a/src/tool/hpcrun/gpu/nvidia/cupti-api.c
+++ b/src/tool/hpcrun/gpu/nvidia/cupti-api.c
@@ -82,6 +82,7 @@
 #include <hpcrun/hpcrun_stats.h>
 #include <hpcrun/main.h> // hpcrun_force_dlopen
 #include <hpcrun/safe-sampling.h>
+#include <hpcrun/gpu-monitors.h>
 
 #include <hpcrun/gpu/gpu-activity-channel.h>
 #include <hpcrun/gpu/gpu-application-thread-api.h>
@@ -105,8 +106,9 @@
 
 #include "tool_state.h"
 
+//#include "sample_sources_all.h"
+
 
-#include "sample_sources_all.h"
 //******************************************************************************
 // macros
 //******************************************************************************
@@ -954,10 +956,6 @@ cupti_subscriber_callback
         uint64_t correlation_id = gpu_correlation_id();
         cupti_correlation_id_push(correlation_id);
 
-        printf("Driver API: enter -----------------\n");
-				hpcrun_all_sources_stop();
-				hpcrun_all_sources_start();
-
         cct_node_t *api_node = cupti_correlation_callback(correlation_id);
 
         gpu_op_ccts_t gpu_op_ccts;
@@ -971,13 +969,18 @@ cupti_subscriber_callback
 	    gpu_op_ccts_get(&gpu_op_ccts, gpu_placeholder_type_kernel);
 
 	  ensure_kernel_ip_present(kernel_ph, kernel_ip);
-        }
+
+					printf("\nDriver API: enter -----------------\n" );
+					gpu_monitors_apply( &(gpu_monitors_apply_t){.cct_node=cupti_kernel_ph,.gpu_type=nvidia}, gpu_monitor_type_enter);
+
+				}
 
         hpcrun_safe_exit();
 
         // Generate notification entry
         uint64_t cpu_submit_time = hpcrun_nanotime();
-        gpu_correlation_channel_produce(correlation_id, &gpu_op_ccts,
+
+				gpu_correlation_channel_produce(correlation_id, &gpu_op_ccts,
           cpu_submit_time);
 
         TMSG(CUPTI_TRACE, "Driver push externalId %lu (cb_id = %u)", correlation_id, cb_id);
@@ -985,7 +988,12 @@ cupti_subscriber_callback
         uint64_t correlation_id __attribute__((unused)); // not used if PRINT omitted
         correlation_id = cupti_correlation_id_pop();
         TMSG(CUPTI_TRACE, "Driver pop externalId %lu (cb_id = %u)", correlation_id, cb_id);
-      }
+
+        printf("\nDriver API: exit -----------------\n" );
+				gpu_monitors_apply( &(gpu_monitors_apply_t){.cct_node=cupti_kernel_ph,.gpu_type=nvidia}, gpu_monitor_type_exit);
+
+
+			}
     } else if (is_kernel_op && cupti_runtime_api_flag && cd->callbackSite ==
       CUPTI_API_ENTER) {
       if (cupti_kernel_ph != NULL) {
@@ -1105,9 +1113,6 @@ cupti_subscriber_callback
         uint64_t correlation_id = gpu_correlation_id();
         cupti_correlation_id_push(correlation_id);
 
-				printf("Runtime API: enter -----------------\n");
-				hpcrun_all_sources_stop();
-				hpcrun_all_sources_start();
         // We should make notification records in the api enter callback.
         // A runtime API must be implemented by driver APIs.
         // Though unlikely in most cases,
@@ -1127,7 +1132,11 @@ cupti_subscriber_callback
 
         // Generate notification entry
         uint64_t cpu_submit_time = hpcrun_nanotime();
-        gpu_correlation_channel_produce(correlation_id, &gpu_op_ccts,
+
+				printf("\nRuntime API: enter -----------------\n" );
+				gpu_monitors_apply( &(gpu_monitors_apply_t){.cct_node=cupti_kernel_ph, .gpu_type=nvidia}, gpu_monitor_type_enter);
+
+				gpu_correlation_channel_produce(correlation_id, &gpu_op_ccts,
           cpu_submit_time);
 
         TMSG(CUPTI_TRACE, "Runtime push externalId %lu (cb_id = %u)", correlation_id, cb_id);
@@ -1139,7 +1148,10 @@ cupti_subscriber_callback
         correlation_id = cupti_correlation_id_pop();
         TMSG(CUPTI_TRACE, "Runtime pop externalId %lu (cb_id = %u)", correlation_id, cb_id);
 
-        cupti_kernel_ph = NULL;
+				printf("\nRuntime API: exit -----------------\n" );
+				gpu_monitors_apply( &(gpu_monitors_apply_t){.cct_node=cupti_kernel_ph, .gpu_type=nvidia}, gpu_monitor_type_exit);
+
+				cupti_kernel_ph = NULL;
       }
     } else {
       TMSG(CUPTI_TRACE, "Go through runtime with kernel_op %d, valid_op %d, "
@@ -1236,7 +1248,7 @@ cupti_buffer_completion_callback
     do {
       status = cupti_buffer_cursor_advance(buffer, validSize, &cupti_activity);
       if (status) {
-        cupti_activity_process(cupti_activity);
+				cupti_activity_process(cupti_activity);
         ++processed;
       }
     } while (status);
diff --git a/src/tool/hpcrun/sample-sources/papi-c.c b/src/tool/hpcrun/sample-sources/papi-c.c
index 456850f2c8..93873b2417 100644
--- a/src/tool/hpcrun/sample-sources/papi-c.c
+++ b/src/tool/hpcrun/sample-sources/papi-c.c
@@ -86,6 +86,7 @@
 #include <hpcrun/hpcrun_options.h>
 #include <hpcrun/hpcrun_stats.h>
 #include <hpcrun/metrics.h>
+#include <hpcrun/gpu-monitors.h>
 #include <hpcrun/safe-sampling.h>
 #include <hpcrun/sample_sources_registered.h>
 #include <hpcrun/sample_event.h>
@@ -100,7 +101,6 @@
 
 #include "tool_state.h"
 
-
 /******************************************************************************
  * macros
  *****************************************************************************/
@@ -136,9 +136,15 @@ static bool disable_papi_cuda = false;
 
 static kind_info_t *papi_kind;
 
+// gpu monitor
+static __thread gpu_monitor_fn_entry_t gpu_monitor_enter;
+static __thread gpu_monitor_fn_entry_t gpu_monitor_exit;
+
 /******************************************************************************
  * private operations 
  *****************************************************************************/
+static void papi_monitor_enter(void *reg_info, void *args_in);
+static void papi_monitor_exit(void *reg_info, void *args_in);
 
 static int
 get_event_index(sample_source_t *self, int event_code)
@@ -155,7 +161,7 @@ get_event_index(sample_source_t *self, int event_code)
 //
 // fetch a given component's event set. Create one if need be
 //
-int 
+int
 get_component_event_set(papi_source_info_t* psi, int cidx)
 {
    if (cidx < 0 || cidx >= psi->num_components) {
@@ -354,7 +360,7 @@ METHOD_FN(start)
 	  EMSG("PAPI returned EISRUN for event set %d component %d", ci->eventSet, cidx);
 	}
 	else if (ret != PAPI_OK) {
-	  EMSG("PAPI_start failed with %s (%d) for event set %d component %d ", 
+	  EMSG("PAPI_start failed with %s (%d) for event set %d component %d ",
 	       PAPI_strerror(ret), ret, ci->eventSet, cidx);
 	  hpcrun_ssfail_start("PAPI");
 	}
@@ -362,7 +368,7 @@ METHOD_FN(start)
 	if (ci->some_derived) {
 	  ret = PAPI_read(ci->eventSet, ci->prev_values);
 	  if (ret != PAPI_OK) {
-	    EMSG("PAPI_read of event set %d for component %d failed with %s (%d)", 
+	    EMSG("PAPI_read of event set %d for component %d failed with %s (%d)",
 		 ci->eventSet, cidx, PAPI_strerror(ret), ret);
 	  }
 	}
@@ -425,10 +431,6 @@ METHOD_FN(stop)
       else {
 	TMSG(PAPI,"stop w event set = %d", ci->eventSet);
 	long_long values[nevents+2];
-//for (int i = 0; i < nevents+2; ++i) {
-//	values[i] = 0;
-//	printf("values_prev[%d] = %llu\n", i, values[i]);
-//}
 	//	long_long *values = (long_long *) alloca(sizeof(long_long) * (nevents+2));
 	int ret = PAPI_stop(ci->eventSet, values);
 	if (ret != PAPI_OK){
@@ -436,7 +438,7 @@ METHOD_FN(stop)
 	       ci->eventSet, ret, PAPI_strerror(ret));
 	}
 
-	METHOD_CALL(self, print_counters, values);
+//	METHOD_CALL(self, print_counters, values);
       }
     }
   }
@@ -473,14 +475,14 @@ METHOD_FN(supports_event, const char *ev_str)
 	tool_enter();
 	bool ret;
   ev_str = strip_papi_prefix(ev_str);
-  
+
   TMSG(PAPI, "supports event");
   if (papi_unavail) { ret = false; goto finish;}
 
   if (self->state == UNINIT){
     METHOD_CALL(self, init);
   }
-  
+
   char evtmp[1024];
   int ec;
   long th;
@@ -493,7 +495,7 @@ METHOD_FN(supports_event, const char *ev_str)
 	tool_exit();
 	return ret;
 }
- 
+
 static void
 METHOD_FN(process_event_list, int lush_metrics)
 {
@@ -634,7 +636,7 @@ METHOD_FN(gen_event_set, int lush_metrics)
   if (papi_unavail) { goto finish; }
 
   int num_components = PAPI_num_components();
-  int ss_info_size = sizeof(papi_source_info_t) + 
+  int ss_info_size = sizeof(papi_source_info_t) +
 	num_components * sizeof(papi_component_info_t);
 
   TMSG(PAPI, "Num components = %d", num_components);
@@ -647,7 +649,7 @@ METHOD_FN(gen_event_set, int lush_metrics)
   psi->num_components = num_components;
   for (i = 0; i < num_components; i++) {
     papi_component_info_t *ci = &(psi->component_info[i]);
-    ci->inUse = false;  
+    ci->inUse = false;
     ci->eventSet = PAPI_NULL;
     ci->state = INIT;
     ci->some_derived = 0;
@@ -670,15 +672,15 @@ METHOD_FN(gen_event_set, int lush_metrics)
   for (i = 0; i < nevents; i++) {
     int evcode = self->evl.events[i].event;
     int cidx = PAPI_get_event_component(evcode);
-    
+
     ret = component_add_event(psi, cidx, evcode);
     psi->component_info[cidx].some_derived |= event_is_derived(evcode);
     TMSG(PAPI, "Added event code %x to component %d", evcode, cidx);
     {
       char buffer[PAPI_MAX_STR_LEN];
       PAPI_event_code_to_name(evcode, buffer);
-      TMSG(PAPI, 
-	   "PAPI_add_event(eventSet=%%d, event_code=%x (event name %s)) component=%d", 
+      TMSG(PAPI,
+	   "PAPI_add_event(eventSet=%%d, event_code=%x (event name %s)) component=%d",
 	   /* eventSet, */ evcode, buffer, cidx);
     }
     if (ret != PAPI_OK) {
@@ -701,7 +703,7 @@ METHOD_FN(gen_event_set, int lush_metrics)
     long thresh = self->evl.events[i].thresh;
     int cidx = PAPI_get_event_component(evcode);
     int eventSet = get_component_event_set(psi, cidx);
-    
+
     // **** No overflow for synchronous events ****
     // **** Use component-specific setup for synchronous events ****
     if (component_uses_sync_samples(cidx)) {
@@ -715,7 +717,7 @@ METHOD_FN(gen_event_set, int lush_metrics)
     if (! derived[i]) {
       ret = PAPI_overflow(eventSet, evcode, thresh, OVERFLOW_MODE,
 			  papi_event_handler);
-      TMSG(PAPI, "PAPI_overflow(eventSet=%d, evcode=%x, thresh=%d) = %d", 
+      TMSG(PAPI, "PAPI_overflow(eventSet=%d, evcode=%x, thresh=%d) = %d",
 	   eventSet, evcode, thresh, ret);
       if (ret != PAPI_OK) {
 	EMSG("failure in PAPI gen_event_set(): PAPI_overflow() returned: %s (%d)",
@@ -725,6 +727,17 @@ METHOD_FN(gen_event_set, int lush_metrics)
     }
   }
 
+  /// Register papi handler callbacks
+  gpu_monitor_enter.reg_info = psi;
+	gpu_monitor_enter.fn = papi_monitor_enter;
+	gpu_monitor_register(gpu_monitor_type_enter,
+														&gpu_monitor_enter);
+
+	gpu_monitor_exit.reg_info = psi;
+	gpu_monitor_exit.fn = papi_monitor_exit;
+	gpu_monitor_register(gpu_monitor_type_exit,
+														&gpu_monitor_exit);
+
 finish:
 	tool_exit();
 }
@@ -776,7 +789,7 @@ METHOD_FN(display_events)
     printf("\n\n");
   }
 
-  num_components = PAPI_num_components(); 
+  num_components = PAPI_num_components();
   for(cidx = 0; cidx < num_components; cidx++) {
     const PAPI_component_info_t* component = PAPI_get_component_info(cidx);
     int cmp_event_count = 0;
@@ -788,7 +801,7 @@ METHOD_FN(display_events)
     printf("\n");
     printf("Name  Description\n");
     printf("===========================================================================\n");
-    
+
     ev = 0 | PAPI_NATIVE_MASK;
     ret = PAPI_enum_cmp_event(&ev, PAPI_ENUM_FIRST, cidx);
     while (ret == PAPI_OK) {
@@ -811,7 +824,7 @@ METHOD_FN(display_events)
 	tool_exit();
 }
 
-static void
+void
 METHOD_FN(print_counters, const long long *values)
 {
 	char* evlist = METHOD_CALL(self, get_event_str);
@@ -838,7 +851,7 @@ METHOD_FN(print_counters, const long long *values)
 #include "ss_obj.h"
 
 // **************************************************************************
-// * public operations 
+// * public operations
 // **************************************************************************
 void
 hpcrun_disable_papi_cuda(void)
@@ -849,7 +862,7 @@ hpcrun_disable_papi_cuda(void)
 }
 
 /******************************************************************************
- * private operations 
+ * private operations
  *****************************************************************************/
 
 // Returns: 1 if the event code is a derived event.
@@ -946,11 +959,11 @@ papi_event_handler(int event_set, void *pc, long long ovec,
     }
   }
 
-  ret = PAPI_get_overflow_event_index(event_set, ovec, my_events, 
+  ret = PAPI_get_overflow_event_index(event_set, ovec, my_events,
 				      &my_event_count);
   if (ret != PAPI_OK) {
     TMSG(PAPI_SAMPLE, "papi_event_handler: event set %d ovec %ld "
-	 "get_overflow_event_index return code = %d ==> %s", 
+	 "get_overflow_event_index return code = %d ==> %s",
 	 event_set, ovec, ret, PAPI_strerror(ret));
 #ifdef DEBUG_PAPI_OVERFLOW
     ret = PAPI_list_events(event_set, my_event_codes, &my_event_codes_count);
@@ -959,7 +972,7 @@ papi_event_handler(int event_set, void *pc, long long ovec,
 	   "Return code = %d ==> %s", ret, PAPI_strerror(ret));
     } else {
       for (i = 0; i < my_event_codes_count; i++) {
-        TMSG(PAPI_SAMPLE, "event set %d event code %d = %x\n", 
+        TMSG(PAPI_SAMPLE, "event set %d event code %d = %x\n",
 	     event_set, i, my_event_codes[i]);
       }
     }
@@ -978,7 +991,7 @@ papi_event_handler(int event_set, void *pc, long long ovec,
     // This means lush's 'time' metric should be *last*
 
     TMSG(PAPI_SAMPLE,"handling papi overflow event: "
-	"event set %d event index = %d event code = 0x%x", 
+	"event set %d event index = %d event code = 0x%x",
 	event_set, my_events[i], my_event_codes[my_events[i]]);
 
     int event_index = get_event_index(self, my_event_codes[my_events[i]]);
@@ -996,7 +1009,7 @@ papi_event_handler(int event_set, void *pc, long long ovec,
       metricIncrement = 1;
     }
 
-    sample_val_t sv = hpcrun_sample_callpath(context, metric_id, 
+    sample_val_t sv = hpcrun_sample_callpath(context, metric_id,
 			(hpcrun_metricVal_t) {.i=metricIncrement},
 			0/*skipInner*/, 0/*isSync*/, NULL);
 
@@ -1027,3 +1040,122 @@ papi_event_handler(int event_set, void *pc, long long ovec,
 	tool_exit();
   hpcrun_safe_exit();
 }
+
+//static __thread cct_node_t *cct_node;
+static __thread long long prev_values[MAX_EVENTS];
+
+static void
+papi_monitor_enter(void *reg_info, void *args_in)
+{
+	tool_enter();
+	printf("|------->PAPI_MONITOR_ENTER\n");
+	papi_source_info_t *psi = (papi_source_info_t *) reg_info;
+	gpu_monitors_apply_t *args = (gpu_monitors_apply_t *) args_in;
+
+	sample_source_t *self = &obj_name(); /// just for debug
+	int ret;
+
+
+	// if sampling disabled explicitly for this thread, skip all processing
+	if (hpcrun_suppress_sample() || sample_filters_apply()) goto finish;
+
+	if (args->gpu_type == nvidia)
+		cudaDeviceSynchronize();
+
+	// Save counts on the end so we could substract that from next call (we don't want to measure ourselves)
+	for (int cid = 0; cid < psi->num_components; ++cid) {
+		papi_component_info_t *ci = &(psi->component_info[cid]);
+		if (ci->inUse) {
+			printf("Self = %p | Component %d ---> %p \t | cct = %p | gpu = %s\n\n", self, cid, &ci, args->cct_node, gpu_monitors_get_gpu_name(args->gpu_type) );
+
+//			ret = PAPI_read(ci->eventSet, ci->prev_values);
+			ret = PAPI_read(ci->eventSet, prev_values);
+//			ret = PAPI_start(ci->eventSet);
+//			ret = PAPI_start(ci->eventSet);
+
+			if (ret != PAPI_OK) {
+				EMSG("PAPI_read of event set %d for component %d failed with %s (%d)",
+						 ci->eventSet, cid, PAPI_strerror(ret), ret);
+			}
+
+//			printf("%d Event value = %llu\n", 0, ci->prev_values[0]);
+//			printf("%d Event value = %llu\n", 1, ci->prev_values[1]);
+//
+//			ret = PAPI_read(ci->eventSet, ci->prev_values);
+//			printf("%d Event value = %llu\n", 0, ci->prev_values[0]);
+//			printf("%d Event value = %llu\n", 1, ci->prev_values[1]);
+//
+//			ret = PAPI_read(ci->eventSet, ci->prev_values);
+//			printf("%d Event value = %llu\n", 0, ci->prev_values[0]);
+//			printf("%d Event value = %llu\n", 1, ci->prev_values[1]);
+
+		}
+	}
+
+finish:
+	tool_exit();
+}
+
+static void
+papi_monitor_exit(void *reg_info, void *args_in)
+{
+	tool_enter();
+	printf("|------->PAPI_MONITOR_EXIT\n");
+	papi_source_info_t *psi = (papi_source_info_t *) reg_info;
+	gpu_monitors_apply_t *args = (gpu_monitors_apply_t *) args_in;
+
+	sample_source_t *self = &obj_name(); /// just for debug
+	int my_event_codes[MAX_EVENTS];
+	long long my_event_values[MAX_EVENTS];
+	int my_event_count = MAX_EVENTS;
+	int ret;
+
+
+	// if sampling disabled explicitly for this thread, skip all processing
+	if (hpcrun_suppress_sample() || sample_filters_apply()) goto finish;
+
+	if (args->gpu_type == nvidia)
+		cudaDeviceSynchronize();
+
+
+	// Collect counters for components in use
+	for (int cid = 0; cid < psi->num_components; ++cid) {
+		papi_component_info_t *ci = &(psi->component_info[cid]);
+		if (ci->inUse){
+//			ret = PAPI_read(ci->eventSet, my_event_values);
+			ret = PAPI_read(ci->eventSet, my_event_values);
+//			ret = PAPI_start(ci->eventSet);
+			if (ret != PAPI_OK) {
+				EMSG("PAPI_read of event set %d for component %d failed with %s (%d)",
+						 ci->eventSet, cid, PAPI_strerror(ret), ret);
+			}
+		}
+	}
+
+	// Attribute collected metric to cct nodes
+	for (int cid = 0; cid < psi->num_components; ++cid) {
+		papi_component_info_t* ci = &(psi->component_info[cid]);
+		if (ci->inUse){
+			printf("Self = %p | Component %d ---> %p \t | cct = %p | gpu = %s\n\n", self, cid, &ci, args->cct_node, gpu_monitors_get_gpu_name(args->gpu_type) );
+
+			ret = PAPI_list_events(ci->eventSet, my_event_codes, &my_event_count);
+			if (ret != PAPI_OK) {
+				hpcrun_abort("PAPI_list_events failed inside papi_event_handler."
+										 "Return code = %d ==> %s", ret, PAPI_strerror(ret));
+			}
+
+			for (int eid = 0; eid < my_event_count; ++eid) {
+				int event_index = get_event_index(self, my_event_codes[eid]);
+				int metric_id = hpcrun_event2metric(self, event_index);
+
+				printf("%d Event = %x, event_index = %d, metric_id = %d || value = %llu -> %llu\n",
+							 eid, my_event_codes[eid], event_index, metric_id, ci->prev_values[eid], my_event_values[eid]);
+
+				blame_shift_apply(metric_id, args->cct_node, my_event_values[eid] - ci->prev_values[eid] /*metricIncr*/);
+			}
+		}
+	}
+
+finish:
+	tool_exit();
+}
\ No newline at end of file
diff --git a/src/tool/hpcrun/sample-sources/papi-c.h b/src/tool/hpcrun/sample-sources/papi-c.h
index 2387765f94..df4cfbe101 100644
--- a/src/tool/hpcrun/sample-sources/papi-c.h
+++ b/src/tool/hpcrun/sample-sources/papi-c.h
@@ -84,7 +84,6 @@ typedef struct {
 } papi_source_info_t;
 
 
-
 /******************************************************************************
  * external declarations 
  *****************************************************************************/

From d5f6b2ee172704cb32e3cde71edaf8b138e4159e Mon Sep 17 00:00:00 2001
From: dejangrubisic <grubisic.dejan@yahoo.com>
Date: Tue, 4 Aug 2020 19:21:04 -0500
Subject: [PATCH 013/177] module-ignore-map includes rocm, deleted
 print_counter function

---
 src/tool/hpcrun/gpu/amd/roctracer-api.c       | 36 +++++----
 src/tool/hpcrun/gpu/nvidia/cupti-api.c        | 42 +++++-----
 src/tool/hpcrun/module-ignore-map.c           |  5 +-
 src/tool/hpcrun/sample-sources/amd.c          |  5 --
 .../blame-shift/blame-sample-source.h         |  6 --
 src/tool/hpcrun/sample-sources/cuda.c         |  4 -
 src/tool/hpcrun/sample-sources/ga.c           |  6 --
 src/tool/hpcrun/sample-sources/generic.c      |  6 --
 src/tool/hpcrun/sample-sources/gpu_blame.c    |  6 --
 src/tool/hpcrun/sample-sources/idle.c         |  6 --
 src/tool/hpcrun/sample-sources/io.c           |  5 --
 src/tool/hpcrun/sample-sources/itimer.c       |  5 --
 src/tool/hpcrun/sample-sources/memleak.c      |  6 --
 src/tool/hpcrun/sample-sources/mpi.c          |  5 --
 src/tool/hpcrun/sample-sources/none.c         |  5 --
 src/tool/hpcrun/sample-sources/nvidia.c       |  4 -
 src/tool/hpcrun/sample-sources/papi-c.c       | 78 ++++++-------------
 src/tool/hpcrun/sample-sources/papi.c         |  6 --
 .../hpcrun/sample-sources/perf/linux_perf.c   |  5 --
 .../hpcrun/sample-sources/pthread-blame.c     |  5 --
 src/tool/hpcrun/sample-sources/retcnt.c       |  5 --
 .../hpcrun/sample-sources/sample_source_obj.h |  1 -
 src/tool/hpcrun/sample-sources/ss_obj.h       |  1 -
 src/tool/hpcrun/sample-sources/sync.c         |  5 --
 src/tool/hpcrun/sample-sources/tst.c          |  5 --
 src/tool/hpcrun/sample-sources/upc.c          |  5 --
 src/tool/hpcrun/tool_state.c                  |  2 +-
 27 files changed, 72 insertions(+), 198 deletions(-)

diff --git a/src/tool/hpcrun/gpu/amd/roctracer-api.c b/src/tool/hpcrun/gpu/amd/roctracer-api.c
index bb35d99fcc..ad617008f4 100644
--- a/src/tool/hpcrun/gpu/amd/roctracer-api.c
+++ b/src/tool/hpcrun/gpu/amd/roctracer-api.c
@@ -260,8 +260,8 @@ roctracer_subscriber_callback
 )
 {
   if (is_tool_active()) {
-		TMSG(ROCM, "PAPI correlation callback");
-		gpu_correlation_channel_produce(PAPI_CORR_ID, NULL, 0);
+//		TMSG(ROCM, "PAPI correlation callback");
+//		gpu_correlation_channel_produce(PAPI_CORR_ID, NULL, 0);
 		return;
   }
 
@@ -358,28 +358,34 @@ roctracer_subscriber_callback
 
 
   if (data->phase == ACTIVITY_API_PHASE_ENTER) {
-    uint64_t correlation_id = data->correlation_id;
-    cct_node_t *api_node =
-      gpu_application_thread_correlation_callback(correlation_id);
+		uint64_t correlation_id = data->correlation_id;
+		cct_node_t *api_node =
+		gpu_application_thread_correlation_callback(correlation_id);
 
-    gpu_op_ccts_t gpu_op_ccts;
-    hpcrun_safe_enter();
-    gpu_op_ccts_insert(api_node, &gpu_op_ccts, gpu_op_placeholder_flags);
-    hpcrun_safe_exit();
+		gpu_op_ccts_t gpu_op_ccts;
+		hpcrun_safe_enter();
+		gpu_op_ccts_insert(api_node, &gpu_op_ccts, gpu_op_placeholder_flags);
+		hpcrun_safe_exit();
 
-    gpu_activity_channel_consume(gpu_metrics_attribute);
+		gpu_activity_channel_consume(gpu_metrics_attribute);
 
-    // Generate notification entry
-    uint64_t cpu_submit_time = hpcrun_nanotime();
+		// Generate notification entry
+		uint64_t cpu_submit_time = hpcrun_nanotime();
 
-		printf("\nRuntime API: enter -----------------| cct = %p | gpu = %d\n", api_node, amd );
 		cupti_kernel_ph = gpu_op_ccts_get(&gpu_op_ccts, gpu_placeholder_type_kernel); //dejan: added
-		gpu_monitors_apply( &(gpu_monitors_apply_t){.cct_node=cupti_kernel_ph, .gpu_type=amd}, gpu_monitor_type_enter);
+
+		printf("\nACTIVITY_API_PHASE_ENTER -----------------| cct = %p | gpu = %d\n", cupti_kernel_ph, amd);
+		gpu_monitors_apply(&(gpu_monitors_apply_t) {.cct_node=cupti_kernel_ph, .gpu_type=amd}, gpu_monitor_type_enter);
 
 		gpu_correlation_channel_produce(correlation_id, &gpu_op_ccts, cpu_submit_time);
+  }else if (data->phase == ACTIVITY_API_PHASE_EXIT){
+		printf("\nACTIVITY_API_PHASE_EXIT -----------------| cct = %p | gpu = %d\n", cupti_kernel_ph, amd );
+		cupti_kernel_ph = NULL;
   }else{
-		printf("\nRuntime API_PHASE = %d\n", data->phase);
+  	;
   }
+
+
 }
 
 
diff --git a/src/tool/hpcrun/gpu/nvidia/cupti-api.c b/src/tool/hpcrun/gpu/nvidia/cupti-api.c
index 9cdafeed20..5852a588b3 100644
--- a/src/tool/hpcrun/gpu/nvidia/cupti-api.c
+++ b/src/tool/hpcrun/gpu/nvidia/cupti-api.c
@@ -112,6 +112,8 @@
 //******************************************************************************
 // macros
 //******************************************************************************
+#define DEBUG 1
+#include <hpcrun/gpu/gpu-print.h>
 
 #define CUPTI_LIBRARY_LOCATION "/lib64/libcupti.so"
 #define CUPTI_PATH_FROM_CUDA "extras/CUPTI"
@@ -763,8 +765,11 @@ cupti_subscriber_callback
 {
 
 	if (is_tool_active()) {
-		TMSG(CUPTI, "PAPI correlation callback");
-		gpu_correlation_channel_produce(PAPI_CORR_ID, NULL, 0);
+//		const CUpti_CallbackData *cd = (const CUpti_CallbackData *) cb_info;
+//		PRINT("\nTOOL callback: -----------------%s\n", cd->functionName );
+
+//		TMSG(CUPTI, "PAPI correlation callback");
+//		gpu_correlation_channel_produce(PAPI_CORR_ID, NULL, 0);
 		return;
 	}
 
@@ -795,6 +800,7 @@ cupti_subscriber_callback
     cupti_stop_flag_set();
 
     const CUpti_CallbackData *cd = (const CUpti_CallbackData *) cb_info;
+		printf("\nDriver API:  -----------------%s\n", cd->functionName );
 
     bool ompt_runtime_api_flag = ompt_runtime_status_get();
 
@@ -945,11 +951,14 @@ cupti_subscriber_callback
         break;
     }
 
-    bool is_kernel_op = gpu_op_placeholder_flags_is_set(gpu_op_placeholder_flags,
-      gpu_placeholder_type_kernel);
+    bool is_kernel_op = gpu_op_placeholder_flags_is_set(gpu_op_placeholder_flags,gpu_placeholder_type_kernel);
+
+//		PRINT("DRIVER: is_valid_op = %d \t is_kernel = %d \t cupti_runtime_api_flag = %d \t ompt_runtime_api_flag = %d | callback_site = %d\n",
+//					 is_valid_op, is_kernel_op, cupti_runtime_api_flag, ompt_runtime_api_flag, cd->callbackSite);
+
     // If we have a valid operation and is not in the interval of a cuda/ompt runtime api
     if (is_valid_op && !cupti_runtime_api_flag && !ompt_runtime_api_flag) {
-      if (cd->callbackSite == CUPTI_API_ENTER) {
+			if (cd->callbackSite == CUPTI_API_ENTER) {
         // A driver API cannot be implemented by other driver APIs, so we get an id
         // and unwind when the API is entered
 
@@ -965,21 +974,17 @@ cupti_subscriber_callback
         gpu_op_ccts_insert(api_node, &gpu_op_ccts, gpu_op_placeholder_flags);
 
         if (is_kernel_op) {
-          cct_node_t *kernel_ph = 
-	    gpu_op_ccts_get(&gpu_op_ccts, gpu_placeholder_type_kernel);
-
-	  ensure_kernel_ip_present(kernel_ph, kernel_ip);
-
-					printf("\nDriver API: enter -----------------\n" );
-					gpu_monitors_apply( &(gpu_monitors_apply_t){.cct_node=cupti_kernel_ph,.gpu_type=nvidia}, gpu_monitor_type_enter);
-
+          cct_node_t *kernel_ph = gpu_op_ccts_get(&gpu_op_ccts, gpu_placeholder_type_kernel);
+				  ensure_kernel_ip_present(kernel_ph, kernel_ip);
 				}
-
         hpcrun_safe_exit();
 
         // Generate notification entry
         uint64_t cpu_submit_time = hpcrun_nanotime();
 
+//				gpu_monitors_apply( &(gpu_monitors_apply_t){.cct_node=api_node,.gpu_type=nvidia}, gpu_monitor_type_enter);
+
+
 				gpu_correlation_channel_produce(correlation_id, &gpu_op_ccts,
           cpu_submit_time);
 
@@ -989,8 +994,7 @@ cupti_subscriber_callback
         correlation_id = cupti_correlation_id_pop();
         TMSG(CUPTI_TRACE, "Driver pop externalId %lu (cb_id = %u)", correlation_id, cb_id);
 
-        printf("\nDriver API: exit -----------------\n" );
-				gpu_monitors_apply( &(gpu_monitors_apply_t){.cct_node=cupti_kernel_ph,.gpu_type=nvidia}, gpu_monitor_type_exit);
+//				gpu_monitors_apply( &(gpu_monitors_apply_t){.cct_node=NULL,.gpu_type=nvidia}, gpu_monitor_type_exit);
 
 
 			}
@@ -1011,6 +1015,7 @@ cupti_subscriber_callback
     cupti_stop_flag_set();
 
     const CUpti_CallbackData *cd = (const CUpti_CallbackData *)cb_info;
+		printf("\nRuntime API:  -----------------%s\n", cd->functionName );
 
     bool is_valid_op = false;
     bool is_kernel_op __attribute__((unused)) = false; // used only by PRINT when debugging
@@ -1106,6 +1111,9 @@ cupti_subscriber_callback
         break;
     }
 
+//		PRINT("RUNTIME: is_valid_op = %d \t is_kernel = %d \t cupti_runtime_api_flag = %d \t ompt_runtime_api_flag = %d | callback_site = %d\n",
+//					 is_valid_op, is_kernel_op, cupti_runtime_api_flag, ompt_runtime_status_get(), cd->callbackSite);
+
     if (is_valid_op) {
       if (cd->callbackSite == CUPTI_API_ENTER) {
         // Enter a CUDA runtime api
@@ -1133,7 +1141,6 @@ cupti_subscriber_callback
         // Generate notification entry
         uint64_t cpu_submit_time = hpcrun_nanotime();
 
-				printf("\nRuntime API: enter -----------------\n" );
 				gpu_monitors_apply( &(gpu_monitors_apply_t){.cct_node=cupti_kernel_ph, .gpu_type=nvidia}, gpu_monitor_type_enter);
 
 				gpu_correlation_channel_produce(correlation_id, &gpu_op_ccts,
@@ -1148,7 +1155,6 @@ cupti_subscriber_callback
         correlation_id = cupti_correlation_id_pop();
         TMSG(CUPTI_TRACE, "Runtime pop externalId %lu (cb_id = %u)", correlation_id, cb_id);
 
-				printf("\nRuntime API: exit -----------------\n" );
 				gpu_monitors_apply( &(gpu_monitors_apply_t){.cct_node=cupti_kernel_ph, .gpu_type=nvidia}, gpu_monitor_type_exit);
 
 				cupti_kernel_ph = NULL;
diff --git a/src/tool/hpcrun/module-ignore-map.c b/src/tool/hpcrun/module-ignore-map.c
index 0a6d272a93..7a801bc676 100644
--- a/src/tool/hpcrun/module-ignore-map.c
+++ b/src/tool/hpcrun/module-ignore-map.c
@@ -102,7 +102,7 @@
 #define PRINT(...)
 #endif
 
-#define NUM_FNS 3
+#define NUM_FNS 4
 
 
 
@@ -122,7 +122,8 @@ typedef struct module_ignore_entry {
 //***************************************************************************
 
 static const char *NVIDIA_FNS[NUM_FNS] = {
-  "cuLaunchKernel", "cudaLaunchKernel", "cuptiActivityEnable"
+  "cuLaunchKernel", "cudaLaunchKernel", "cuptiActivityEnable",
+  "roctracer_start"
 };
 static module_ignore_entry_t modules[NUM_FNS];
 static pfq_rwlock_t modules_lock;
diff --git a/src/tool/hpcrun/sample-sources/amd.c b/src/tool/hpcrun/sample-sources/amd.c
index 9823a531cb..a5d8a492a0 100644
--- a/src/tool/hpcrun/sample-sources/amd.c
+++ b/src/tool/hpcrun/sample-sources/amd.c
@@ -193,11 +193,6 @@ METHOD_FN(display_events)
 }
 
 
-static void
-METHOD_FN(print_counters, const long long *values)
-{
-}
-
 //**************************************************************************
 // object
 //**************************************************************************
diff --git a/src/tool/hpcrun/sample-sources/blame-shift/blame-sample-source.h b/src/tool/hpcrun/sample-sources/blame-shift/blame-sample-source.h
index 48a7f1dcf1..c533c59826 100644
--- a/src/tool/hpcrun/sample-sources/blame-shift/blame-sample-source.h
+++ b/src/tool/hpcrun/sample-sources/blame-shift/blame-sample-source.h
@@ -192,12 +192,6 @@ METHOD_FN(display_events)
 }
 
 
-static void
-METHOD_FN(print_counters, const long long *values)
-{
-}
-
-
 /*--------------------------------------------------------------------------
  | sample source object
  --------------------------------------------------------------------------*/
diff --git a/src/tool/hpcrun/sample-sources/cuda.c b/src/tool/hpcrun/sample-sources/cuda.c
index fdaaaf8953..81bbb5a252 100644
--- a/src/tool/hpcrun/sample-sources/cuda.c
+++ b/src/tool/hpcrun/sample-sources/cuda.c
@@ -422,10 +422,6 @@ METHOD_FN(display_events)
   printf("\n");
 }
 
-static void
-METHOD_FN(print_counters, const long long *values)
-{
-}
 
 /***************************************************************************
  * object
diff --git a/src/tool/hpcrun/sample-sources/ga.c b/src/tool/hpcrun/sample-sources/ga.c
index b7bfb8e9fe..a8490d6d41 100644
--- a/src/tool/hpcrun/sample-sources/ga.c
+++ b/src/tool/hpcrun/sample-sources/ga.c
@@ -237,12 +237,6 @@ METHOD_FN(display_events)
 }
 
 
-static void
-METHOD_FN(print_counters, const long long *values)
-{
-}
-
-
 //***************************************************************************
 // object
 //***************************************************************************
diff --git a/src/tool/hpcrun/sample-sources/generic.c b/src/tool/hpcrun/sample-sources/generic.c
index d9173c0335..997365d4eb 100644
--- a/src/tool/hpcrun/sample-sources/generic.c
+++ b/src/tool/hpcrun/sample-sources/generic.c
@@ -447,12 +447,6 @@ METHOD_FN(display_events)
 }
 
 
-static void
-METHOD_FN(print_counters, const long long *values)
-{
-}
-
-
 //***************************************************************************
 // object
 //***************************************************************************
diff --git a/src/tool/hpcrun/sample-sources/gpu_blame.c b/src/tool/hpcrun/sample-sources/gpu_blame.c
index be19d2bb9c..b7e79f7672 100644
--- a/src/tool/hpcrun/sample-sources/gpu_blame.c
+++ b/src/tool/hpcrun/sample-sources/gpu_blame.c
@@ -258,12 +258,6 @@ static void METHOD_FN(display_events)
 }
 
 
-static void
-METHOD_FN(print_counters, const long long *values)
-{
-}
-
-
 /***************************************************************************
  * object
  ***************************************************************************/
diff --git a/src/tool/hpcrun/sample-sources/idle.c b/src/tool/hpcrun/sample-sources/idle.c
index c12f8dd9f8..f43588fa34 100644
--- a/src/tool/hpcrun/sample-sources/idle.c
+++ b/src/tool/hpcrun/sample-sources/idle.c
@@ -231,12 +231,6 @@ METHOD_FN(display_events)
 }
 
 
-static void
-METHOD_FN(print_counters, const long long *values)
-{
-}
-
-
 /***************************************************************************
  * object
  ***************************************************************************/
diff --git a/src/tool/hpcrun/sample-sources/io.c b/src/tool/hpcrun/sample-sources/io.c
index ccd6b9c854..7c91344211 100644
--- a/src/tool/hpcrun/sample-sources/io.c
+++ b/src/tool/hpcrun/sample-sources/io.c
@@ -184,11 +184,6 @@ METHOD_FN(display_events)
 }
 
 
-static void
-METHOD_FN(print_counters, const long long *values)
-{
-}
-
 /***************************************************************************
  * object
  ***************************************************************************/
diff --git a/src/tool/hpcrun/sample-sources/itimer.c b/src/tool/hpcrun/sample-sources/itimer.c
index c887b76a46..3a2eb67e4b 100644
--- a/src/tool/hpcrun/sample-sources/itimer.c
+++ b/src/tool/hpcrun/sample-sources/itimer.c
@@ -662,11 +662,6 @@ METHOD_FN(display_events)
 }
 
 
-static void
-METHOD_FN(print_counters, const long long *values)
-{
-}
-
 /***************************************************************************
  * object
  ***************************************************************************/
diff --git a/src/tool/hpcrun/sample-sources/memleak.c b/src/tool/hpcrun/sample-sources/memleak.c
index 63ca5036d1..ff8df2b253 100644
--- a/src/tool/hpcrun/sample-sources/memleak.c
+++ b/src/tool/hpcrun/sample-sources/memleak.c
@@ -205,12 +205,6 @@ METHOD_FN(display_events)
 }
 
 
-static void
-METHOD_FN(print_counters, const long long *values)
-{
-}
-
-
 /***************************************************************************
  * object
  ***************************************************************************/
diff --git a/src/tool/hpcrun/sample-sources/mpi.c b/src/tool/hpcrun/sample-sources/mpi.c
index d4e2ba5494..aa43fe41bc 100644
--- a/src/tool/hpcrun/sample-sources/mpi.c
+++ b/src/tool/hpcrun/sample-sources/mpi.c
@@ -187,11 +187,6 @@ METHOD_FN(display_events)
 }
 
 
-static void
-METHOD_FN(print_counters, const long long *values)
-{
-}
-
 /***************************************************************************
  * object
  ***************************************************************************/
diff --git a/src/tool/hpcrun/sample-sources/none.c b/src/tool/hpcrun/sample-sources/none.c
index 151a8344d2..51a89c2a53 100644
--- a/src/tool/hpcrun/sample-sources/none.c
+++ b/src/tool/hpcrun/sample-sources/none.c
@@ -188,11 +188,6 @@ METHOD_FN(display_events)
 }
 
 
-static void
-METHOD_FN(print_counters, const long long *values)
-{
-}
-
 /***************************************************************************
  * object
  ***************************************************************************/
diff --git a/src/tool/hpcrun/sample-sources/nvidia.c b/src/tool/hpcrun/sample-sources/nvidia.c
index c33dd97e9a..9de7326c43 100644
--- a/src/tool/hpcrun/sample-sources/nvidia.c
+++ b/src/tool/hpcrun/sample-sources/nvidia.c
@@ -457,10 +457,6 @@ METHOD_FN(display_events)
   printf("\n");
 }
 
-static void
-METHOD_FN(print_counters, const int *values)
-{
-}
 
 //******************************************************************************
 // object
diff --git a/src/tool/hpcrun/sample-sources/papi-c.c b/src/tool/hpcrun/sample-sources/papi-c.c
index 93873b2417..31dc8041a3 100644
--- a/src/tool/hpcrun/sample-sources/papi-c.c
+++ b/src/tool/hpcrun/sample-sources/papi-c.c
@@ -279,8 +279,8 @@ METHOD_FN(init)
       EMSG("warning: PAPI_set_domain(PAPI_DOM_ALL) failed: %d", ret);
     }
   }
-
-  self->state = INIT;
+  
+	self->state = INIT;
   tool_exit();
 }
 
@@ -432,13 +432,13 @@ METHOD_FN(stop)
 	TMSG(PAPI,"stop w event set = %d", ci->eventSet);
 	long_long values[nevents+2];
 	//	long_long *values = (long_long *) alloca(sizeof(long_long) * (nevents+2));
+
 	int ret = PAPI_stop(ci->eventSet, values);
 	if (ret != PAPI_OK){
 	  EMSG("Failed to stop PAPI for eventset %d. Return code = %d ==> %s",
 	       ci->eventSet, ret, PAPI_strerror(ret));
 	}
 
-//	METHOD_CALL(self, print_counters, values);
       }
     }
   }
@@ -824,21 +824,6 @@ METHOD_FN(display_events)
 	tool_exit();
 }
 
-void
-METHOD_FN(print_counters, const long long *values)
-{
-	char* evlist = METHOD_CALL(self, get_event_str);
-	char *event;
-	int evcode;
-	int i;
-
-	for (i = 0, event = start_tok(evlist); more_tok(); i++, event = next_tok()) {
-		PAPI_event_name_to_code(event, &evcode);
-
-		printf("event %s \t-> event code = %x, value = %llu\n", event, evcode, values[i]);
-	}
-
-}
 
 /***************************************************************************
  * object
@@ -1041,54 +1026,44 @@ papi_event_handler(int event_set, void *pc, long long ovec,
   hpcrun_safe_exit();
 }
 
-//static __thread cct_node_t *cct_node;
+
+static __thread cct_node_t *cct_node;
 static __thread long long prev_values[MAX_EVENTS];
 
 static void
 papi_monitor_enter(void *reg_info, void *args_in)
 {
 	tool_enter();
-	printf("|------->PAPI_MONITOR_ENTER\n");
 	papi_source_info_t *psi = (papi_source_info_t *) reg_info;
 	gpu_monitors_apply_t *args = (gpu_monitors_apply_t *) args_in;
 
 	sample_source_t *self = &obj_name(); /// just for debug
 	int ret;
 
+	printf("|------->PAPI_MONITOR_ENTER | running? %d\n", METHOD_CALL(self, started));
 
 	// if sampling disabled explicitly for this thread, skip all processing
 	if (hpcrun_suppress_sample() || sample_filters_apply()) goto finish;
 
-	if (args->gpu_type == nvidia)
-		cudaDeviceSynchronize();
+	cct_node = args->cct_node;
+
+  if (args->gpu_type == amd)
+    hipDeviceSynchronize();
 
 	// Save counts on the end so we could substract that from next call (we don't want to measure ourselves)
 	for (int cid = 0; cid < psi->num_components; ++cid) {
 		papi_component_info_t *ci = &(psi->component_info[cid]);
 		if (ci->inUse) {
-			printf("Self = %p | Component %d ---> %p \t | cct = %p | gpu = %s\n\n", self, cid, &ci, args->cct_node, gpu_monitors_get_gpu_name(args->gpu_type) );
+			printf("Self = %p | Component %d \t | cct = %p | gpu = %s\n\n", self, cid, args->cct_node, gpu_monitors_get_gpu_name(args->gpu_type) );
 
-//			ret = PAPI_read(ci->eventSet, ci->prev_values);
 			ret = PAPI_read(ci->eventSet, prev_values);
-//			ret = PAPI_start(ci->eventSet);
-//			ret = PAPI_start(ci->eventSet);
+			//			ret = PAPI_start(ci->eventSet);
 
 			if (ret != PAPI_OK) {
 				EMSG("PAPI_read of event set %d for component %d failed with %s (%d)",
 						 ci->eventSet, cid, PAPI_strerror(ret), ret);
 			}
 
-//			printf("%d Event value = %llu\n", 0, ci->prev_values[0]);
-//			printf("%d Event value = %llu\n", 1, ci->prev_values[1]);
-//
-//			ret = PAPI_read(ci->eventSet, ci->prev_values);
-//			printf("%d Event value = %llu\n", 0, ci->prev_values[0]);
-//			printf("%d Event value = %llu\n", 1, ci->prev_values[1]);
-//
-//			ret = PAPI_read(ci->eventSet, ci->prev_values);
-//			printf("%d Event value = %llu\n", 0, ci->prev_values[0]);
-//			printf("%d Event value = %llu\n", 1, ci->prev_values[1]);
-
 		}
 	}
 
@@ -1100,7 +1075,6 @@ static void
 papi_monitor_exit(void *reg_info, void *args_in)
 {
 	tool_enter();
-	printf("|------->PAPI_MONITOR_EXIT\n");
 	papi_source_info_t *psi = (papi_source_info_t *) reg_info;
 	gpu_monitors_apply_t *args = (gpu_monitors_apply_t *) args_in;
 
@@ -1110,34 +1084,26 @@ papi_monitor_exit(void *reg_info, void *args_in)
 	int my_event_count = MAX_EVENTS;
 	int ret;
 
+	printf("|------->PAPI_MONITOR_EXIT| running? %d\n", METHOD_CALL(self, started));
+
+	if (args->gpu_type == amd)
+    hipDeviceSynchronize();
 
 	// if sampling disabled explicitly for this thread, skip all processing
 	if (hpcrun_suppress_sample() || sample_filters_apply()) goto finish;
 
-	if (args->gpu_type == nvidia)
-		cudaDeviceSynchronize();
-
-
 	// Collect counters for components in use
 	for (int cid = 0; cid < psi->num_components; ++cid) {
 		papi_component_info_t *ci = &(psi->component_info[cid]);
 		if (ci->inUse){
-//			ret = PAPI_read(ci->eventSet, my_event_values);
 			ret = PAPI_read(ci->eventSet, my_event_values);
-//			ret = PAPI_start(ci->eventSet);
+
 			if (ret != PAPI_OK) {
 				EMSG("PAPI_read of event set %d for component %d failed with %s (%d)",
 						 ci->eventSet, cid, PAPI_strerror(ret), ret);
 			}
-		}
-	}
-
-	// Attribute collected metric to cct nodes
-	for (int cid = 0; cid < psi->num_components; ++cid) {
-		papi_component_info_t* ci = &(psi->component_info[cid]);
-		if (ci->inUse){
-			printf("Self = %p | Component %d ---> %p \t | cct = %p | gpu = %s\n\n", self, cid, &ci, args->cct_node, gpu_monitors_get_gpu_name(args->gpu_type) );
 
+			// Attribute collected metric to cct nodes
 			ret = PAPI_list_events(ci->eventSet, my_event_codes, &my_event_count);
 			if (ret != PAPI_OK) {
 				hpcrun_abort("PAPI_list_events failed inside papi_event_handler."
@@ -1148,13 +1114,15 @@ papi_monitor_exit(void *reg_info, void *args_in)
 				int event_index = get_event_index(self, my_event_codes[eid]);
 				int metric_id = hpcrun_event2metric(self, event_index);
 
-				printf("%d Event = %x, event_index = %d, metric_id = %d || value = %llu -> %llu\n",
-							 eid, my_event_codes[eid], event_index, metric_id, ci->prev_values[eid], my_event_values[eid]);
+				printf("%d Event = %x, event_index = %d, metric_id = %d || value = %llu ---> %llu\n",
+							 eid, my_event_codes[eid], event_index, metric_id, prev_values[eid], my_event_values[eid]);
 
-				blame_shift_apply(metric_id, args->cct_node, my_event_values[eid] - ci->prev_values[eid] /*metricIncr*/);
+				blame_shift_apply(metric_id, cct_node, my_event_values[eid] /*metricIncr*/);
 			}
+
 		}
 	}
+	cct_node = NULL;
 
 finish:
 	tool_exit();
diff --git a/src/tool/hpcrun/sample-sources/papi.c b/src/tool/hpcrun/sample-sources/papi.c
index 729a885f19..7fea3ee109 100644
--- a/src/tool/hpcrun/sample-sources/papi.c
+++ b/src/tool/hpcrun/sample-sources/papi.c
@@ -634,12 +634,6 @@ METHOD_FN(display_events)
 }
 
 
-static void
-METHOD_FN(print_counters, const long long *values)
-{
-}
-
-
 /***************************************************************************
  * object
  ***************************************************************************/
diff --git a/src/tool/hpcrun/sample-sources/perf/linux_perf.c b/src/tool/hpcrun/sample-sources/perf/linux_perf.c
index 67aa4e8c2f..ae6e11be97 100644
--- a/src/tool/hpcrun/sample-sources/perf/linux_perf.c
+++ b/src/tool/hpcrun/sample-sources/perf/linux_perf.c
@@ -982,11 +982,6 @@ METHOD_FN(display_events)
 }
 
 
-static void
-METHOD_FN(print_counters, const long long *values)
-{
-}
-
 // --------------------------------------------------------------------------
 // read a counter from the file descriptor,
 //  and returns the value of the counter
diff --git a/src/tool/hpcrun/sample-sources/pthread-blame.c b/src/tool/hpcrun/sample-sources/pthread-blame.c
index 1187d4d3b8..0c7491915a 100644
--- a/src/tool/hpcrun/sample-sources/pthread-blame.c
+++ b/src/tool/hpcrun/sample-sources/pthread-blame.c
@@ -393,11 +393,6 @@ METHOD_FN(display_events)
 }
 
 
-static void
-METHOD_FN(print_counters, const long long *values)
-{
-}
-
 /*--------------------------------------------------------------------------
  | sample source object
  --------------------------------------------------------------------------*/
diff --git a/src/tool/hpcrun/sample-sources/retcnt.c b/src/tool/hpcrun/sample-sources/retcnt.c
index 5aaa90972e..dea5321b4c 100644
--- a/src/tool/hpcrun/sample-sources/retcnt.c
+++ b/src/tool/hpcrun/sample-sources/retcnt.c
@@ -209,11 +209,6 @@ METHOD_FN(display_events)
 }
 
 
-static void
-METHOD_FN(print_counters, const long long *values)
-{
-}
-
 #define ss_name retcnt
 #define ss_cls SS_SOFTWARE
 #define ss_sort_order 100
diff --git a/src/tool/hpcrun/sample-sources/sample_source_obj.h b/src/tool/hpcrun/sample-sources/sample_source_obj.h
index c497898d80..d74c9db766 100644
--- a/src/tool/hpcrun/sample-sources/sample_source_obj.h
+++ b/src/tool/hpcrun/sample-sources/sample_source_obj.h
@@ -106,7 +106,6 @@ typedef struct sample_source_t {
   VMETHOD_DEF(finalize_event_list);
   VMETHOD_DEF(gen_event_set, int lush_agents);
   VMETHOD_DEF(display_events);
-	VMETHOD_DEF(print_counters, const long long *values);
 
   // data
   evlist_t       	  evl;       	 // event list
diff --git a/src/tool/hpcrun/sample-sources/ss_obj.h b/src/tool/hpcrun/sample-sources/ss_obj.h
index 12d8a7c0db..137e49afb7 100644
--- a/src/tool/hpcrun/sample-sources/ss_obj.h
+++ b/src/tool/hpcrun/sample-sources/ss_obj.h
@@ -97,7 +97,6 @@ sample_source_t obj_name() = {
   .finalize_event_list = finalize_event_list,
   .gen_event_set = gen_event_set,
   .display_events = display_events,
-	.print_counters = print_counters,
 
   // data
   .evl = {
diff --git a/src/tool/hpcrun/sample-sources/sync.c b/src/tool/hpcrun/sample-sources/sync.c
index 44c3c7fe76..3c80fd253a 100644
--- a/src/tool/hpcrun/sample-sources/sync.c
+++ b/src/tool/hpcrun/sample-sources/sync.c
@@ -200,11 +200,6 @@ METHOD_FN(display_events)
 }
 
 
-static void
-METHOD_FN(print_counters, const long long *values)
-{
-}
-
 /***************************************************************************
  * object
  ***************************************************************************/
diff --git a/src/tool/hpcrun/sample-sources/tst.c b/src/tool/hpcrun/sample-sources/tst.c
index b6f5438305..76153b044b 100644
--- a/src/tool/hpcrun/sample-sources/tst.c
+++ b/src/tool/hpcrun/sample-sources/tst.c
@@ -367,11 +367,6 @@ METHOD_FN(display_events)
 }
 
 
-static void
-METHOD_FN(print_counters, const long long *values)
-{
-}
-
 /***************************************************************************
  * object
  ***************************************************************************/
diff --git a/src/tool/hpcrun/sample-sources/upc.c b/src/tool/hpcrun/sample-sources/upc.c
index 5fcac6d541..e9ccec9daf 100644
--- a/src/tool/hpcrun/sample-sources/upc.c
+++ b/src/tool/hpcrun/sample-sources/upc.c
@@ -448,11 +448,6 @@ METHOD_FN(display_events)
 }
 
 
-static void
-METHOD_FN(print_counters, const long long *values)
-{
-}
-
 #define ss_name upc
 #define ss_cls SS_HARDWARE
 
diff --git a/src/tool/hpcrun/tool_state.c b/src/tool/hpcrun/tool_state.c
index 3afaef4a46..03673c6a02 100644
--- a/src/tool/hpcrun/tool_state.c
+++ b/src/tool/hpcrun/tool_state.c
@@ -15,4 +15,4 @@ void tool_exit(){
 
 bool is_tool_active(){
 	return tool_active;
-}
+}
\ No newline at end of file

From 689599eb89359a087cf20125a7842f72d2d80d44 Mon Sep 17 00:00:00 2001
From: dejangrubisic <grubisic.dejan@yahoo.com>
Date: Tue, 4 Aug 2020 22:26:57 -0500
Subject: [PATCH 014/177] identation

---
 src/tool/hpcrun/gpu/amd/roctracer-api.c |  94 +++----
 src/tool/hpcrun/sample-sources/papi-c.c | 360 ++++++++++++------------
 2 files changed, 227 insertions(+), 227 deletions(-)

diff --git a/src/tool/hpcrun/gpu/amd/roctracer-api.c b/src/tool/hpcrun/gpu/amd/roctracer-api.c
index ad617008f4..1d9751ba5f 100644
--- a/src/tool/hpcrun/gpu/amd/roctracer-api.c
+++ b/src/tool/hpcrun/gpu/amd/roctracer-api.c
@@ -72,7 +72,7 @@
 // macros
 //******************************************************************************
 
-#define FORALL_ROCTRACER_ROUTINES(macro)			\
+#define FORALL_ROCTRACER_ROUTINES(macro)      \
   macro(roctracer_open_pool_expl)   \
   macro(roctracer_enable_callback)  \
   macro(roctracer_enable_activity_expl)  \
@@ -90,10 +90,10 @@
 
 #define HPCRUN_ROCTRACER_CALL(fn, args) \
 {      \
-  roctracer_status_t status = ROCTRACER_FN_NAME(fn) args;	\
-  if (status != ROCTRACER_STATUS_SUCCESS) {		\
+  roctracer_status_t status = ROCTRACER_FN_NAME(fn) args;  \
+  if (status != ROCTRACER_STATUS_SUCCESS) {    \
     /* use roctracer_error_string() */ \
-  }						\
+  }            \
 }
 
 
@@ -214,35 +214,35 @@ roctracer_kernel_data_set
     {
     case HIP_API_ID_hipModuleLaunchKernel:
       entry_data->kernel.blockSharedMemory =
-	data->args.hipModuleLaunchKernel.sharedMemBytes;
+  data->args.hipModuleLaunchKernel.sharedMemBytes;
 
       entry_data->kernel.blockThreads =
-	data->args.hipModuleLaunchKernel.blockDimX *
-	data->args.hipModuleLaunchKernel.blockDimY *
-	data->args.hipModuleLaunchKernel.blockDimZ;
+  data->args.hipModuleLaunchKernel.blockDimX *
+  data->args.hipModuleLaunchKernel.blockDimY *
+  data->args.hipModuleLaunchKernel.blockDimZ;
       break;
 
     case HIP_API_ID_hipLaunchCooperativeKernel:
       entry_data->kernel.blockSharedMemory =
-	data->args.hipLaunchCooperativeKernel.sharedMemBytes;
+  data->args.hipLaunchCooperativeKernel.sharedMemBytes;
 
       entry_data->kernel.blockThreads =
-	data->args.hipLaunchCooperativeKernel.blockDimX.x *
-	data->args.hipLaunchCooperativeKernel.blockDimX.y *
-	data->args.hipLaunchCooperativeKernel.blockDimX.z;
+  data->args.hipLaunchCooperativeKernel.blockDimX.x *
+  data->args.hipLaunchCooperativeKernel.blockDimX.y *
+  data->args.hipLaunchCooperativeKernel.blockDimX.z;
       break;
 
     case HIP_API_ID_hipHccModuleLaunchKernel:
       entry_data->kernel.blockSharedMemory =
-	data->args.hipHccModuleLaunchKernel.sharedMemBytes;
+  data->args.hipHccModuleLaunchKernel.sharedMemBytes;
 
       entry_data->kernel.blockThreads =
-	(data->args.hipHccModuleLaunchKernel.globalWorkSizeX *
-	 data->args.hipHccModuleLaunchKernel.globalWorkSizeY *
-	 data->args.hipHccModuleLaunchKernel.globalWorkSizeZ) +
-	(data->args.hipHccModuleLaunchKernel.localWorkSizeX *
-	 data->args.hipHccModuleLaunchKernel.localWorkSizeY *
-	 data->args.hipHccModuleLaunchKernel.localWorkSizeZ);
+  (data->args.hipHccModuleLaunchKernel.globalWorkSizeX *
+   data->args.hipHccModuleLaunchKernel.globalWorkSizeY *
+   data->args.hipHccModuleLaunchKernel.globalWorkSizeZ) +
+  (data->args.hipHccModuleLaunchKernel.localWorkSizeX *
+   data->args.hipHccModuleLaunchKernel.localWorkSizeY *
+   data->args.hipHccModuleLaunchKernel.localWorkSizeZ);
       break;
     }
 }
@@ -260,9 +260,9 @@ roctracer_subscriber_callback
 )
 {
   if (is_tool_active()) {
-//		TMSG(ROCM, "PAPI correlation callback");
-//		gpu_correlation_channel_produce(PAPI_CORR_ID, NULL, 0);
-		return;
+//    TMSG(ROCM, "PAPI correlation callback");
+//    gpu_correlation_channel_produce(PAPI_CORR_ID, NULL, 0);
+    return;
   }
 
   gpu_op_placeholder_flags_t gpu_op_placeholder_flags = 0;
@@ -294,7 +294,7 @@ roctracer_subscriber_callback
   case HIP_API_ID_hipMemcpyDtoHAsync:
   case HIP_API_ID_hipMemcpyParam2D:
     gpu_op_placeholder_flags_set(&gpu_op_placeholder_flags,
-				 gpu_placeholder_type_copy);
+         gpu_placeholder_type_copy);
     is_valid_op = true;
     break;
 
@@ -307,7 +307,7 @@ roctracer_subscriber_callback
   case HIP_API_ID_hipMalloc3D:
   case HIP_API_ID_hipExtMallocWithFlags:
     gpu_op_placeholder_flags_set(&gpu_op_placeholder_flags,
-				 gpu_placeholder_type_alloc);
+         gpu_placeholder_type_alloc);
     is_valid_op = true;
     break;
 
@@ -320,14 +320,14 @@ roctracer_subscriber_callback
   case HIP_API_ID_hipMemsetAsync:
   case HIP_API_ID_hipMemsetD32Async:
     gpu_op_placeholder_flags_set(&gpu_op_placeholder_flags,
-				 gpu_placeholder_type_memset);
+         gpu_placeholder_type_memset);
     is_valid_op = true;
     break;
 
   case HIP_API_ID_hipFree:
   case HIP_API_ID_hipFreeArray:
     gpu_op_placeholder_flags_set(&gpu_op_placeholder_flags,
-				 gpu_placeholder_type_delete);
+         gpu_placeholder_type_delete);
     is_valid_op = true;
     break;
 
@@ -337,7 +337,7 @@ roctracer_subscriber_callback
     //case HIP_API_ID_hipExtModuleLaunchKernel:
 
     gpu_op_placeholder_flags_set(&gpu_op_placeholder_flags,
-				 gpu_placeholder_type_kernel);
+         gpu_placeholder_type_kernel);
     is_valid_op = true;
     break;
 
@@ -346,7 +346,7 @@ roctracer_subscriber_callback
   case HIP_API_ID_hipDeviceSynchronize:
   case HIP_API_ID_hipEventSynchronize:
     gpu_op_placeholder_flags_set(&gpu_op_placeholder_flags,
-				 gpu_placeholder_type_sync);
+         gpu_placeholder_type_sync);
     is_valid_op = true;
     break;
 
@@ -358,31 +358,31 @@ roctracer_subscriber_callback
 
 
   if (data->phase == ACTIVITY_API_PHASE_ENTER) {
-		uint64_t correlation_id = data->correlation_id;
-		cct_node_t *api_node =
-		gpu_application_thread_correlation_callback(correlation_id);
+    uint64_t correlation_id = data->correlation_id;
+    cct_node_t *api_node =
+    gpu_application_thread_correlation_callback(correlation_id);
 
-		gpu_op_ccts_t gpu_op_ccts;
-		hpcrun_safe_enter();
-		gpu_op_ccts_insert(api_node, &gpu_op_ccts, gpu_op_placeholder_flags);
-		hpcrun_safe_exit();
+    gpu_op_ccts_t gpu_op_ccts;
+    hpcrun_safe_enter();
+    gpu_op_ccts_insert(api_node, &gpu_op_ccts, gpu_op_placeholder_flags);
+    hpcrun_safe_exit();
 
-		gpu_activity_channel_consume(gpu_metrics_attribute);
+    gpu_activity_channel_consume(gpu_metrics_attribute);
 
-		// Generate notification entry
-		uint64_t cpu_submit_time = hpcrun_nanotime();
+    // Generate notification entry
+    uint64_t cpu_submit_time = hpcrun_nanotime();
 
-		cupti_kernel_ph = gpu_op_ccts_get(&gpu_op_ccts, gpu_placeholder_type_kernel); //dejan: added
+    cupti_kernel_ph = gpu_op_ccts_get(&gpu_op_ccts, gpu_placeholder_type_kernel); //dejan: added
 
-		printf("\nACTIVITY_API_PHASE_ENTER -----------------| cct = %p | gpu = %d\n", cupti_kernel_ph, amd);
-		gpu_monitors_apply(&(gpu_monitors_apply_t) {.cct_node=cupti_kernel_ph, .gpu_type=amd}, gpu_monitor_type_enter);
+    printf("\nACTIVITY_API_PHASE_ENTER -----------------| cct = %p | gpu = %d\n", cupti_kernel_ph, amd);
+    gpu_monitors_apply(&(gpu_monitors_apply_t) {.cct_node=cupti_kernel_ph, .gpu_type=amd}, gpu_monitor_type_enter);
 
-		gpu_correlation_channel_produce(correlation_id, &gpu_op_ccts, cpu_submit_time);
+    gpu_correlation_channel_produce(correlation_id, &gpu_op_ccts, cpu_submit_time);
   }else if (data->phase == ACTIVITY_API_PHASE_EXIT){
-		printf("\nACTIVITY_API_PHASE_EXIT -----------------| cct = %p | gpu = %d\n", cupti_kernel_ph, amd );
-		cupti_kernel_ph = NULL;
+    printf("\nACTIVITY_API_PHASE_EXIT -----------------| cct = %p | gpu = %d\n", cupti_kernel_ph, amd );
+    cupti_kernel_ph = NULL;
   }else{
-  	;
+    ;
   }
 
 
@@ -409,7 +409,7 @@ roctracer_activity_process
   roctracer_activity_translate(&gpu_activity, roctracer_record);
   if (gpu_correlation_id_map_lookup(roctracer_record->correlation_id) == NULL) {
     gpu_correlation_id_map_insert(roctracer_record->correlation_id,
-				  roctracer_record->correlation_id);
+          roctracer_record->correlation_id);
   }
   gpu_activity_process(&gpu_activity);
 }
@@ -494,7 +494,7 @@ roctracer_init
   properties.buffer_callback_arg = 0;
   HPCRUN_ROCTRACER_CALL(roctracer_open_pool_expl,(&properties, NULL));
   HPCRUN_ROCTRACER_CALL(roctracer_enable_callback,
-			(roctracer_subscriber_callback, NULL));
+      (roctracer_subscriber_callback, NULL));
   HPCRUN_ROCTRACER_CALL(roctracer_enable_activity_expl, (NULL));
 }
 
diff --git a/src/tool/hpcrun/sample-sources/papi-c.c b/src/tool/hpcrun/sample-sources/papi-c.c
index 31dc8041a3..2f950d8246 100644
--- a/src/tool/hpcrun/sample-sources/papi-c.c
+++ b/src/tool/hpcrun/sample-sources/papi-c.c
@@ -229,7 +229,7 @@ strip_papi_prefix(const char *str)
 static void
 METHOD_FN(init)
 {
-	tool_enter();
+  tool_enter();
   // PAPI_set_debug(0x3ff);
 
   // **NOTE: some papi components may start threads, so
@@ -244,10 +244,10 @@ METHOD_FN(init)
     if (cidx) {
       int res = PAPI_disable_component(cidx);
       if (res == PAPI_OK) {
-	TMSG(PAPI, "PAPI cuda component disabled");
+  TMSG(PAPI, "PAPI cuda component disabled");
       }
       else {
-	EMSG("*** PAPI cuda component could not be disabled!!!");
+  EMSG("*** PAPI cuda component could not be disabled!!!");
       }
     }
   }
@@ -279,15 +279,15 @@ METHOD_FN(init)
       EMSG("warning: PAPI_set_domain(PAPI_DOM_ALL) failed: %d", ret);
     }
   }
-  
-	self->state = INIT;
+
+  self->state = INIT;
   tool_exit();
 }
 
 static void
 METHOD_FN(thread_init)
 {
-	tool_enter();
+  tool_enter();
   TMSG(PAPI, "thread init");
   if (papi_unavail) { goto finish; }
 
@@ -305,7 +305,7 @@ METHOD_FN(thread_init)
 static void
 METHOD_FN(thread_init_action)
 {
-	tool_enter();
+  tool_enter();
   TMSG(PAPI, "register thread");
   if (papi_unavail) { goto finish; }
 
@@ -317,13 +317,13 @@ METHOD_FN(thread_init_action)
   TMSG(PAPI, "register thread ok");
 
 finish:
-	tool_exit();
+  tool_exit();
 }
 
 static void
 METHOD_FN(start)
 {
-	tool_enter();
+  tool_enter();
   int cidx;
   TMSG(PAPI, "start");
 
@@ -340,7 +340,7 @@ METHOD_FN(start)
 
   if (my_state == START) {
     TMSG(PAPI,"*NOTE* PAPI start called when already in state START");
-		goto finish;
+    goto finish;
   }
 
   // for each active component, start its event set
@@ -349,42 +349,42 @@ METHOD_FN(start)
     papi_component_info_t* ci = &(psi->component_info[cidx]);
     if (ci->inUse) {
       if (component_uses_sync_samples(cidx)) {
-	TMSG(PAPI, "component %d is synchronous, use synchronous start", cidx);
-	ci->sync_start();
+  TMSG(PAPI, "component %d is synchronous, use synchronous start", cidx);
+  ci->sync_start();
       }
       else {
-	TMSG(PAPI,"starting PAPI event set %d for component %d", ci->eventSet, cidx);
-	int ret = PAPI_start(ci->eventSet);
-	if (ret == PAPI_EISRUN) {
-	  // this case should not happen, but maybe it's not fatal
-	  EMSG("PAPI returned EISRUN for event set %d component %d", ci->eventSet, cidx);
-	}
-	else if (ret != PAPI_OK) {
-	  EMSG("PAPI_start failed with %s (%d) for event set %d component %d ",
-	       PAPI_strerror(ret), ret, ci->eventSet, cidx);
-	  hpcrun_ssfail_start("PAPI");
-	}
-
-	if (ci->some_derived) {
-	  ret = PAPI_read(ci->eventSet, ci->prev_values);
-	  if (ret != PAPI_OK) {
-	    EMSG("PAPI_read of event set %d for component %d failed with %s (%d)",
-		 ci->eventSet, cidx, PAPI_strerror(ret), ret);
-	  }
-	}
+  TMSG(PAPI,"starting PAPI event set %d for component %d", ci->eventSet, cidx);
+  int ret = PAPI_start(ci->eventSet);
+  if (ret == PAPI_EISRUN) {
+    // this case should not happen, but maybe it's not fatal
+    EMSG("PAPI returned EISRUN for event set %d component %d", ci->eventSet, cidx);
+  }
+  else if (ret != PAPI_OK) {
+    EMSG("PAPI_start failed with %s (%d) for event set %d component %d ",
+         PAPI_strerror(ret), ret, ci->eventSet, cidx);
+    hpcrun_ssfail_start("PAPI");
+  }
+
+  if (ci->some_derived) {
+    ret = PAPI_read(ci->eventSet, ci->prev_values);
+    if (ret != PAPI_OK) {
+      EMSG("PAPI_read of event set %d for component %d failed with %s (%d)",
+     ci->eventSet, cidx, PAPI_strerror(ret), ret);
+    }
+  }
       }
     }
   }
   td->ss_state[self->sel_idx] = START;
 
 finish:
-	tool_exit();
+  tool_exit();
 }
 
 static void
 METHOD_FN(thread_fini_action)
 {
-	tool_enter();
+  tool_enter();
   TMSG(PAPI, "unregister thread");
   if (papi_unavail) { goto finish; }
 
@@ -393,16 +393,16 @@ METHOD_FN(thread_fini_action)
   snprintf(msg, sizeof(msg)-1, "!!NOT PAPI_OK!! (code = %d)", retval);
   TMSG(PAPI, "unregister thread returns %s", retval == PAPI_OK? "PAPI_OK" : msg);
 finish:
-	tool_exit();
+  tool_exit();
 }
 
 
 static void
 METHOD_FN(stop)
 {
-	tool_enter();
+  tool_enter();
 
-	int cidx;
+  int cidx;
 
   TMSG(PAPI, "stop");
   if (papi_unavail) { goto finish; }
@@ -426,18 +426,18 @@ METHOD_FN(stop)
     papi_component_info_t *ci = &(psi->component_info[cidx]);
     if (ci->inUse) {
       if (component_uses_sync_samples(cidx)) {
-	TMSG(PAPI, "component %d is synchronous, stop is trivial", cidx);
+  TMSG(PAPI, "component %d is synchronous, stop is trivial", cidx);
       }
       else {
-	TMSG(PAPI,"stop w event set = %d", ci->eventSet);
-	long_long values[nevents+2];
-	//	long_long *values = (long_long *) alloca(sizeof(long_long) * (nevents+2));
-
-	int ret = PAPI_stop(ci->eventSet, values);
-	if (ret != PAPI_OK){
-	  EMSG("Failed to stop PAPI for eventset %d. Return code = %d ==> %s",
-	       ci->eventSet, ret, PAPI_strerror(ret));
-	}
+  TMSG(PAPI,"stop w event set = %d", ci->eventSet);
+  long_long values[nevents+2];
+  //  long_long *values = (long_long *) alloca(sizeof(long_long) * (nevents+2));
+
+  int ret = PAPI_stop(ci->eventSet, values);
+  if (ret != PAPI_OK){
+    EMSG("Failed to stop PAPI for eventset %d. Return code = %d ==> %s",
+         ci->eventSet, ret, PAPI_strerror(ret));
+  }
 
       }
     }
@@ -445,26 +445,26 @@ METHOD_FN(stop)
 
   TD_GET(ss_state)[self->sel_idx] = STOP;
 finish:
-	tool_exit();
+  tool_exit();
 }
 
 
 static void
 METHOD_FN(shutdown)
 {
-	tool_enter();
+  tool_enter();
   TMSG(PAPI, "shutdown");
   if (papi_unavail) { goto finish; }
 
   do{
-		METHOD_CALL(self, stop); // make sure stop has been called
-	}while(0);
+    METHOD_CALL(self, stop); // make sure stop has been called
+  }while(0);
   // FIXME: add component shutdown code here
   PAPI_shutdown();
 
   self->state = UNINIT;
 finish:
-	tool_exit();
+  tool_exit();
 }
 
 // Return true if PAPI recognizes the name, whether supported or not.
@@ -472,8 +472,8 @@ METHOD_FN(shutdown)
 static bool
 METHOD_FN(supports_event, const char *ev_str)
 {
-	tool_enter();
-	bool ret;
+  tool_enter();
+  bool ret;
   ev_str = strip_papi_prefix(ev_str);
 
   TMSG(PAPI, "supports event");
@@ -492,14 +492,14 @@ METHOD_FN(supports_event, const char *ev_str)
   ret = (PAPI_event_name_to_code(evtmp, &ec) == PAPI_OK);
 
 finish:
-	tool_exit();
-	return ret;
+  tool_exit();
+  return ret;
 }
 
 static void
 METHOD_FN(process_event_list, int lush_metrics)
 {
-	tool_enter();
+  tool_enter();
   TMSG(PAPI, "process event list");
   if (papi_unavail) { goto finish; }
 
@@ -522,7 +522,7 @@ METHOD_FN(process_event_list, int lush_metrics)
 #ifdef USE_PAPI_CHECKING
     if (! hpcrun_extract_ev_thresh(event, sizeof(name), name, &thresh, DEFAULT_THRESHOLD)) {
       AMSG("WARNING: %s using default threshold %ld, "
-	   "better to use an explicit threshold.", name, DEFAULT_THRESHOLD);
+     "better to use an explicit threshold.", name, DEFAULT_THRESHOLD);
     }
 #else
     hpcrun_extract_ev_thresh(event, sizeof(name), name, &thresh, DEFAULT_THRESHOLD);
@@ -530,8 +530,8 @@ METHOD_FN(process_event_list, int lush_metrics)
     ret = PAPI_event_name_to_code(name, &evcode);
     if (ret != PAPI_OK) {
       EMSG("unexpected failure in PAPI process_event_list(): "
-	   "PAPI_event_name_to_code() returned %s (%d)",
-	   PAPI_strerror(ret), ret);
+     "PAPI_event_name_to_code() returned %s (%d)",
+     PAPI_strerror(ret), ret);
       hpcrun_ssfail_unsupported("PAPI", name);
     }
     if (PAPI_query_event(evcode) != PAPI_OK) {
@@ -568,7 +568,7 @@ METHOD_FN(process_event_list, int lush_metrics)
     // supports hardware overflow.  use threshold = 0 to force proxy
     // sampling (for testing).
     if (event_is_derived(self->evl.events[i].event)
-	|| self->evl.events[i].thresh == 0) {
+  || self->evl.events[i].thresh == 0) {
       TMSG(PAPI, "using proxy sampling for event %s", buffer);
       strcat(buffer, " (proxy)");
       self->evl.events[i].thresh = 1;
@@ -592,17 +592,17 @@ METHOD_FN(process_event_list, int lush_metrics)
       TMSG(PAPI, "Event %s from synchronous component", buffer);
     int metric_id = /* weight */
       hpcrun_set_new_metric_info_and_period(papi_kind, strdup(buffer),
-					    MetricFlags_ValFmt_Int,
-					    threshold, prop);
+              MetricFlags_ValFmt_Int,
+              threshold, prop);
     METHOD_CALL(self, store_metric_id, i, metric_id);
 
     // FIXME:LUSH: need a more flexible metric interface
     if (num_lush_metrics > 0 && strcmp(buffer, "PAPI_TOT_CYC") == 0) {
       // there should be one lush metric; its source is the last event
       int mid_idleness =
-	hpcrun_set_new_metric_info_and_period(papi_kind, "idleness",
-					      MetricFlags_ValFmt_Real,
-					      self->evl.events[i].thresh, prop);
+  hpcrun_set_new_metric_info_and_period(papi_kind, "idleness",
+                MetricFlags_ValFmt_Real,
+                self->evl.events[i].thresh, prop);
       assert(num_lush_metrics == 1 && (i == (nevents - 1)));
       lush_agents->metric_time = metric_id;
       lush_agents->metric_idleness = mid_idleness;
@@ -616,7 +616,7 @@ METHOD_FN(process_event_list, int lush_metrics)
   }
 
 finish:
-	tool_exit();
+  tool_exit();
 }
 
 static void
@@ -627,7 +627,7 @@ METHOD_FN(finalize_event_list)
 static void
 METHOD_FN(gen_event_set, int lush_metrics)
 {
-	tool_enter();
+  tool_enter();
   thread_data_t *td = hpcrun_get_thread_data();
   int i;
   int ret;
@@ -637,7 +637,7 @@ METHOD_FN(gen_event_set, int lush_metrics)
 
   int num_components = PAPI_num_components();
   int ss_info_size = sizeof(papi_source_info_t) +
-	num_components * sizeof(papi_component_info_t);
+  num_components * sizeof(papi_component_info_t);
 
   TMSG(PAPI, "Num components = %d", num_components);
   papi_source_info_t* psi = hpcrun_malloc(ss_info_size);
@@ -680,12 +680,12 @@ METHOD_FN(gen_event_set, int lush_metrics)
       char buffer[PAPI_MAX_STR_LEN];
       PAPI_event_code_to_name(evcode, buffer);
       TMSG(PAPI,
-	   "PAPI_add_event(eventSet=%%d, event_code=%x (event name %s)) component=%d",
-	   /* eventSet, */ evcode, buffer, cidx);
+     "PAPI_add_event(eventSet=%%d, event_code=%x (event name %s)) component=%d",
+     /* eventSet, */ evcode, buffer, cidx);
     }
     if (ret != PAPI_OK) {
       EMSG("failure in PAPI gen_event_set(): PAPI_add_event() returned: %s (%d)",
-	   PAPI_strerror(ret), ret);
+     PAPI_strerror(ret), ret);
       event_fatal_error(evcode, ret);
     }
   }
@@ -716,43 +716,43 @@ METHOD_FN(gen_event_set, int lush_metrics)
     // ***** Only set overflow if NOT derived event *****
     if (! derived[i]) {
       ret = PAPI_overflow(eventSet, evcode, thresh, OVERFLOW_MODE,
-			  papi_event_handler);
+        papi_event_handler);
       TMSG(PAPI, "PAPI_overflow(eventSet=%d, evcode=%x, thresh=%d) = %d",
-	   eventSet, evcode, thresh, ret);
+     eventSet, evcode, thresh, ret);
       if (ret != PAPI_OK) {
-	EMSG("failure in PAPI gen_event_set(): PAPI_overflow() returned: %s (%d)",
-	     PAPI_strerror(ret), ret);
-	event_fatal_error(evcode, ret);
+  EMSG("failure in PAPI gen_event_set(): PAPI_overflow() returned: %s (%d)",
+       PAPI_strerror(ret), ret);
+  event_fatal_error(evcode, ret);
       }
     }
   }
 
   /// Register papi handler callbacks
   gpu_monitor_enter.reg_info = psi;
-	gpu_monitor_enter.fn = papi_monitor_enter;
-	gpu_monitor_register(gpu_monitor_type_enter,
-														&gpu_monitor_enter);
+  gpu_monitor_enter.fn = papi_monitor_enter;
+  gpu_monitor_register(gpu_monitor_type_enter,
+                            &gpu_monitor_enter);
 
-	gpu_monitor_exit.reg_info = psi;
-	gpu_monitor_exit.fn = papi_monitor_exit;
-	gpu_monitor_register(gpu_monitor_type_exit,
-														&gpu_monitor_exit);
+  gpu_monitor_exit.reg_info = psi;
+  gpu_monitor_exit.fn = papi_monitor_exit;
+  gpu_monitor_register(gpu_monitor_type_exit,
+                            &gpu_monitor_exit);
 
 finish:
-	tool_exit();
+  tool_exit();
 }
 
 static void
 METHOD_FN(display_events)
 {
-	tool_enter();
+  tool_enter();
   PAPI_event_info_t info;
   int ev, ret, num_total, num_prof;
   int num_components, cidx;
 
   if (papi_unavail) {
     printf("PAPI is not available.  Probably, the kernel doesn't support PAPI,\n"
-	   "or else maybe HPCToolkit is out of sync with PAPI.\n\n");
+     "or else maybe HPCToolkit is out of sync with PAPI.\n\n");
     goto finish;
   }
 
@@ -773,14 +773,14 @@ METHOD_FN(display_events)
       char *prof;
       memset(&info, 0, sizeof(info));
       if (PAPI_get_event_info(ev, &info) == PAPI_OK && info.count != 0) {
-	if (event_is_derived(ev)) {
-	  prof = "No";
-	} else {
-	  prof = "Yes";
-	  num_prof++;
-	}
-	num_total++;
-	printf("%-10s\t%s\t%s\n", info.symbol, prof, info.long_descr);
+  if (event_is_derived(ev)) {
+    prof = "No";
+  } else {
+    prof = "Yes";
+    num_prof++;
+  }
+  num_total++;
+  printf("%-10s\t%s\t%s\n", info.symbol, prof, info.long_descr);
       }
       ret = PAPI_enum_cmp_event(&ev, PAPI_ENUM_EVENTS, cidx);
     }
@@ -807,7 +807,7 @@ METHOD_FN(display_events)
     while (ret == PAPI_OK) {
       memset(&info, 0, sizeof(info));
       if (PAPI_get_event_info(ev, &info) == PAPI_OK) {
-	cmp_event_count++;
+  cmp_event_count++;
         display_event_info(stdout, info.symbol, info.long_descr);
         printf("---------------------------------------------------------------------------\n");
       }
@@ -821,7 +821,7 @@ METHOD_FN(display_events)
   printf( "Total events reported: %d\n", num_total);
   printf("\n\n");
 finish:
-	tool_exit();
+  tool_exit();
 }
 
 
@@ -841,7 +841,7 @@ METHOD_FN(display_events)
 void
 hpcrun_disable_papi_cuda(void)
 {
-	tool_enter();
+  tool_enter();
   disable_papi_cuda = true;
   tool_exit();
 }
@@ -855,8 +855,8 @@ hpcrun_disable_papi_cuda(void)
 static int
 event_is_derived(int ev_code)
 {
-	tool_enter();
-	int ret;
+  tool_enter();
+  int ret;
   PAPI_event_info_t info;
 
   // "Is derived" is kind of a bad thing, so if any unexpected failure
@@ -876,14 +876,14 @@ event_is_derived(int ev_code)
   ret = 1;
 
 finish:
-	tool_exit();
-	return ret;
+  tool_exit();
+  return ret;
 }
 
 static void
 event_fatal_error(int ev_code, int papi_ret)
 {
-	tool_enter();
+  tool_enter();
   char name[1024];
 
   PAPI_event_code_to_name(ev_code, name);
@@ -905,7 +905,7 @@ static void
 papi_event_handler(int event_set, void *pc, long long ovec,
                    void *context)
 {
-	tool_enter();
+  tool_enter();
   sample_source_t *self = &obj_name();
   long long values[MAX_EVENTS];
   int my_events[MAX_EVENTS];
@@ -921,8 +921,8 @@ papi_event_handler(int event_set, void *pc, long long ovec,
 
   if (!ovec) {
     TMSG(PAPI_SAMPLE, "papi overflow event: event set %d ovec = %ld",
-	 event_set, ovec);
-		goto finish;
+   event_set, ovec);
+    goto finish;
   }
 
   // If the interrupt came from inside our code, then drop the sample
@@ -945,20 +945,20 @@ papi_event_handler(int event_set, void *pc, long long ovec,
   }
 
   ret = PAPI_get_overflow_event_index(event_set, ovec, my_events,
-				      &my_event_count);
+              &my_event_count);
   if (ret != PAPI_OK) {
     TMSG(PAPI_SAMPLE, "papi_event_handler: event set %d ovec %ld "
-	 "get_overflow_event_index return code = %d ==> %s",
-	 event_set, ovec, ret, PAPI_strerror(ret));
+   "get_overflow_event_index return code = %d ==> %s",
+   event_set, ovec, ret, PAPI_strerror(ret));
 #ifdef DEBUG_PAPI_OVERFLOW
     ret = PAPI_list_events(event_set, my_event_codes, &my_event_codes_count);
     if (ret != PAPI_OK) {
       TMSG(PAPI_SAMPLE, "PAPI_list_events failed inside papi_event_handler."
-	   "Return code = %d ==> %s", ret, PAPI_strerror(ret));
+     "Return code = %d ==> %s", ret, PAPI_strerror(ret));
     } else {
       for (i = 0; i < my_event_codes_count; i++) {
         TMSG(PAPI_SAMPLE, "event set %d event code %d = %x\n",
-	     event_set, i, my_event_codes[i]);
+       event_set, i, my_event_codes[i]);
       }
     }
     TMSG(PAPI_SAMPLE, "get_overflow_event_index failure in papi_event_handler");
@@ -968,7 +968,7 @@ papi_event_handler(int event_set, void *pc, long long ovec,
   ret = PAPI_list_events(event_set, my_event_codes, &my_event_codes_count);
   if (ret != PAPI_OK) {
     hpcrun_abort("PAPI_list_events failed inside papi_event_handler."
-		 "Return code = %d ==> %s", ret, PAPI_strerror(ret));
+     "Return code = %d ==> %s", ret, PAPI_strerror(ret));
   }
 
   for (i = 0; i < my_event_count; i++) {
@@ -976,8 +976,8 @@ papi_event_handler(int event_set, void *pc, long long ovec,
     // This means lush's 'time' metric should be *last*
 
     TMSG(PAPI_SAMPLE,"handling papi overflow event: "
-	"event set %d event index = %d event code = 0x%x",
-	event_set, my_events[i], my_event_codes[my_events[i]]);
+  "event set %d event index = %d event code = 0x%x",
+  event_set, my_events[i], my_event_codes[my_events[i]]);
 
     int event_index = get_event_index(self, my_event_codes[my_events[i]]);
 
@@ -995,8 +995,8 @@ papi_event_handler(int event_set, void *pc, long long ovec,
     }
 
     sample_val_t sv = hpcrun_sample_callpath(context, metric_id,
-			(hpcrun_metricVal_t) {.i=metricIncrement},
-			0/*skipInner*/, 0/*isSync*/, NULL);
+      (hpcrun_metricVal_t) {.i=metricIncrement},
+      0/*skipInner*/, 0/*isSync*/, NULL);
 
     blame_shift_apply(metric_id, sv.sample_node, 1 /*metricIncr*/);
   }
@@ -1009,9 +1009,9 @@ papi_event_handler(int event_set, void *pc, long long ovec,
   if (ci->some_derived) {
     for (i = 0; i < nevents; i++) {
       if (derived[i]) {
-	      hpcrun_sample_callpath(context, hpcrun_event2metric(self, i),
-			(hpcrun_metricVal_t) {.i=values[i] - ci->prev_values[i]},
-			0, 0, NULL);
+        hpcrun_sample_callpath(context, hpcrun_event2metric(self, i),
+      (hpcrun_metricVal_t) {.i=values[i] - ci->prev_values[i]},
+      0, 0, NULL);
       }
     }
 
@@ -1022,7 +1022,7 @@ papi_event_handler(int event_set, void *pc, long long ovec,
   }
 
 finish:
-	tool_exit();
+  tool_exit();
   hpcrun_safe_exit();
 }
 
@@ -1033,97 +1033,97 @@ static __thread long long prev_values[MAX_EVENTS];
 static void
 papi_monitor_enter(void *reg_info, void *args_in)
 {
-	tool_enter();
-	papi_source_info_t *psi = (papi_source_info_t *) reg_info;
-	gpu_monitors_apply_t *args = (gpu_monitors_apply_t *) args_in;
+  tool_enter();
+  papi_source_info_t *psi = (papi_source_info_t *) reg_info;
+  gpu_monitors_apply_t *args = (gpu_monitors_apply_t *) args_in;
 
-	sample_source_t *self = &obj_name(); /// just for debug
-	int ret;
+  sample_source_t *self = &obj_name(); /// just for debug
+  int ret;
 
-	printf("|------->PAPI_MONITOR_ENTER | running? %d\n", METHOD_CALL(self, started));
+  printf("|------->PAPI_MONITOR_ENTER | running? %d\n", METHOD_CALL(self, started));
 
-	// if sampling disabled explicitly for this thread, skip all processing
-	if (hpcrun_suppress_sample() || sample_filters_apply()) goto finish;
+  // if sampling disabled explicitly for this thread, skip all processing
+  if (hpcrun_suppress_sample() || sample_filters_apply()) goto finish;
 
-	cct_node = args->cct_node;
+  cct_node = args->cct_node;
 
   if (args->gpu_type == amd)
     hipDeviceSynchronize();
 
-	// Save counts on the end so we could substract that from next call (we don't want to measure ourselves)
-	for (int cid = 0; cid < psi->num_components; ++cid) {
-		papi_component_info_t *ci = &(psi->component_info[cid]);
-		if (ci->inUse) {
-			printf("Self = %p | Component %d \t | cct = %p | gpu = %s\n\n", self, cid, args->cct_node, gpu_monitors_get_gpu_name(args->gpu_type) );
+  // Save counts on the end so we could substract that from next call (we don't want to measure ourselves)
+  for (int cid = 0; cid < psi->num_components; ++cid) {
+    papi_component_info_t *ci = &(psi->component_info[cid]);
+    if (ci->inUse) {
+      printf("Self = %p | Component %d \t | cct = %p | gpu = %s\n\n", self, cid, args->cct_node, gpu_monitors_get_gpu_name(args->gpu_type) );
 
-			ret = PAPI_read(ci->eventSet, prev_values);
-			//			ret = PAPI_start(ci->eventSet);
+      ret = PAPI_read(ci->eventSet, prev_values);
+      //      ret = PAPI_start(ci->eventSet);
 
-			if (ret != PAPI_OK) {
-				EMSG("PAPI_read of event set %d for component %d failed with %s (%d)",
-						 ci->eventSet, cid, PAPI_strerror(ret), ret);
-			}
+      if (ret != PAPI_OK) {
+        EMSG("PAPI_read of event set %d for component %d failed with %s (%d)",
+             ci->eventSet, cid, PAPI_strerror(ret), ret);
+      }
 
-		}
-	}
+    }
+  }
 
 finish:
-	tool_exit();
+  tool_exit();
 }
 
 static void
 papi_monitor_exit(void *reg_info, void *args_in)
 {
-	tool_enter();
-	papi_source_info_t *psi = (papi_source_info_t *) reg_info;
-	gpu_monitors_apply_t *args = (gpu_monitors_apply_t *) args_in;
+  tool_enter();
+  papi_source_info_t *psi = (papi_source_info_t *) reg_info;
+  gpu_monitors_apply_t *args = (gpu_monitors_apply_t *) args_in;
 
-	sample_source_t *self = &obj_name(); /// just for debug
-	int my_event_codes[MAX_EVENTS];
-	long long my_event_values[MAX_EVENTS];
-	int my_event_count = MAX_EVENTS;
-	int ret;
+  sample_source_t *self = &obj_name(); /// just for debug
+  int my_event_codes[MAX_EVENTS];
+  long long my_event_values[MAX_EVENTS];
+  int my_event_count = MAX_EVENTS;
+  int ret;
 
-	printf("|------->PAPI_MONITOR_EXIT| running? %d\n", METHOD_CALL(self, started));
+  printf("|------->PAPI_MONITOR_EXIT| running? %d\n", METHOD_CALL(self, started));
 
-	if (args->gpu_type == amd)
+  if (args->gpu_type == amd)
     hipDeviceSynchronize();
 
-	// if sampling disabled explicitly for this thread, skip all processing
-	if (hpcrun_suppress_sample() || sample_filters_apply()) goto finish;
+  // if sampling disabled explicitly for this thread, skip all processing
+  if (hpcrun_suppress_sample() || sample_filters_apply()) goto finish;
 
-	// Collect counters for components in use
-	for (int cid = 0; cid < psi->num_components; ++cid) {
-		papi_component_info_t *ci = &(psi->component_info[cid]);
-		if (ci->inUse){
-			ret = PAPI_read(ci->eventSet, my_event_values);
+  // Collect counters for components in use
+  for (int cid = 0; cid < psi->num_components; ++cid) {
+    papi_component_info_t *ci = &(psi->component_info[cid]);
+    if (ci->inUse){
+      ret = PAPI_read(ci->eventSet, my_event_values);
 
-			if (ret != PAPI_OK) {
-				EMSG("PAPI_read of event set %d for component %d failed with %s (%d)",
-						 ci->eventSet, cid, PAPI_strerror(ret), ret);
-			}
+      if (ret != PAPI_OK) {
+        EMSG("PAPI_read of event set %d for component %d failed with %s (%d)",
+             ci->eventSet, cid, PAPI_strerror(ret), ret);
+      }
 
-			// Attribute collected metric to cct nodes
-			ret = PAPI_list_events(ci->eventSet, my_event_codes, &my_event_count);
-			if (ret != PAPI_OK) {
-				hpcrun_abort("PAPI_list_events failed inside papi_event_handler."
-										 "Return code = %d ==> %s", ret, PAPI_strerror(ret));
-			}
+      // Attribute collected metric to cct nodes
+      ret = PAPI_list_events(ci->eventSet, my_event_codes, &my_event_count);
+      if (ret != PAPI_OK) {
+        hpcrun_abort("PAPI_list_events failed inside papi_event_handler."
+                     "Return code = %d ==> %s", ret, PAPI_strerror(ret));
+      }
 
-			for (int eid = 0; eid < my_event_count; ++eid) {
-				int event_index = get_event_index(self, my_event_codes[eid]);
-				int metric_id = hpcrun_event2metric(self, event_index);
+      for (int eid = 0; eid < my_event_count; ++eid) {
+        int event_index = get_event_index(self, my_event_codes[eid]);
+        int metric_id = hpcrun_event2metric(self, event_index);
 
-				printf("%d Event = %x, event_index = %d, metric_id = %d || value = %llu ---> %llu\n",
-							 eid, my_event_codes[eid], event_index, metric_id, prev_values[eid], my_event_values[eid]);
+        printf("%d Event = %x, event_index = %d, metric_id = %d || value = %llu ---> %llu\n",
+               eid, my_event_codes[eid], event_index, metric_id, prev_values[eid], my_event_values[eid]);
 
-				blame_shift_apply(metric_id, cct_node, my_event_values[eid] /*metricIncr*/);
-			}
+        blame_shift_apply(metric_id, cct_node, my_event_values[eid] /*metricIncr*/);
+      }
 
-		}
-	}
-	cct_node = NULL;
+    }
+  }
+  cct_node = NULL;
 
 finish:
-	tool_exit();
+  tool_exit();
 }
\ No newline at end of file

From 7c889f1108a346708c0e3d172a67c955a16d7820 Mon Sep 17 00:00:00 2001
From: dejangrubisic <grubisic.dejan@yahoo.com>
Date: Mon, 17 Aug 2020 08:21:24 -0500
Subject: [PATCH 015/177] unwind problem solved, papi gets gpu_sync with
 pointer

---
 src/tool/hpcrun/Makefile.am             |   3 +-
 src/tool/hpcrun/Makefile.in             |  26 ++-
 src/tool/hpcrun/gpu-monitors.c          |   8 -
 src/tool/hpcrun/gpu-monitors.h          |  11 +-
 src/tool/hpcrun/gpu/amd/hip-api.c       | 252 ++++++++++++++++++++++++
 src/tool/hpcrun/gpu/amd/hip-api.h       | 111 +++++++++++
 src/tool/hpcrun/gpu/amd/roctracer-api.c |  15 +-
 src/tool/hpcrun/gpu/nvidia/cuda-api.c   |  39 ++--
 src/tool/hpcrun/gpu/nvidia/cupti-api.c  |   8 +-
 src/tool/hpcrun/main.c                  |  14 +-
 src/tool/hpcrun/metrics.c               |   2 +
 src/tool/hpcrun/sample-sources/amd.c    |   9 +
 src/tool/hpcrun/sample-sources/nvidia.c |  20 +-
 src/tool/hpcrun/sample-sources/papi-c.c |  40 +++-
 src/tool/hpcrun/sample_event.c          |   2 +-
 15 files changed, 481 insertions(+), 79 deletions(-)
 create mode 100644 src/tool/hpcrun/gpu/amd/hip-api.c
 create mode 100644 src/tool/hpcrun/gpu/amd/hip-api.h

diff --git a/src/tool/hpcrun/Makefile.am b/src/tool/hpcrun/Makefile.am
index bd0a0be241..25eec2011d 100644
--- a/src/tool/hpcrun/Makefile.am
+++ b/src/tool/hpcrun/Makefile.am
@@ -508,7 +508,8 @@ if OPT_ENABLE_ROCM
 MY_ROCM_FILES=\
 	sample-sources/amd.c \
 	gpu/amd/roctracer-activity-translate.c \
-	gpu/amd/roctracer-api.c 	
+	gpu/amd/roctracer-api.c 	\
+	gpu/amd/hip-api.c
 endif
 
 MY_UPC_FILES = sample-sources/upc.c
diff --git a/src/tool/hpcrun/Makefile.in b/src/tool/hpcrun/Makefile.in
index 32a37ea9c3..6838a7d4d3 100644
--- a/src/tool/hpcrun/Makefile.in
+++ b/src/tool/hpcrun/Makefile.in
@@ -507,11 +507,11 @@ am__libhpcrun_la_SOURCES_DIST = utilities/first_func.c main.h main.c \
 	gpu/nvidia/cupti-analysis.c gpu/nvidia/cupti-api.c \
 	gpu/nvidia/cupti-gpu-api.c sample-sources/upc.c \
 	sample-sources/amd.c gpu/amd/roctracer-activity-translate.c \
-	gpu/amd/roctracer-api.c unwind/common/backtrace.c \
-	unwind/common/unw-throw.c unwind/common/binarytree_uwi.c \
-	unwind/common/interval_t.c unwind/common/libunw_intervals.c \
-	unwind/common/stack_troll.c unwind/common/uw_hash.c \
-	unwind/common/uw_recipe_map.c \
+	gpu/amd/roctracer-api.c gpu/amd/hip-api.c \
+	unwind/common/backtrace.c unwind/common/unw-throw.c \
+	unwind/common/binarytree_uwi.c unwind/common/interval_t.c \
+	unwind/common/libunw_intervals.c unwind/common/stack_troll.c \
+	unwind/common/uw_hash.c unwind/common/uw_recipe_map.c \
 	unwind/generic-libunwind/libunw-unwind.c \
 	unwind/ppc64/ppc64-unwind.c \
 	unwind/ppc64/ppc64-unwind-interval.c \
@@ -694,7 +694,8 @@ am__objects_35 =
 @OPT_ENABLE_ROCM_TRUE@am__objects_36 =  \
 @OPT_ENABLE_ROCM_TRUE@	sample-sources/libhpcrun_la-amd.lo \
 @OPT_ENABLE_ROCM_TRUE@	gpu/amd/libhpcrun_la-roctracer-activity-translate.lo \
-@OPT_ENABLE_ROCM_TRUE@	gpu/amd/libhpcrun_la-roctracer-api.lo
+@OPT_ENABLE_ROCM_TRUE@	gpu/amd/libhpcrun_la-roctracer-api.lo \
+@OPT_ENABLE_ROCM_TRUE@	gpu/amd/libhpcrun_la-hip-api.lo
 @OPT_ENABLE_ROCM_TRUE@am__objects_37 = $(am__objects_36)
 am__objects_38 = unwind/common/libhpcrun_la-backtrace.lo \
 	unwind/common/libhpcrun_la-unw-throw.lo
@@ -1792,7 +1793,8 @@ MY_AARCH64_FILES = \
 @OPT_ENABLE_ROCM_TRUE@MY_ROCM_FILES = \
 @OPT_ENABLE_ROCM_TRUE@	sample-sources/amd.c \
 @OPT_ENABLE_ROCM_TRUE@	gpu/amd/roctracer-activity-translate.c \
-@OPT_ENABLE_ROCM_TRUE@	gpu/amd/roctracer-api.c 	
+@OPT_ENABLE_ROCM_TRUE@	gpu/amd/roctracer-api.c 	\
+@OPT_ENABLE_ROCM_TRUE@	gpu/amd/hip-api.c
 
 MY_UPC_FILES = sample-sources/upc.c
 MY_INCLUDE_DIRS = \
@@ -2654,6 +2656,8 @@ gpu/amd/libhpcrun_la-roctracer-activity-translate.lo:  \
 	gpu/amd/$(am__dirstamp) gpu/amd/$(DEPDIR)/$(am__dirstamp)
 gpu/amd/libhpcrun_la-roctracer-api.lo: gpu/amd/$(am__dirstamp) \
 	gpu/amd/$(DEPDIR)/$(am__dirstamp)
+gpu/amd/libhpcrun_la-hip-api.lo: gpu/amd/$(am__dirstamp) \
+	gpu/amd/$(DEPDIR)/$(am__dirstamp)
 unwind/common/libhpcrun_la-backtrace.lo:  \
 	unwind/common/$(am__dirstamp) \
 	unwind/common/$(DEPDIR)/$(am__dirstamp)
@@ -3533,6 +3537,7 @@ distclean-compile:
 @AMDEP_TRUE@@am__include@ @am__quote@gpu/$(DEPDIR)/libhpcrun_o-gpu-trace-channel.Po@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@gpu/$(DEPDIR)/libhpcrun_o-gpu-trace-item.Po@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@gpu/$(DEPDIR)/libhpcrun_o-gpu-trace.Po@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@gpu/amd/$(DEPDIR)/libhpcrun_la-hip-api.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@gpu/amd/$(DEPDIR)/libhpcrun_la-roctracer-activity-translate.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@gpu/amd/$(DEPDIR)/libhpcrun_la-roctracer-api.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@gpu/nvidia/$(DEPDIR)/libhpcrun_la-cubin-hash-map.Plo@am__quote@
@@ -5103,6 +5108,13 @@ gpu/amd/libhpcrun_la-roctracer-api.lo: gpu/amd/roctracer-api.c
 @AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
 @am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -c -o gpu/amd/libhpcrun_la-roctracer-api.lo `test -f 'gpu/amd/roctracer-api.c' || echo '$(srcdir)/'`gpu/amd/roctracer-api.c
 
+gpu/amd/libhpcrun_la-hip-api.lo: gpu/amd/hip-api.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -MT gpu/amd/libhpcrun_la-hip-api.lo -MD -MP -MF gpu/amd/$(DEPDIR)/libhpcrun_la-hip-api.Tpo -c -o gpu/amd/libhpcrun_la-hip-api.lo `test -f 'gpu/amd/hip-api.c' || echo '$(srcdir)/'`gpu/amd/hip-api.c
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) gpu/amd/$(DEPDIR)/libhpcrun_la-hip-api.Tpo gpu/amd/$(DEPDIR)/libhpcrun_la-hip-api.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='gpu/amd/hip-api.c' object='gpu/amd/libhpcrun_la-hip-api.lo' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -c -o gpu/amd/libhpcrun_la-hip-api.lo `test -f 'gpu/amd/hip-api.c' || echo '$(srcdir)/'`gpu/amd/hip-api.c
+
 unwind/common/libhpcrun_la-backtrace.lo: unwind/common/backtrace.c
 @am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -MT unwind/common/libhpcrun_la-backtrace.lo -MD -MP -MF unwind/common/$(DEPDIR)/libhpcrun_la-backtrace.Tpo -c -o unwind/common/libhpcrun_la-backtrace.lo `test -f 'unwind/common/backtrace.c' || echo '$(srcdir)/'`unwind/common/backtrace.c
 @am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) unwind/common/$(DEPDIR)/libhpcrun_la-backtrace.Tpo unwind/common/$(DEPDIR)/libhpcrun_la-backtrace.Plo
diff --git a/src/tool/hpcrun/gpu-monitors.c b/src/tool/hpcrun/gpu-monitors.c
index 13ca174a4b..3ffd5e8d55 100644
--- a/src/tool/hpcrun/gpu-monitors.c
+++ b/src/tool/hpcrun/gpu-monitors.c
@@ -5,8 +5,6 @@
 #include "gpu-monitors.h"
 
 static gpu_monitor_fn_entry_t *kinds[2] = {0, 0};
-static const char *gpu_name[] = {"unknown", "nvidia", "amd", "intel"};
-
 
 void
 gpu_monitor_register(gpu_monitor_type_t type, gpu_monitor_fn_entry_t *entry)
@@ -25,10 +23,4 @@ gpu_monitors_apply(void *args_in, gpu_monitor_type_t type)
 		fn->fn(fn->reg_info, args_in);
 		fn = fn->next;
 	}
-}
-
-char *
-gpu_monitors_get_gpu_name(gpu_type_t t)
-{
-	return gpu_name[t];
 }
\ No newline at end of file
diff --git a/src/tool/hpcrun/gpu-monitors.h b/src/tool/hpcrun/gpu-monitors.h
index 42454f3e0c..15a63542d1 100644
--- a/src/tool/hpcrun/gpu-monitors.h
+++ b/src/tool/hpcrun/gpu-monitors.h
@@ -14,17 +14,10 @@ typedef enum {
 	gpu_monitor_type_exit
 } gpu_monitor_type_t;
 
-typedef enum {
-	unknown,
-	nvidia,
-	amd,
-	intel
-} gpu_type_t;
-
 
 typedef struct gpu_monitors_apply_t {
 	cct_node_t *cct_node;
-	gpu_type_t gpu_type;
+  int (*gpu_sync_ptr)(void);
 } gpu_monitors_apply_t;
 
 
@@ -37,6 +30,6 @@ typedef struct gpu_monitor_fn_entry_t {
 
 extern void gpu_monitor_register(gpu_monitor_type_t type, gpu_monitor_fn_entry_t* entry);
 extern void gpu_monitors_apply(void *args, gpu_monitor_type_t type);
-char * gpu_monitors_get_gpu_name(gpu_type_t t);
+
 
 #endif //HPCTOOLKIT_GPU_MONITORS_H
diff --git a/src/tool/hpcrun/gpu/amd/hip-api.c b/src/tool/hpcrun/gpu/amd/hip-api.c
new file mode 100644
index 0000000000..f7ac18a832
--- /dev/null
+++ b/src/tool/hpcrun/gpu/amd/hip-api.c
@@ -0,0 +1,252 @@
+// -*-Mode: C++;-*- // technically C99
+
+// * BeginRiceCopyright *****************************************************
+//
+// --------------------------------------------------------------------------
+// Part of HPCToolkit (hpctoolkit.org)
+//
+// Information about sources of support for research and development of
+// HPCToolkit is at 'hpctoolkit.org' and in 'README.Acknowledgments'.
+// --------------------------------------------------------------------------
+//
+// Copyright ((c)) 2002-2020, Rice University
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+//   notice, this list of conditions and the following disclaimer.
+//
+// * Redistributions in binary form must reproduce the above copyright
+//   notice, this list of conditions and the following disclaimer in the
+//   documentation and/or other materials provided with the distribution.
+//
+// * Neither the name of Rice University (RICE) nor the names of its
+//   contributors may be used to endorse or promote products derived from
+//   this software without specific prior written permission.
+//
+// This software is provided by RICE and contributors "as is" and any
+// express or implied warranties, including, but not limited to, the
+// implied warranties of merchantability and fitness for a particular
+// purpose are disclaimed. In no event shall RICE or contributors be
+// liable for any direct, indirect, incidental, special, exemplary, or
+// consequential damages (including, but not limited to, procurement of
+// substitute goods or services; loss of use, data, or profits; or
+// business interruption) however caused and on any theory of liability,
+// whether in contract, strict liability, or tort (including negligence
+// or otherwise) arising in any way out of the use of this software, even
+// if advised of the possibility of such damage.
+//
+// ******************************************************* EndRiceCopyright *
+
+//***************************************************************************
+//
+// File:
+//   hip-api.c
+//
+// Purpose:
+//   wrapper around AMD HIP layer
+//
+//***************************************************************************
+
+
+//*****************************************************************************
+// system include files
+//*****************************************************************************
+
+#include <dlfcn.h>
+#include <stdio.h>
+#include <string.h>    // memset
+
+#include <roctracer_hip.h>
+
+
+
+//*****************************************************************************
+// local include files
+//*****************************************************************************
+
+#include <hpcrun/sample-sources/libdl.h>
+#include <hpcrun/messages/messages.h>
+
+#include "hip-api.h"
+
+
+
+//*****************************************************************************
+// macros
+//*****************************************************************************
+
+#define HIP_FN_NAME(f) DYN_FN_NAME(f)
+
+#define HIP_FN(fn, args) \
+  static hipError_t (*HIP_FN_NAME(fn)) args
+
+#define HPCRUN_HIP_API_CALL(fn, args)                              \
+{                                                                   \
+  hipError_t error_result = HIP_FN_NAME(fn) args;		    \
+  if (error_result != hipSuccess) {				    \
+    ETMSG(CUDA, "hip api %s returned %d", #fn, (int) error_result);    \
+    exit(-1);							    \
+  }								    \
+}
+
+#define FORALL_HIP_ROUTINES(macro)             \
+  macro(hipDeviceSynchronize)                  \
+  macro(hipDeviceGetAttribute)                 \
+  macro(hipCtxGetCurrent)
+
+//******************************************************************************
+// static data
+//******************************************************************************
+
+#ifndef HPCRUN_STATIC_LINK
+HIP_FN
+(
+ hipDeviceSynchronize,
+( void )
+);
+
+HIP_FN
+(
+ hipDeviceGetAttribute,
+ (
+ int *pi,
+ hipDeviceAttribute_t attrib,
+ int dev
+ )
+);
+
+HIP_FN
+(
+ hipCtxGetCurrent,
+ (
+ hipCtx_t *ctx
+ )
+);
+
+#endif
+
+//******************************************************************************
+// private operations
+//******************************************************************************
+//TODO: Copied from cuda-api.c - check if works for hip
+#ifndef HPCRUN_STATIC_LINK
+static int
+hip_device_sm_blocks_query
+(
+ int major,
+ int minor
+)
+{
+  switch(major) {
+    case 7:
+    case 6:
+      return 32;
+    default:
+      // TODO(Keren): add more devices
+      return 8;
+  }
+}
+#endif
+
+
+//******************************************************************************
+// interface operations
+//******************************************************************************
+
+int
+hip_bind
+(
+void
+)
+{
+#ifndef HPCRUN_STATIC_LINK
+  // dynamic libraries only availabile in non-static case
+  CHK_DLOPEN(hip, "libhip_hcc.so", RTLD_NOW | RTLD_GLOBAL);
+
+#define HIP_BIND(fn) \
+  CHK_DLSYM(hip, fn);
+
+  FORALL_HIP_ROUTINES(HIP_BIND)
+#undef CUPTI_BIND
+
+  return 0;
+#else
+  return -1;
+#endif // ! HPCRUN_STATIC_LINK
+}
+
+int
+hip_context
+(
+ hipCtx_t *ctx
+)
+{
+#ifndef HPCRUN_STATIC_LINK
+  HPCRUN_HIP_API_CALL(hipCtxGetCurrent, (ctx));
+  return 0;
+#else
+  return -1;
+#endif
+}
+
+int
+hip_device_property_query
+(
+ int device_id,
+ hip_device_property_t *property
+)
+{
+#ifndef HPCRUN_STATIC_LINK
+  HPCRUN_HIP_API_CALL(hipDeviceGetAttribute,
+                       (&property->sm_count, hipDeviceAttributeMultiprocessorCount, device_id));
+
+  HPCRUN_HIP_API_CALL(hipDeviceGetAttribute,
+                       (&property->sm_clock_rate, hipDeviceAttributeClockRate, device_id));
+
+  HPCRUN_HIP_API_CALL(hipDeviceGetAttribute,
+                       (&property->sm_shared_memory,
+                       hipDeviceAttributeMaxSharedMemoryPerMultiprocessor, device_id));
+
+  HPCRUN_HIP_API_CALL(hipDeviceGetAttribute,
+                       (&property->sm_registers,
+                       hipDeviceAttributeMaxRegistersPerBlock, device_id));//CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR
+
+  HPCRUN_HIP_API_CALL(hipDeviceGetAttribute,
+                       (&property->sm_threads, hipDeviceAttributeMaxThreadsPerMultiProcessor,
+                       device_id));
+
+  HPCRUN_HIP_API_CALL(hipDeviceGetAttribute,
+                       (&property->num_threads_per_warp, hipDeviceAttributeWarpSize,
+                       device_id));
+
+  int major = 0, minor = 0;
+
+  HPCRUN_HIP_API_CALL(hipDeviceGetAttribute,
+                       (&major, hipDeviceAttributeComputeCapabilityMajor, device_id));
+
+  HPCRUN_HIP_API_CALL(hipDeviceGetAttribute,
+                       (&minor, hipDeviceAttributeComputeCapabilityMinor, device_id));
+
+  property->sm_blocks = hip_device_sm_blocks_query(major, minor);
+
+  return 0;
+#else
+  return -1;
+#endif
+}
+
+int
+hip_dev_sync
+()
+{
+#ifndef HPCRUN_STATIC_LINK
+  HPCRUN_HIP_API_CALL(hipDeviceSynchronize, () );
+  return 0;
+#else
+  return -1;
+#endif
+}
\ No newline at end of file
diff --git a/src/tool/hpcrun/gpu/amd/hip-api.h b/src/tool/hpcrun/gpu/amd/hip-api.h
new file mode 100644
index 0000000000..459499e638
--- /dev/null
+++ b/src/tool/hpcrun/gpu/amd/hip-api.h
@@ -0,0 +1,111 @@
+// -*-Mode: C++;-*- // technically C99
+
+// * BeginRiceCopyright *****************************************************
+//
+// --------------------------------------------------------------------------
+// Part of HPCToolkit (hpctoolkit.org)
+//
+// Information about sources of support for research and development of
+// HPCToolkit is at 'hpctoolkit.org' and in 'README.Acknowledgments'.
+// --------------------------------------------------------------------------
+//
+// Copyright ((c)) 2002-2020, Rice University
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+//   notice, this list of conditions and the following disclaimer.
+//
+// * Redistributions in binary form must reproduce the above copyright
+//   notice, this list of conditions and the following disclaimer in the
+//   documentation and/or other materials provided with the distribution.
+//
+// * Neither the name of Rice University (RICE) nor the names of its
+//   contributors may be used to endorse or promote products derived from
+//   this software without specific prior written permission.
+//
+// This software is provided by RICE and contributors "as is" and any
+// express or implied warranties, including, but not limited to, the
+// implied warranties of merchantability and fitness for a particular
+// purpose are disclaimed. In no event shall RICE or contributors be
+// liable for any direct, indirect, incidental, special, exemplary, or
+// consequential damages (including, but not limited to, procurement of
+// substitute goods or services; loss of use, data, or profits; or
+// business interruption) however caused and on any theory of liability,
+// whether in contract, strict liability, or tort (including negligence
+// or otherwise) arising in any way out of the use of this software, even
+// if advised of the possibility of such damage.
+//
+// ******************************************************* EndRiceCopyright *
+
+//***************************************************************************
+//
+// File:
+//   cuda-api.h
+//
+// Purpose:
+//   interface definitions for wrapper around AMD HIP layer
+//
+//***************************************************************************
+
+#ifndef hip_api_h
+#define hip_api_h
+
+
+
+//*****************************************************************************
+// nvidia includes
+//*****************************************************************************
+
+#include <roctracer_hip.h>
+
+
+
+//*****************************************************************************
+// interface operations
+//*****************************************************************************
+
+typedef struct hip_device_property {
+ int sm_count;
+ int sm_clock_rate;
+ int sm_shared_memory;
+ int sm_registers;
+ int sm_threads;
+ int sm_blocks;
+ int num_threads_per_warp;
+} hip_device_property_t;
+
+
+//*****************************************************************************
+// interface operations
+//*****************************************************************************
+
+// returns 0 on success
+int
+hip_bind
+(
+ void
+);
+
+// returns 0 on success
+int
+hip_context
+(
+ hipCtx_t *ctx
+);
+
+// returns 0 on success
+int
+hip_device_property_query
+(
+ int device_id,
+ hip_device_property_t *property
+);
+
+int
+hip_dev_sync ();
+
+#endif //hip_api_h
diff --git a/src/tool/hpcrun/gpu/amd/roctracer-api.c b/src/tool/hpcrun/gpu/amd/roctracer-api.c
index 1d9751ba5f..29ea75ab65 100644
--- a/src/tool/hpcrun/gpu/amd/roctracer-api.c
+++ b/src/tool/hpcrun/gpu/amd/roctracer-api.c
@@ -47,6 +47,7 @@
 
 #include "roctracer-api.h"
 #include "roctracer-activity-translate.h"
+#include "hip-api.h"
 
 #include <roctracer_hip.h>
 
@@ -101,7 +102,6 @@
 //******************************************************************************
 // local variables
 //******************************************************************************
-static __thread cct_node_t *cupti_kernel_ph = NULL;
 
 //----------------------------------------------------------
 // roctracer function pointers for late binding
@@ -372,15 +372,16 @@ roctracer_subscriber_callback
     // Generate notification entry
     uint64_t cpu_submit_time = hpcrun_nanotime();
 
-    cupti_kernel_ph = gpu_op_ccts_get(&gpu_op_ccts, gpu_placeholder_type_kernel); //dejan: added
-
-    printf("\nACTIVITY_API_PHASE_ENTER -----------------| cct = %p | gpu = %d\n", cupti_kernel_ph, amd);
-    gpu_monitors_apply(&(gpu_monitors_apply_t) {.cct_node=cupti_kernel_ph, .gpu_type=amd}, gpu_monitor_type_enter);
+    printf("\nACTIVITY_API_PHASE_ENTER -----------------| cct = %p \n", api_node);
+    int (*hip_gpu_sync_ptr)(void) = hip_dev_sync;
+    gpu_monitors_apply(&(gpu_monitors_apply_t) {.cct_node=api_node, .gpu_sync_ptr=hip_gpu_sync_ptr}, gpu_monitor_type_enter);
 
     gpu_correlation_channel_produce(correlation_id, &gpu_op_ccts, cpu_submit_time);
   }else if (data->phase == ACTIVITY_API_PHASE_EXIT){
-    printf("\nACTIVITY_API_PHASE_EXIT -----------------| cct = %p | gpu = %d\n", cupti_kernel_ph, amd );
-    cupti_kernel_ph = NULL;
+    printf("\nACTIVITY_API_PHASE_EXIT -----------------| \n");
+    int (*hip_gpu_sync_ptr)(void) = hip_dev_sync;
+    gpu_monitors_apply(&(gpu_monitors_apply_t) {.cct_node=NULL, .gpu_sync_ptr=hip_gpu_sync_ptr}, gpu_monitor_type_exit);
+
   }else{
     ;
   }
diff --git a/src/tool/hpcrun/gpu/nvidia/cuda-api.c b/src/tool/hpcrun/gpu/nvidia/cuda-api.c
index 1c20b6dda8..0b7f3815da 100644
--- a/src/tool/hpcrun/gpu/nvidia/cuda-api.c
+++ b/src/tool/hpcrun/gpu/nvidia/cuda-api.c
@@ -125,26 +125,6 @@ CUDA_FN
 // private operations
 //******************************************************************************
 
-int
-cuda_bind
-(
-  void
-)
-{
-#ifndef HPCRUN_STATIC_LINK
-  // dynamic libraries only availabile in non-static case
-  CHK_DLOPEN(cuda, "libcuda.so", RTLD_NOW | RTLD_GLOBAL);
-
-  CHK_DLSYM(cuda, cuDeviceGetAttribute);
-  CHK_DLSYM(cuda, cuCtxGetCurrent);
-
-  return 0;
-#else
-  return -1;
-#endif // ! HPCRUN_STATIC_LINK
-}
-
-
 #ifndef HPCRUN_STATIC_LINK
 static int
 cuda_device_sm_blocks_query
@@ -169,6 +149,25 @@ cuda_device_sm_blocks_query
 // interface operations
 //******************************************************************************
 
+int
+cuda_bind
+(
+ void
+)
+{
+#ifndef HPCRUN_STATIC_LINK
+  // dynamic libraries only availabile in non-static case
+  CHK_DLOPEN(cuda, "libcuda.so", RTLD_NOW | RTLD_GLOBAL);
+
+  CHK_DLSYM(cuda, cuDeviceGetAttribute);
+  CHK_DLSYM(cuda, cuCtxGetCurrent);
+
+  return 0;
+#else
+  return -1;
+#endif // ! HPCRUN_STATIC_LINK
+}
+
 int
 cuda_context
 (
diff --git a/src/tool/hpcrun/gpu/nvidia/cupti-api.c b/src/tool/hpcrun/gpu/nvidia/cupti-api.c
index 5852a588b3..367c763e96 100644
--- a/src/tool/hpcrun/gpu/nvidia/cupti-api.c
+++ b/src/tool/hpcrun/gpu/nvidia/cupti-api.c
@@ -982,7 +982,7 @@ cupti_subscriber_callback
         // Generate notification entry
         uint64_t cpu_submit_time = hpcrun_nanotime();
 
-//				gpu_monitors_apply( &(gpu_monitors_apply_t){.cct_node=api_node,.gpu_type=nvidia}, gpu_monitor_type_enter);
+				gpu_monitors_apply( &(gpu_monitors_apply_t){.cct_node=api_node,.gpu_sync_ptr=NULL}, gpu_monitor_type_enter);
 
 
 				gpu_correlation_channel_produce(correlation_id, &gpu_op_ccts,
@@ -994,7 +994,7 @@ cupti_subscriber_callback
         correlation_id = cupti_correlation_id_pop();
         TMSG(CUPTI_TRACE, "Driver pop externalId %lu (cb_id = %u)", correlation_id, cb_id);
 
-//				gpu_monitors_apply( &(gpu_monitors_apply_t){.cct_node=NULL,.gpu_type=nvidia}, gpu_monitor_type_exit);
+				gpu_monitors_apply( &(gpu_monitors_apply_t){.cct_node=NULL,.gpu_sync_ptr=NULL}, gpu_monitor_type_exit);
 
 
 			}
@@ -1141,7 +1141,7 @@ cupti_subscriber_callback
         // Generate notification entry
         uint64_t cpu_submit_time = hpcrun_nanotime();
 
-				gpu_monitors_apply( &(gpu_monitors_apply_t){.cct_node=cupti_kernel_ph, .gpu_type=nvidia}, gpu_monitor_type_enter);
+				gpu_monitors_apply( &(gpu_monitors_apply_t){.cct_node=cupti_kernel_ph, .gpu_sync_ptr=NULL}, gpu_monitor_type_enter);
 
 				gpu_correlation_channel_produce(correlation_id, &gpu_op_ccts,
           cpu_submit_time);
@@ -1155,7 +1155,7 @@ cupti_subscriber_callback
         correlation_id = cupti_correlation_id_pop();
         TMSG(CUPTI_TRACE, "Runtime pop externalId %lu (cb_id = %u)", correlation_id, cb_id);
 
-				gpu_monitors_apply( &(gpu_monitors_apply_t){.cct_node=cupti_kernel_ph, .gpu_type=nvidia}, gpu_monitor_type_exit);
+				gpu_monitors_apply( &(gpu_monitors_apply_t){.cct_node=cupti_kernel_ph, .gpu_sync_ptr=NULL}, gpu_monitor_type_exit);
 
 				cupti_kernel_ph = NULL;
       }
diff --git a/src/tool/hpcrun/main.c b/src/tool/hpcrun/main.c
index a0679949e5..cfce1bb0cc 100644
--- a/src/tool/hpcrun/main.c
+++ b/src/tool/hpcrun/main.c
@@ -474,9 +474,9 @@ hpcrun_init_internal(bool is_child)
   hpcrun_mmap_init();
   hpcrun_thread_data_init(0, NULL, is_child, hpcrun_get_num_sample_sources());
 
-  // must initialize unwind recipe map before initializing fnbounds
-  // because mapping of load modules affects the recipe map.
-  hpcrun_unw_init();
+//  // must initialize unwind recipe map before initializing fnbounds
+//  // because mapping of load modules affects the recipe map.
+//  hpcrun_unw_init();
 
 //  // init callbacks for each device
 //  hpcrun_initializer_init();
@@ -551,7 +551,7 @@ hpcrun_init_internal(bool is_child)
   //
 
   if (! is_child) {
-  	SAMPLE_SOURCES(process_event_list, lush_metrics);
+    SAMPLE_SOURCES(process_event_list, lush_metrics);
     SAMPLE_SOURCES(finalize_event_list);
     hpcrun_metrics_data_finalize();
   }
@@ -937,6 +937,10 @@ monitor_init_process(int *argc, char **argv, void* data)
   }
   messages_logfile_create();
 
+  // must initialize unwind recipe map before initializing fnbounds
+  // because mapping of load modules affects the recipe map.
+  hpcrun_unw_init();
+
   // We need to save vdso before initializing fnbounds this
   // is because fnbounds_init will iterate over the load map 
   // and will invoke analysis on vdso
@@ -946,7 +950,7 @@ monitor_init_process(int *argc, char **argv, void* data)
   hpcrun_initializer_init();
 
   // fnbounds must be after module_ignore_map
-	fnbounds_init();
+  fnbounds_init();
 
   hpcrun_registered_sources_init();
 
diff --git a/src/tool/hpcrun/metrics.c b/src/tool/hpcrun/metrics.c
index 7e903aec78..035fd58947 100644
--- a/src/tool/hpcrun/metrics.c
+++ b/src/tool/hpcrun/metrics.c
@@ -365,6 +365,7 @@ hpcrun_set_new_metric_desc(kind_info_t *kind, const char* name,
   metric_desc_list_t* n = NULL;
 
   // if there are pre-allocated metrics, use them
+  // (default metrics - not alloc, added metrics - prealloc)
   if (pre_alloc) {
     n = pre_alloc;
     pre_alloc = pre_alloc->next;
@@ -372,6 +373,7 @@ hpcrun_set_new_metric_desc(kind_info_t *kind, const char* name,
   else {
     n = (metric_desc_list_t*) hpcrun_malloc(sizeof(metric_desc_list_t));
   }
+  // Add n into the list of metric description - kind->metric_data
   n->next = kind->metric_data;
   kind->metric_data = n;
   n->proc = upd_fn;
diff --git a/src/tool/hpcrun/sample-sources/amd.c b/src/tool/hpcrun/sample-sources/amd.c
index a5d8a492a0..429aaa7cbd 100644
--- a/src/tool/hpcrun/sample-sources/amd.c
+++ b/src/tool/hpcrun/sample-sources/amd.c
@@ -41,6 +41,7 @@
 #include <hpcrun/control-knob.h>
 #include <hpcrun/device-finalizers.h>
 #include <hpcrun/gpu/amd/roctracer-api.h>
+#include <hpcrun/gpu/amd/hip-api.h>
 #include <hpcrun/gpu/gpu-activity.h>
 #include <hpcrun/gpu/gpu-metrics.h>
 #include <hpcrun/hpcrun_options.h>
@@ -143,6 +144,14 @@ METHOD_FN(process_event_list, int lush_metrics)
     int nevents = (self->evl).nevents;
     gpu_metrics_default_enable();
     TMSG(CUDA,"nevents = %d", nevents);
+
+
+#ifndef HPCRUN_STATIC_LINK
+  if (hip_bind()) {
+    EEMSG("hpcrun: unable to bind to HIP AMD library %s\n", dlerror());
+    monitor_real_exit(-1);
+  }
+#endif
 }
 
 static void
diff --git a/src/tool/hpcrun/sample-sources/nvidia.c b/src/tool/hpcrun/sample-sources/nvidia.c
index 9de7326c43..2e8a3d9baa 100644
--- a/src/tool/hpcrun/sample-sources/nvidia.c
+++ b/src/tool/hpcrun/sample-sources/nvidia.c
@@ -359,17 +359,17 @@ METHOD_FN(process_event_list, int lush_metrics)
   gpu_metrics_default_enable();
   gpu_metrics_KINFO_enable();
 
-#ifndef HPCRUN_STATIC_LINK
-  if (cuda_bind()) {
-    EEMSG("hpcrun: unable to bind to NVIDIA CUDA library %s\n", dlerror());
-    monitor_real_exit(-1);
-  }
+  #ifndef HPCRUN_STATIC_LINK
+    if (cuda_bind()) {
+      EEMSG("hpcrun: unable to bind to NVIDIA CUDA library %s\n", dlerror());
+      monitor_real_exit(-1);
+    }
 
-  if (cupti_bind()) {
-    EEMSG("hpcrun: unable to bind to NVIDIA CUPTI library %s\n", dlerror());
-    monitor_real_exit(-1);
-  }
-#endif
+    if (cupti_bind()) {
+      EEMSG("hpcrun: unable to bind to NVIDIA CUPTI library %s\n", dlerror());
+      monitor_real_exit(-1);
+    }
+  #endif
 
   // Register hpcrun callbacks
   device_finalizer_flush.fn = cupti_device_flush;
diff --git a/src/tool/hpcrun/sample-sources/papi-c.c b/src/tool/hpcrun/sample-sources/papi-c.c
index 2f950d8246..45decffc0d 100644
--- a/src/tool/hpcrun/sample-sources/papi-c.c
+++ b/src/tool/hpcrun/sample-sources/papi-c.c
@@ -107,7 +107,7 @@
 
 #define OVERFLOW_MODE 0
 #define WEIGHT_METRIC 0
-#define DEFAULT_THRESHOLD  1 //2000000L
+#define DEFAULT_THRESHOLD  2000000L
 
 #include "papi-c.h"
 
@@ -145,6 +145,8 @@ static __thread gpu_monitor_fn_entry_t gpu_monitor_exit;
  *****************************************************************************/
 static void papi_monitor_enter(void *reg_info, void *args_in);
 static void papi_monitor_exit(void *reg_info, void *args_in);
+static void
+gpu_metrics_attribute_papi(int metric_id, cct_node_t *cct_node, long long value);
 
 static int
 get_event_index(sample_source_t *self, int event_code)
@@ -590,6 +592,7 @@ METHOD_FN(process_event_list, int lush_metrics)
 
     if (component_uses_sync_samples(cidx))
       TMSG(PAPI, "Event %s from synchronous component", buffer);
+
     int metric_id = /* weight */
       hpcrun_set_new_metric_info_and_period(papi_kind, strdup(buffer),
               MetricFlags_ValFmt_Int,
@@ -1040,21 +1043,21 @@ papi_monitor_enter(void *reg_info, void *args_in)
   sample_source_t *self = &obj_name(); /// just for debug
   int ret;
 
-  printf("|------->PAPI_MONITOR_ENTER | running? %d\n", METHOD_CALL(self, started));
+  printf("|------->PAPI_MONITOR_ENTER | cct = %p\n", args->cct_node);
 
   // if sampling disabled explicitly for this thread, skip all processing
   if (hpcrun_suppress_sample() || sample_filters_apply()) goto finish;
 
   cct_node = args->cct_node;
 
-  if (args->gpu_type == amd)
-    hipDeviceSynchronize();
+  if (args->gpu_sync_ptr)  // for amd it seems that there is no default sync like in nvidia case
+    args->gpu_sync_ptr();
 
   // Save counts on the end so we could substract that from next call (we don't want to measure ourselves)
   for (int cid = 0; cid < psi->num_components; ++cid) {
     papi_component_info_t *ci = &(psi->component_info[cid]);
     if (ci->inUse) {
-      printf("Self = %p | Component %d \t | cct = %p | gpu = %s\n\n", self, cid, args->cct_node, gpu_monitors_get_gpu_name(args->gpu_type) );
+      printf("Self = %p | Component %d \t | cct = %p \n\n", self, cid, args->cct_node );
 
       ret = PAPI_read(ci->eventSet, prev_values);
       //      ret = PAPI_start(ci->eventSet);
@@ -1071,6 +1074,7 @@ papi_monitor_enter(void *reg_info, void *args_in)
   tool_exit();
 }
 
+
 static void
 papi_monitor_exit(void *reg_info, void *args_in)
 {
@@ -1086,8 +1090,8 @@ papi_monitor_exit(void *reg_info, void *args_in)
 
   printf("|------->PAPI_MONITOR_EXIT| running? %d\n", METHOD_CALL(self, started));
 
-  if (args->gpu_type == amd)
-    hipDeviceSynchronize();
+  if (args->gpu_sync_ptr)
+    args->gpu_sync_ptr();
 
   // if sampling disabled explicitly for this thread, skip all processing
   if (hpcrun_suppress_sample() || sample_filters_apply()) goto finish;
@@ -1118,6 +1122,9 @@ papi_monitor_exit(void *reg_info, void *args_in)
                eid, my_event_codes[eid], event_index, metric_id, prev_values[eid], my_event_values[eid]);
 
         blame_shift_apply(metric_id, cct_node, my_event_values[eid] /*metricIncr*/);
+
+
+        gpu_metrics_attribute_papi(metric_id, cct_node, my_event_values[eid]);
       }
 
     }
@@ -1126,4 +1133,23 @@ papi_monitor_exit(void *reg_info, void *args_in)
 
 finish:
   tool_exit();
+}
+
+
+static void
+gpu_metrics_attribute_papi
+(
+ int metric_id,
+ cct_node_t *cct_node,
+ long long value
+)
+{
+  metric_data_list_t* metrics = hpcrun_reify_metric_set(cct_node, metric_id);
+  
+  hpcrun_metric_std_inc(metric_id,
+                        metrics,
+                        (cct_metric_data_t) {.i = value});
+
+
+//  gpu_context_trace(context_id, &entry_trace);
 }
\ No newline at end of file
diff --git a/src/tool/hpcrun/sample_event.c b/src/tool/hpcrun/sample_event.c
index 3206eeb3ac..98c9ed70f3 100644
--- a/src/tool/hpcrun/sample_event.c
+++ b/src/tool/hpcrun/sample_event.c
@@ -232,7 +232,7 @@ hpcrun_sample_callpath(void* context, int metricId,
       }
     }
   }
-  else {
+  else {  // Partial unwind case
     cct_bundle_t* cct = &(td->core_profile_trace_data.epoch->csdata);
     node = record_partial_unwind(cct, td->btbuf_beg, td->btbuf_cur - 1,
         metricId, metricIncr, skipInner, NULL);

From ec6b1b455755382ccb6f789ca986a82df16164c7 Mon Sep 17 00:00:00 2001
From: Aaron Cherian <aarontcopal2@jlselogin2.ftm.alcf.anl.gov>
Date: Fri, 28 Aug 2020 16:10:43 +0000
Subject: [PATCH 016/177] support for extracting lineinfo from Intel GPU
 binaries inside hpcstruct

---
 src/lib/banal/Makefile.am                  |   1 +
 src/lib/banal/Makefile.in                  |  29 ++-
 src/lib/banal/Struct.cpp                   |  41 +++-
 src/lib/banal/intel/.IntelGPUbanal.cpp.swp | Bin 0 -> 16384 bytes
 src/lib/banal/intel/IntelGPUbanal.cpp      | 122 ++++++++++
 src/lib/banal/intel/IntelGPUbanal.hpp      |  89 +++++++
 src/lib/binutils/InputFile.cpp             |   9 +-
 src/lib/binutils/IntelGPUbinutils.cpp      | 263 +++++++++++++++++++++
 src/lib/binutils/IntelGPUbinutils.hpp      |  76 ++++++
 src/lib/binutils/Makefile.am               |   1 +
 src/lib/binutils/Makefile.in               |  14 +-
 11 files changed, 632 insertions(+), 13 deletions(-)
 create mode 100644 src/lib/banal/intel/.IntelGPUbanal.cpp.swp
 create mode 100644 src/lib/banal/intel/IntelGPUbanal.cpp
 create mode 100644 src/lib/banal/intel/IntelGPUbanal.hpp
 create mode 100644 src/lib/binutils/IntelGPUbinutils.cpp
 create mode 100644 src/lib/binutils/IntelGPUbinutils.hpp

diff --git a/src/lib/banal/Makefile.am b/src/lib/banal/Makefile.am
index ea5d7d2c15..fe45f78749 100644
--- a/src/lib/banal/Makefile.am
+++ b/src/lib/banal/Makefile.am
@@ -70,6 +70,7 @@ MYSOURCES = \
 	cuda/CudaBlock.cpp  \
 	cuda/CudaCodeSource.cpp  \
 	cuda/ReadCubinCFG.cpp \
+	intel/IntelGPUbanal.cpp \
 	Struct.cpp  \
 	Struct-Inline.cpp  \
 	Struct-Output.cpp
diff --git a/src/lib/banal/Makefile.in b/src/lib/banal/Makefile.in
index babee912a7..70bb0d77cb 100644
--- a/src/lib/banal/Makefile.in
+++ b/src/lib/banal/Makefile.in
@@ -142,7 +142,8 @@ am__objects_1 = cuda/libHPCbanal_la-CFGParser.lo \
 	cuda/libHPCbanal_la-GraphReader.lo \
 	cuda/libHPCbanal_la-CudaBlock.lo \
 	cuda/libHPCbanal_la-CudaCodeSource.lo \
-	cuda/libHPCbanal_la-ReadCubinCFG.lo libHPCbanal_la-Struct.lo \
+	cuda/libHPCbanal_la-ReadCubinCFG.lo \
+	intel/libHPCbanal_la-IntelGPUbanal.lo libHPCbanal_la-Struct.lo \
 	libHPCbanal_la-Struct-Inline.lo \
 	libHPCbanal_la-Struct-Output.lo
 am_libHPCbanal_la_OBJECTS = $(am__objects_1)
@@ -521,6 +522,7 @@ MYSOURCES = \
 	cuda/CudaBlock.cpp  \
 	cuda/CudaCodeSource.cpp  \
 	cuda/ReadCubinCFG.cpp \
+	intel/IntelGPUbanal.cpp \
 	Struct.cpp  \
 	Struct-Inline.cpp  \
 	Struct-Output.cpp
@@ -620,6 +622,14 @@ cuda/libHPCbanal_la-CudaCodeSource.lo: cuda/$(am__dirstamp) \
 	cuda/$(DEPDIR)/$(am__dirstamp)
 cuda/libHPCbanal_la-ReadCubinCFG.lo: cuda/$(am__dirstamp) \
 	cuda/$(DEPDIR)/$(am__dirstamp)
+intel/$(am__dirstamp):
+	@$(MKDIR_P) intel
+	@: > intel/$(am__dirstamp)
+intel/$(DEPDIR)/$(am__dirstamp):
+	@$(MKDIR_P) intel/$(DEPDIR)
+	@: > intel/$(DEPDIR)/$(am__dirstamp)
+intel/libHPCbanal_la-IntelGPUbanal.lo: intel/$(am__dirstamp) \
+	intel/$(DEPDIR)/$(am__dirstamp)
 
 libHPCbanal.la: $(libHPCbanal_la_OBJECTS) $(libHPCbanal_la_DEPENDENCIES) $(EXTRA_libHPCbanal_la_DEPENDENCIES) 
 	$(AM_V_CXXLD)$(libHPCbanal_la_LINK)  $(libHPCbanal_la_OBJECTS) $(libHPCbanal_la_LIBADD) $(LIBS)
@@ -631,6 +641,8 @@ mostlyclean-compile:
 	-rm -f *.$(OBJEXT)
 	-rm -f cuda/*.$(OBJEXT)
 	-rm -f cuda/*.lo
+	-rm -f intel/*.$(OBJEXT)
+	-rm -f intel/*.lo
 
 distclean-compile:
 	-rm -f *.tab.c
@@ -646,6 +658,7 @@ distclean-compile:
 @AMDEP_TRUE@@am__include@ @am__quote@cuda/$(DEPDIR)/libHPCbanal_la-CudaFunction.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@cuda/$(DEPDIR)/libHPCbanal_la-GraphReader.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@cuda/$(DEPDIR)/libHPCbanal_la-ReadCubinCFG.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@intel/$(DEPDIR)/libHPCbanal_la-IntelGPUbanal.Plo@am__quote@
 
 .cpp.o:
 @am__fastdepCXX_TRUE@	$(AM_V_CXX)depbase=`echo $@ | sed 's|[^/]*$$|$(DEPDIR)/&|;s|\.o$$||'`;\
@@ -720,6 +733,13 @@ cuda/libHPCbanal_la-ReadCubinCFG.lo: cuda/ReadCubinCFG.cpp
 @AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
 @am__fastdepCXX_FALSE@	$(AM_V_CXX@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libHPCbanal_la_CXXFLAGS) $(CXXFLAGS) -c -o cuda/libHPCbanal_la-ReadCubinCFG.lo `test -f 'cuda/ReadCubinCFG.cpp' || echo '$(srcdir)/'`cuda/ReadCubinCFG.cpp
 
+intel/libHPCbanal_la-IntelGPUbanal.lo: intel/IntelGPUbanal.cpp
+@am__fastdepCXX_TRUE@	$(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libHPCbanal_la_CXXFLAGS) $(CXXFLAGS) -MT intel/libHPCbanal_la-IntelGPUbanal.lo -MD -MP -MF intel/$(DEPDIR)/libHPCbanal_la-IntelGPUbanal.Tpo -c -o intel/libHPCbanal_la-IntelGPUbanal.lo `test -f 'intel/IntelGPUbanal.cpp' || echo '$(srcdir)/'`intel/IntelGPUbanal.cpp
+@am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) intel/$(DEPDIR)/libHPCbanal_la-IntelGPUbanal.Tpo intel/$(DEPDIR)/libHPCbanal_la-IntelGPUbanal.Plo
+@AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='intel/IntelGPUbanal.cpp' object='intel/libHPCbanal_la-IntelGPUbanal.lo' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCXX_FALSE@	$(AM_V_CXX@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libHPCbanal_la_CXXFLAGS) $(CXXFLAGS) -c -o intel/libHPCbanal_la-IntelGPUbanal.lo `test -f 'intel/IntelGPUbanal.cpp' || echo '$(srcdir)/'`intel/IntelGPUbanal.cpp
+
 libHPCbanal_la-Struct.lo: Struct.cpp
 @am__fastdepCXX_TRUE@	$(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libHPCbanal_la_CXXFLAGS) $(CXXFLAGS) -MT libHPCbanal_la-Struct.lo -MD -MP -MF $(DEPDIR)/libHPCbanal_la-Struct.Tpo -c -o libHPCbanal_la-Struct.lo `test -f 'Struct.cpp' || echo '$(srcdir)/'`Struct.cpp
 @am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/libHPCbanal_la-Struct.Tpo $(DEPDIR)/libHPCbanal_la-Struct.Plo
@@ -754,6 +774,7 @@ mostlyclean-libtool:
 clean-libtool:
 	-rm -rf .libs _libs
 	-rm -rf cuda/.libs cuda/_libs
+	-rm -rf intel/.libs intel/_libs
 
 ID: $(am__tagged_files)
 	$(am__define_uniq_tagged_files); mkid -fID $$unique
@@ -870,6 +891,8 @@ distclean-generic:
 	-test . = "$(srcdir)" || test -z "$(CONFIG_CLEAN_VPATH_FILES)" || rm -f $(CONFIG_CLEAN_VPATH_FILES)
 	-rm -f cuda/$(DEPDIR)/$(am__dirstamp)
 	-rm -f cuda/$(am__dirstamp)
+	-rm -f intel/$(DEPDIR)/$(am__dirstamp)
+	-rm -f intel/$(am__dirstamp)
 
 maintainer-clean-generic:
 	@echo "This command is intended for maintainers to use"
@@ -880,7 +903,7 @@ clean-am: clean-generic clean-libtool clean-noinstLTLIBRARIES \
 	mostlyclean-am
 
 distclean: distclean-am
-	-rm -rf ./$(DEPDIR) cuda/$(DEPDIR)
+	-rm -rf ./$(DEPDIR) cuda/$(DEPDIR) intel/$(DEPDIR)
 	-rm -f Makefile
 distclean-am: clean-am distclean-compile distclean-generic \
 	distclean-tags
@@ -926,7 +949,7 @@ install-ps-am:
 installcheck-am:
 
 maintainer-clean: maintainer-clean-am
-	-rm -rf ./$(DEPDIR) cuda/$(DEPDIR)
+	-rm -rf ./$(DEPDIR) cuda/$(DEPDIR) intel/$(DEPDIR)
 	-rm -f Makefile
 maintainer-clean-am: distclean-am maintainer-clean-generic
 
diff --git a/src/lib/banal/Struct.cpp b/src/lib/banal/Struct.cpp
index ecf0084f24..c9dbf509cf 100644
--- a/src/lib/banal/Struct.cpp
+++ b/src/lib/banal/Struct.cpp
@@ -117,6 +117,8 @@
 
 #include "cuda/ReadCubinCFG.hpp"
 
+#include "intel/IntelGPUbanal.hpp"
+
 #ifdef ENABLE_OPENMP
 #include <omp.h>
 #endif
@@ -141,7 +143,7 @@ using namespace std;
 #endif
 
 #define DEBUG_CFG_SOURCE  0
-#define DEBUG_MAKE_SKEL   0
+#define DEBUG_MAKE_SKEL   1 //0
 #define DEBUG_SHOW_GAPS   0
 #define DEBUG_SKEL_SUMMARY  0
 
@@ -550,6 +552,21 @@ printTime(const char *label, struct timeval *tv_prev, struct rusage *ru_prev,
   cout << endl;
 }
 
+static string
+getFileNameFromAbsolutePath(string str)
+{
+	vector <string> tokens; 
+	stringstream str_stream(str); 
+	string intermediate; 
+
+	// Tokenizing w.r.t. '/'
+	while(getline(str_stream, intermediate, '/')) { 
+		tokens.push_back(intermediate); 
+	} 
+	return tokens[tokens.size() - 1];
+}
+
+
 //
 // makeStructure -- the main entry point for hpcstruct realmain().
 //
@@ -581,7 +598,7 @@ makeStructure(string filename,
 
   // failure throws an error up the call chain
   inputFile.openFile(filename, InputFileError_Error);
-
+	
   ElfFileVector * elfFileVector = inputFile.fileVector();
   string & sfilename = inputFile.fileName();
   const char * cfilename = inputFile.CfileName();
@@ -591,9 +608,10 @@ makeStructure(string filename,
   }
 
   Output::printStructFileBegin(outFile, gapsFile, sfilename);
-
+	
   for (uint i = 0; i < elfFileVector->size(); i++) {
     bool parsable = true;
+    parsable = false; // aaron
     ElfFile *elfFile = (*elfFileVector)[i];
 
     if (opts.show_time) {
@@ -614,8 +632,8 @@ makeStructure(string filename,
 
     Symtab * symtab = Inline::openSymtab(elfFile);
     if (symtab == NULL) {
-      continue;
-    }
+			continue;
+		}
     the_symtab = symtab;
     bool cuda_file = SYMTAB_ARCH_CUDA(symtab);
 
@@ -643,8 +661,17 @@ makeStructure(string filename,
     omp_set_num_threads(opts.jobs_parse);
 #endif
 
-    // don't run parseapi on cuda binary
-    if (! cuda_file) {
+		bool isIntelArch = true;
+		bool cfgNotPresent = true;
+		if (isIntelArch && cfgNotPresent) {
+			//std::cerr << "executing intel-gen9 specific code." << std::endl;
+			add_custom_function_object(symtab, getFileNameFromAbsolutePath(elfFile->getFileName())); //adds a dummy function object
+			code_src = new SymtabCodeSource(symtab);
+		  code_obj = new CodeObject(code_src, NULL, NULL, false, true); //last param is bool ignoreParse
+      //code_obj->parse();
+			parsable = false;
+		}
+    else if (! cuda_file) { // don't run parseapi on cuda binary
       code_src = new SymtabCodeSource(symtab);
       code_obj = new CodeObject(code_src);
       code_obj->parse();
diff --git a/src/lib/banal/intel/.IntelGPUbanal.cpp.swp b/src/lib/banal/intel/.IntelGPUbanal.cpp.swp
new file mode 100644
index 0000000000000000000000000000000000000000..86f67f99a30201fd0d098868ba71f9612dcf246b
GIT binary patch
literal 16384
zcmeHNTW=gm6|Q7An;qB-+VFq`5`~l9%$V_v<Ap`o>j=kolF8am<k(q|kVZA#H8YiT
zcQswro(wB465=8)67T{7tq?*yAp|VsFCZ=tNFWemk$^wI+XB(Pz;~+Ko*7>dT4IDy
zjr6(Qr>jn#^PO|5yQ{i0d#kn1KhSs|!|!d3y*ONLf9I>eJn`ZpV?ty)v4KuSG)I?@
z`rVF<jC4;W|FRWhpBz`%G`edCB9qO28d$BPJIc=Vf?(!@jb|IPO_K%9NOhVWk%*|N
z5-X!-i~e4@vF&7yAWe@l$jdzgo`FLcc$_^ucVU(;oSk|PKl9Y~A#J?KGvFEU40r}S
z1D*lTfM>un;2HRTWx%E<*{9&#$<o1Yc|5W2_)vNN_VRdp-}Um;A3Ot|0ndPEz%$?(
z@C<kcJOiEq&wyvZGvFEcA7ns;jC}!iJx>b&*#F;s|NqgGjQs%kHt-GLYd{N_0^SY$
z?g_@e0xSa`0-gfic$~4n1Fr%-pbdN!cn9#?#~AxH@G@`-xc^qh{tEmM=mJ&XY2Z(9
z!Fa$H@FeiF3S*xJ7J=6vW$b%E2e<$<fIpu`AK;I`mw_(<7lHQzuRX%p?}48HKL%a_
zJ_i_}4ZHwc1D*rk54;OF0Wjbnrx^P_@Cxt(a1D4GcoaAdJOZ2o?mf)dcY%)p9|o=g
zE#MUJFz^s?5_lC0C|?Jz0#|?~;2Gc?a0a*!{{IHN4*UZ6IdBj73~(D*1Sm#N0B>M%
z<}n<9v%5&MFRv<jVpK1YA&+#@<C*NSgQE7oD>Lfjk}r~$nTtJX*JXSYl4@;%9j4*v
z))irl%xu+IVROS57rD)H2{n}tzW(65Vs3`96`h%jU6q8JvZr)X#n^Q|)v)reouUqf
zm`_6v%Qm*x*2tJZC&ofgI6rS}rjp(>JP|S3-p!Mstvk2dbiTk|Vl`G7Yd3kFRc<;9
zrb|oetja_fwu9VQ9k)rBg;AtW2A1t;6>`R!&FKgFgF;SUy<}YxNUl?v2^!Tr_<@i-
zXG00LC2wtPt**84g-v^9_0sm0_RZDht=9F8_QulsDxY$n-&8ki{CX;r<u$${cNA<}
zk)6CZHBz#2=?cGaJcPs|$n-H$ppqcULs`1H_mxpUO{dt|-HXrDCrKtxQ=Qpns6;Q(
z#;U+H`WKJK<46Z0LLxZZj>Yrbpi*3+n3PhjNdBc8tp&FK(#kMVC^6L9h>)VW^M51+
zOET(`a3sqUg_4_;b%Vr4lOi)TO_~Z0O6@R}j_8rjqRhufQHVz-MSM)*fOS^J#~F_Y
zI**{VF^#;*d5N!L{zZW=>vWi@Uf=R*Xmj&Ymy2*m8QfrWm)pMNsWwJ+RHW>XL?#dV
zJQT6$$p&@M8P{l*4bZ2?MW$TNsf3GU$Oi&_a=WW4H<<27t1^7|6Tv!{J2D|ngT5q1
zhycfQZ!;0tI;Q+g1(ruj6eiSZK#LOP)uM7-hRa^0dNK(l>FH2zOJj;THp^3*Tx-0q
z2N)^ifq)qy%!7x8l^9iL&biSrq{!qQ%m6mGlJ~R@4GoRk9L<bo0ry`F0dJtK4nnMO
zrlpzgDvQfHBNcStk{`=uVijt=pv{b#UWc`oCz?Erm4#ha*Ey5~c_w3Y!w95@42vZ1
z!UEzoR2lA1b$B@XFHj-ciwK;hGQe<P_O6W6NMu7Yjnt%hhHy%RNkWGSBB5d#Hh3$6
z4K5J4$@WDQ@y*uqD*5Xop<txW4DUF86?7_CiVr+90mr1HA(mOt$7vZS$1j+0fpBK?
zRAiV>bJ#<s$h&DqLBbdZ6_RH(Eh4ywGlY<llCAMf;>ap6w*wwR-We7yP?#rtm=Z_~
zGv(|9mC!<p_+(P@fwFx{Tt{VB1yHPD=^#_q%7o*V9V?WDPFxM}$suB7eMeGaP&m0v
zLXd~d&&a%3Bb{&z^S~M&%1rH`Sr-?`j%mIjF*MQ?e~L#Oq!^Vd-%ijnGgO9EicSq7
z8#Qw=K;rHB6C#Buf!w17dD}#5SE)P}c=kYojL}8@x*?Gjs5r!X<QuXR<JSv@B3!4e
z2dANOPc4o~vbL+ENDrtU_fpFha~Z2?zA4AGQ+@J|N|2tE*D;Ur5Qaz5f*a|q5golF
zF;+QC?K$_KnB*dM&=~k+be@6eK;u!NG{y=TNzNT^Ni6L=g)P>hLY6qL>#n+7h9FnM
zQA*mruuj*AsBsoEm(i0PIc{mzgcYTAl+a_WnYznWO0F_zOOlpQ#BORg;FM-9)H#_t
zn)0jFpjP8^v$JzEbF*`^_5Ep9Dw=s<KN|k(jpeQ4gK#QW5ORL>iFiY2z4OjxxO;wc
zb!laNwXqc3N%SC+VUM!eoOe#N63Un|wL7j*UF=*Hl+SQ(vLx39CMKsf#QY!WbTqA!
zK{qffQ9_RyR-H7dM$H}=2{R?F=UU-8Qhe^J6yf&f8lBMjG{1xwqh#OR`OVzt@n*x=
z7x8|6AE5XD-TnPP;2r;0z_);xfXl#hz&nA{z+dpb{~GX9;2!Wz;AP<Rz-NI?;0!=(
z0;HSP1^mx5;2H1?cm_NJo&nE*XTUSy8So5v2LAgDe1aXlA9r9Y4%=T?=kMsKT5f_F
zcUD{A&1SW@!lx5CfEYV#%_iE@5)ZA36dh<Q4c)Osfwe9Y;~cAiyxc>punKMKRVrO9
zI7ujVJA#2B(aeQ91nB6Nd6Q7yos?pEvk<wy)Yf$td+PQ?A&IbM*rq*7lJ0Gjx`A+o
zgqv>55qpc&nzj~6xQ?YbEUivs4Tp|&32PzE=JjsZNL#0K&;XCHZ!$lRf8i{Sx3G_P
zd1-rXi`o{_TUh%pR;kNQMa#AJ+V$lnH&)SZO(l1*e14SXt@h>Z4YxT*s%g^(dbY4b
ZLLF$AvkoH*DW9GQ8n#0bsj^3v{S!bZI!XWl

literal 0
HcmV?d00001

diff --git a/src/lib/banal/intel/IntelGPUbanal.cpp b/src/lib/banal/intel/IntelGPUbanal.cpp
new file mode 100644
index 0000000000..152ba3ce9d
--- /dev/null
+++ b/src/lib/banal/intel/IntelGPUbanal.cpp
@@ -0,0 +1,122 @@
+// * BeginRiceCopyright *****************************************************
+//
+// $HeadURL$
+// $Id$
+//
+// --------------------------------------------------------------------------
+// Part of HPCToolkit (hpctoolkit.org)
+//
+// Information about sources of support for research and development of
+// HPCToolkit is at 'hpctoolkit.org' and in 'README.Acknowledgments'.
+// --------------------------------------------------------------------------
+//
+// Copyright ((c)) 2002-2020, Rice University
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+//   notice, this list of conditions and the following disclaimer.
+//
+// * Redistributions in binary form must reproduce the above copyright
+//   notice, this list of conditions and the following disclaimer in the
+//   documentation and/or other materials provided with the distribution.
+//
+// * Neither the name of Rice University (RICE) nor the names of its
+//   contributors may be used to endorse or promote products derived from
+//   this software without specific prior written permission.
+//
+// This software is provided by RICE and contributors "as is" and any
+// express or implied warranties, including, but not limited to, the
+// implied warranties of merchantability and fitness for a particular
+// purpose are disclaimed. In no event shall RICE or contributors be
+// liable for any direct, indirect, incidental, special, exemplary, or
+// consequential damages (including, but not limited to, procurement of
+// substitute goods or services; loss of use, data, or profits; or
+// business interruption) however caused and on any theory of liability,
+// whether in contract, strict liability, or tort (including negligence
+// or otherwise) arising in any way out of the use of this software, even
+// if advised of the possibility of such damage.
+//
+// ******************************************************* EndRiceCopyright *
+
+
+//***************************************************************************
+
+//******************************************************************************
+// system includes
+//******************************************************************************
+
+#include <iostream>
+#include <string>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <libelf.h>
+
+#include <Symtab.h>
+
+using namespace Dyninst;
+using namespace SymtabAPI;
+
+
+//******************************************************************************
+// local includes
+//******************************************************************************
+
+#include <lib/support/diagnostics.h>
+#include "IntelGPUbanal.hpp"
+
+
+
+//******************************************************************************
+// macros
+//******************************************************************************
+
+#define DBG 1
+
+#define INTEL_GPU_DEBUG_SECTION_NAME "Intel(R) OpenCL Device Debug"
+
+
+
+//******************************************************************************
+// interface operations
+//******************************************************************************
+
+void 
+add_custom_function_object
+(
+	Symtab* symtab,
+	std::string func_obj_name
+)
+{
+	const std::string& name = func_obj_name;
+
+	Region *reg = NULL;
+	bool status = symtab->findRegion(reg, ".text");
+	assert(status == true);
+	unsigned long reg_size = reg->getMemSize();
+
+	Symbol *custom_symbol = new Symbol(
+			name, 
+			SymtabAPI::Symbol::ST_FUNCTION, // SymbolType
+			Symbol::SL_LOCAL, //SymbolLinkage
+			SymtabAPI::Symbol::SV_DEFAULT, //SymbolVisibility
+			0, //Offset,
+			NULL, //Module *module 
+			reg, //Region *r
+			reg_size, //unsigned s
+			false, //bool d
+			false, //bool a
+			-1, //int index
+			-1, //int strindex
+			false //bool cs
+	);
+	
+	//adding the custom symbol into the symtab object
+	status = symtab->addSymbol(custom_symbol); //(Symbol *newsym)
+	assert(status == true);
+}
diff --git a/src/lib/banal/intel/IntelGPUbanal.hpp b/src/lib/banal/intel/IntelGPUbanal.hpp
new file mode 100644
index 0000000000..f2d160aaa2
--- /dev/null
+++ b/src/lib/banal/intel/IntelGPUbanal.hpp
@@ -0,0 +1,89 @@
+// * BeginRiceCopyright *****************************************************
+//
+// $HeadURL$
+// $Id$
+//
+// --------------------------------------------------------------------------
+// Part of HPCToolkit (hpctoolkit.org)
+//
+// Information about sources of support for research and development of
+// HPCToolkit is at 'hpctoolkit.org' and in 'README.Acknowledgments'.
+// --------------------------------------------------------------------------
+//
+// Copyright ((c)) 2002-2020, Rice University
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+//   notice, this list of conditions and the following disclaimer.
+//
+// * Redistributions in binary form must reproduce the above copyright
+//   notice, this list of conditions and the following disclaimer in the
+//   documentation and/or other materials provided with the distribution.
+//
+// * Neither the name of Rice University (RICE) nor the names of its
+//   contributors may be used to endorse or promote products derived from
+//   this software without specific prior written permission.
+//
+// This software is provided by RICE and contributors "as is" and any
+// express or implied warranties, including, but not limited to, the
+// implied warranties of merchantability and fitness for a particular
+// purpose are disclaimed. In no event shall RICE or contributors be
+// liable for any direct, indirect, incidental, special, exemplary, or
+// consequential damages (including, but not limited to, procurement of
+// substitute goods or services; loss of use, data, or profits; or
+// business interruption) however caused and on any theory of liability,
+// whether in contract, strict liability, or tort (including negligence
+// or otherwise) arising in any way out of the use of this software, even
+// if advised of the possibility of such damage.
+//
+// ******************************************************* EndRiceCopyright *
+
+//******************************************************************************
+// system includes
+//******************************************************************************
+
+#include <iostream>
+#include <string>
+#include <Symtab.h>
+
+using namespace Dyninst;
+using namespace SymtabAPI;
+
+
+
+//******************************************************************************
+// type definitions
+//*****************************************************************************
+
+enum SHT_OPENCL : uint32_t {
+    SHT_OPENCL_SOURCE = 0xff000000,                  // CL source to link into LLVM binary
+    SHT_OPENCL_HEADER = 0xff000001,                  // CL header to link into LLVM binary
+    SHT_OPENCL_LLVM_TEXT = 0xff000002,               // LLVM text
+    SHT_OPENCL_LLVM_BINARY = 0xff000003,             // LLVM byte code
+    SHT_OPENCL_LLVM_ARCHIVE = 0xff000004,            // LLVM archives(s)
+    SHT_OPENCL_DEV_BINARY = 0xff000005,              // Device binary (coherent by default)
+    SHT_OPENCL_OPTIONS = 0xff000006,                 // CL Options
+    SHT_OPENCL_PCH = 0xff000007,                     // PCH (pre-compiled headers)
+    SHT_OPENCL_DEV_DEBUG = 0xff000008,               // Device debug
+    SHT_OPENCL_SPIRV = 0xff000009,                   // SPIRV
+    SHT_OPENCL_NON_COHERENT_DEV_BINARY = 0xff00000a, // Non-coherent Device binary
+    SHT_OPENCL_SPIRV_SC_IDS = 0xff00000b,            // Specialization Constants IDs
+    SHT_OPENCL_SPIRV_SC_VALUES = 0xff00000c          // Specialization Constants values
+};
+
+
+
+//******************************************************************************
+// interface functions
+//******************************************************************************
+
+void 
+add_custom_function_object
+(
+	Symtab* symtab,
+	std::string func_obj_name
+);
diff --git a/src/lib/binutils/InputFile.cpp b/src/lib/binutils/InputFile.cpp
index dd156e590d..a645fe6ce3 100644
--- a/src/lib/binutils/InputFile.cpp
+++ b/src/lib/binutils/InputFile.cpp
@@ -75,6 +75,7 @@
 
 #include "ElfHelper.hpp"
 #include "Fatbin.hpp"
+#include "IntelGPUbinutils.hpp"
 #include "InputFile.hpp"
 
 
@@ -182,10 +183,16 @@ InputFile::openFile
 
   ElfFile *elfFile = new ElfFile;
   bool result = elfFile->open(file_buffer, f_size, filename);
+	bool isIntelGPUFile = true;
 
   if (result) {
     filevector = new ElfFileVector;
-    filevector->push_back(elfFile);
+		if (isIntelGPUFile) {
+			findIntelGPUbins(elfFile, filevector);
+		}
+		else {
+			filevector->push_back(elfFile);
+		}
     //findCubins(elfFile, filevector);
   } else {
     DIAG_MsgIf_GENERIC(tag, 1, "Not an ELF binary " << filename);
diff --git a/src/lib/binutils/IntelGPUbinutils.cpp b/src/lib/binutils/IntelGPUbinutils.cpp
new file mode 100644
index 0000000000..06791cb046
--- /dev/null
+++ b/src/lib/binutils/IntelGPUbinutils.cpp
@@ -0,0 +1,263 @@
+// * BeginRiceCopyright *****************************************************
+//
+// $HeadURL$
+// $Id$
+//
+// --------------------------------------------------------------------------
+// Part of HPCToolkit (hpctoolkit.org)
+//
+// Information about sources of support for research and development of
+// HPCToolkit is at 'hpctoolkit.org' and in 'README.Acknowledgments'.
+// --------------------------------------------------------------------------
+//
+// Copyright ((c)) 2002-2020, Rice University
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+//   notice, this list of conditions and the following disclaimer.
+//
+// * Redistributions in binary form must reproduce the above copyright
+//   notice, this list of conditions and the following disclaimer in the
+//   documentation and/or other materials provided with the distribution.
+//
+// * Neither the name of Rice University (RICE) nor the names of its
+//   contributors may be used to endorse or promote products derived from
+//   this software without specific prior written permission.
+//
+// This software is provided by RICE and contributors "as is" and any
+// express or implied warranties, including, but not limited to, the
+// implied warranties of merchantability and fitness for a particular
+// purpose are disclaimed. In no event shall RICE or contributors be
+// liable for any direct, indirect, incidental, special, exemplary, or
+// consequential damages (including, but not limited to, procurement of
+// substitute goods or services; loss of use, data, or profits; or
+// business interruption) however caused and on any theory of liability,
+// whether in contract, strict liability, or tort (including negligence
+// or otherwise) arising in any way out of the use of this software, even
+// if advised of the possibility of such damage.
+//
+// ******************************************************* EndRiceCopyright *
+
+
+//***************************************************************************
+
+//******************************************************************************
+// system includes
+//******************************************************************************
+
+#include <iostream>
+#include <string>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <libelf.h>
+
+#include "igc_binary_decoder.h"
+#include "gen_symbols_decoder.h"
+
+
+//******************************************************************************
+// local includes
+//******************************************************************************
+
+#include <lib/binutils/ElfHelper.hpp>
+#include <lib/support/diagnostics.h>
+#include <lib/support/RealPathMgr.cpp>
+#include "IntelGPUbinutils.hpp"
+
+
+
+//******************************************************************************
+// macros
+//******************************************************************************
+
+#define DBG 1
+
+#define INTEL_GPU_DEBUG_SECTION_NAME "Intel(R) OpenCL Device Debug"
+
+
+
+//******************************************************************************
+// private operations
+//******************************************************************************
+
+static size_t
+file_size(int fd)
+{
+  struct stat sb;
+  int retval = fstat(fd, &sb);
+  if (retval == 0 && S_ISREG(sb.st_mode)) {
+    return sb.st_size;
+  }
+  return 0;
+}
+
+
+// Automatically restart short reads.
+// This protects against EINTR.
+//
+static size_t
+read_all(int fd, void *buf, size_t count)
+{
+  ssize_t ret;
+  size_t len;
+
+  len = 0;
+  while (len < count) {
+    ret = read(fd, ((char *) buf) + len, count - len);
+    if (ret == 0 || (ret < 0 && errno != EINTR)) {
+      break;
+    }
+    if (ret > 0) {
+      len += ret;
+    }
+  }
+
+  return len;
+}
+
+
+static const char*
+openclElfSectionType
+(
+	Elf64_Word sh_type
+)
+{
+	switch (sh_type) {
+    case SHT_OPENCL_SOURCE:
+			return "SHT_OPENCL_SOURCE";
+    case SHT_OPENCL_HEADER:
+			return "SHT_OPENCL_HEADER";
+    case SHT_OPENCL_LLVM_TEXT:
+			return "SHT_OPENCL_LLVM_TEXT";
+    case SHT_OPENCL_LLVM_BINARY:
+			return "SHT_OPENCL_LLVM_BINARY";
+    case SHT_OPENCL_LLVM_ARCHIVE:
+			return "SHT_OPENCL_LLVM_ARCHIVE";
+    case SHT_OPENCL_DEV_BINARY:
+			return "SHT_OPENCL_DEV_BINARY";
+    case SHT_OPENCL_OPTIONS:
+			return "SHT_OPENCL_OPTIONS";
+    case SHT_OPENCL_PCH:
+			return "SHT_OPENCL_PCH";
+    case SHT_OPENCL_DEV_DEBUG:
+			return "SHT_OPENCL_DEV_DEBUG";
+    case SHT_OPENCL_SPIRV:
+			return "SHT_OPENCL_SPIRV";
+    case SHT_OPENCL_NON_COHERENT_DEV_BINARY:
+			return "SHT_OPENCL_NON_COHERENT_DEV_BINARY";
+    case SHT_OPENCL_SPIRV_SC_IDS:
+			return "SHT_OPENCL_SPIRV_SC_IDS";
+    case SHT_OPENCL_SPIRV_SC_VALUES:
+			return "SHT_OPENCL_SPIRV_SC_VALUES";
+		default:
+			return "unknown type";
+	}
+}
+
+
+static bool
+extract_kernelelfs
+(
+	std::vector<uint8_t> symbols,
+	ElfFileVector *filevector
+)
+{
+	bool extractSuccess = true;
+	const uint8_t* ptr = symbols.data();
+	const SProgramDebugDataHeaderIGC* header =
+		reinterpret_cast<const SProgramDebugDataHeaderIGC*>(ptr);
+	ptr += sizeof(SProgramDebugDataHeaderIGC);
+
+	if (header->NumberOfKernels == 0) {
+		extractSuccess = false;
+	}
+	
+	for (uint32_t i = 0; i < header->NumberOfKernels; ++i) {
+		const SKernelDebugDataHeaderIGC* kernel_header =
+			reinterpret_cast<const SKernelDebugDataHeaderIGC*>(ptr);
+		ptr += sizeof(SKernelDebugDataHeaderIGC);
+
+		const char* kernel_name = reinterpret_cast<const char*>(ptr);
+		char *file_name = (char*) kernel_name;
+
+		unsigned kernel_name_size_aligned = sizeof(uint32_t) *
+			(1 + (kernel_header->KernelNameSize - 1) / sizeof(uint32_t));
+		ptr += kernel_name_size_aligned;
+
+		if (kernel_header->SizeVisaDbgInBytes > 0) {
+			// Parse the binary block [ptr, ptr + kernel_header->SizeVisaDbgInBytes)
+			// as a blob in standard ELF/DWARF format
+	
+			FILE *f_ptr = fopen(kernel_name, "wb");
+			fwrite(ptr, kernel_header->SizeVisaDbgInBytes, 1, f_ptr);
+			fclose(f_ptr);
+			std::ifstream in(kernel_name);
+			std::string file_contents((std::istreambuf_iterator<char>(in)), 
+			    std::istreambuf_iterator<char>());
+
+
+			ElfFile *elfFile = new ElfFile;
+			int file_fd = open(file_name, O_RDONLY);
+			size_t f_size = file_size(file_fd);
+			char  *file_buffer = (char *) malloc(f_size);
+			size_t bytes = read_all(file_fd, file_buffer, f_size);
+			bool result = elfFile->open(file_buffer, f_size, file_name);
+
+			filevector->push_back(elfFile);
+		} else {
+			extractSuccess = false;
+		}
+	}
+	return extractSuccess;
+}
+
+
+
+//******************************************************************************
+// interface operations
+//******************************************************************************
+
+bool
+findIntelGPUbins
+(
+	ElfFile *elfFile,
+	ElfFileVector *filevector
+)
+{
+	bool fileHasDebugSection = false;
+	bool extractSuccess = false;
+
+  Elf *elf = elfFile->getElf();
+	char *file_buffer = elfFile->getMemory();
+  ElfSectionVector *sections = elfGetSectionVector(elf);
+  GElf_Ehdr ehdr_v;
+  GElf_Ehdr *ehdr = gelf_getehdr(elf, &ehdr_v);
+
+  if (ehdr) {
+    for (auto si = sections->begin(); si != sections->end(); si++) {
+			Elf_Scn *scn = *si;
+			GElf_Shdr shdr_v;
+			GElf_Shdr *shdr = gelf_getshdr(scn, &shdr_v);
+			if (!shdr) continue;
+			char *sectionData = elfSectionGetData(file_buffer, shdr);
+			const char *section_name = elf_strptr(elf, ehdr->e_shstrndx, shdr->sh_name);
+			//std::cerr << "section name: " << section_name << ". section type: " << openclElfSectionType(shdr->sh_type) << std::endl;
+
+			// extract debug section
+			if (shdr->sh_type == SHT_OPENCL_DEV_DEBUG && strcmp(section_name, INTEL_GPU_DEBUG_SECTION_NAME) == 0) {
+				fileHasDebugSection = true;
+				std::vector<uint8_t> debug_info(reinterpret_cast<uint8_t*>(sectionData), reinterpret_cast<uint8_t*>(sectionData) + shdr->sh_size);
+				extractSuccess = extract_kernelelfs(debug_info, filevector);
+				break;
+			}
+    }
+  }
+  bool success = fileHasDebugSection && extractSuccess;
+  return success; 
+}
diff --git a/src/lib/binutils/IntelGPUbinutils.hpp b/src/lib/binutils/IntelGPUbinutils.hpp
new file mode 100644
index 0000000000..56d2d04d52
--- /dev/null
+++ b/src/lib/binutils/IntelGPUbinutils.hpp
@@ -0,0 +1,76 @@
+// * BeginRiceCopyright *****************************************************
+//
+// $HeadURL$
+// $Id$
+//
+// --------------------------------------------------------------------------
+// Part of HPCToolkit (hpctoolkit.org)
+//
+// Information about sources of support for research and development of
+// HPCToolkit is at 'hpctoolkit.org' and in 'README.Acknowledgments'.
+// --------------------------------------------------------------------------
+//
+// Copyright ((c)) 2002-2020, Rice University
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+//   notice, this list of conditions and the following disclaimer.
+//
+// * Redistributions in binary form must reproduce the above copyright
+//   notice, this list of conditions and the following disclaimer in the
+//   documentation and/or other materials provided with the distribution.
+//
+// * Neither the name of Rice University (RICE) nor the names of its
+//   contributors may be used to endorse or promote products derived from
+//   this software without specific prior written permission.
+//
+// This software is provided by RICE and contributors "as is" and any
+// express or implied warranties, including, but not limited to, the
+// implied warranties of merchantability and fitness for a particular
+// purpose are disclaimed. In no event shall RICE or contributors be
+// liable for any direct, indirect, incidental, special, exemplary, or
+// consequential damages (including, but not limited to, procurement of
+// substitute goods or services; loss of use, data, or profits; or
+// business interruption) however caused and on any theory of liability,
+// whether in contract, strict liability, or tort (including negligence
+// or otherwise) arising in any way out of the use of this software, even
+// if advised of the possibility of such damage.
+//
+// ******************************************************* EndRiceCopyright *
+
+//******************************************************************************
+// type definitions
+//*****************************************************************************
+
+enum SHT_OPENCL : uint32_t {
+    SHT_OPENCL_SOURCE = 0xff000000,                  // CL source to link into LLVM binary
+    SHT_OPENCL_HEADER = 0xff000001,                  // CL header to link into LLVM binary
+    SHT_OPENCL_LLVM_TEXT = 0xff000002,               // LLVM text
+    SHT_OPENCL_LLVM_BINARY = 0xff000003,             // LLVM byte code
+    SHT_OPENCL_LLVM_ARCHIVE = 0xff000004,            // LLVM archives(s)
+    SHT_OPENCL_DEV_BINARY = 0xff000005,              // Device binary (coherent by default)
+    SHT_OPENCL_OPTIONS = 0xff000006,                 // CL Options
+    SHT_OPENCL_PCH = 0xff000007,                     // PCH (pre-compiled headers)
+    SHT_OPENCL_DEV_DEBUG = 0xff000008,               // Device debug
+    SHT_OPENCL_SPIRV = 0xff000009,                   // SPIRV
+    SHT_OPENCL_NON_COHERENT_DEV_BINARY = 0xff00000a, // Non-coherent Device binary
+    SHT_OPENCL_SPIRV_SC_IDS = 0xff00000b,            // Specialization Constants IDs
+    SHT_OPENCL_SPIRV_SC_VALUES = 0xff00000c          // Specialization Constants values
+};
+
+
+
+//******************************************************************************
+// interface functions
+//******************************************************************************
+
+bool
+findIntelGPUbins
+(
+	ElfFile *elfFile,
+	ElfFileVector *filevector
+);
diff --git a/src/lib/binutils/Makefile.am b/src/lib/binutils/Makefile.am
index 94c9aaa461..8a35ff4e79 100644
--- a/src/lib/binutils/Makefile.am
+++ b/src/lib/binutils/Makefile.am
@@ -95,6 +95,7 @@ MYSOURCES = \
 	BinUtils.hpp BinUtils.cpp \
 	VMAInterval.hpp VMAInterval.cpp \
 	Fatbin.cpp \
+	IntelGPUbinutils.cpp \
 	ElfHelper.cpp \
 	InputFile.cpp \
 	RelocateCubin.cpp 
diff --git a/src/lib/binutils/Makefile.in b/src/lib/binutils/Makefile.in
index 852f94f55e..8ba332da50 100644
--- a/src/lib/binutils/Makefile.in
+++ b/src/lib/binutils/Makefile.in
@@ -145,8 +145,9 @@ am__objects_1 = libHPCbinutils_la-LM.lo libHPCbinutils_la-Seg.lo \
 	libHPCbinutils_la-SimpleSymbolsFactories.lo \
 	libHPCbinutils_la-Dbg-LM.lo libHPCbinutils_la-Dbg-Proc.lo \
 	libHPCbinutils_la-BinUtils.lo libHPCbinutils_la-VMAInterval.lo \
-	libHPCbinutils_la-Fatbin.lo libHPCbinutils_la-ElfHelper.lo \
-	libHPCbinutils_la-InputFile.lo \
+	libHPCbinutils_la-Fatbin.lo \
+	libHPCbinutils_la-IntelGPUbinutils.lo \
+	libHPCbinutils_la-ElfHelper.lo libHPCbinutils_la-InputFile.lo \
 	libHPCbinutils_la-RelocateCubin.lo
 am_libHPCbinutils_la_OBJECTS = $(am__objects_1)
 libHPCbinutils_la_OBJECTS = $(am_libHPCbinutils_la_OBJECTS)
@@ -546,6 +547,7 @@ MYSOURCES = \
 	BinUtils.hpp BinUtils.cpp \
 	VMAInterval.hpp VMAInterval.cpp \
 	Fatbin.cpp \
+	IntelGPUbinutils.cpp \
 	ElfHelper.cpp \
 	InputFile.cpp \
 	RelocateCubin.cpp 
@@ -644,6 +646,7 @@ distclean-compile:
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libHPCbinutils_la-Fatbin.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libHPCbinutils_la-InputFile.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libHPCbinutils_la-Insn.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libHPCbinutils_la-IntelGPUbinutils.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libHPCbinutils_la-LM.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libHPCbinutils_la-LinuxKernelSymbols.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libHPCbinutils_la-Proc.Plo@am__quote@
@@ -758,6 +761,13 @@ libHPCbinutils_la-Fatbin.lo: Fatbin.cpp
 @AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
 @am__fastdepCXX_FALSE@	$(AM_V_CXX@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libHPCbinutils_la_CXXFLAGS) $(CXXFLAGS) -c -o libHPCbinutils_la-Fatbin.lo `test -f 'Fatbin.cpp' || echo '$(srcdir)/'`Fatbin.cpp
 
+libHPCbinutils_la-IntelGPUbinutils.lo: IntelGPUbinutils.cpp
+@am__fastdepCXX_TRUE@	$(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libHPCbinutils_la_CXXFLAGS) $(CXXFLAGS) -MT libHPCbinutils_la-IntelGPUbinutils.lo -MD -MP -MF $(DEPDIR)/libHPCbinutils_la-IntelGPUbinutils.Tpo -c -o libHPCbinutils_la-IntelGPUbinutils.lo `test -f 'IntelGPUbinutils.cpp' || echo '$(srcdir)/'`IntelGPUbinutils.cpp
+@am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/libHPCbinutils_la-IntelGPUbinutils.Tpo $(DEPDIR)/libHPCbinutils_la-IntelGPUbinutils.Plo
+@AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='IntelGPUbinutils.cpp' object='libHPCbinutils_la-IntelGPUbinutils.lo' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCXX_FALSE@	$(AM_V_CXX@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libHPCbinutils_la_CXXFLAGS) $(CXXFLAGS) -c -o libHPCbinutils_la-IntelGPUbinutils.lo `test -f 'IntelGPUbinutils.cpp' || echo '$(srcdir)/'`IntelGPUbinutils.cpp
+
 libHPCbinutils_la-ElfHelper.lo: ElfHelper.cpp
 @am__fastdepCXX_TRUE@	$(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libHPCbinutils_la_CXXFLAGS) $(CXXFLAGS) -MT libHPCbinutils_la-ElfHelper.lo -MD -MP -MF $(DEPDIR)/libHPCbinutils_la-ElfHelper.Tpo -c -o libHPCbinutils_la-ElfHelper.lo `test -f 'ElfHelper.cpp' || echo '$(srcdir)/'`ElfHelper.cpp
 @am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/libHPCbinutils_la-ElfHelper.Tpo $(DEPDIR)/libHPCbinutils_la-ElfHelper.Plo

From 7eeca680e7f6161ec981ddcde457155e765d47da Mon Sep 17 00:00:00 2001
From: Aaron Cherian <aarontcopal2@iris11.ftm.alcf.anl.gov>
Date: Mon, 7 Sep 2020 02:13:45 +0000
Subject: [PATCH 017/177] updated code for creating loadmaps for opencl kernels

---
 src/lib/binutils/IntelGPUbinutils.cpp         |  20 ++-
 .../instrumentation/opencl-instrumentation.c  | 143 ++++++++++--------
 src/tool/hpcrun/gpu/opencl/opencl-intercept.c | 117 ++++++++++++++
 src/tool/hpcrun/gpu/opencl/opencl-intercept.h |  18 +++
 4 files changed, 230 insertions(+), 68 deletions(-)

diff --git a/src/lib/binutils/IntelGPUbinutils.cpp b/src/lib/binutils/IntelGPUbinutils.cpp
index 06791cb046..fd328c954a 100644
--- a/src/lib/binutils/IntelGPUbinutils.cpp
+++ b/src/lib/binutils/IntelGPUbinutils.cpp
@@ -218,6 +218,16 @@ extract_kernelelfs
 }
 
 
+static bool
+isCustomOpenCLBinary
+(
+	const char *section_name
+)
+{
+  return (strcmp(section_name, ".SHT_OPENCL_DEV_DEBUG") == 0);
+}
+
+
 
 //******************************************************************************
 // interface operations
@@ -247,14 +257,20 @@ findIntelGPUbins
 			if (!shdr) continue;
 			char *sectionData = elfSectionGetData(file_buffer, shdr);
 			const char *section_name = elf_strptr(elf, ehdr->e_shstrndx, shdr->sh_name);
-			//std::cerr << "section name: " << section_name << ". section type: " << openclElfSectionType(shdr->sh_type) << std::endl;
+			std::cerr << "section name: " << section_name << ". section type: " << openclElfSectionType(shdr->sh_type) << std::endl;
 
 			// extract debug section
-			if (shdr->sh_type == SHT_OPENCL_DEV_DEBUG && strcmp(section_name, INTEL_GPU_DEBUG_SECTION_NAME) == 0) {
+			if ((shdr->sh_type == SHT_OPENCL_DEV_DEBUG && strcmp(section_name, INTEL_GPU_DEBUG_SECTION_NAME) == 0)
+					|| isCustomOpenCLBinary(section_name)) {
 				fileHasDebugSection = true;
 				std::vector<uint8_t> debug_info(reinterpret_cast<uint8_t*>(sectionData), reinterpret_cast<uint8_t*>(sectionData) + shdr->sh_size);
 				extractSuccess = extract_kernelelfs(debug_info, filevector);
 				break;
+			} else if (strcmp(section_name, ".text") == 0) {
+				FILE *bin_ptr;
+				bin_ptr = fopen("switch.text", "wb");
+				fwrite(sectionData, shdr->sh_size, 1, bin_ptr);
+				fclose(bin_ptr);
 			}
     }
   }
diff --git a/src/tool/hpcrun/gpu/instrumentation/opencl-instrumentation.c b/src/tool/hpcrun/gpu/instrumentation/opencl-instrumentation.c
index 62a2344833..9c8fe6ea37 100644
--- a/src/tool/hpcrun/gpu/instrumentation/opencl-instrumentation.c
+++ b/src/tool/hpcrun/gpu/instrumentation/opencl-instrumentation.c
@@ -4,8 +4,11 @@
 //******************************************************************************
 
 #include <assert.h>
-#include <gtpin.h>
 #include <stdlib.h>
+#include <gen_symbols_decoder.h>
+#include <igc_binary_decoder.h>
+#include <utils.h>
+#include <gtpin.h>
 
 
 
@@ -93,6 +96,77 @@ createKernelNode
 }
 
 
+uint32_t
+findKernelAndInsertToLoadMap
+(
+	uint8_t *debuginfo,
+	char *input_kernel_name
+)
+{
+	const uint8_t* ptr = debuginfo;
+	const SProgramDebugDataHeaderIGC* header = (const SProgramDebugDataHeaderIGC*)(ptr);
+	ptr += sizeof(SProgramDebugDataHeaderIGC);
+
+	printf("header->NumberOfKernels: %d\n", header->NumberOfKernels);
+	for (uint32_t i = 0; i < header->NumberOfKernels; ++i) {
+		const SKernelDebugDataHeaderIGC* kernel_header = (const SKernelDebugDataHeaderIGC*)(ptr);
+		ptr += sizeof(SKernelDebugDataHeaderIGC);
+
+		const char* kernel_name = (const char*)(ptr);
+		char *file_name = (char*) kernel_name;
+		std::cerr << file_name <<std::endl;
+
+		unsigned kernel_name_size_aligned = sizeof(uint32_t) *
+			(1 + (kernel_header->KernelNameSize - 1) / sizeof(uint32_t));
+		ptr += kernel_name_size_aligned;
+
+		if (kernel_header->SizeVisaDbgInBytes > 0 && strcmp(kernel_name, input_kernel_name) == 0) {
+			file_name = strcat(file_name, ".gpbin");
+			FILE *fptr = fopen(file_name, "wb");
+			fwrite(ptr, kernel_header->SizeVisaDbgInBytes, 1, fptr);
+
+			uint32_t hpctoolkit_module_id;
+			load_module_t *module = NULL;
+			hpcrun_loadmap_lock();
+			if ((module = hpcrun_loadmap_findByName(file_name)) == NULL) {
+				hpctoolkit_module_id = hpcrun_loadModule_add(file_name);
+			} else {
+				hpctoolkit_module_id = module->id;
+			}
+			hpcrun_loadmap_unlock();
+			printf("dumped debug file size: %zu\n", kernel_header->SizeVisaDbgInBytes);
+			fclose(fptr);
+			return hpctoolkit_module_id;
+		}
+		// Should be zero for newest drivers
+		assert(kernel_header->SizeGenIsaDbgInBytes == 0);
+
+		ptr += kernel_header->SizeVisaDbgInBytes;
+		ptr += kernel_header->SizeGenIsaDbgInBytes;
+	}
+	return -1;
+}
+
+
+static uint32_t
+add_opencl_binary_to_loadmap 
+(
+	char *kernel_name
+)
+{
+	// we need to remove this hardcoding
+	FILE *fptr = fopen("opencl_main.debug_info", "rb");
+	fseek(fptr, 0L, SEEK_END);
+	size_t debug_info_size = ftell(fptr);
+	printf("debug_info_size: %zu\n", debug_info_size);
+	rewind(fptr);
+	uint8_t *debug_info = (uint8_t*)malloc(debug_info_size);
+	fread(debug_info, debug_info_size, 1, fptr);
+	findKernelAndInsertToLoadMap(debug_info, kernel_name);
+}
+
+
+/*
 static uint32_t
 add_opencl_binary_to_loadmap
 (
@@ -111,44 +185,7 @@ add_opencl_binary_to_loadmap
 	hpcrun_loadmap_unlock();
 	return hpctoolkit_module_id;
 }
-
-
-static uint32_t
-save_opencl_binary
-(
-	GTPinKernel kernel,
-	char *bin_name
-)
-{
-	// dump the binary to files for using it at inside hpcprof 
-	uint32_t kernel_binary_size = 0;
-  GTPINTOOL_STATUS status = GTPin_GetKernelBinary(kernel, 0, NULL, &kernel_binary_size);
-  assert(status == GTPINTOOL_STATUS_SUCCESS);
-
-	uint8_t *binary = (uint8_t*) malloc(sizeof(uint8_t) * kernel_binary_size);
-
-	/*!
-	 * Copy original kernel's binary into specified buffer
-	 * @ingroup KERNEL
-	 * @param[in]        kernel         the target kernel.
-	 * @param[in]        buffer_size    size of the buffer in bytes.Ignored,
-	 *                                  if buffer is not provided('buf' is NULL)
-	 * @param[out, opt]  buf            buffer that receives the requested binary code. NULL pointer can be used to
-	 *                                  check actual size of the string without copying it into a client's buffer.
-	 * @param[out, opt]  binary_size    If specified(not NULL), receives the actual size of the requested binary in
-	 *                                  bytes, including terminating NULL.
-	 *
-	 * @par Availability:
-	 * - OnKernelComplete
-	 */
-  status = GTPin_GetKernelBinary(kernel, kernel_binary_size, (char *)(binary), NULL);
-  assert(status == GTPINTOOL_STATUS_SUCCESS);
-
-	strcat(bin_name, "_kernel.bin");
-	FILE *bin_ptr = fopen(bin_name, "wb");
-	fwrite(binary, kernel_binary_size, 1, bin_ptr);
-	return add_opencl_binary_to_loadmap(bin_name);
-}
+*/
 
 
 static void
@@ -222,38 +259,12 @@ onKernelBuild
     GTPinINS head = GTPin_InsHead(block);
     assert(GTPin_InsValid(head));
     
-		/*!
-		 * @return the offset of the instruction relative to the beginning of the original kernel's binary
-		 * -1 is returned in case of an error
-		 * @ingroup INS
-		 * @param[in]   ins the instruction handle.
-		 *
-		 * @par Availability:
-		 * - OnKernelBuild
-		 */
 		int32_t offset =  GTPin_InsOffset(head);
 
     GTPinMem mem = NULL;
     status = GTPin_MemClaim(kernel, sizeof(uint32_t), &mem);
     assert(status == GTPINTOOL_STATUS_SUCCESS);
 
-		/*!
-		* Insert instrumentaion (Opcodeprof) that counts the number of dynamic executions of basic block
-		* *countSlot++
-		*
-		* @ingroup INSTRUMENTATION
-		*
-		* @param[in]       ins         instruction to be instrumented. The instrumentation code will be inserted
-		*                              BEFORE this instruction
-		*
-		* @param[in]       countSlot   memory slot to store the resulting counter in. The slot should be allocate
-		*                              by the GTPin_MemClaim() function, prior to this function call
-		*
-		* @return  Success/failure status
-		*
-		* @par Availability:
-		* - OnKernelBuild
-		*/
     status = GTPin_OpcodeprofInstrument(head, mem);
     assert(status == GTPINTOOL_STATUS_SUCCESS);
 
@@ -285,7 +296,7 @@ onKernelBuild
 	// add these details to cct_node. If thats not needed, we can create the kernel_cct in onKernelComplete
   data.name = kernel_name;
   data.call_count = 0;
-	data.loadmap_module_id = save_opencl_binary(kernel, kernel_name);
+	data.loadmap_module_id = add_opencl_binary_to_loadmap(kernel_name);
 	
 	kernel_data_map_insert1((uint64_t)kernel, data);
   ETMSG(OPENCL, "onKernelBuild complete. Inserted key: %"PRIu64 "",(uint64_t)kernel);
diff --git a/src/tool/hpcrun/gpu/opencl/opencl-intercept.c b/src/tool/hpcrun/gpu/opencl/opencl-intercept.c
index 838426b888..b07e988c05 100644
--- a/src/tool/hpcrun/gpu/opencl/opencl-intercept.c
+++ b/src/tool/hpcrun/gpu/opencl/opencl-intercept.c
@@ -71,11 +71,17 @@
 //******************************************************************************
 
 #ifndef HPCRUN_STATIC_LINK
+static gotcha_wrappee_handle_t clBuildProgram_handle;
 static gotcha_wrappee_handle_t clCreateCommandQueue_handle;
 static gotcha_wrappee_handle_t clEnqueueNDRangeKernel_handle;
 static gotcha_wrappee_handle_t clEnqueueReadBuffer_handle;
 static gotcha_wrappee_handle_t clEnqueueWriteBuffer_handle;
 static atomic_long correlation_id;
+static char *debugInfoFullFileName;
+
+
+#define CL_PROGRAM_DEBUG_INFO_SIZES_INTEL 0x4101
+#define CL_PROGRAM_DEBUG_INFO_INTEL       0x4100
 
 
 
@@ -103,6 +109,20 @@ getCorrelationId
 }
 
 
+static void
+setDebugInfoFullFileName
+(
+	char *fileName
+)
+{
+	if (fileName != NULL) {
+		debugInfoFullFileName = fileName;	
+	} else {
+		debugInfoFullFileName = "opencl_main.debuginfo";	
+	}
+}
+
+
 static void
 initializeKernelCallBackInfo
 (
@@ -298,6 +318,88 @@ clEnqueueWriteBuffer_wrapper
   return return_status;
 }
 
+
+// we are dunping the debuginfo temporarily since the binary does not have debugsection
+// poorly written code: FIXME
+static char*
+dumpIntelGPUBinary(cl_program program) {
+	int device_count = 1;
+	cl_int status = CL_SUCCESS;
+	size_t *binary_size = (size_t*)malloc(sizeof(size_t) * device_count);
+
+	status = clGetProgramInfo(program, CL_PROGRAM_BINARY_SIZES,	sizeof(size_t), binary_size, NULL);
+	assert(status == CL_SUCCESS);
+	uint8_t **binary = (uint8_t**)malloc(device_count * sizeof(uint8_t*));
+	for (size_t i = 0; i < device_count; ++i) {
+		binary[i] = (uint8_t*)malloc(binary_size[i] * sizeof(uint8_t));
+	}
+
+	status = clGetProgramInfo(program, CL_PROGRAM_BINARIES, device_count * sizeof(uint8_t*), binary, NULL);
+	assert(status == CL_SUCCESS);
+
+	FILE *bin_ptr;
+	bin_ptr = fopen("opencl_main.gpubin", "wb");
+	fwrite(binary[0], binary_size[0], 1, bin_ptr);
+
+  // SECOND
+	size_t *debug_info_size = (size_t*)malloc(sizeof(size_t) * device_count);
+
+	status = clGetProgramInfo(program, CL_PROGRAM_DEBUG_INFO_SIZES_INTEL,	sizeof(size_t), debug_info_size, NULL);
+	assert(status == CL_SUCCESS);
+	uint8_t **debug_info = (uint8_t**)malloc(device_count * sizeof(uint8_t*));
+	for (size_t i = 0; i < device_count; ++i) {
+		debug_info[i] = (uint8_t*)malloc(debug_info_size[i] * sizeof(uint8_t));
+	}
+
+	status = clGetProgramInfo(program, CL_PROGRAM_DEBUG_INFO_INTEL, device_count * sizeof(uint8_t*), debug_info, NULL);
+	assert(status == CL_SUCCESS);
+
+	bin_ptr = fopen("opencl_main.debuginfo", "wb");
+	fwrite(debug_info[0], debug_info_size[0], 1, bin_ptr);
+	fclose(bin_ptr);
+  ETMSG(OPENCL, "Intel GPU files dumped successfully");
+	return realpath("opencl_main.debuginfo", NULL);
+}
+
+
+static void
+clBuildProgramCallback
+(
+	cl_program program,
+	void* user_data
+)
+{
+	char* debugInfoFullPath = dumpIntelGPUBinary(program);
+	setDebugInfoFullFileName(debugInfoFullPath);
+}
+
+
+// one downside of this appproach is that we may override the callback provided by user
+static cl_int
+clBuildProgram_wrapper
+(
+ cl_program program,
+ cl_uint num_devices,
+ const cl_device_id* device_list,
+ const char* options,
+ void (CL_CALLBACK* pfn_notify)(cl_program program, void* user_data),
+ void* user_data
+)
+{
+  ETMSG(OPENCL, "inside clBuildProgram_wrapper");
+  clbuildprogram_t clBuildProgram_wrappee = 
+    GOTCHA_GET_TYPED_WRAPPEE(clBuildProgram_handle, clbuildprogram_t);
+
+	char optionsWithDebugFlag[] = " -gline-tables-only ";
+	printf("%s\n", optionsWithDebugFlag);
+	if (options != NULL) {
+		strcat(optionsWithDebugFlag, options);
+	}
+	printf("%s\n", optionsWithDebugFlag);
+
+  return clBuildProgram_wrappee(program, num_devices, device_list, (const char*)optionsWithDebugFlag, clBuildProgramCallback, user_data);
+}
+
 #endif
 
 
@@ -308,6 +410,11 @@ clEnqueueWriteBuffer_wrapper
 
 #ifndef HPCRUN_STATIC_LINK
 static gotcha_binding_t opencl_bindings[] = {
+  {
+    "clBuildProgram",
+    (void*) clBuildProgram_wrapper,
+    &clBuildProgram_handle
+  },
   {
     "clCreateCommandQueue",
     (void*) clCreateCommandQueue_wrapper,
@@ -337,6 +444,16 @@ static gotcha_binding_t opencl_bindings[] = {
 // interface operations
 //******************************************************************************
 
+char*
+getDebugInfoFullFileName
+(
+	void
+)
+{
+	return debugInfoFullFileName;
+}
+
+
 void
 opencl_intercept_setup
 (
diff --git a/src/tool/hpcrun/gpu/opencl/opencl-intercept.h b/src/tool/hpcrun/gpu/opencl/opencl-intercept.h
index 89e4cd3584..8ea1a2f3ce 100644
--- a/src/tool/hpcrun/gpu/opencl/opencl-intercept.h
+++ b/src/tool/hpcrun/gpu/opencl/opencl-intercept.h
@@ -113,6 +113,17 @@ typedef cl_int (*clwritebuffer_t)(
 );
 
 
+typedef cl_int (*clbuildprogram_t)
+(
+ cl_program program,
+ cl_uint num_devices,
+ const cl_device_id* device_list,
+ const char* options,
+ void (CL_CALLBACK* pfn_notify)(cl_program program, void* user_data),
+ void* user_data
+);
+
+
 typedef enum {
   memcpy_H2D                      = 0,
   memcpy_D2H                      = 1,
@@ -146,6 +157,13 @@ typedef struct cl_memory_callback_t {
 // interface operations
 //******************************************************************************
 
+char*
+getDebugInfoFullFileName
+(
+	void
+);
+
+
 void
 opencl_intercept_setup
 (

From 0894406ccf241c751039e57452b0c276f1b1fb3a Mon Sep 17 00:00:00 2001
From: Aaron Cherian <aarontcopal2@iris19.ftm.alcf.anl.gov>
Date: Mon, 7 Sep 2020 17:14:41 +0000
Subject: [PATCH 018/177] updated loadmap names as absolute filepaths for
 opencl

---
 src/lib/banal/Struct.cpp                      |  4 +-
 src/lib/binutils/IntelGPUbinutils.cpp         |  4 +-
 .../instrumentation/opencl-instrumentation.c  | 55 ++++++-------------
 src/tool/hpcrun/gpu/opencl/opencl-intercept.c | 16 +++---
 4 files changed, 29 insertions(+), 50 deletions(-)

diff --git a/src/lib/banal/Struct.cpp b/src/lib/banal/Struct.cpp
index c983a7a036..fb0ef67692 100644
--- a/src/lib/banal/Struct.cpp
+++ b/src/lib/banal/Struct.cpp
@@ -710,7 +710,9 @@ makeStructure(string filename,
 
     makeWorkList(fileMap, wlPrint, wlLaunch);
 
-    Output::printLoadModuleBegin(outFile, elfFile->getFileName());
+		char *elfFileRealPath = realpath(elfFile->getFileName().c_str(), NULL);
+		std::cerr << elfFileRealPath << std::endl;
+    Output::printLoadModuleBegin(outFile, elfFileRealPath);
 
 #pragma omp parallel  default(none)				\
     shared(wlPrint, wlLaunch, num_done, output_mtx)		\
diff --git a/src/lib/binutils/IntelGPUbinutils.cpp b/src/lib/binutils/IntelGPUbinutils.cpp
index fd328c954a..a7e7d4451a 100644
--- a/src/lib/binutils/IntelGPUbinutils.cpp
+++ b/src/lib/binutils/IntelGPUbinutils.cpp
@@ -266,12 +266,12 @@ findIntelGPUbins
 				std::vector<uint8_t> debug_info(reinterpret_cast<uint8_t*>(sectionData), reinterpret_cast<uint8_t*>(sectionData) + shdr->sh_size);
 				extractSuccess = extract_kernelelfs(debug_info, filevector);
 				break;
-			} else if (strcmp(section_name, ".text") == 0) {
+			} /*else if (strcmp(section_name, ".text") == 0) {
 				FILE *bin_ptr;
 				bin_ptr = fopen("switch.text", "wb");
 				fwrite(sectionData, shdr->sh_size, 1, bin_ptr);
 				fclose(bin_ptr);
-			}
+			}*/
     }
   }
   bool success = fileHasDebugSection && extractSuccess;
diff --git a/src/tool/hpcrun/gpu/instrumentation/opencl-instrumentation.c b/src/tool/hpcrun/gpu/instrumentation/opencl-instrumentation.c
index 9c8fe6ea37..79b6db0e21 100644
--- a/src/tool/hpcrun/gpu/instrumentation/opencl-instrumentation.c
+++ b/src/tool/hpcrun/gpu/instrumentation/opencl-instrumentation.c
@@ -5,9 +5,9 @@
 
 #include <assert.h>
 #include <stdlib.h>
-#include <gen_symbols_decoder.h>
-#include <igc_binary_decoder.h>
-#include <utils.h>
+//#include <gen_symbols_decoder.h>
+//#include <igc_binary_decoder.h>
+//#include <utils.h>
 #include <gtpin.h>
 
 
@@ -27,8 +27,10 @@
 #include <hpcrun/gpu/gpu-op-placeholders.h>
 #include <hpcrun/gpu/gpu-metrics.h>
 #include <hpcrun/gpu/gpu-monitoring-thread-api.h>
+#include <hpcrun/memory/hpcrun-malloc.h>
 #include <hpcrun/utilities/hpcrun-nanotime.h>
-
+#include <hpcrun/gpu/opencl/opencl-intercept.h>
+#include "gen_symbols_decoder_wrapper.h"
 #include "opencl-instrumentation.h"
 
 
@@ -107,34 +109,34 @@ findKernelAndInsertToLoadMap
 	const SProgramDebugDataHeaderIGC* header = (const SProgramDebugDataHeaderIGC*)(ptr);
 	ptr += sizeof(SProgramDebugDataHeaderIGC);
 
-	printf("header->NumberOfKernels: %d\n", header->NumberOfKernels);
+	ETMSG(OPENCL, "Number of kernels: %d", header->NumberOfKernels);
 	for (uint32_t i = 0; i < header->NumberOfKernels; ++i) {
 		const SKernelDebugDataHeaderIGC* kernel_header = (const SKernelDebugDataHeaderIGC*)(ptr);
 		ptr += sizeof(SKernelDebugDataHeaderIGC);
 
 		const char* kernel_name = (const char*)(ptr);
-		char *file_name = (char*) kernel_name;
-		std::cerr << file_name <<std::endl;
+		char *file_name = (char*)hpcrun_malloc(sizeof(kernel_name));
+		strcpy(file_name, kernel_name);
+		strcat(file_name, ".gpubin");
 
 		unsigned kernel_name_size_aligned = sizeof(uint32_t) *
 			(1 + (kernel_header->KernelNameSize - 1) / sizeof(uint32_t));
 		ptr += kernel_name_size_aligned;
 
 		if (kernel_header->SizeVisaDbgInBytes > 0 && strcmp(kernel_name, input_kernel_name) == 0) {
-			file_name = strcat(file_name, ".gpbin");
 			FILE *fptr = fopen(file_name, "wb");
 			fwrite(ptr, kernel_header->SizeVisaDbgInBytes, 1, fptr);
 
 			uint32_t hpctoolkit_module_id;
 			load_module_t *module = NULL;
 			hpcrun_loadmap_lock();
-			if ((module = hpcrun_loadmap_findByName(file_name)) == NULL) {
-				hpctoolkit_module_id = hpcrun_loadModule_add(file_name);
+			char *absoluteKernelName = realpath(file_name, NULL); 
+			if ((module = hpcrun_loadmap_findByName(absoluteKernelName)) == NULL) {
+				hpctoolkit_module_id = hpcrun_loadModule_add(absoluteKernelName);
 			} else {
 				hpctoolkit_module_id = module->id;
 			}
 			hpcrun_loadmap_unlock();
-			printf("dumped debug file size: %zu\n", kernel_header->SizeVisaDbgInBytes);
 			fclose(fptr);
 			return hpctoolkit_module_id;
 		}
@@ -154,40 +156,17 @@ add_opencl_binary_to_loadmap
 	char *kernel_name
 )
 {
-	// we need to remove this hardcoding
-	FILE *fptr = fopen("opencl_main.debug_info", "rb");
+	char *debuginfoFileName = getDebugInfoFullFileName();
+	FILE *fptr = fopen(debuginfoFileName, "rb");
 	fseek(fptr, 0L, SEEK_END);
 	size_t debug_info_size = ftell(fptr);
-	printf("debug_info_size: %zu\n", debug_info_size);
 	rewind(fptr);
-	uint8_t *debug_info = (uint8_t*)malloc(debug_info_size);
+	uint8_t *debug_info = (uint8_t*)hpcrun_malloc(debug_info_size);
 	fread(debug_info, debug_info_size, 1, fptr);
 	findKernelAndInsertToLoadMap(debug_info, kernel_name);
 }
 
 
-/*
-static uint32_t
-add_opencl_binary_to_loadmap
-(
-	char *bin_filename
-)
-{
-	uint32_t hpctoolkit_module_id;
-	load_module_t *module = NULL;
-
-	hpcrun_loadmap_lock();
-	if ((module = hpcrun_loadmap_findByName(bin_filename)) == NULL) {
-		hpctoolkit_module_id = hpcrun_loadModule_add(bin_filename);
-	} else {
-		hpctoolkit_module_id = module->id;
-	}
-	hpcrun_loadmap_unlock();
-	return hpctoolkit_module_id;
-}
-*/
-
-
 static void
 opencl_activity_notify
 (
@@ -268,7 +247,7 @@ onKernelBuild
     status = GTPin_OpcodeprofInstrument(head, mem);
     assert(status == GTPINTOOL_STATUS_SUCCESS);
 
-		mem_pair_node *m = malloc(sizeof(mem_pair_node));
+		mem_pair_node *m = hpcrun_malloc(sizeof(mem_pair_node));
 		m->offset = offset;
 		m->mem = mem;
 		m->next = NULL;
diff --git a/src/tool/hpcrun/gpu/opencl/opencl-intercept.c b/src/tool/hpcrun/gpu/opencl/opencl-intercept.c
index b07e988c05..44fee63c3c 100644
--- a/src/tool/hpcrun/gpu/opencl/opencl-intercept.c
+++ b/src/tool/hpcrun/gpu/opencl/opencl-intercept.c
@@ -55,6 +55,7 @@
 
 #include <hpcrun/gpu/instrumentation/opencl-instrumentation.h>
 #include <hpcrun/gpu/gpu-metrics.h>
+#include <hpcrun/memory/hpcrun-malloc.h>
 #include <hpcrun/messages/messages.h>
 #include <lib/prof-lean/hpcrun-gotcha.h>
 #include <lib/prof-lean/hpcrun-opencl.h>
@@ -325,13 +326,13 @@ static char*
 dumpIntelGPUBinary(cl_program program) {
 	int device_count = 1;
 	cl_int status = CL_SUCCESS;
-	size_t *binary_size = (size_t*)malloc(sizeof(size_t) * device_count);
+	size_t *binary_size = (size_t*)hpcrun_malloc(sizeof(size_t) * device_count);
 
 	status = clGetProgramInfo(program, CL_PROGRAM_BINARY_SIZES,	sizeof(size_t), binary_size, NULL);
 	assert(status == CL_SUCCESS);
-	uint8_t **binary = (uint8_t**)malloc(device_count * sizeof(uint8_t*));
+	uint8_t **binary = (uint8_t**)hpcrun_malloc(device_count * sizeof(uint8_t*));
 	for (size_t i = 0; i < device_count; ++i) {
-		binary[i] = (uint8_t*)malloc(binary_size[i] * sizeof(uint8_t));
+		binary[i] = (uint8_t*)hpcrun_malloc(binary_size[i] * sizeof(uint8_t));
 	}
 
 	status = clGetProgramInfo(program, CL_PROGRAM_BINARIES, device_count * sizeof(uint8_t*), binary, NULL);
@@ -342,13 +343,13 @@ dumpIntelGPUBinary(cl_program program) {
 	fwrite(binary[0], binary_size[0], 1, bin_ptr);
 
   // SECOND
-	size_t *debug_info_size = (size_t*)malloc(sizeof(size_t) * device_count);
+	size_t *debug_info_size = (size_t*)hpcrun_malloc(sizeof(size_t) * device_count);
 
 	status = clGetProgramInfo(program, CL_PROGRAM_DEBUG_INFO_SIZES_INTEL,	sizeof(size_t), debug_info_size, NULL);
 	assert(status == CL_SUCCESS);
-	uint8_t **debug_info = (uint8_t**)malloc(device_count * sizeof(uint8_t*));
+	uint8_t **debug_info = (uint8_t**)hpcrun_malloc(device_count * sizeof(uint8_t*));
 	for (size_t i = 0; i < device_count; ++i) {
-		debug_info[i] = (uint8_t*)malloc(debug_info_size[i] * sizeof(uint8_t));
+		debug_info[i] = (uint8_t*)hpcrun_malloc(debug_info_size[i] * sizeof(uint8_t));
 	}
 
 	status = clGetProgramInfo(program, CL_PROGRAM_DEBUG_INFO_INTEL, device_count * sizeof(uint8_t*), debug_info, NULL);
@@ -391,12 +392,9 @@ clBuildProgram_wrapper
     GOTCHA_GET_TYPED_WRAPPEE(clBuildProgram_handle, clbuildprogram_t);
 
 	char optionsWithDebugFlag[] = " -gline-tables-only ";
-	printf("%s\n", optionsWithDebugFlag);
 	if (options != NULL) {
 		strcat(optionsWithDebugFlag, options);
 	}
-	printf("%s\n", optionsWithDebugFlag);
-
   return clBuildProgram_wrappee(program, num_devices, device_list, (const char*)optionsWithDebugFlag, clBuildProgramCallback, user_data);
 }
 

From b39e75982142c7322144e3526403339f02746b63 Mon Sep 17 00:00:00 2001
From: Aaron Cherian <aarontcopal2@iris19.ftm.alcf.anl.gov>
Date: Mon, 7 Sep 2020 22:24:51 +0000
Subject: [PATCH 019/177] added bifurcation in hpcstruct for opencl code

---
 src/lib/banal/Struct.cpp                      |  3 ++-
 src/lib/binutils/InputFile.cpp                | 22 +++++++++++++++++--
 src/lib/binutils/IntelGPUbinutils.cpp         | 13 ++++++++++-
 .../instrumentation/opencl-instrumentation.c  |  5 ++---
 4 files changed, 36 insertions(+), 7 deletions(-)

diff --git a/src/lib/banal/Struct.cpp b/src/lib/banal/Struct.cpp
index fb0ef67692..5e2a97dd19 100644
--- a/src/lib/banal/Struct.cpp
+++ b/src/lib/banal/Struct.cpp
@@ -709,7 +709,8 @@ makeStructure(string filename,
     mutex output_mtx;
 
     makeWorkList(fileMap, wlPrint, wlLaunch);
-
+		
+		std::cerr << elfFile->getArch() << std::endl;
 		char *elfFileRealPath = realpath(elfFile->getFileName().c_str(), NULL);
 		std::cerr << elfFileRealPath << std::endl;
     Output::printLoadModuleBegin(outFile, elfFileRealPath);
diff --git a/src/lib/binutils/InputFile.cpp b/src/lib/binutils/InputFile.cpp
index a645fe6ce3..e15a4bf75f 100644
--- a/src/lib/binutils/InputFile.cpp
+++ b/src/lib/binutils/InputFile.cpp
@@ -119,6 +119,25 @@ read_all(int fd, void *buf, size_t count)
   return len;
 }
 
+static bool
+isIntelGPUFile
+(
+	ElfFile *elfFile
+)
+{
+  Elf *elf = elfFile->getElf();
+  GElf_Ehdr ehdr_v;
+  GElf_Ehdr *ehdr = gelf_getehdr(elf, &ehdr_v);
+
+	int intelGPUType = 0xff04;
+	std::cerr << "ehdr->e_type: " << ehdr->e_type << std::endl;
+	std::cerr << "ehdr->e_type == intelGPUType: " << (ehdr->e_type == intelGPUType) << std::endl;
+  if (ehdr && ehdr->e_type == intelGPUType) {
+		return true;
+	}
+	return false;
+}
+
 
 //******************************************************************************
 // interface oeprations
@@ -183,11 +202,10 @@ InputFile::openFile
 
   ElfFile *elfFile = new ElfFile;
   bool result = elfFile->open(file_buffer, f_size, filename);
-	bool isIntelGPUFile = true;
 
   if (result) {
     filevector = new ElfFileVector;
-		if (isIntelGPUFile) {
+		if (isIntelGPUFile(elfFile)) {
 			findIntelGPUbins(elfFile, filevector);
 		}
 		else {
diff --git a/src/lib/binutils/IntelGPUbinutils.cpp b/src/lib/binutils/IntelGPUbinutils.cpp
index a7e7d4451a..ea13f7c794 100644
--- a/src/lib/binutils/IntelGPUbinutils.cpp
+++ b/src/lib/binutils/IntelGPUbinutils.cpp
@@ -257,7 +257,7 @@ findIntelGPUbins
 			if (!shdr) continue;
 			char *sectionData = elfSectionGetData(file_buffer, shdr);
 			const char *section_name = elf_strptr(elf, ehdr->e_shstrndx, shdr->sh_name);
-			std::cerr << "section name: " << section_name << ". section type: " << openclElfSectionType(shdr->sh_type) << std::endl;
+			//std::cerr << "section name: " << section_name << ". section type: " << openclElfSectionType(shdr->sh_type) << std::endl;
 
 			// extract debug section
 			if ((shdr->sh_type == SHT_OPENCL_DEV_DEBUG && strcmp(section_name, INTEL_GPU_DEBUG_SECTION_NAME) == 0)
@@ -274,6 +274,17 @@ findIntelGPUbins
 			}*/
     }
   }
+	FILE *fptr;
+	if (!fileHasDebugSection && (fptr = fopen("opencl_main.debuginfo", "rb"))) {
+		fileHasDebugSection = true;
+		fseek(fptr, 0L, SEEK_END);
+		size_t debug_info_size = ftell(fptr);
+		printf("debug_info_size: %zu\n", debug_info_size);
+		rewind(fptr);
+		std::vector<uint8_t> debug_info(debug_info_size);
+		fread(debug_info.data(), debug_info_size, 1, fptr);
+		extractSuccess = extract_kernelelfs(debug_info, filevector);
+	}
   bool success = fileHasDebugSection && extractSuccess;
   return success; 
 }
diff --git a/src/tool/hpcrun/gpu/instrumentation/opencl-instrumentation.c b/src/tool/hpcrun/gpu/instrumentation/opencl-instrumentation.c
index 79b6db0e21..3e59e28813 100644
--- a/src/tool/hpcrun/gpu/instrumentation/opencl-instrumentation.c
+++ b/src/tool/hpcrun/gpu/instrumentation/opencl-instrumentation.c
@@ -98,7 +98,7 @@ createKernelNode
 }
 
 
-uint32_t
+static void
 findKernelAndInsertToLoadMap
 (
 	uint8_t *debuginfo,
@@ -138,7 +138,6 @@ findKernelAndInsertToLoadMap
 			}
 			hpcrun_loadmap_unlock();
 			fclose(fptr);
-			return hpctoolkit_module_id;
 		}
 		// Should be zero for newest drivers
 		assert(kernel_header->SizeGenIsaDbgInBytes == 0);
@@ -146,7 +145,6 @@ findKernelAndInsertToLoadMap
 		ptr += kernel_header->SizeVisaDbgInBytes;
 		ptr += kernel_header->SizeGenIsaDbgInBytes;
 	}
-	return -1;
 }
 
 
@@ -258,6 +256,7 @@ onKernelBuild
 			isHeadNull = false;
 		} else {
 			current->next = m;
+			current = current->next;
 		}
   }
 	if (h != NULL) {

From ba49c3c5e1edade2eeab4bb2b19aae4378e0c406 Mon Sep 17 00:00:00 2001
From: Aaron Cherian <aarontcopal2@jlselogin2.ftm.alcf.anl.gov>
Date: Fri, 11 Sep 2020 17:28:12 +0000
Subject: [PATCH 020/177] added intercept to get source code for opencl kernels

---
 src/lib/banal/Struct.cpp                      |   2 +-
 src/lib/binutils/IntelGPUbinutils.cpp         |  11 +-
 .../instrumentation/opencl-instrumentation.c  |  58 ++++-
 .../instrumentation/opencl-instrumentation.h  |  67 ++++++
 src/tool/hpcrun/gpu/opencl/opencl-intercept.c | 222 +++++++++++-------
 src/tool/hpcrun/gpu/opencl/opencl-intercept.h |  10 +
 6 files changed, 275 insertions(+), 95 deletions(-)

diff --git a/src/lib/banal/Struct.cpp b/src/lib/banal/Struct.cpp
index 5e2a97dd19..bfce07ecd4 100644
--- a/src/lib/banal/Struct.cpp
+++ b/src/lib/banal/Struct.cpp
@@ -143,7 +143,7 @@ using namespace std;
 #endif
 
 #define DEBUG_CFG_SOURCE  0
-#define DEBUG_MAKE_SKEL   1 //0
+#define DEBUG_MAKE_SKEL   0 //1
 #define DEBUG_SHOW_GAPS   0
 #define DEBUG_SKEL_SUMMARY  0
 
diff --git a/src/lib/binutils/IntelGPUbinutils.cpp b/src/lib/binutils/IntelGPUbinutils.cpp
index ea13f7c794..51a3e94869 100644
--- a/src/lib/binutils/IntelGPUbinutils.cpp
+++ b/src/lib/binutils/IntelGPUbinutils.cpp
@@ -184,19 +184,18 @@ extract_kernelelfs
 		ptr += sizeof(SKernelDebugDataHeaderIGC);
 
 		const char* kernel_name = reinterpret_cast<const char*>(ptr);
-		char *file_name = (char*) kernel_name;
+		char *file_name = (char*)malloc(sizeof(kernel_name));
+		strcpy(file_name, kernel_name);
+		strcat(file_name, ".gpubin");
 
 		unsigned kernel_name_size_aligned = sizeof(uint32_t) *
 			(1 + (kernel_header->KernelNameSize - 1) / sizeof(uint32_t));
 		ptr += kernel_name_size_aligned;
 
 		if (kernel_header->SizeVisaDbgInBytes > 0) {
-			// Parse the binary block [ptr, ptr + kernel_header->SizeVisaDbgInBytes)
-			// as a blob in standard ELF/DWARF format
-	
-			FILE *f_ptr = fopen(kernel_name, "wb");
+			/*FILE *f_ptr = fopen(kernel_name, "wb");
 			fwrite(ptr, kernel_header->SizeVisaDbgInBytes, 1, f_ptr);
-			fclose(f_ptr);
+			fclose(f_ptr);*/
 			std::ifstream in(kernel_name);
 			std::string file_contents((std::istreambuf_iterator<char>(in)), 
 			    std::istreambuf_iterator<char>());
diff --git a/src/tool/hpcrun/gpu/instrumentation/opencl-instrumentation.c b/src/tool/hpcrun/gpu/instrumentation/opencl-instrumentation.c
index 3e59e28813..64d8176029 100644
--- a/src/tool/hpcrun/gpu/instrumentation/opencl-instrumentation.c
+++ b/src/tool/hpcrun/gpu/instrumentation/opencl-instrumentation.c
@@ -1,3 +1,45 @@
+// -*-Mode: C++;-*- // technically C99
+
+// * BeginRiceCopyright *****************************************************
+//
+// --------------------------------------------------------------------------
+// Part of HPCToolkit (hpctoolkit.org)
+//
+// Information about sources of support for research and development of
+// HPCToolkit is at 'hpctoolkit.org' and in 'README.Acknowledgments'.
+// --------------------------------------------------------------------------
+//
+// Copyright ((c)) 2002-2020, Rice University
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+//   notice, this list of conditions and the following disclaimer.
+//
+// * Redistributions in binary form must reproduce the above copyright
+//   notice, this list of conditions and the following disclaimer in the
+//   documentation and/or other materials provided with the distribution.
+//
+// * Neither the name of Rice University (RICE) nor the names of its
+//   contributors may be used to endorse or promote products derived from
+//   this software without specific prior written permission.
+//
+// This software is provided by RICE and contributors "as is" and any
+// express or implied warranties, including, but not limited to, the
+// implied warranties of merchantability and fitness for a particular
+// purpose are disclaimed. In no event shall RICE or contributors be
+// liable for any direct, indirect, incidental, special, exemplary, or
+// consequential damages (including, but not limited to, procurement of
+// substitute goods or services; loss of use, data, or profits; or
+// business interruption) however caused and on any theory of liability,
+// whether in contract, strict liability, or tort (including negligence
+// or otherwise) arising in any way out of the use of this software, even
+// if advised of the possibility of such damage.
+//
+// ******************************************************* EndRiceCopyright *
 
 //******************************************************************************
 // system includes
@@ -5,9 +47,6 @@
 
 #include <assert.h>
 #include <stdlib.h>
-//#include <gen_symbols_decoder.h>
-//#include <igc_binary_decoder.h>
-//#include <utils.h>
 #include <gtpin.h>
 
 
@@ -30,7 +69,6 @@
 #include <hpcrun/memory/hpcrun-malloc.h>
 #include <hpcrun/utilities/hpcrun-nanotime.h>
 #include <hpcrun/gpu/opencl/opencl-intercept.h>
-#include "gen_symbols_decoder_wrapper.h"
 #include "opencl-instrumentation.h"
 
 
@@ -98,7 +136,7 @@ createKernelNode
 }
 
 
-static void
+static uint32_t
 findKernelAndInsertToLoadMap
 (
 	uint8_t *debuginfo,
@@ -138,6 +176,7 @@ findKernelAndInsertToLoadMap
 			}
 			hpcrun_loadmap_unlock();
 			fclose(fptr);
+			return hpctoolkit_module_id;
 		}
 		// Should be zero for newest drivers
 		assert(kernel_header->SizeGenIsaDbgInBytes == 0);
@@ -145,6 +184,7 @@ findKernelAndInsertToLoadMap
 		ptr += kernel_header->SizeVisaDbgInBytes;
 		ptr += kernel_header->SizeGenIsaDbgInBytes;
 	}
+	return -1;
 }
 
 
@@ -155,13 +195,17 @@ add_opencl_binary_to_loadmap
 )
 {
 	char *debuginfoFileName = getDebugInfoFullFileName();
+	if (debuginfoFileName == NULL) {
+		ETMSG(OPENCL, "debug file not found");
+		return -1;	
+	}
 	FILE *fptr = fopen(debuginfoFileName, "rb");
 	fseek(fptr, 0L, SEEK_END);
 	size_t debug_info_size = ftell(fptr);
 	rewind(fptr);
 	uint8_t *debug_info = (uint8_t*)hpcrun_malloc(debug_info_size);
 	fread(debug_info, debug_info_size, 1, fptr);
-	findKernelAndInsertToLoadMap(debug_info, kernel_name);
+	return findKernelAndInsertToLoadMap(debug_info, kernel_name);
 }
 
 
@@ -192,6 +236,8 @@ opencl_kernel_block_activity_translate
 	ga->details.kernel_block.offset = offset;
 	ga->details.kernel_block.execution_count = execution_count;
 	ga->kind = GPU_ACTIVITY_KERNEL_BLOCK;
+
+  cstack_ptr_set(&(ga->next), 0);
 }
 
 
diff --git a/src/tool/hpcrun/gpu/instrumentation/opencl-instrumentation.h b/src/tool/hpcrun/gpu/instrumentation/opencl-instrumentation.h
index 1204ac9f61..c15d6fde0f 100644
--- a/src/tool/hpcrun/gpu/instrumentation/opencl-instrumentation.h
+++ b/src/tool/hpcrun/gpu/instrumentation/opencl-instrumentation.h
@@ -1,3 +1,45 @@
+// -*-Mode: C++;-*- // technically C99
+
+// * BeginRiceCopyright *****************************************************
+//
+// --------------------------------------------------------------------------
+// Part of HPCToolkit (hpctoolkit.org)
+//
+// Information about sources of support for research and development of
+// HPCToolkit is at 'hpctoolkit.org' and in 'README.Acknowledgments'.
+// --------------------------------------------------------------------------
+//
+// Copyright ((c)) 2002-2020, Rice University
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+//   notice, this list of conditions and the following disclaimer.
+//
+// * Redistributions in binary form must reproduce the above copyright
+//   notice, this list of conditions and the following disclaimer in the
+//   documentation and/or other materials provided with the distribution.
+//
+// * Neither the name of Rice University (RICE) nor the names of its
+//   contributors may be used to endorse or promote products derived from
+//   this software without specific prior written permission.
+//
+// This software is provided by RICE and contributors "as is" and any
+// express or implied warranties, including, but not limited to, the
+// implied warranties of merchantability and fitness for a particular
+// purpose are disclaimed. In no event shall RICE or contributors be
+// liable for any direct, indirect, incidental, special, exemplary, or
+// consequential damages (including, but not limited to, procurement of
+// substitute goods or services; loss of use, data, or profits; or
+// business interruption) however caused and on any theory of liability,
+// whether in contract, strict liability, or tort (including negligence
+// or otherwise) arising in any way out of the use of this software, even
+// if advised of the possibility of such damage.
+//
+// ******************************************************* EndRiceCopyright *
 
 //******************************************************************************
 // local includes
@@ -8,6 +50,31 @@
 
 
 
+//******************************************************************************
+// type declarations
+//******************************************************************************
+
+typedef struct _SProgramDebugDataHeaderIGC
+{
+    uint32_t         Magic;
+    uint32_t         Version;
+    uint32_t         Size;
+    uint32_t         Device;
+    uint32_t         SteppingId;
+    uint32_t         GPUPointerSizeInBytes;
+    uint32_t         NumberOfKernels;
+} SProgramDebugDataHeaderIGC;
+
+
+typedef struct _SKernelDebugDataHeaderIGC
+{
+    uint32_t         KernelNameSize;
+    uint32_t         SizeVisaDbgInBytes;
+    uint32_t         SizeGenIsaDbgInBytes;
+} SKernelDebugDataHeaderIGC;
+
+
+
 //******************************************************************************
 // interface operations
 //******************************************************************************
diff --git a/src/tool/hpcrun/gpu/opencl/opencl-intercept.c b/src/tool/hpcrun/gpu/opencl/opencl-intercept.c
index 44fee63c3c..3a33d0748c 100644
--- a/src/tool/hpcrun/gpu/opencl/opencl-intercept.c
+++ b/src/tool/hpcrun/gpu/opencl/opencl-intercept.c
@@ -73,6 +73,7 @@
 
 #ifndef HPCRUN_STATIC_LINK
 static gotcha_wrappee_handle_t clBuildProgram_handle;
+static gotcha_wrappee_handle_t clCreateProgramWithSource_handle;
 static gotcha_wrappee_handle_t clCreateCommandQueue_handle;
 static gotcha_wrappee_handle_t clEnqueueNDRangeKernel_handle;
 static gotcha_wrappee_handle_t clEnqueueReadBuffer_handle;
@@ -116,10 +117,10 @@ setDebugInfoFullFileName
 	char *fileName
 )
 {
-	if (fileName != NULL) {
+	if (debugInfoFullFileName == NULL) {
+		//size_t fileNameLength = strlen(fileName);
+		//debugInfoFullFileName = (char*) malloc(sizeof(fileNameLength));
 		debugInfoFullFileName = fileName;	
-	} else {
-		debugInfoFullFileName = "opencl_main.debuginfo";	
 	}
 }
 
@@ -152,6 +153,137 @@ initializeMemoryCallBackInfo
   mem_transfer_cb->fromDeviceToHost = !fromHostToDevice;
 }
 
+static char*
+getKernelNameFromSourceCode
+(
+	const char *kernelSourceCode
+)
+{
+	char *kernelCode_copy = (char*)hpcrun_malloc(sizeof(kernelSourceCode));
+	strcpy(kernelCode_copy, kernelSourceCode);
+	char *token = strtok(kernelCode_copy, " ");
+	while (token != NULL) {
+		if (strcmp(token, "void") == 0) { // not searching for kernel because "supported\n#endif\nkernel"
+			token = strtok(NULL, " ");
+			printf("kernel name: %s", token);
+			return token;
+		}
+		token = strtok(NULL, " ");
+	}
+	return NULL;
+}
+
+
+static cl_program
+clCreateProgramWithSource_wrapper
+(
+ cl_context context,
+ cl_uint count,
+ const char** strings,
+ const size_t* lengths,
+ cl_int* errcode_ret
+)
+{
+	clcreateprogramwithsource_t clCreateProgramWithSource_wrappee =
+		GOTCHA_GET_TYPED_WRAPPEE(clCreateProgramWithSource_handle, clcreateprogramwithsource_t);
+	return clCreateProgramWithSource_wrappee(context, count, strings, lengths, errcode_ret);
+	/*
+	ETMSG(OPENCL, "inside clCreateProgramWithSource_wrapper");
+
+	FILE *f_ptr;
+	for (int i = 0; i < (int)count; i++) {
+		// what if a single file has multiple kernels?
+		char *filename = "add.src"; // we need to add logic to get filenames by reading the strings contents
+		//char *filename = getKernelNameFromSourceCode(strings[i]);
+		f_ptr = fopen(filename, "w");
+		fwrite(strings[i], lengths[i], 1, f_ptr);
+	}
+	fclose(f_ptr);
+	*/
+}
+
+
+// we are dumping the debuginfo temporarily since the binary does not have debugsection
+// poorly written code: FIXME
+static char*
+dumpIntelGPUBinary(cl_program program, size_t *fileNameSize) {
+	int device_count = 1;
+	cl_int status = CL_SUCCESS;
+	size_t *binary_size = (size_t*)hpcrun_malloc(sizeof(size_t) * device_count);
+
+	status = clGetProgramInfo(program, CL_PROGRAM_BINARY_SIZES,	sizeof(size_t), binary_size, NULL);
+	assert(status == CL_SUCCESS);
+	uint8_t **binary = (uint8_t**)hpcrun_malloc(device_count * sizeof(uint8_t*));
+	for (size_t i = 0; i < device_count; ++i) {
+		binary[i] = (uint8_t*)hpcrun_malloc(binary_size[i] * sizeof(uint8_t));
+	}
+
+	status = clGetProgramInfo(program, CL_PROGRAM_BINARIES, device_count * sizeof(uint8_t*), binary, NULL);
+	assert(status == CL_SUCCESS);
+
+	FILE *bin_ptr;
+	bin_ptr = fopen("opencl_main.gpubin", "wb");
+	fwrite(binary[0], binary_size[0], 1, bin_ptr);
+
+  // SECOND
+	size_t *debug_info_size = (size_t*)hpcrun_malloc(sizeof(size_t) * device_count);
+
+	status = clGetProgramInfo(program, CL_PROGRAM_DEBUG_INFO_SIZES_INTEL,	sizeof(size_t), debug_info_size, NULL);
+	assert(status == CL_SUCCESS);
+	uint8_t **debug_info = (uint8_t**)hpcrun_malloc(device_count * sizeof(uint8_t*));
+	for (size_t i = 0; i < device_count; ++i) {
+		debug_info[i] = (uint8_t*)hpcrun_malloc(debug_info_size[i] * sizeof(uint8_t));
+	}
+
+	status = clGetProgramInfo(program, CL_PROGRAM_DEBUG_INFO_INTEL, device_count * sizeof(uint8_t*), debug_info, NULL);
+	assert(status == CL_SUCCESS);
+
+	char *debuginfoFileName = "opencl_main.debuginfo";
+	*fileNameSize = strlen(debuginfoFileName);
+	bin_ptr = fopen(debuginfoFileName, "wb");
+	fwrite(debug_info[0], debug_info_size[0], 1, bin_ptr);
+	fclose(bin_ptr);
+  ETMSG(OPENCL, "Intel GPU files dumped successfully");
+	return realpath(debuginfoFileName, NULL);
+}
+
+
+static void
+clBuildProgramCallback
+(
+	cl_program program,
+	void* user_data
+)
+{
+	size_t fileNameSize;
+	char* debugInfoFullFileName = dumpIntelGPUBinary(program, &fileNameSize);
+	setDebugInfoFullFileName(debugInfoFullFileName);
+}
+
+
+// one downside of this appproach is that we may override the callback provided by user
+static cl_int
+clBuildProgram_wrapper
+(
+ cl_program program,
+ cl_uint num_devices,
+ const cl_device_id* device_list,
+ const char* options,
+ void (CL_CALLBACK* pfn_notify)(cl_program program, void* user_data),
+ void* user_data
+)
+{
+  ETMSG(OPENCL, "inside clBuildProgram_wrapper");
+  clbuildprogram_t clBuildProgram_wrappee = 
+    GOTCHA_GET_TYPED_WRAPPEE(clBuildProgram_handle, clbuildprogram_t);
+
+	char optionsWithDebugFlag[] = " -gline-tables-only ";
+	if (options != NULL) {
+		strcat(optionsWithDebugFlag, options);
+	}
+  return clBuildProgram_wrappee(program, num_devices, device_list, (const char*)optionsWithDebugFlag, clBuildProgramCallback, user_data);
+}
+
 
 static cl_command_queue
 clCreateCommandQueue_wrapper
@@ -319,85 +451,6 @@ clEnqueueWriteBuffer_wrapper
   return return_status;
 }
 
-
-// we are dunping the debuginfo temporarily since the binary does not have debugsection
-// poorly written code: FIXME
-static char*
-dumpIntelGPUBinary(cl_program program) {
-	int device_count = 1;
-	cl_int status = CL_SUCCESS;
-	size_t *binary_size = (size_t*)hpcrun_malloc(sizeof(size_t) * device_count);
-
-	status = clGetProgramInfo(program, CL_PROGRAM_BINARY_SIZES,	sizeof(size_t), binary_size, NULL);
-	assert(status == CL_SUCCESS);
-	uint8_t **binary = (uint8_t**)hpcrun_malloc(device_count * sizeof(uint8_t*));
-	for (size_t i = 0; i < device_count; ++i) {
-		binary[i] = (uint8_t*)hpcrun_malloc(binary_size[i] * sizeof(uint8_t));
-	}
-
-	status = clGetProgramInfo(program, CL_PROGRAM_BINARIES, device_count * sizeof(uint8_t*), binary, NULL);
-	assert(status == CL_SUCCESS);
-
-	FILE *bin_ptr;
-	bin_ptr = fopen("opencl_main.gpubin", "wb");
-	fwrite(binary[0], binary_size[0], 1, bin_ptr);
-
-  // SECOND
-	size_t *debug_info_size = (size_t*)hpcrun_malloc(sizeof(size_t) * device_count);
-
-	status = clGetProgramInfo(program, CL_PROGRAM_DEBUG_INFO_SIZES_INTEL,	sizeof(size_t), debug_info_size, NULL);
-	assert(status == CL_SUCCESS);
-	uint8_t **debug_info = (uint8_t**)hpcrun_malloc(device_count * sizeof(uint8_t*));
-	for (size_t i = 0; i < device_count; ++i) {
-		debug_info[i] = (uint8_t*)hpcrun_malloc(debug_info_size[i] * sizeof(uint8_t));
-	}
-
-	status = clGetProgramInfo(program, CL_PROGRAM_DEBUG_INFO_INTEL, device_count * sizeof(uint8_t*), debug_info, NULL);
-	assert(status == CL_SUCCESS);
-
-	bin_ptr = fopen("opencl_main.debuginfo", "wb");
-	fwrite(debug_info[0], debug_info_size[0], 1, bin_ptr);
-	fclose(bin_ptr);
-  ETMSG(OPENCL, "Intel GPU files dumped successfully");
-	return realpath("opencl_main.debuginfo", NULL);
-}
-
-
-static void
-clBuildProgramCallback
-(
-	cl_program program,
-	void* user_data
-)
-{
-	char* debugInfoFullPath = dumpIntelGPUBinary(program);
-	setDebugInfoFullFileName(debugInfoFullPath);
-}
-
-
-// one downside of this appproach is that we may override the callback provided by user
-static cl_int
-clBuildProgram_wrapper
-(
- cl_program program,
- cl_uint num_devices,
- const cl_device_id* device_list,
- const char* options,
- void (CL_CALLBACK* pfn_notify)(cl_program program, void* user_data),
- void* user_data
-)
-{
-  ETMSG(OPENCL, "inside clBuildProgram_wrapper");
-  clbuildprogram_t clBuildProgram_wrappee = 
-    GOTCHA_GET_TYPED_WRAPPEE(clBuildProgram_handle, clbuildprogram_t);
-
-	char optionsWithDebugFlag[] = " -gline-tables-only ";
-	if (options != NULL) {
-		strcat(optionsWithDebugFlag, options);
-	}
-  return clBuildProgram_wrappee(program, num_devices, device_list, (const char*)optionsWithDebugFlag, clBuildProgramCallback, user_data);
-}
-
 #endif
 
 
@@ -413,6 +466,11 @@ static gotcha_binding_t opencl_bindings[] = {
     (void*) clBuildProgram_wrapper,
     &clBuildProgram_handle
   },
+  {
+    "clCreateProgramWithSource",
+    (void*) clCreateProgramWithSource_wrapper,
+    &clCreateProgramWithSource_handle
+  },
   {
     "clCreateCommandQueue",
     (void*) clCreateCommandQueue_wrapper,
diff --git a/src/tool/hpcrun/gpu/opencl/opencl-intercept.h b/src/tool/hpcrun/gpu/opencl/opencl-intercept.h
index 8ea1a2f3ce..30c6ec6d19 100644
--- a/src/tool/hpcrun/gpu/opencl/opencl-intercept.h
+++ b/src/tool/hpcrun/gpu/opencl/opencl-intercept.h
@@ -124,6 +124,16 @@ typedef cl_int (*clbuildprogram_t)
 );
 
 
+typedef cl_program (*clcreateprogramwithsource_t)
+(
+ cl_context context,
+ cl_uint count,
+ const char** strings,
+ const size_t* lengths,
+ cl_int* errcode_ret
+);
+
+
 typedef enum {
   memcpy_H2D                      = 0,
   memcpy_D2H                      = 1,

From 25b5a8706c196e3b004d25fbb851007c7b280ff7 Mon Sep 17 00:00:00 2001
From: Keren Zhou <jokeren@jlselogin2.ftm.alcf.anl.gov>
Date: Sun, 13 Sep 2020 16:50:37 +0000
Subject: [PATCH 021/177] Rewrite configure files and remove all the manual
 dependencies, at least it compiles

---
 Makefile.in                                   |   9 +
 configure                                     | 241 ++++-
 configure.ac                                  | 158 +++-
 doc/Makefile.in                               |   9 +
 doc/man/Makefile.in                           |   9 +
 doc/manual/Makefile.in                        |   9 +
 doc/www/Makefile.in                           |   9 +
 lib/Makefile.in                               |   9 +
 src/Makefile.in                               |   9 +
 src/lib/Makefile.in                           |   9 +
 src/lib/analysis/Makefile.in                  |   9 +
 src/lib/banal/Makefile.in                     |   9 +
 src/lib/binutils/InputFile.cpp                |   2 +-
 src/lib/binutils/Makefile.am                  |  18 +-
 src/lib/binutils/Makefile.in                  |  78 +-
 .../binutils/{ => intel}/IntelGPUbinutils.cpp |   0
 .../binutils/{ => intel}/IntelGPUbinutils.hpp |   0
 src/lib/binutils/intel/dwarf.h                |  41 +
 src/lib/binutils/intel/dwarf_parser.h         | 144 +++
 src/lib/binutils/intel/dwarf_state_machine.h  | 210 +++++
 src/lib/binutils/intel/elf.h                  |  45 +
 src/lib/binutils/intel/elf_parser.h           | 129 +++
 src/lib/binutils/intel/gen_binary_decoder.h   |  60 ++
 src/lib/binutils/intel/gen_symbols_decoder.h  | 108 +++
 src/lib/binutils/intel/igc_binary_decoder.h   |  98 +++
 src/lib/binutils/intel/leb128.h               |  74 ++
 src/lib/binutils/intel/metric_device.h        |  69 ++
 src/lib/binutils/intel/metric_utils.h         |  95 ++
 src/lib/binutils/intel/shared_library.h       |  72 ++
 src/lib/binutils/intel/utils.h                | 117 +++
 src/lib/binutils/intel/ze_tracer.h            | 104 +++
 src/lib/binutils/intel/ze_utils.h             | 143 +++
 src/lib/isa/Makefile.in                       |   9 +
 src/lib/prof-lean/Makefile.in                 |   9 +
 src/lib/prof/Makefile.in                      |   9 +
 src/lib/profxml/Makefile.in                   |   9 +
 src/lib/stubs-gcc_s/Makefile.in               |   9 +
 src/lib/support-lean/Makefile.in              |   9 +
 src/lib/support/Makefile.in                   |   9 +
 src/lib/xml/Makefile.in                       |   9 +
 src/tool/Makefile.in                          |   9 +
 src/tool/hpcfnbounds/Makefile.in              |   9 +
 src/tool/hpcfnbounds2/Makefile.in             |   9 +
 src/tool/hpclump/Makefile.in                  |   9 +
 src/tool/hpcprof-flat/Makefile.in             |   9 +
 src/tool/hpcprof-mpi/Makefile.in              |   9 +
 src/tool/hpcprof/Makefile.in                  |   9 +
 src/tool/hpcproftt/Makefile.in                |   9 +
 src/tool/hpcrun-flat/Makefile.in              |   9 +
 src/tool/hpcrun/Makefile.am                   |  47 +-
 src/tool/hpcrun/Makefile.in                   | 824 ++++++++----------
 src/tool/hpcrun/utilities/bgq-cnk/Makefile.in |   9 +
 src/tool/hpcserver/Makefile.in                |   9 +
 src/tool/hpcserver/mpi/Makefile.in            |   9 +
 src/tool/hpcstruct/Makefile.in                |   9 +
 src/tool/hpctracedump/Makefile.in             |   9 +
 src/tool/misc/Makefile.in                     |   9 +
 src/tool/xprof/Makefile.in                    |   9 +
 58 files changed, 2642 insertions(+), 541 deletions(-)
 rename src/lib/binutils/{ => intel}/IntelGPUbinutils.cpp (100%)
 rename src/lib/binutils/{ => intel}/IntelGPUbinutils.hpp (100%)
 create mode 100644 src/lib/binutils/intel/dwarf.h
 create mode 100644 src/lib/binutils/intel/dwarf_parser.h
 create mode 100644 src/lib/binutils/intel/dwarf_state_machine.h
 create mode 100644 src/lib/binutils/intel/elf.h
 create mode 100644 src/lib/binutils/intel/elf_parser.h
 create mode 100644 src/lib/binutils/intel/gen_binary_decoder.h
 create mode 100644 src/lib/binutils/intel/gen_symbols_decoder.h
 create mode 100644 src/lib/binutils/intel/igc_binary_decoder.h
 create mode 100644 src/lib/binutils/intel/leb128.h
 create mode 100644 src/lib/binutils/intel/metric_device.h
 create mode 100644 src/lib/binutils/intel/metric_utils.h
 create mode 100644 src/lib/binutils/intel/shared_library.h
 create mode 100644 src/lib/binutils/intel/utils.h
 create mode 100644 src/lib/binutils/intel/ze_tracer.h
 create mode 100644 src/lib/binutils/intel/ze_utils.h

diff --git a/Makefile.in b/Makefile.in
index 30c8dee7ae..f9c4ceb6c5 100644
--- a/Makefile.in
+++ b/Makefile.in
@@ -341,9 +341,18 @@ OPT_CUDA_LDFLAGS = @OPT_CUDA_LDFLAGS@
 OPT_CUPTI = @OPT_CUPTI@
 OPT_CUPTI_IFLAGS = @OPT_CUPTI_IFLAGS@
 OPT_CUPTI_LDFLAGS = @OPT_CUPTI_LDFLAGS@
+OPT_GTPIN = @OPT_GTPIN@
+OPT_GTPIN_IFLAGS = @OPT_GTPIN_IFLAGS@
+OPT_GTPIN_LDFLAGS = @OPT_GTPIN_LDFLAGS@
+OPT_IGC = @OPT_IGC@
+OPT_IGC_IFLAGS = @OPT_IGC_IFLAGS@
+OPT_IGC_LDFLAGS = @OPT_IGC_LDFLAGS@
 OPT_LEVEL0 = @OPT_LEVEL0@
 OPT_LEVEL0_IFLAGS = @OPT_LEVEL0_IFLAGS@
 OPT_LEVEL0_LDFLAGS = @OPT_LEVEL0_LDFLAGS@
+OPT_METRICS_DISCOVERY = @OPT_METRICS_DISCOVERY@
+OPT_METRICS_DISCOVERY_IFLAGS = @OPT_METRICS_DISCOVERY_IFLAGS@
+OPT_METRICS_DISCOVERY_LDFLAGS = @OPT_METRICS_DISCOVERY_LDFLAGS@
 OPT_OBJCOPY = @OPT_OBJCOPY@
 OPT_OPENCL_IFLAGS = @OPT_OPENCL_IFLAGS@
 OPT_PAPI = @OPT_PAPI@
diff --git a/configure b/configure
index 329823775b..b157f0b67e 100755
--- a/configure
+++ b/configure
@@ -657,9 +657,24 @@ OPT_ROCM_IFLAGS
 OPT_ROCM
 OPT_ENABLE_ROCM_FALSE
 OPT_ENABLE_ROCM_TRUE
+OPT_GTPIN_LDFLAGS
+OPT_GTPIN_IFLAGS
+OPT_GTPIN
+OPT_ENABLE_GTPIN_FALSE
+OPT_ENABLE_GTPIN_TRUE
+OPT_METRICS_DISCOVERY_LDFLAGS
+OPT_METRICS_DISCOVERY_IFLAGS
+OPT_METRICS_DISCOVERY
+OPT_ENABLE_METRICS_DISCOVERY_FALSE
+OPT_ENABLE_METRICS_DISCOVERY_TRUE
+OPT_IGC_LDFLAGS
+OPT_IGC_IFLAGS
+OPT_IGC
+OPT_ENABLE_IGC_FALSE
+OPT_ENABLE_IGC_TRUE
 OPT_OPENCL_IFLAGS
-ENABLE_OPENCL_FALSE
-ENABLE_OPENCL_TRUE
+OPT_ENABLE_OPENCL_FALSE
+OPT_ENABLE_OPENCL_TRUE
 OPT_CUPTI_LDFLAGS
 OPT_CUPTI_IFLAGS
 OPT_CUPTI
@@ -1059,6 +1074,9 @@ enable_xop
 with_cuda
 with_cupti
 with_opencl
+with_igc
+with_metrics_discovery
+with_gtpin
 with_rocm
 with_level0
 enable_data_centric_tracing
@@ -1812,6 +1830,10 @@ Optional Packages:
   --with-cupti=PATH       path to cupti install directory, default is from
                           cuda
   --with-opencl=PATH      path to opencl headers
+  --with-igc=PATH         path to igc install directory
+  --with-metrics-discovery=PATH
+                          path to metrics-discovery install directory
+  --with-gtpin=PATH       path to gtpin install directory
   --with-rocm=PATH        use given ROCM installation (absolute path) with
                           hpcrun (default is NO)
   --with-level0=PATH      use given Level Zero installation (absolute path)
@@ -23594,9 +23616,12 @@ if test "${with_opencl+set}" = set; then :
 fi
 
 
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for opencl" >&5
+$as_echo_n "checking for opencl... " >&6; }
+
 case "$OPENCL" in
   /* )
-    if test ! -f "${OPENCL}/CL/cl.h" ; then
+    if test ! -f "${OPENCL}/include/CL/cl.h" ; then
       as_fn_error $? "unable to find CL/cl.h in: $OPENCL" "$LINENO" 5
     else
       OPT_OPENCL_IFLAGS="-I${OPENCL}"
@@ -23620,14 +23645,194 @@ $as_echo "$as_me: The <CL/cl.h> header file is available." >&6;}
 fi
 
  if test "$OPT_HAVE_OPENCL" = yes; then
-  ENABLE_OPENCL_TRUE=
-  ENABLE_OPENCL_FALSE='#'
+  OPT_ENABLE_OPENCL_TRUE=
+  OPT_ENABLE_OPENCL_FALSE='#'
+else
+  OPT_ENABLE_OPENCL_TRUE='#'
+  OPT_ENABLE_OPENCL_FALSE=
+fi
+
+
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $OPENCL" >&5
+$as_echo "$OPENCL" >&6; }
+
+
+
+#-------------------------------------------------
+# Option: --with-igc=PATH
+#-------------------------------------------------
+
+OPT_HAVE_IGC=no
+OPT_IGC_IFLAGS=
+OPT_IGC_LDFLAGS=
+
+
+# Check whether --with-igc was given.
+if test "${with_igc+set}" = set; then :
+  withval=$with_igc; IGC="$withval"
+fi
+
+
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for igc" >&5
+$as_echo_n "checking for igc... " >&6; }
+
+case "$IGC" in
+  /* )
+    if test ! -f "${IGC}/include/igc/igc.opencl.h" ; then
+      as_fn_error $? "unable to find igc.opencl.h in: $IGC" "$LINENO" 5
+    fi
+    OPT_IGC_IFLAGS="-I${IGC}/include"
+
+    for lib in $multilib_path ; do
+      if test -f "${IGC}/${lib}/libigc.so" ; then
+        OPT_IGC_LDFLAGS="-L${IGC}/$lib -Wl,-rpath=${IGC}/$lib -ligc"
+        break
+      fi
+    done
+    if test "x$OPT_IGC_LDFLAGS" = x ; then
+      as_fn_error $? "unable to find libigc.so in: $IGC" "$LINENO" 5
+    fi
+    OPT_HAVE_IGC=yes
+    ;;
+  no )
+    ;;
+  * )
+    as_fn_error $? "igc directory must be absolute path: $IGC" "$LINENO" 5
+    ;;
+esac
+
+ if test "$OPT_HAVE_IGC" = yes; then
+  OPT_ENABLE_IGC_TRUE=
+  OPT_ENABLE_IGC_FALSE='#'
 else
-  ENABLE_OPENCL_TRUE='#'
-  ENABLE_OPENCL_FALSE=
+  OPT_ENABLE_IGC_TRUE='#'
+  OPT_ENABLE_IGC_FALSE=
 fi
 
 
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $IGC" >&5
+$as_echo "$IGC" >&6; }
+
+
+
+
+
+#-------------------------------------------------
+# Option: --with-metrics-discovery=PATH
+#-------------------------------------------------
+
+OPT_HAVE_METRICS_DISCOVERY=no
+OPT_METRICS_DISCOVERY_IFLAGS=
+OPT_METRICS_DISCOVERY_LDFLAGS=
+
+
+# Check whether --with-metrics-discovery was given.
+if test "${with_metrics_discovery+set}" = set; then :
+  withval=$with_metrics_discovery; METRICS_DISCOVERY="$withval"
+fi
+
+
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for metrics-discovery" >&5
+$as_echo_n "checking for metrics-discovery... " >&6; }
+
+case "$METRICS_DISCOVERY" in
+  /* )
+    if test ! -f "${METRICS_DISCOVERY}/include/metrics_discovery_api.h" ; then
+      as_fn_error $? "unable to find metrics_discovery_api.h in: $METRICS_DISCOVERY" "$LINENO" 5
+    fi
+    OPT_METRICS_DISCOVERY_IFLAGS="-I${METRICS_DISCOVERY}/include"
+
+    for lib in $multilib_path ; do
+      if test -f "${METRICS_DISCOVERY}/${lib}/libmd.so" ; then
+        OPT_METRICS_DISCOVERY_LDFLAGS="-L${METRICS_DISCOVERY}/$lib -Wl,-rpath=${METRICS_DISCOVERY}/$lib -lmd"
+        break
+      fi
+    done
+    if test "x$OPT_METRICS_DISCOVERY_LDFLAGS" = x ; then
+      as_fn_error $? "unable to find libmd.so in: $METRICS_DISCOVERY" "$LINENO" 5
+    fi
+    OPT_HAVE_METRICS_DISCOVERY=yes
+    ;;
+  no )
+    ;;
+  * )
+    as_fn_error $? "metrics-discovery directory must be absolute path: $METRICS_DISCOVERY" "$LINENO" 5
+    ;;
+esac
+
+ if test "$OPT_HAVE_METRICS_DISCOVERY" = yes; then
+  OPT_ENABLE_METRICS_DISCOVERY_TRUE=
+  OPT_ENABLE_METRICS_DISCOVERY_FALSE='#'
+else
+  OPT_ENABLE_METRICS_DISCOVERY_TRUE='#'
+  OPT_ENABLE_METRICS_DISCOVERY_FALSE=
+fi
+
+
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $METRICS_DISCOVERY" >&5
+$as_echo "$METRICS_DISCOVERY" >&6; }
+
+
+
+
+
+#-------------------------------------------------
+# Option: --with-gtpin=PATH
+#-------------------------------------------------
+
+OPT_HAVE_GTPIN=no
+OPT_GTPIN_IFLAGS=
+OPT_GTPIN_LDFLAGS=
+
+
+# Check whether --with-gtpin was given.
+if test "${with_gtpin+set}" = set; then :
+  withval=$with_gtpin; GTPIN="$withval"
+fi
+
+
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for gtpin" >&5
+$as_echo_n "checking for gtpin... " >&6; }
+
+case "$GTPIN" in
+  /* )
+    if test ! -f "${GTPIN}/Profilers/Include/gtpin.h" ; then
+      as_fn_error $? "unable to find gtpin.h in: $GTPIN" "$LINENO" 5
+    fi
+    OPT_GTPIN_IFLAGS="-I${GTPIN}/Profilers/Include -I${GTPIN}/Profilers/Include/ged/intel64/"
+
+    if test -f "${GTPIN}/Profilers/Lib/intel64/libgtpin.so" ; then
+      OPT_GTPIN_LDFLAGS="-L${GTPIN}/Profilers/Lib/intel64/ -Wl,-rpath=${GTPIN}/Profilers/Lib/intel64/ -lgtpin"
+    fi
+
+    if test "x$OPT_GTPIN_LDFLAGS" = x ; then
+      as_fn_error $? "unable to find libgtpin.so in: $GTPIN" "$LINENO" 5
+    fi
+    OPT_HAVE_GTPIN=yes
+    ;;
+  no )
+    ;;
+  * )
+    as_fn_error $? "gtpin directory must be absolute path: $GTPIN" "$LINENO" 5
+    ;;
+esac
+
+ if test "$OPT_HAVE_GTPIN" = yes; then
+  OPT_ENABLE_GTPIN_TRUE=
+  OPT_ENABLE_GTPIN_FALSE='#'
+else
+  OPT_ENABLE_GTPIN_TRUE='#'
+  OPT_ENABLE_GTPIN_FALSE=
+fi
+
+
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $GTPIN" >&5
+$as_echo "$GTPIN" >&6; }
+
+
+
+
+
 
 #-------------------------------------------------
 # Option: --with-rocm=PATH
@@ -24323,8 +24528,20 @@ if test -z "${OPT_ENABLE_CUPTI_TRUE}" && test -z "${OPT_ENABLE_CUPTI_FALSE}"; th
   as_fn_error $? "conditional \"OPT_ENABLE_CUPTI\" was never defined.
 Usually this means the macro was only invoked conditionally." "$LINENO" 5
 fi
-if test -z "${ENABLE_OPENCL_TRUE}" && test -z "${ENABLE_OPENCL_FALSE}"; then
-  as_fn_error $? "conditional \"ENABLE_OPENCL\" was never defined.
+if test -z "${OPT_ENABLE_OPENCL_TRUE}" && test -z "${OPT_ENABLE_OPENCL_FALSE}"; then
+  as_fn_error $? "conditional \"OPT_ENABLE_OPENCL\" was never defined.
+Usually this means the macro was only invoked conditionally." "$LINENO" 5
+fi
+if test -z "${OPT_ENABLE_IGC_TRUE}" && test -z "${OPT_ENABLE_IGC_FALSE}"; then
+  as_fn_error $? "conditional \"OPT_ENABLE_IGC\" was never defined.
+Usually this means the macro was only invoked conditionally." "$LINENO" 5
+fi
+if test -z "${OPT_ENABLE_METRICS_DISCOVERY_TRUE}" && test -z "${OPT_ENABLE_METRICS_DISCOVERY_FALSE}"; then
+  as_fn_error $? "conditional \"OPT_ENABLE_METRICS_DISCOVERY\" was never defined.
+Usually this means the macro was only invoked conditionally." "$LINENO" 5
+fi
+if test -z "${OPT_ENABLE_GTPIN_TRUE}" && test -z "${OPT_ENABLE_GTPIN_FALSE}"; then
+  as_fn_error $? "conditional \"OPT_ENABLE_GTPIN\" was never defined.
 Usually this means the macro was only invoked conditionally." "$LINENO" 5
 fi
 if test -z "${OPT_ENABLE_ROCM_TRUE}" && test -z "${OPT_ENABLE_ROCM_FALSE}"; then
@@ -26989,6 +27206,12 @@ $as_echo "$as_me:   zlib:         ${ZLIB}" >&6;}
 $as_echo "$as_me:   cuda:         ${CUDA}" >&6;}
 { $as_echo "$as_me:${as_lineno-$LINENO}:   cupti:        ${CUPTI}" >&5
 $as_echo "$as_me:   cupti:        ${CUPTI}" >&6;}
+{ $as_echo "$as_me:${as_lineno-$LINENO}:   igc:          ${IGC}" >&5
+$as_echo "$as_me:   igc:          ${IGC}" >&6;}
+{ $as_echo "$as_me:${as_lineno-$LINENO}:   gtpin:        ${GTPIN}" >&5
+$as_echo "$as_me:   gtpin:        ${GTPIN}" >&6;}
+{ $as_echo "$as_me:${as_lineno-$LINENO}:   metrics-discovery: ${METRICS_DISCOVERY}" >&5
+$as_echo "$as_me:   metrics-discovery: ${METRICS_DISCOVERY}" >&6;}
 { $as_echo "$as_me:${as_lineno-$LINENO}:   papi-c-cupti: ${use_papi_c_cupti}" >&5
 $as_echo "$as_me:   papi-c-cupti: ${use_papi_c_cupti}" >&6;}
 { $as_echo "$as_me:${as_lineno-$LINENO}:   valgrind:     ${VALGRIND}" >&5
diff --git a/configure.ac b/configure.ac
index ff2117ddf7..03204b2190 100644
--- a/configure.ac
+++ b/configure.ac
@@ -4721,9 +4721,11 @@ AC_ARG_WITH([opencl],
   [OPENCL="$withval"],
   [])
 
+AC_MSG_CHECKING([for opencl])
+
 case "$OPENCL" in
   /* )
-    if test ! -f "${OPENCL}/CL/cl.h" ; then
+    if test ! -f "${OPENCL}/include/CL/cl.h" ; then
       AC_MSG_ERROR([unable to find CL/cl.h in: $OPENCL])
     else
       OPT_OPENCL_IFLAGS="-I${OPENCL}"
@@ -4745,9 +4747,158 @@ if test "$OPT_HAVE_OPENCL" = yes ; then
   AC_MSG_NOTICE([The <CL/cl.h> header file is available.])
 fi
 
-AM_CONDITIONAL([ENABLE_OPENCL], [test "$OPT_HAVE_OPENCL" = yes])
+AM_CONDITIONAL([OPT_ENABLE_OPENCL], [test "$OPT_HAVE_OPENCL" = yes])
+
+AC_MSG_RESULT([$OPENCL])
+
 AC_SUBST([OPT_OPENCL_IFLAGS])
 
+#-------------------------------------------------
+# Option: --with-igc=PATH
+#-------------------------------------------------
+
+OPT_HAVE_IGC=no
+OPT_IGC_IFLAGS=
+OPT_IGC_LDFLAGS=
+
+AC_ARG_WITH([igc],
+  [AS_HELP_STRING([--with-igc=PATH],
+      [path to igc install directory])],
+  [IGC="$withval"],
+  [])
+
+AC_MSG_CHECKING([for igc])
+
+case "$IGC" in
+  /* )
+    if test ! -f "${IGC}/include/igc/igc.opencl.h" ; then
+      AC_MSG_ERROR([unable to find igc.opencl.h in: $IGC])
+    fi
+    OPT_IGC_IFLAGS="-I${IGC}/include"
+
+    for lib in $multilib_path ; do
+      if test -f "${IGC}/${lib}/libigc.so" ; then
+        OPT_IGC_LDFLAGS="-L${IGC}/$lib -Wl,-rpath=${IGC}/$lib -ligc"
+        break
+      fi
+    done
+    if test "x$OPT_IGC_LDFLAGS" = x ; then
+      AC_MSG_ERROR([unable to find libigc.so in: $IGC])
+    fi
+    OPT_HAVE_IGC=yes
+    ;;
+  no )
+    ;;
+  * )
+    AC_MSG_ERROR([igc directory must be absolute path: $IGC])
+    ;;
+esac
+
+AM_CONDITIONAL([OPT_ENABLE_IGC], [test "$OPT_HAVE_IGC" = yes])
+
+AC_MSG_RESULT([$IGC])
+
+AC_SUBST([OPT_IGC])
+AC_SUBST([OPT_IGC_IFLAGS])
+AC_SUBST([OPT_IGC_LDFLAGS])
+
+#-------------------------------------------------
+# Option: --with-metrics-discovery=PATH
+#-------------------------------------------------
+
+OPT_HAVE_METRICS_DISCOVERY=no
+OPT_METRICS_DISCOVERY_IFLAGS=
+OPT_METRICS_DISCOVERY_LDFLAGS=
+
+AC_ARG_WITH([metrics-discovery],
+  [AS_HELP_STRING([--with-metrics-discovery=PATH],
+      [path to metrics-discovery install directory])],
+  [METRICS_DISCOVERY="$withval"],
+  [])
+
+AC_MSG_CHECKING([for metrics-discovery])
+
+case "$METRICS_DISCOVERY" in
+  /* )
+    if test ! -f "${METRICS_DISCOVERY}/include/metrics_discovery_api.h" ; then
+      AC_MSG_ERROR([unable to find metrics_discovery_api.h in: $METRICS_DISCOVERY])
+    fi
+    OPT_METRICS_DISCOVERY_IFLAGS="-I${METRICS_DISCOVERY}/include"
+
+    for lib in $multilib_path ; do
+      if test -f "${METRICS_DISCOVERY}/${lib}/libmd.so" ; then
+        OPT_METRICS_DISCOVERY_LDFLAGS="-L${METRICS_DISCOVERY}/$lib -Wl,-rpath=${METRICS_DISCOVERY}/$lib -lmd"
+        break
+      fi
+    done
+    if test "x$OPT_METRICS_DISCOVERY_LDFLAGS" = x ; then
+      AC_MSG_ERROR([unable to find libmd.so in: $METRICS_DISCOVERY])
+    fi
+    OPT_HAVE_METRICS_DISCOVERY=yes
+    ;;
+  no )
+    ;;
+  * )
+    AC_MSG_ERROR([metrics-discovery directory must be absolute path: $METRICS_DISCOVERY])
+    ;;
+esac
+
+AM_CONDITIONAL([OPT_ENABLE_METRICS_DISCOVERY], [test "$OPT_HAVE_METRICS_DISCOVERY" = yes])
+
+AC_MSG_RESULT([$METRICS_DISCOVERY])
+
+AC_SUBST([OPT_METRICS_DISCOVERY])
+AC_SUBST([OPT_METRICS_DISCOVERY_IFLAGS])
+AC_SUBST([OPT_METRICS_DISCOVERY_LDFLAGS])
+
+#-------------------------------------------------
+# Option: --with-gtpin=PATH
+#-------------------------------------------------
+
+OPT_HAVE_GTPIN=no
+OPT_GTPIN_IFLAGS=
+OPT_GTPIN_LDFLAGS=
+
+AC_ARG_WITH([gtpin],
+  [AS_HELP_STRING([--with-gtpin=PATH],
+      [path to gtpin install directory])],
+  [GTPIN="$withval"],
+  [])
+
+AC_MSG_CHECKING([for gtpin])
+
+case "$GTPIN" in
+  /* )
+    if test ! -f "${GTPIN}/Profilers/Include/gtpin.h" ; then
+      AC_MSG_ERROR([unable to find gtpin.h in: $GTPIN])
+    fi
+    OPT_GTPIN_IFLAGS="-I${GTPIN}/Profilers/Include -I${GTPIN}/Profilers/Include/ged/intel64/"
+
+    if test -f "${GTPIN}/Profilers/Lib/intel64/libgtpin.so" ; then
+      OPT_GTPIN_LDFLAGS="-L${GTPIN}/Profilers/Lib/intel64/ -Wl,-rpath=${GTPIN}/Profilers/Lib/intel64/ -lgtpin"
+    fi
+
+    if test "x$OPT_GTPIN_LDFLAGS" = x ; then
+      AC_MSG_ERROR([unable to find libgtpin.so in: $GTPIN])
+    fi
+    OPT_HAVE_GTPIN=yes
+    ;;
+  no )
+    ;;
+  * )
+    AC_MSG_ERROR([gtpin directory must be absolute path: $GTPIN])
+    ;;
+esac
+
+AM_CONDITIONAL([OPT_ENABLE_GTPIN], [test "$OPT_HAVE_GTPIN" = yes])
+
+AC_MSG_RESULT([$GTPIN])
+
+AC_SUBST([OPT_GTPIN])
+AC_SUBST([OPT_GTPIN_IFLAGS])
+AC_SUBST([OPT_GTPIN_LDFLAGS])
+
+
 #-------------------------------------------------
 # Option: --with-rocm=PATH
 #-------------------------------------------------
@@ -5183,6 +5334,9 @@ AC_MSG_NOTICE([  xerces:       ${XERCES}])
 AC_MSG_NOTICE([  zlib:         ${ZLIB}])
 AC_MSG_NOTICE([  cuda:         ${CUDA}])
 AC_MSG_NOTICE([  cupti:        ${CUPTI}])
+AC_MSG_NOTICE([  igc:          ${IGC}])
+AC_MSG_NOTICE([  gtpin:        ${GTPIN}])
+AC_MSG_NOTICE([  metrics-discovery: ${METRICS_DISCOVERY}])
 AC_MSG_NOTICE([  papi-c-cupti: ${use_papi_c_cupti}])
 AC_MSG_NOTICE([  valgrind:     ${VALGRIND}])
 AC_MSG_NOTICE([  valgrind:     annotated: ${OPT_ENABLE_VG_ANNOTATIONS}])
diff --git a/doc/Makefile.in b/doc/Makefile.in
index dc8e3b5cc2..a20ab2984d 100644
--- a/doc/Makefile.in
+++ b/doc/Makefile.in
@@ -322,9 +322,18 @@ OPT_CUDA_LDFLAGS = @OPT_CUDA_LDFLAGS@
 OPT_CUPTI = @OPT_CUPTI@
 OPT_CUPTI_IFLAGS = @OPT_CUPTI_IFLAGS@
 OPT_CUPTI_LDFLAGS = @OPT_CUPTI_LDFLAGS@
+OPT_GTPIN = @OPT_GTPIN@
+OPT_GTPIN_IFLAGS = @OPT_GTPIN_IFLAGS@
+OPT_GTPIN_LDFLAGS = @OPT_GTPIN_LDFLAGS@
+OPT_IGC = @OPT_IGC@
+OPT_IGC_IFLAGS = @OPT_IGC_IFLAGS@
+OPT_IGC_LDFLAGS = @OPT_IGC_LDFLAGS@
 OPT_LEVEL0 = @OPT_LEVEL0@
 OPT_LEVEL0_IFLAGS = @OPT_LEVEL0_IFLAGS@
 OPT_LEVEL0_LDFLAGS = @OPT_LEVEL0_LDFLAGS@
+OPT_METRICS_DISCOVERY = @OPT_METRICS_DISCOVERY@
+OPT_METRICS_DISCOVERY_IFLAGS = @OPT_METRICS_DISCOVERY_IFLAGS@
+OPT_METRICS_DISCOVERY_LDFLAGS = @OPT_METRICS_DISCOVERY_LDFLAGS@
 OPT_OBJCOPY = @OPT_OBJCOPY@
 OPT_OPENCL_IFLAGS = @OPT_OPENCL_IFLAGS@
 OPT_PAPI = @OPT_PAPI@
diff --git a/doc/man/Makefile.in b/doc/man/Makefile.in
index a5f28dbe57..3b28c735e1 100644
--- a/doc/man/Makefile.in
+++ b/doc/man/Makefile.in
@@ -296,9 +296,18 @@ OPT_CUDA_LDFLAGS = @OPT_CUDA_LDFLAGS@
 OPT_CUPTI = @OPT_CUPTI@
 OPT_CUPTI_IFLAGS = @OPT_CUPTI_IFLAGS@
 OPT_CUPTI_LDFLAGS = @OPT_CUPTI_LDFLAGS@
+OPT_GTPIN = @OPT_GTPIN@
+OPT_GTPIN_IFLAGS = @OPT_GTPIN_IFLAGS@
+OPT_GTPIN_LDFLAGS = @OPT_GTPIN_LDFLAGS@
+OPT_IGC = @OPT_IGC@
+OPT_IGC_IFLAGS = @OPT_IGC_IFLAGS@
+OPT_IGC_LDFLAGS = @OPT_IGC_LDFLAGS@
 OPT_LEVEL0 = @OPT_LEVEL0@
 OPT_LEVEL0_IFLAGS = @OPT_LEVEL0_IFLAGS@
 OPT_LEVEL0_LDFLAGS = @OPT_LEVEL0_LDFLAGS@
+OPT_METRICS_DISCOVERY = @OPT_METRICS_DISCOVERY@
+OPT_METRICS_DISCOVERY_IFLAGS = @OPT_METRICS_DISCOVERY_IFLAGS@
+OPT_METRICS_DISCOVERY_LDFLAGS = @OPT_METRICS_DISCOVERY_LDFLAGS@
 OPT_OBJCOPY = @OPT_OBJCOPY@
 OPT_OPENCL_IFLAGS = @OPT_OPENCL_IFLAGS@
 OPT_PAPI = @OPT_PAPI@
diff --git a/doc/manual/Makefile.in b/doc/manual/Makefile.in
index 01986c8ee7..caaa631d57 100644
--- a/doc/manual/Makefile.in
+++ b/doc/manual/Makefile.in
@@ -293,9 +293,18 @@ OPT_CUDA_LDFLAGS = @OPT_CUDA_LDFLAGS@
 OPT_CUPTI = @OPT_CUPTI@
 OPT_CUPTI_IFLAGS = @OPT_CUPTI_IFLAGS@
 OPT_CUPTI_LDFLAGS = @OPT_CUPTI_LDFLAGS@
+OPT_GTPIN = @OPT_GTPIN@
+OPT_GTPIN_IFLAGS = @OPT_GTPIN_IFLAGS@
+OPT_GTPIN_LDFLAGS = @OPT_GTPIN_LDFLAGS@
+OPT_IGC = @OPT_IGC@
+OPT_IGC_IFLAGS = @OPT_IGC_IFLAGS@
+OPT_IGC_LDFLAGS = @OPT_IGC_LDFLAGS@
 OPT_LEVEL0 = @OPT_LEVEL0@
 OPT_LEVEL0_IFLAGS = @OPT_LEVEL0_IFLAGS@
 OPT_LEVEL0_LDFLAGS = @OPT_LEVEL0_LDFLAGS@
+OPT_METRICS_DISCOVERY = @OPT_METRICS_DISCOVERY@
+OPT_METRICS_DISCOVERY_IFLAGS = @OPT_METRICS_DISCOVERY_IFLAGS@
+OPT_METRICS_DISCOVERY_LDFLAGS = @OPT_METRICS_DISCOVERY_LDFLAGS@
 OPT_OBJCOPY = @OPT_OBJCOPY@
 OPT_OPENCL_IFLAGS = @OPT_OPENCL_IFLAGS@
 OPT_PAPI = @OPT_PAPI@
diff --git a/doc/www/Makefile.in b/doc/www/Makefile.in
index 3db6ccc8ed..864f7cd0e8 100644
--- a/doc/www/Makefile.in
+++ b/doc/www/Makefile.in
@@ -293,9 +293,18 @@ OPT_CUDA_LDFLAGS = @OPT_CUDA_LDFLAGS@
 OPT_CUPTI = @OPT_CUPTI@
 OPT_CUPTI_IFLAGS = @OPT_CUPTI_IFLAGS@
 OPT_CUPTI_LDFLAGS = @OPT_CUPTI_LDFLAGS@
+OPT_GTPIN = @OPT_GTPIN@
+OPT_GTPIN_IFLAGS = @OPT_GTPIN_IFLAGS@
+OPT_GTPIN_LDFLAGS = @OPT_GTPIN_LDFLAGS@
+OPT_IGC = @OPT_IGC@
+OPT_IGC_IFLAGS = @OPT_IGC_IFLAGS@
+OPT_IGC_LDFLAGS = @OPT_IGC_LDFLAGS@
 OPT_LEVEL0 = @OPT_LEVEL0@
 OPT_LEVEL0_IFLAGS = @OPT_LEVEL0_IFLAGS@
 OPT_LEVEL0_LDFLAGS = @OPT_LEVEL0_LDFLAGS@
+OPT_METRICS_DISCOVERY = @OPT_METRICS_DISCOVERY@
+OPT_METRICS_DISCOVERY_IFLAGS = @OPT_METRICS_DISCOVERY_IFLAGS@
+OPT_METRICS_DISCOVERY_LDFLAGS = @OPT_METRICS_DISCOVERY_LDFLAGS@
 OPT_OBJCOPY = @OPT_OBJCOPY@
 OPT_OPENCL_IFLAGS = @OPT_OPENCL_IFLAGS@
 OPT_PAPI = @OPT_PAPI@
diff --git a/lib/Makefile.in b/lib/Makefile.in
index 5139597fea..1bf14dd93a 100644
--- a/lib/Makefile.in
+++ b/lib/Makefile.in
@@ -292,9 +292,18 @@ OPT_CUDA_LDFLAGS = @OPT_CUDA_LDFLAGS@
 OPT_CUPTI = @OPT_CUPTI@
 OPT_CUPTI_IFLAGS = @OPT_CUPTI_IFLAGS@
 OPT_CUPTI_LDFLAGS = @OPT_CUPTI_LDFLAGS@
+OPT_GTPIN = @OPT_GTPIN@
+OPT_GTPIN_IFLAGS = @OPT_GTPIN_IFLAGS@
+OPT_GTPIN_LDFLAGS = @OPT_GTPIN_LDFLAGS@
+OPT_IGC = @OPT_IGC@
+OPT_IGC_IFLAGS = @OPT_IGC_IFLAGS@
+OPT_IGC_LDFLAGS = @OPT_IGC_LDFLAGS@
 OPT_LEVEL0 = @OPT_LEVEL0@
 OPT_LEVEL0_IFLAGS = @OPT_LEVEL0_IFLAGS@
 OPT_LEVEL0_LDFLAGS = @OPT_LEVEL0_LDFLAGS@
+OPT_METRICS_DISCOVERY = @OPT_METRICS_DISCOVERY@
+OPT_METRICS_DISCOVERY_IFLAGS = @OPT_METRICS_DISCOVERY_IFLAGS@
+OPT_METRICS_DISCOVERY_LDFLAGS = @OPT_METRICS_DISCOVERY_LDFLAGS@
 OPT_OBJCOPY = @OPT_OBJCOPY@
 OPT_OPENCL_IFLAGS = @OPT_OPENCL_IFLAGS@
 OPT_PAPI = @OPT_PAPI@
diff --git a/src/Makefile.in b/src/Makefile.in
index 488f2d163c..d0cc63b65c 100644
--- a/src/Makefile.in
+++ b/src/Makefile.in
@@ -322,9 +322,18 @@ OPT_CUDA_LDFLAGS = @OPT_CUDA_LDFLAGS@
 OPT_CUPTI = @OPT_CUPTI@
 OPT_CUPTI_IFLAGS = @OPT_CUPTI_IFLAGS@
 OPT_CUPTI_LDFLAGS = @OPT_CUPTI_LDFLAGS@
+OPT_GTPIN = @OPT_GTPIN@
+OPT_GTPIN_IFLAGS = @OPT_GTPIN_IFLAGS@
+OPT_GTPIN_LDFLAGS = @OPT_GTPIN_LDFLAGS@
+OPT_IGC = @OPT_IGC@
+OPT_IGC_IFLAGS = @OPT_IGC_IFLAGS@
+OPT_IGC_LDFLAGS = @OPT_IGC_LDFLAGS@
 OPT_LEVEL0 = @OPT_LEVEL0@
 OPT_LEVEL0_IFLAGS = @OPT_LEVEL0_IFLAGS@
 OPT_LEVEL0_LDFLAGS = @OPT_LEVEL0_LDFLAGS@
+OPT_METRICS_DISCOVERY = @OPT_METRICS_DISCOVERY@
+OPT_METRICS_DISCOVERY_IFLAGS = @OPT_METRICS_DISCOVERY_IFLAGS@
+OPT_METRICS_DISCOVERY_LDFLAGS = @OPT_METRICS_DISCOVERY_LDFLAGS@
 OPT_OBJCOPY = @OPT_OBJCOPY@
 OPT_OPENCL_IFLAGS = @OPT_OPENCL_IFLAGS@
 OPT_PAPI = @OPT_PAPI@
diff --git a/src/lib/Makefile.in b/src/lib/Makefile.in
index 3d501a0900..3194f6268d 100644
--- a/src/lib/Makefile.in
+++ b/src/lib/Makefile.in
@@ -334,9 +334,18 @@ OPT_CUDA_LDFLAGS = @OPT_CUDA_LDFLAGS@
 OPT_CUPTI = @OPT_CUPTI@
 OPT_CUPTI_IFLAGS = @OPT_CUPTI_IFLAGS@
 OPT_CUPTI_LDFLAGS = @OPT_CUPTI_LDFLAGS@
+OPT_GTPIN = @OPT_GTPIN@
+OPT_GTPIN_IFLAGS = @OPT_GTPIN_IFLAGS@
+OPT_GTPIN_LDFLAGS = @OPT_GTPIN_LDFLAGS@
+OPT_IGC = @OPT_IGC@
+OPT_IGC_IFLAGS = @OPT_IGC_IFLAGS@
+OPT_IGC_LDFLAGS = @OPT_IGC_LDFLAGS@
 OPT_LEVEL0 = @OPT_LEVEL0@
 OPT_LEVEL0_IFLAGS = @OPT_LEVEL0_IFLAGS@
 OPT_LEVEL0_LDFLAGS = @OPT_LEVEL0_LDFLAGS@
+OPT_METRICS_DISCOVERY = @OPT_METRICS_DISCOVERY@
+OPT_METRICS_DISCOVERY_IFLAGS = @OPT_METRICS_DISCOVERY_IFLAGS@
+OPT_METRICS_DISCOVERY_LDFLAGS = @OPT_METRICS_DISCOVERY_LDFLAGS@
 OPT_OBJCOPY = @OPT_OBJCOPY@
 OPT_OPENCL_IFLAGS = @OPT_OPENCL_IFLAGS@
 OPT_PAPI = @OPT_PAPI@
diff --git a/src/lib/analysis/Makefile.in b/src/lib/analysis/Makefile.in
index bd19270aa4..90a520bec7 100644
--- a/src/lib/analysis/Makefile.in
+++ b/src/lib/analysis/Makefile.in
@@ -370,9 +370,18 @@ OPT_CUDA_LDFLAGS = @OPT_CUDA_LDFLAGS@
 OPT_CUPTI = @OPT_CUPTI@
 OPT_CUPTI_IFLAGS = @OPT_CUPTI_IFLAGS@
 OPT_CUPTI_LDFLAGS = @OPT_CUPTI_LDFLAGS@
+OPT_GTPIN = @OPT_GTPIN@
+OPT_GTPIN_IFLAGS = @OPT_GTPIN_IFLAGS@
+OPT_GTPIN_LDFLAGS = @OPT_GTPIN_LDFLAGS@
+OPT_IGC = @OPT_IGC@
+OPT_IGC_IFLAGS = @OPT_IGC_IFLAGS@
+OPT_IGC_LDFLAGS = @OPT_IGC_LDFLAGS@
 OPT_LEVEL0 = @OPT_LEVEL0@
 OPT_LEVEL0_IFLAGS = @OPT_LEVEL0_IFLAGS@
 OPT_LEVEL0_LDFLAGS = @OPT_LEVEL0_LDFLAGS@
+OPT_METRICS_DISCOVERY = @OPT_METRICS_DISCOVERY@
+OPT_METRICS_DISCOVERY_IFLAGS = @OPT_METRICS_DISCOVERY_IFLAGS@
+OPT_METRICS_DISCOVERY_LDFLAGS = @OPT_METRICS_DISCOVERY_LDFLAGS@
 OPT_OBJCOPY = @OPT_OBJCOPY@
 OPT_OPENCL_IFLAGS = @OPT_OPENCL_IFLAGS@
 OPT_PAPI = @OPT_PAPI@
diff --git a/src/lib/banal/Makefile.in b/src/lib/banal/Makefile.in
index b0d3dbd541..d069cd74d9 100644
--- a/src/lib/banal/Makefile.in
+++ b/src/lib/banal/Makefile.in
@@ -366,9 +366,18 @@ OPT_CUDA_LDFLAGS = @OPT_CUDA_LDFLAGS@
 OPT_CUPTI = @OPT_CUPTI@
 OPT_CUPTI_IFLAGS = @OPT_CUPTI_IFLAGS@
 OPT_CUPTI_LDFLAGS = @OPT_CUPTI_LDFLAGS@
+OPT_GTPIN = @OPT_GTPIN@
+OPT_GTPIN_IFLAGS = @OPT_GTPIN_IFLAGS@
+OPT_GTPIN_LDFLAGS = @OPT_GTPIN_LDFLAGS@
+OPT_IGC = @OPT_IGC@
+OPT_IGC_IFLAGS = @OPT_IGC_IFLAGS@
+OPT_IGC_LDFLAGS = @OPT_IGC_LDFLAGS@
 OPT_LEVEL0 = @OPT_LEVEL0@
 OPT_LEVEL0_IFLAGS = @OPT_LEVEL0_IFLAGS@
 OPT_LEVEL0_LDFLAGS = @OPT_LEVEL0_LDFLAGS@
+OPT_METRICS_DISCOVERY = @OPT_METRICS_DISCOVERY@
+OPT_METRICS_DISCOVERY_IFLAGS = @OPT_METRICS_DISCOVERY_IFLAGS@
+OPT_METRICS_DISCOVERY_LDFLAGS = @OPT_METRICS_DISCOVERY_LDFLAGS@
 OPT_OBJCOPY = @OPT_OBJCOPY@
 OPT_OPENCL_IFLAGS = @OPT_OPENCL_IFLAGS@
 OPT_PAPI = @OPT_PAPI@
diff --git a/src/lib/binutils/InputFile.cpp b/src/lib/binutils/InputFile.cpp
index e15a4bf75f..9a25e84662 100644
--- a/src/lib/binutils/InputFile.cpp
+++ b/src/lib/binutils/InputFile.cpp
@@ -75,7 +75,7 @@
 
 #include "ElfHelper.hpp"
 #include "Fatbin.hpp"
-#include "IntelGPUbinutils.hpp"
+#include "intel/IntelGPUbinutils.hpp"
 #include "InputFile.hpp"
 
 
diff --git a/src/lib/binutils/Makefile.am b/src/lib/binutils/Makefile.am
index 8a35ff4e79..1223a53e40 100644
--- a/src/lib/binutils/Makefile.am
+++ b/src/lib/binutils/Makefile.am
@@ -58,7 +58,7 @@
 #############################################################################
 
 # We do not want the standard GNU files (NEWS README AUTHORS ChangeLog...)
-AUTOMAKE_OPTIONS = foreign
+AUTOMAKE_OPTIONS = foreign subdir-objects
 
 #############################################################################
 # Common settings
@@ -75,6 +75,14 @@ if OPT_DYNINST_LIBDWARF
 DWARF_IFLAGS = -I$(LIBDWARF_INC)
 endif
 
+if OPT_ENABLE_IGC
+IGC_IFLAGS = @OPT_IGC_IFLAGS@
+endif
+
+if OPT_ENABLE_METRICS_DISCOVERY
+MD_IFLAGS = @OPT_METRICS_DISCOVERY_IFLAGS@
+endif
+
 #############################################################################
 # Local settings
 #############################################################################
@@ -95,10 +103,10 @@ MYSOURCES = \
 	BinUtils.hpp BinUtils.cpp \
 	VMAInterval.hpp VMAInterval.cpp \
 	Fatbin.cpp \
-	IntelGPUbinutils.cpp \
 	ElfHelper.cpp \
 	InputFile.cpp \
-	RelocateCubin.cpp 
+	RelocateCubin.cpp \
+	intel/IntelGPUbinutils.cpp
 
 
 #############################################################################
@@ -107,7 +115,9 @@ MYCXXFLAGS = @HOST_CXXFLAGS@ $(HPC_IFLAGS) @BINUTILS_IFLAGS@ \
 	$(BOOST_IFLAGS)  \
 	$(DWARF_IFLAGS)  \
 	-I$(LIBELF_INC)  \
-	$(DYNINST_IFLAGS)
+	$(DYNINST_IFLAGS) \
+	$(IGC_IFLAGS) \
+	$(MD_IFLAGS)
 
 if IS_HOST_AR
   MYAR = @HOST_AR@
diff --git a/src/lib/binutils/Makefile.in b/src/lib/binutils/Makefile.in
index bb29f47868..ff79979420 100644
--- a/src/lib/binutils/Makefile.in
+++ b/src/lib/binutils/Makefile.in
@@ -138,6 +138,7 @@ CONFIG_CLEAN_VPATH_FILES =
 LTLIBRARIES = $(noinst_LTLIBRARIES)
 am__DEPENDENCIES_1 =
 libHPCbinutils_la_DEPENDENCIES = $(am__DEPENDENCIES_1)
+am__dirstamp = $(am__leading_dot)dirstamp
 am__objects_1 = libHPCbinutils_la-LM.lo libHPCbinutils_la-Seg.lo \
 	libHPCbinutils_la-Proc.lo libHPCbinutils_la-Insn.lo \
 	libHPCbinutils_la-LinuxKernelSymbols.lo \
@@ -145,10 +146,10 @@ am__objects_1 = libHPCbinutils_la-LM.lo libHPCbinutils_la-Seg.lo \
 	libHPCbinutils_la-SimpleSymbolsFactories.lo \
 	libHPCbinutils_la-Dbg-LM.lo libHPCbinutils_la-Dbg-Proc.lo \
 	libHPCbinutils_la-BinUtils.lo libHPCbinutils_la-VMAInterval.lo \
-	libHPCbinutils_la-Fatbin.lo \
-	libHPCbinutils_la-IntelGPUbinutils.lo \
-	libHPCbinutils_la-ElfHelper.lo libHPCbinutils_la-InputFile.lo \
-	libHPCbinutils_la-RelocateCubin.lo
+	libHPCbinutils_la-Fatbin.lo libHPCbinutils_la-ElfHelper.lo \
+	libHPCbinutils_la-InputFile.lo \
+	libHPCbinutils_la-RelocateCubin.lo \
+	intel/libHPCbinutils_la-IntelGPUbinutils.lo
 am_libHPCbinutils_la_OBJECTS = $(am__objects_1)
 libHPCbinutils_la_OBJECTS = $(am_libHPCbinutils_la_OBJECTS)
 AM_V_lt = $(am__v_lt_@AM_V@)
@@ -377,9 +378,18 @@ OPT_CUDA_LDFLAGS = @OPT_CUDA_LDFLAGS@
 OPT_CUPTI = @OPT_CUPTI@
 OPT_CUPTI_IFLAGS = @OPT_CUPTI_IFLAGS@
 OPT_CUPTI_LDFLAGS = @OPT_CUPTI_LDFLAGS@
+OPT_GTPIN = @OPT_GTPIN@
+OPT_GTPIN_IFLAGS = @OPT_GTPIN_IFLAGS@
+OPT_GTPIN_LDFLAGS = @OPT_GTPIN_LDFLAGS@
+OPT_IGC = @OPT_IGC@
+OPT_IGC_IFLAGS = @OPT_IGC_IFLAGS@
+OPT_IGC_LDFLAGS = @OPT_IGC_LDFLAGS@
 OPT_LEVEL0 = @OPT_LEVEL0@
 OPT_LEVEL0_IFLAGS = @OPT_LEVEL0_IFLAGS@
 OPT_LEVEL0_LDFLAGS = @OPT_LEVEL0_LDFLAGS@
+OPT_METRICS_DISCOVERY = @OPT_METRICS_DISCOVERY@
+OPT_METRICS_DISCOVERY_IFLAGS = @OPT_METRICS_DISCOVERY_IFLAGS@
+OPT_METRICS_DISCOVERY_LDFLAGS = @OPT_METRICS_DISCOVERY_LDFLAGS@
 OPT_OBJCOPY = @OPT_OBJCOPY@
 OPT_OPENCL_IFLAGS = @OPT_OPENCL_IFLAGS@
 OPT_PAPI = @OPT_PAPI@
@@ -500,7 +510,7 @@ top_builddir = @top_builddir@
 top_srcdir = @top_srcdir@
 
 # We do not want the standard GNU files (NEWS README AUTHORS ChangeLog...)
-AUTOMAKE_OPTIONS = foreign
+AUTOMAKE_OPTIONS = foreign subdir-objects
 HPC_IFLAGS = -I@abs_top_srcdir@/src -I@abs_top_builddir@/src
 
 ############################################################
@@ -530,6 +540,8 @@ HPCLIB_XML = $(top_builddir)/src/lib/xml/libHPCxml.la
 HPCLIB_Support = $(top_builddir)/src/lib/support/libHPCsupport.la
 HPCLIB_SupportLean = $(top_builddir)/src/lib/support-lean/libHPCsupport-lean.la
 @OPT_DYNINST_LIBDWARF_TRUE@DWARF_IFLAGS = -I$(LIBDWARF_INC)
+@OPT_ENABLE_IGC_TRUE@IGC_IFLAGS = @OPT_IGC_IFLAGS@
+@OPT_ENABLE_METRICS_DISCOVERY_TRUE@MD_IFLAGS = @OPT_METRICS_DISCOVERY_IFLAGS@
 
 #############################################################################
 # Local settings
@@ -550,10 +562,10 @@ MYSOURCES = \
 	BinUtils.hpp BinUtils.cpp \
 	VMAInterval.hpp VMAInterval.cpp \
 	Fatbin.cpp \
-	IntelGPUbinutils.cpp \
 	ElfHelper.cpp \
 	InputFile.cpp \
-	RelocateCubin.cpp 
+	RelocateCubin.cpp \
+	intel/IntelGPUbinutils.cpp
 
 
 #############################################################################
@@ -562,7 +574,9 @@ MYCXXFLAGS = @HOST_CXXFLAGS@ $(HPC_IFLAGS) @BINUTILS_IFLAGS@ \
 	$(BOOST_IFLAGS)  \
 	$(DWARF_IFLAGS)  \
 	-I$(LIBELF_INC)  \
-	$(DYNINST_IFLAGS)
+	$(DYNINST_IFLAGS) \
+	$(IGC_IFLAGS) \
+	$(MD_IFLAGS)
 
 @IS_HOST_AR_FALSE@MYAR = $(AR) cru
 @IS_HOST_AR_TRUE@MYAR = @HOST_AR@
@@ -632,12 +646,22 @@ clean-noinstLTLIBRARIES:
 	  echo rm -f $${locs}; \
 	  rm -f $${locs}; \
 	}
+intel/$(am__dirstamp):
+	@$(MKDIR_P) intel
+	@: > intel/$(am__dirstamp)
+intel/$(DEPDIR)/$(am__dirstamp):
+	@$(MKDIR_P) intel/$(DEPDIR)
+	@: > intel/$(DEPDIR)/$(am__dirstamp)
+intel/libHPCbinutils_la-IntelGPUbinutils.lo: intel/$(am__dirstamp) \
+	intel/$(DEPDIR)/$(am__dirstamp)
 
 libHPCbinutils.la: $(libHPCbinutils_la_OBJECTS) $(libHPCbinutils_la_DEPENDENCIES) $(EXTRA_libHPCbinutils_la_DEPENDENCIES) 
 	$(AM_V_CXXLD)$(libHPCbinutils_la_LINK)  $(libHPCbinutils_la_OBJECTS) $(libHPCbinutils_la_LIBADD) $(LIBS)
 
 mostlyclean-compile:
 	-rm -f *.$(OBJEXT)
+	-rm -f intel/*.$(OBJEXT)
+	-rm -f intel/*.lo
 
 distclean-compile:
 	-rm -f *.tab.c
@@ -649,7 +673,6 @@ distclean-compile:
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libHPCbinutils_la-Fatbin.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libHPCbinutils_la-InputFile.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libHPCbinutils_la-Insn.Plo@am__quote@
-@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libHPCbinutils_la-IntelGPUbinutils.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libHPCbinutils_la-LM.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libHPCbinutils_la-LinuxKernelSymbols.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libHPCbinutils_la-Proc.Plo@am__quote@
@@ -658,24 +681,28 @@ distclean-compile:
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libHPCbinutils_la-SimpleSymbols.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libHPCbinutils_la-SimpleSymbolsFactories.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libHPCbinutils_la-VMAInterval.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@intel/$(DEPDIR)/libHPCbinutils_la-IntelGPUbinutils.Plo@am__quote@
 
 .cpp.o:
-@am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXXCOMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $<
-@am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po
+@am__fastdepCXX_TRUE@	$(AM_V_CXX)depbase=`echo $@ | sed 's|[^/]*$$|$(DEPDIR)/&|;s|\.o$$||'`;\
+@am__fastdepCXX_TRUE@	$(CXXCOMPILE) -MT $@ -MD -MP -MF $$depbase.Tpo -c -o $@ $< &&\
+@am__fastdepCXX_TRUE@	$(am__mv) $$depbase.Tpo $$depbase.Po
 @AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='$<' object='$@' libtool=no @AMDEPBACKSLASH@
 @AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
 @am__fastdepCXX_FALSE@	$(AM_V_CXX@am__nodep@)$(CXXCOMPILE) -c -o $@ $<
 
 .cpp.obj:
-@am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXXCOMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ `$(CYGPATH_W) '$<'`
-@am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po
+@am__fastdepCXX_TRUE@	$(AM_V_CXX)depbase=`echo $@ | sed 's|[^/]*$$|$(DEPDIR)/&|;s|\.obj$$||'`;\
+@am__fastdepCXX_TRUE@	$(CXXCOMPILE) -MT $@ -MD -MP -MF $$depbase.Tpo -c -o $@ `$(CYGPATH_W) '$<'` &&\
+@am__fastdepCXX_TRUE@	$(am__mv) $$depbase.Tpo $$depbase.Po
 @AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='$<' object='$@' libtool=no @AMDEPBACKSLASH@
 @AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
 @am__fastdepCXX_FALSE@	$(AM_V_CXX@am__nodep@)$(CXXCOMPILE) -c -o $@ `$(CYGPATH_W) '$<'`
 
 .cpp.lo:
-@am__fastdepCXX_TRUE@	$(AM_V_CXX)$(LTCXXCOMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $<
-@am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Plo
+@am__fastdepCXX_TRUE@	$(AM_V_CXX)depbase=`echo $@ | sed 's|[^/]*$$|$(DEPDIR)/&|;s|\.lo$$||'`;\
+@am__fastdepCXX_TRUE@	$(LTCXXCOMPILE) -MT $@ -MD -MP -MF $$depbase.Tpo -c -o $@ $< &&\
+@am__fastdepCXX_TRUE@	$(am__mv) $$depbase.Tpo $$depbase.Plo
 @AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='$<' object='$@' libtool=yes @AMDEPBACKSLASH@
 @AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
 @am__fastdepCXX_FALSE@	$(AM_V_CXX@am__nodep@)$(LTCXXCOMPILE) -c -o $@ $<
@@ -764,13 +791,6 @@ libHPCbinutils_la-Fatbin.lo: Fatbin.cpp
 @AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
 @am__fastdepCXX_FALSE@	$(AM_V_CXX@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libHPCbinutils_la_CXXFLAGS) $(CXXFLAGS) -c -o libHPCbinutils_la-Fatbin.lo `test -f 'Fatbin.cpp' || echo '$(srcdir)/'`Fatbin.cpp
 
-libHPCbinutils_la-IntelGPUbinutils.lo: IntelGPUbinutils.cpp
-@am__fastdepCXX_TRUE@	$(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libHPCbinutils_la_CXXFLAGS) $(CXXFLAGS) -MT libHPCbinutils_la-IntelGPUbinutils.lo -MD -MP -MF $(DEPDIR)/libHPCbinutils_la-IntelGPUbinutils.Tpo -c -o libHPCbinutils_la-IntelGPUbinutils.lo `test -f 'IntelGPUbinutils.cpp' || echo '$(srcdir)/'`IntelGPUbinutils.cpp
-@am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/libHPCbinutils_la-IntelGPUbinutils.Tpo $(DEPDIR)/libHPCbinutils_la-IntelGPUbinutils.Plo
-@AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='IntelGPUbinutils.cpp' object='libHPCbinutils_la-IntelGPUbinutils.lo' libtool=yes @AMDEPBACKSLASH@
-@AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
-@am__fastdepCXX_FALSE@	$(AM_V_CXX@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libHPCbinutils_la_CXXFLAGS) $(CXXFLAGS) -c -o libHPCbinutils_la-IntelGPUbinutils.lo `test -f 'IntelGPUbinutils.cpp' || echo '$(srcdir)/'`IntelGPUbinutils.cpp
-
 libHPCbinutils_la-ElfHelper.lo: ElfHelper.cpp
 @am__fastdepCXX_TRUE@	$(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libHPCbinutils_la_CXXFLAGS) $(CXXFLAGS) -MT libHPCbinutils_la-ElfHelper.lo -MD -MP -MF $(DEPDIR)/libHPCbinutils_la-ElfHelper.Tpo -c -o libHPCbinutils_la-ElfHelper.lo `test -f 'ElfHelper.cpp' || echo '$(srcdir)/'`ElfHelper.cpp
 @am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/libHPCbinutils_la-ElfHelper.Tpo $(DEPDIR)/libHPCbinutils_la-ElfHelper.Plo
@@ -792,11 +812,19 @@ libHPCbinutils_la-RelocateCubin.lo: RelocateCubin.cpp
 @AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
 @am__fastdepCXX_FALSE@	$(AM_V_CXX@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libHPCbinutils_la_CXXFLAGS) $(CXXFLAGS) -c -o libHPCbinutils_la-RelocateCubin.lo `test -f 'RelocateCubin.cpp' || echo '$(srcdir)/'`RelocateCubin.cpp
 
+intel/libHPCbinutils_la-IntelGPUbinutils.lo: intel/IntelGPUbinutils.cpp
+@am__fastdepCXX_TRUE@	$(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libHPCbinutils_la_CXXFLAGS) $(CXXFLAGS) -MT intel/libHPCbinutils_la-IntelGPUbinutils.lo -MD -MP -MF intel/$(DEPDIR)/libHPCbinutils_la-IntelGPUbinutils.Tpo -c -o intel/libHPCbinutils_la-IntelGPUbinutils.lo `test -f 'intel/IntelGPUbinutils.cpp' || echo '$(srcdir)/'`intel/IntelGPUbinutils.cpp
+@am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) intel/$(DEPDIR)/libHPCbinutils_la-IntelGPUbinutils.Tpo intel/$(DEPDIR)/libHPCbinutils_la-IntelGPUbinutils.Plo
+@AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='intel/IntelGPUbinutils.cpp' object='intel/libHPCbinutils_la-IntelGPUbinutils.lo' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCXX_FALSE@	$(AM_V_CXX@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libHPCbinutils_la_CXXFLAGS) $(CXXFLAGS) -c -o intel/libHPCbinutils_la-IntelGPUbinutils.lo `test -f 'intel/IntelGPUbinutils.cpp' || echo '$(srcdir)/'`intel/IntelGPUbinutils.cpp
+
 mostlyclean-libtool:
 	-rm -f *.lo
 
 clean-libtool:
 	-rm -rf .libs _libs
+	-rm -rf intel/.libs intel/_libs
 
 ID: $(am__tagged_files)
 	$(am__define_uniq_tagged_files); mkid -fID $$unique
@@ -911,6 +939,8 @@ clean-generic:
 distclean-generic:
 	-test -z "$(CONFIG_CLEAN_FILES)" || rm -f $(CONFIG_CLEAN_FILES)
 	-test . = "$(srcdir)" || test -z "$(CONFIG_CLEAN_VPATH_FILES)" || rm -f $(CONFIG_CLEAN_VPATH_FILES)
+	-rm -f intel/$(DEPDIR)/$(am__dirstamp)
+	-rm -f intel/$(am__dirstamp)
 
 maintainer-clean-generic:
 	@echo "This command is intended for maintainers to use"
@@ -921,7 +951,7 @@ clean-am: clean-generic clean-libtool clean-noinstLTLIBRARIES \
 	mostlyclean-am
 
 distclean: distclean-am
-	-rm -rf ./$(DEPDIR)
+	-rm -rf ./$(DEPDIR) intel/$(DEPDIR)
 	-rm -f Makefile
 distclean-am: clean-am distclean-compile distclean-generic \
 	distclean-tags
@@ -967,7 +997,7 @@ install-ps-am:
 installcheck-am:
 
 maintainer-clean: maintainer-clean-am
-	-rm -rf ./$(DEPDIR)
+	-rm -rf ./$(DEPDIR) intel/$(DEPDIR)
 	-rm -f Makefile
 maintainer-clean-am: distclean-am maintainer-clean-generic
 
diff --git a/src/lib/binutils/IntelGPUbinutils.cpp b/src/lib/binutils/intel/IntelGPUbinutils.cpp
similarity index 100%
rename from src/lib/binutils/IntelGPUbinutils.cpp
rename to src/lib/binutils/intel/IntelGPUbinutils.cpp
diff --git a/src/lib/binutils/IntelGPUbinutils.hpp b/src/lib/binutils/intel/IntelGPUbinutils.hpp
similarity index 100%
rename from src/lib/binutils/IntelGPUbinutils.hpp
rename to src/lib/binutils/intel/IntelGPUbinutils.hpp
diff --git a/src/lib/binutils/intel/dwarf.h b/src/lib/binutils/intel/dwarf.h
new file mode 100644
index 0000000000..e033c284de
--- /dev/null
+++ b/src/lib/binutils/intel/dwarf.h
@@ -0,0 +1,41 @@
+//==============================================================
+// Copyright © 2019 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+// =============================================================
+
+#ifndef PTI_SAMPLES_UTILS_DWARF_H_
+#define PTI_SAMPLES_UTILS_DWARF_H_
+
+#include <stdint.h>
+
+#define DWARF_VERSION 4
+
+#define DW_LNS_COPY             0x01
+#define DW_LNS_ADVANCE_PC       0x02
+#define DW_LNS_ADVANCE_LINE     0x03
+#define DW_LNS_SET_FILE         0x04
+#define DW_LNS_SET_COLUMN       0x05
+#define DW_LNS_NEGATE_STMT      0x06
+#define DW_LNS_SET_BASIC_BLOCK  0x07
+#define DW_LNS_CONST_ADD_PC     0x08
+#define DW_LNS_FIXED_ADVANCE_PC 0x09
+
+#define DW_LNS_END_SEQUENCE     0x01
+#define DW_LNE_SET_ADDRESS      0x02
+
+#pragma pack(push, 1)
+struct Dwarf32Header {
+  uint32_t unit_length;
+  uint16_t version;
+  uint32_t header_length;
+  uint8_t  minimum_instruction_length;
+  uint8_t  maximum_operations_per_instruction;
+  uint8_t  default_is_stmt;
+  int8_t   line_base;
+  uint8_t  line_range;
+  uint8_t  opcode_base;
+};
+#pragma pack(pop)
+
+#endif // PTI_SAMPLES_UTILS_DWARF_H_
\ No newline at end of file
diff --git a/src/lib/binutils/intel/dwarf_parser.h b/src/lib/binutils/intel/dwarf_parser.h
new file mode 100644
index 0000000000..08e37fbf86
--- /dev/null
+++ b/src/lib/binutils/intel/dwarf_parser.h
@@ -0,0 +1,144 @@
+//==============================================================
+// Copyright © 2019 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+// =============================================================
+
+#ifndef PTI_SAMPLES_UTILS_DWARF_PARSER_H_
+#define PTI_SAMPLES_UTILS_DWARF_PARSER_H_
+
+#include <assert.h>
+#include <string.h>
+
+#include <string>
+#include <vector>
+
+#include "dwarf_state_machine.h"
+
+using LineInfo = std::vector< std::pair<uint32_t, uint32_t> >;
+
+class DwarfParser {
+ public:
+  DwarfParser(const uint8_t* data, uint32_t size) : data_(data), size_(size) {}
+
+  bool IsValid() const {
+    if (data_ == nullptr || size_ < sizeof(Dwarf32Header)) {
+      return false;
+    }
+
+    const Dwarf32Header* header =
+      reinterpret_cast<const Dwarf32Header*>(data_);
+    if (header->version != DWARF_VERSION) {
+      return false;
+    }
+
+    return true;
+  }
+
+  std::vector<std::string> GetFileNames() const {
+    if (!IsValid()) {
+      return std::vector<std::string>();
+    }
+    std::vector<std::string> file_names;
+    ProcessHeader(&file_names);
+    return file_names;
+  }
+
+  LineInfo GetLineInfo(uint32_t file_id) const {
+    assert(file_id > 0);
+    
+    if (!IsValid()) {
+      return LineInfo();
+    }
+
+    const uint8_t* ptr = ProcessHeader(nullptr);
+    assert(ptr < data_ + size_);
+    const uint8_t* line_number_program = ptr;
+    uint32_t line_number_program_size =
+      static_cast<uint32_t>(data_ + size_ - line_number_program);
+    DwarfLineInfo line_info =
+      DwarfStateMachine(line_number_program, line_number_program_size,
+                        reinterpret_cast<const Dwarf32Header*>(data_)).Run();
+    if (line_info.size() == 0) {
+      return LineInfo();    
+    }
+
+    return ProcessLineInfo(line_info, file_id);
+  }
+
+ private:
+  const uint8_t* ProcessHeader(std::vector<std::string>* file_names) const {
+    const uint8_t* ptr = data_;
+    const Dwarf32Header* header = reinterpret_cast<const Dwarf32Header*>(ptr);
+
+    ptr += sizeof(Dwarf32Header);
+
+    // standard_opcode_lengths
+    for (uint8_t i = 1; i < header->opcode_base; ++i) {
+      uint32_t value = 0;
+      bool done = false;
+      ptr = utils::leb128::Decode32(ptr, value, done);
+      assert(done);
+    }
+
+    // include_directories
+    while (*ptr != 0) {
+      const char* include_directory = reinterpret_cast<const char*>(ptr);
+      ptr += strlen(include_directory) + 1;
+    }
+    ++ptr;
+
+    // file_names
+    assert(*ptr != 0);
+    while (*ptr != 0) {
+      std::string file_name(reinterpret_cast<const char*>(ptr));
+      ptr += file_name.size() + 1;
+
+      bool done = false;
+      uint32_t directory_index = 0;
+      ptr = utils::leb128::Decode32(ptr, directory_index, done);
+      assert(done);
+
+      uint32_t time = 0;
+      ptr = utils::leb128::Decode32(ptr, time, done);
+      assert(done);
+
+      uint32_t size = 0;
+      ptr = utils::leb128::Decode32(ptr, size, done);
+      assert(done);
+
+      if (file_names != nullptr) {
+        file_names->push_back(file_name);
+      }
+    }
+
+    ++ptr;
+    return ptr;
+  }
+
+  LineInfo ProcessLineInfo(DwarfLineInfo line_info, uint32_t file) const {
+    LineInfo result;
+
+    uint32_t address = 0;
+    uint32_t line = 0;
+    for (auto item : line_info) {
+      assert(address <= item.first);
+      if (item.second.first != file) {
+        continue;
+      }
+      if (line == item.second.second) {
+        continue;
+      }
+      address = (uint32_t)item.first;
+      line = item.second.second;
+      result.push_back(std::make_pair(address, line));
+    }
+    
+    return result;
+}
+
+  const uint8_t* data_;
+  uint32_t size_;
+};
+
+#endif // PTI_SAMPLES_UTILS_DWARF_PARSER_H_
\ No newline at end of file
diff --git a/src/lib/binutils/intel/dwarf_state_machine.h b/src/lib/binutils/intel/dwarf_state_machine.h
new file mode 100644
index 0000000000..644ee6bd10
--- /dev/null
+++ b/src/lib/binutils/intel/dwarf_state_machine.h
@@ -0,0 +1,210 @@
+//==============================================================
+// Copyright © 2019 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+// =============================================================
+
+#ifndef PTI_SAMPLES_UTILS_DWARF_STATE_MACHINE_H_
+#define PTI_SAMPLES_UTILS_DWARF_STATE_MACHINE_H_
+
+#include "dwarf.h"
+#include "leb128.h"
+
+using DwarfLineInfo =
+  std::vector< std::pair<uint64_t, std::pair<uint32_t, uint32_t> > >;
+
+struct DwarfState {
+  uint64_t address;
+  uint32_t operation;
+  uint32_t line;
+  uint32_t file;
+};
+
+class DwarfStateMachine {
+ public:
+  DwarfStateMachine(const uint8_t* data, uint32_t size,
+                    const Dwarf32Header* header)
+      : data_(data), size_(size), header_(header) {
+    assert(data_ != nullptr);
+    assert(size_ > 0);
+    assert(header_ != nullptr);
+  }
+
+  DwarfLineInfo Run() {
+    const uint8_t* ptr = data_;
+    const uint8_t* end = ptr + size_;
+
+    while (ptr < end) {
+      if (*ptr == 0) {
+        ptr = RunExtended(ptr);
+      } else if (*ptr < header_->opcode_base) {
+        ptr = RunStandard(ptr);
+      } else {
+        ptr = RunSpecial(ptr);
+      }
+    }
+
+    return line_info_;
+  }
+
+ private:
+  const uint8_t* RunSpecial(const uint8_t* ptr) {
+    assert(*ptr >= header_->opcode_base);
+
+    uint8_t adjusted_opcode = (*ptr) - header_->opcode_base;
+    uint8_t operation_advance = adjusted_opcode / header_->line_range;
+    UpdateAddress(operation_advance);
+    UpdateOperation(operation_advance);
+    UpdateLine(adjusted_opcode);
+
+    UpdateLineInfo();
+
+    ++ptr;
+    assert(ptr < data_ + size_);
+    return ptr;
+  }
+
+  const uint8_t* RunStandard(const uint8_t* ptr) {
+    uint8_t opcode = *ptr;
+    ++ptr;
+
+    assert(opcode < header_->opcode_base);
+    assert(ptr < data_ + size_);
+
+    switch (opcode) {
+      case DW_LNS_COPY: {
+        UpdateLineInfo();
+        break;
+      }
+      case DW_LNS_ADVANCE_PC: {
+        uint32_t operation_advance = 0;
+        bool done = false;
+        ptr = utils::leb128::Decode32(ptr, operation_advance, done);
+        assert(done);
+        assert(ptr < data_ + size_);
+        UpdateAddress(operation_advance);
+        UpdateOperation(operation_advance);
+        break;
+      }
+      case DW_LNS_ADVANCE_LINE: {
+        int32_t line = 0;
+        bool done = false;
+        ptr = utils::leb128::Decode32(ptr, line, done);
+        assert(done);
+        assert(ptr < data_ + size_);
+        state_.line += line;
+        break;
+      }
+      case DW_LNS_SET_FILE: {
+        uint32_t file = 0;
+        bool done = false;
+        ptr = utils::leb128::Decode32(ptr, file, done);
+        assert(done);
+        assert(ptr < data_ + size_);
+        state_.file = file;
+        break;
+      }
+      case DW_LNS_SET_COLUMN: {
+        uint32_t column = 0;
+        bool done = false;
+        ptr = utils::leb128::Decode32(ptr, column, done);
+        assert(done);
+        assert(ptr < data_ + size_);
+        break;
+      }
+      case DW_LNS_NEGATE_STMT:
+      case DW_LNS_SET_BASIC_BLOCK:
+          break;
+      case DW_LNS_CONST_ADD_PC: {
+        uint8_t adjusted_opcode = 255 - header_->opcode_base;
+        uint8_t operation_advance = adjusted_opcode / header_->line_range;
+        UpdateAddress(operation_advance);
+        UpdateOperation(operation_advance);
+        break;
+      }
+      case DW_LNS_FIXED_ADVANCE_PC: {
+        uint16_t advance = *((uint16_t*)ptr);
+        ptr += sizeof(uint16_t);
+        assert(ptr < data_ + size_);
+        state_.address += advance;
+        state_.operation = 0;
+        break;
+      }
+      default: {
+        assert(0); // Not supported
+        break;
+      }
+    }
+
+    return ptr;
+  }
+
+  const uint8_t* RunExtended(const uint8_t* ptr) {
+    assert(*ptr == 0);
+    ++ptr;
+    assert(ptr < data_ + size_);
+
+    uint8_t size = *ptr;
+    assert(size > 0);
+    ++ptr;
+    assert(ptr < data_ + size_);
+
+    uint8_t opcode = *ptr;
+    ++ptr;
+    assert(ptr <= data_ + size_);
+
+    switch (opcode) {
+      case DW_LNS_END_SEQUENCE: {
+        assert(ptr == data_ + size_);
+        UpdateLineInfo();
+        break;
+      }
+      case DW_LNE_SET_ADDRESS: {
+        uint64_t address = *((const uint64_t*)ptr);
+        assert(size - 1 == sizeof(uint64_t));
+        ptr += sizeof(uint64_t);
+        assert(ptr < data_ + size_);
+        state_.address = address;
+        break;
+      }
+      default: {
+        assert(0); // Not supported
+        break;
+      }
+    }
+
+    return ptr;
+  }
+
+  void UpdateAddress(uint32_t operation_advance) {
+    state_.address += header_->minimum_instruction_length *
+                      ((state_.operation + operation_advance) /
+                      header_->maximum_operations_per_instruction);
+  }
+
+  void UpdateOperation(uint32_t operation_advance) {
+    state_.operation = (state_.operation + operation_advance) %
+                       header_->maximum_operations_per_instruction;
+  }
+
+  void UpdateLine(uint32_t adjusted_opcode) {
+    state_.line += header_->line_base +
+                   (adjusted_opcode % header_->line_range);
+  }
+
+  void UpdateLineInfo() {
+    line_info_.push_back(
+      std::make_pair(state_.address,
+                     std::make_pair(state_.file, state_.line)));
+  }
+
+private:
+  const uint8_t* data_ = nullptr;
+  uint32_t size_ = 0;    
+  const Dwarf32Header* header_ = nullptr;
+
+  DwarfState state_ = { 0, 0, 1, 1 };
+  DwarfLineInfo line_info_;
+};
+
+#endif // PTI_SAMPLES_UTILS_DWARF_STATE_MACHINE_H_
\ No newline at end of file
diff --git a/src/lib/binutils/intel/elf.h b/src/lib/binutils/intel/elf.h
new file mode 100644
index 0000000000..7e2ce6c4f6
--- /dev/null
+++ b/src/lib/binutils/intel/elf.h
@@ -0,0 +1,45 @@
+//==============================================================
+// Copyright © 2019 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+// =============================================================
+
+#ifndef PTI_SAMPLES_UTILS_ELF_H_
+#define PTI_SAMPLES_UTILS_ELF_H_
+
+#include <stdint.h>
+
+#define ELF_MAGIC_NUMBER 0x7F
+#define ELF_NIDENT       16
+
+struct Elf64Header {
+  unsigned char ident[ELF_NIDENT];
+  uint16_t type;
+  uint16_t machine;
+  uint32_t version;
+  uint64_t entry;
+  uint64_t phoff;
+  uint64_t shoff;
+  uint32_t flags;
+  uint16_t ehsize;
+  uint16_t phentsize;
+  uint16_t phnum;
+  uint16_t shentsize;
+  uint16_t shnum;
+  uint16_t shstrndx;
+};
+
+struct Elf64SectionHeader {
+  uint32_t name;
+  uint32_t type;
+  uint64_t flags;
+  uint64_t addr;
+  uint64_t offset;
+  uint64_t size;
+  uint32_t link;
+  uint32_t info;
+  uint64_t addralign;
+  uint64_t entsize;
+};
+
+#endif // PTI_SAMPLES_UTILS_ELF_H_
\ No newline at end of file
diff --git a/src/lib/binutils/intel/elf_parser.h b/src/lib/binutils/intel/elf_parser.h
new file mode 100644
index 0000000000..28326941b4
--- /dev/null
+++ b/src/lib/binutils/intel/elf_parser.h
@@ -0,0 +1,129 @@
+//==============================================================
+// Copyright © 2019 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+// =============================================================
+
+#ifndef PTI_SAMPLES_UTILS_ELF_PARSER_H_
+#define PTI_SAMPLES_UTILS_ELF_PARSER_H_
+
+#include <string.h>
+
+#include <vector>
+
+#include "elf.h"
+#include "dwarf_parser.h"
+
+class ElfParser {
+ public:
+  ElfParser(const uint8_t* data, uint32_t size) : data_(data), size_(size) {}
+
+  bool IsValid() const {
+    if (data_ == nullptr || size_ < sizeof(Elf64Header)) {
+      return false;
+    }
+
+    const Elf64Header* header = reinterpret_cast<const Elf64Header*>(data_);
+    if (header->ident[0] != ELF_MAGIC_NUMBER ||
+        header->ident[1] != 'E' ||
+        header->ident[2] != 'L' ||
+        header->ident[3] != 'F') {
+      return false;
+    }
+
+
+    if (header->ident[4] != 2) { // 64-bit format
+      return false;
+    }
+
+    return true;
+  }
+
+  std::vector<std::string> GetFileNames() const {
+    if (!IsValid()) {
+      return std::vector<std::string>();
+    }
+
+    const uint8_t* section = nullptr;
+    uint64_t section_size = 0;
+    GetSection(".debug_line", &section, &section_size);
+    if (section == nullptr || section_size == 0) {
+      return std::vector<std::string>();
+    }
+
+    DwarfParser parser(section, static_cast<uint32_t>(section_size));
+    if (!parser.IsValid()) {
+      return std::vector<std::string>();
+    }
+
+    return parser.GetFileNames();
+  }
+
+  LineInfo GetLineInfo(uint32_t file_id) const {
+    if (!IsValid()) {
+      return LineInfo();    
+    }
+
+    const uint8_t* section = nullptr;
+    uint64_t section_size = 0;
+    GetSection(".debug_line", &section, &section_size);
+    if (section == nullptr || section_size == 0) {
+      return LineInfo();
+    }
+
+    DwarfParser parser(section, static_cast<uint32_t>(section_size));
+    if (!parser.IsValid()) {
+      return LineInfo();
+    }
+
+    return parser.GetLineInfo(file_id);
+  }
+
+  std::vector<uint8_t> GetGenBinary() const {
+    if (!IsValid()) {
+      return std::vector<uint8_t>();
+    }
+
+    const uint8_t* section = nullptr;
+    uint64_t section_size = 0;
+    GetSection("Intel(R) OpenCL Device Binary", &section, &section_size);
+    if (section == nullptr || section_size == 0) {
+      return std::vector<uint8_t>();
+    }
+
+    std::vector<uint8_t> binary(section_size);
+    memcpy(binary.data(), section, section_size);
+    return binary;
+  }
+
+ private:
+  void GetSection(const char* name,
+                  const uint8_t** section,
+                  uint64_t* section_size) const {
+    assert(section != nullptr && section_size != nullptr);
+
+    const Elf64Header* header = reinterpret_cast<const Elf64Header*>(data_);
+    const Elf64SectionHeader* section_header =
+      reinterpret_cast<const Elf64SectionHeader*>(data_ + header->shoff);
+    const char* name_section = reinterpret_cast<const char*>(
+        data_ + section_header[header->shstrndx].offset);
+    
+    for (uint32_t i = 1; i < header->shnum; ++i) {
+      const char* section_name = name_section + section_header[i].name;
+      if (strcmp(section_name, name) == 0) {
+        *section = data_ + section_header[i].offset;
+        *section_size = section_header[i].size;
+        return;
+      }
+    }
+
+    *section = nullptr;
+    *section_size = 0;
+  }
+
+ private:
+  const uint8_t* data_ = nullptr;
+  uint32_t size_ = 0;
+};
+
+#endif // PTI_SAMPLES_UTILS_ELF_PARSER_H_
\ No newline at end of file
diff --git a/src/lib/binutils/intel/gen_binary_decoder.h b/src/lib/binutils/intel/gen_binary_decoder.h
new file mode 100644
index 0000000000..cf9527bb67
--- /dev/null
+++ b/src/lib/binutils/intel/gen_binary_decoder.h
@@ -0,0 +1,60 @@
+//==============================================================
+// Copyright © 2019 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+// =============================================================
+
+#ifndef PTI_SAMPLES_UTILS_GEN_BINARY_DECODER_H_
+#define PTI_SAMPLES_UTILS_GEN_BINARY_DECODER_H_
+
+#include <assert.h>
+
+#include <vector>
+#include <string>
+
+#include <iga/kv.hpp>
+
+#include "utils.h"
+
+using InstructionList = std::vector< std::pair<uint32_t, std::string> >;
+
+class GenBinaryDecoder {
+ public:
+  GenBinaryDecoder(const std::vector<uint8_t>& binary, iga_gen_t arch)
+      : kernel_view_(arch, binary.data(), binary.size(),
+                     iga::SWSB_ENCODE_MODE::SingleDistPipe) {}
+
+  bool IsValid() const {
+    return kernel_view_.decodeSucceeded();
+  }
+
+  InstructionList Disassemble() {
+    if (!IsValid()) {
+      return InstructionList();
+    }
+
+    InstructionList instruction_list;
+
+    char text[MAX_STR_SIZE] = { 0 };
+    int32_t offset = 0, size = 0;
+    while (true) {
+      size = kernel_view_.getInstSize(offset);
+      if (size == 0) {
+        break;
+      }
+
+      size_t lenght = kernel_view_.getInstSyntax(offset, text, MAX_STR_SIZE);
+      assert(lenght > 0);
+      instruction_list.push_back(std::make_pair(offset, text));
+
+      offset += size;
+    }
+
+    return instruction_list;
+  }
+
+ private:
+  KernelView kernel_view_;
+};
+
+#endif // PTI_SAMPLES_UTILS_GEN_BINARY_DECODER_H_
diff --git a/src/lib/binutils/intel/gen_symbols_decoder.h b/src/lib/binutils/intel/gen_symbols_decoder.h
new file mode 100644
index 0000000000..c270a8236d
--- /dev/null
+++ b/src/lib/binutils/intel/gen_symbols_decoder.h
@@ -0,0 +1,108 @@
+//==============================================================
+// Copyright © 2019 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+// =============================================================
+
+#ifndef PTI_SAMPLES_UTILS_GEN_SYMBOLS_DECODER_H_
+#define PTI_SAMPLES_UTILS_GEN_SYMBOLS_DECODER_H_
+
+#include <vector>
+
+#include <igc/ocl_igc_shared/executable_format/program_debug_data.h>
+
+#include "elf_parser.h"
+
+#define IS_POWER_OF_TWO(X) (!((X - 1)&X))
+#define IGC_MAX_VALUE 1024
+
+class GenSymbolsDecoder {
+ public:
+  GenSymbolsDecoder(const std::vector<uint8_t>& symbols)
+      : data_(symbols.data()), size_(symbols.size()) {}
+
+  bool IsValid() const {
+    return IsValidHeader();
+  }
+
+  std::vector<std::string> GetFileNames(
+      const std::string& kernel_name) const {
+    if (!IsValid()) {
+      return std::vector<std::string>();
+    }
+
+    ElfParser parser = GetSection(kernel_name);
+    return parser.GetFileNames();
+  }
+
+  LineInfo GetLineInfo(
+      const std::string& kernel_name,
+      uint32_t file_id) const {
+    if (!IsValid()) {
+      return LineInfo();
+    }
+
+    ElfParser parser = GetSection(kernel_name);
+    return parser.GetLineInfo(file_id);
+  }
+
+ private:
+  bool IsValidHeader() const {
+    if (data_ == nullptr ||
+        size_ < sizeof(iOpenCL::SProgramDebugDataHeaderIGC)) {
+      return false;
+    }
+
+    const iOpenCL::SProgramDebugDataHeaderIGC* header =
+      reinterpret_cast<const iOpenCL::SProgramDebugDataHeaderIGC*>(data_);
+    return IS_POWER_OF_TWO(header->GPUPointerSizeInBytes) &&
+           (header->GPUPointerSizeInBytes <= IGC_MAX_VALUE) &&
+           (header->Device <= IGC_MAX_VALUE) &&
+           (header->SteppingId <= IGC_MAX_VALUE) &&
+           (header->NumberOfKernels <= IGC_MAX_VALUE);
+  }
+
+  ElfParser GetSection(const std::string& kernel_name) const {
+    const uint8_t* ptr = data_;
+    const iOpenCL::SProgramDebugDataHeaderIGC* header =
+      reinterpret_cast<const iOpenCL::SProgramDebugDataHeaderIGC*>(ptr);
+    ptr += sizeof(iOpenCL::SProgramDebugDataHeaderIGC);
+
+    for (uint32_t i = 0; i < header->NumberOfKernels; ++i) {
+      const iOpenCL::SKernelDebugDataHeaderIGC* kernel_header =
+        reinterpret_cast<const iOpenCL::SKernelDebugDataHeaderIGC*>(ptr);
+      ptr += sizeof(iOpenCL::SKernelDebugDataHeaderIGC);
+      assert(ptr <= data_ + size_);
+
+      const char* current_kernel_name = reinterpret_cast<const char*>(ptr);
+      uint32_t aligned_kernel_name_size = sizeof(uint32_t) *
+        (1 + (kernel_header->KernelNameSize - 1) / sizeof(uint32_t));
+      ptr += aligned_kernel_name_size;
+      assert(ptr <= data_ + size_);
+
+      if (kernel_name == current_kernel_name) {
+        assert(kernel_header->SizeGenIsaDbgInBytes == 0);
+
+        ElfParser parser(ptr, kernel_header->SizeVisaDbgInBytes);
+        if (!parser.IsValid()) {
+          continue;
+        }
+
+        return parser;
+      }
+
+      ptr += kernel_header->SizeVisaDbgInBytes;
+      assert(ptr <= data_ + size_);
+
+      ptr += kernel_header->SizeGenIsaDbgInBytes;
+      assert(ptr <= data_ + size_);
+    }
+
+    return ElfParser(nullptr, 0);
+  }
+
+  const uint8_t* data_ = nullptr;
+  size_t size_ = 0;
+};
+
+#endif // PTI_SAMPLES_UTILS_GEN_SYMBOLS_DECODER_H_
diff --git a/src/lib/binutils/intel/igc_binary_decoder.h b/src/lib/binutils/intel/igc_binary_decoder.h
new file mode 100644
index 0000000000..c98caea568
--- /dev/null
+++ b/src/lib/binutils/intel/igc_binary_decoder.h
@@ -0,0 +1,98 @@
+//==============================================================
+// Copyright © 2020 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+// =============================================================
+
+#pragma once
+
+#include <memory.h>
+
+#include <igc/ocl_igc_shared/executable_format/patch_list.h>
+
+#include <metrics_discovery_internal_api.h>
+
+#include "gen_binary_decoder.h"
+
+using namespace iOpenCL;
+
+class IgcBinaryDecoder {
+ public:
+  IgcBinaryDecoder(const std::vector<uint8_t>& binary) : binary_(binary) {}
+
+  InstructionList Disassemble(const std::string& kernel_name) {
+    if (!IsValidHeader()) {
+      return InstructionList();
+    }
+
+    const SProgramBinaryHeader* header =
+      reinterpret_cast<const SProgramBinaryHeader*>(binary_.data());
+    iga_gen_t arch = GetArch(header->Device);
+    if (arch == IGA_GEN_INVALID) {
+      return InstructionList();
+    }
+
+    const uint8_t* ptr = reinterpret_cast<const uint8_t*>(header) +
+      sizeof(SProgramBinaryHeader) + header->PatchListSize;
+    for (uint32_t i = 0; i < header->NumberOfKernels; ++i) {
+      const SKernelBinaryHeaderCommon* kernel_header =
+        reinterpret_cast<const SKernelBinaryHeaderCommon*>(ptr);
+
+      ptr += sizeof(SKernelBinaryHeaderCommon);
+      const char* name = (const char*)ptr;
+
+      ptr += kernel_header->KernelNameSize;
+      if (kernel_name == name) {
+        std::vector<uint8_t> raw_binary(kernel_header->KernelHeapSize);
+        memcpy(raw_binary.data(), ptr,
+               kernel_header->KernelHeapSize * sizeof(uint8_t));
+        GenBinaryDecoder decoder(raw_binary, arch);
+        return decoder.Disassemble();
+      }
+
+      ptr += kernel_header->PatchListSize +
+        kernel_header->KernelHeapSize +
+        kernel_header->GeneralStateHeapSize +
+        kernel_header->DynamicStateHeapSize +
+        kernel_header->SurfaceStateHeapSize;
+    }
+
+    return InstructionList();
+  }
+
+ private:
+  bool IsValidHeader() {
+    if (binary_.size() < sizeof(SProgramBinaryHeader)) {
+      return false;
+    }
+
+    const SProgramBinaryHeader* header =
+      reinterpret_cast<const SProgramBinaryHeader*>(binary_.data());
+    if (header->Magic != MAGIC_CL) {
+      return false;
+    }
+
+    return true;
+  }
+
+  static iga_gen_t GetArch(uint32_t device) {
+    switch (1 << device) {
+      case MetricsDiscovery::PLATFORM_BDW:
+        return IGA_GEN8;
+      case MetricsDiscovery::PLATFORM_SKL:
+        return IGA_GEN9;
+      case MetricsDiscovery::PLATFORM_KBL:
+        return IGA_GEN9p5;
+      case MetricsDiscovery::PLATFORM_ICL:
+        return IGA_GEN11;
+      case 1 << 18: // TGL (?)
+        return IGA_GEN12p1;
+      default:
+        break;
+    }
+    return IGA_GEN_INVALID;
+  }
+
+private:
+    std::vector<uint8_t> binary_;
+};
diff --git a/src/lib/binutils/intel/leb128.h b/src/lib/binutils/intel/leb128.h
new file mode 100644
index 0000000000..a5ddb20599
--- /dev/null
+++ b/src/lib/binutils/intel/leb128.h
@@ -0,0 +1,74 @@
+//==============================================================
+// Copyright © 2019 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+// =============================================================
+
+#ifndef PTI_SAMPLES_UTILS_LEB128_H_
+#define PTI_SAMPLES_UTILS_LEB128_H_
+
+#include <stdint.h>
+
+namespace utils {
+namespace leb128 {
+
+inline const uint8_t* Decode32(const uint8_t* ptr, uint32_t& value,
+                               bool& done) {
+  uint8_t byte = 0;
+  uint8_t count = 0;
+  uint8_t shift = 0;
+
+  value = 0;
+  done = false;
+
+  while (count < sizeof(uint32_t)) {
+    byte = *ptr;
+    value |= ((byte & 0x7F) << shift);
+    shift += 7;
+
+    ++ptr;
+    ++count;
+
+    if ((byte & 0x80) == 0) {
+      done = true;
+      break;
+    }
+  }
+
+  return ptr;
+}
+
+inline const uint8_t* Decode32(const uint8_t* ptr, int32_t& value,
+                               bool& done) {
+  uint8_t byte = 0;
+  uint8_t count = 0;
+  uint8_t shift = 0;
+
+  value = 0;
+  done = false;
+
+  while (count < sizeof(int32_t)) {
+    byte = *ptr;
+    value |= ((byte & 0x7F) << shift);
+    shift += 7;
+
+    ++ptr;
+    ++count;
+
+    if ((byte & 0x80) == 0) {
+      done = true;
+      break;
+    }
+  }
+
+  if ((shift < 8 * sizeof(int32_t)) && ((byte & 0x40) > 0)) {
+    value |= -(1u << shift);
+  }
+
+  return ptr;
+}
+
+} // namespace leb128
+} // namespace utils
+
+#endif // PTI_SAMPLES_UTILS_LEB128_H_
\ No newline at end of file
diff --git a/src/lib/binutils/intel/metric_device.h b/src/lib/binutils/intel/metric_device.h
new file mode 100644
index 0000000000..48d5b7f104
--- /dev/null
+++ b/src/lib/binutils/intel/metric_device.h
@@ -0,0 +1,69 @@
+//==============================================================
+// Copyright © 2019 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+// =============================================================
+
+#ifndef PTI_SAMPLES_UTILS_METRIC_DEVICE_H_
+#define PTI_SAMPLES_UTILS_METRIC_DEVICE_H_
+
+#include "metric_utils.h"
+#include "shared_library.h"
+
+namespace md = MetricsDiscovery;
+
+class MetricDevice {
+ public:
+  static MetricDevice* Create() {
+    SharedLibrary* lib = nullptr;
+
+    for (auto& path : utils::metrics::GetMDLibraryPossiblePaths()) {
+      lib = SharedLibrary::Create(path);
+      if (lib != nullptr) {
+        break;
+      }
+    }
+
+    if (lib != nullptr) {
+      md::IMetricsDevice_1_5* device = nullptr;
+
+      md::OpenMetricsDevice_fn OpenMetricsDevice =
+        lib->GetSym<md::OpenMetricsDevice_fn>("OpenMetricsDevice");
+      md::TCompletionCode status = OpenMetricsDevice(&device);
+        assert(status == md::CC_OK ||
+               status == md::CC_ALREADY_INITIALIZED);
+
+      if (device != nullptr) {
+        return new MetricDevice(device, lib);
+      } else {
+        delete lib;	
+      }
+    }
+
+    return nullptr;
+  }
+
+  ~MetricDevice() {
+    assert(device_ != nullptr);
+    md::CloseMetricsDevice_fn CloseMetricsDevice =
+      lib_->GetSym<md::CloseMetricsDevice_fn>("CloseMetricsDevice");
+    md::TCompletionCode status = CloseMetricsDevice(device_);
+    assert(status == md::CC_OK);
+
+    assert(lib_ != nullptr);
+    delete lib_;
+  }
+
+  md::IMetricsDevice_1_5* operator->() const {
+    return device_;
+  }
+
+private:
+  MetricDevice(md::IMetricsDevice_1_5* device, SharedLibrary* lib)
+      : device_(device), lib_(lib) {}
+
+  md::IMetricsDevice_1_5* device_ = nullptr;
+  SharedLibrary* lib_ = nullptr;
+};
+
+#endif // PTI_SAMPLES_UTILS_METRIC_DEVICE_H_
\ No newline at end of file
diff --git a/src/lib/binutils/intel/metric_utils.h b/src/lib/binutils/intel/metric_utils.h
new file mode 100644
index 0000000000..8a6a44692c
--- /dev/null
+++ b/src/lib/binutils/intel/metric_utils.h
@@ -0,0 +1,95 @@
+//==============================================================
+// Copyright © 2019 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+// =============================================================
+
+#ifndef PTI_SAMPLES_UTILS_METRIC_UTILS_H_
+#define PTI_SAMPLES_UTILS_METRIC_UTILS_H_
+
+#if defined(_WIN32)
+#include <Windows.h>
+#elif defined(__gnu_linux__) || defined(__APPLE__)
+#include <cerrno>
+#include <dlfcn.h>
+#endif
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include <MD/metrics_discovery_api.h>
+
+namespace utils {
+namespace metrics {
+
+inline std::vector<std::string> GetMDLibraryName() {
+#if defined(_WIN32)
+#  if defined(_WIN64)
+  return{ "igdmd64.dll" };
+#  else
+  return{ "igdmd32.dll" };
+#  endif
+#elif defined(__gnu_linux__)
+  return{ "libmd.so" };
+#elif defined(__APPLE__)
+  return{ "libmd.dylib", "libigdmd.dylib" };
+#else
+#error "Unsupported platform!"
+#endif
+}
+
+inline std::string GetPreferedLibraryPath() {
+#ifdef _WIN32
+  const std::string kKeyName = "SOFTWARE\\Intel\\MDF";
+  const std::string kValueName = "DriverStorePath";
+
+  HKEY key_handle = nullptr;
+  LSTATUS status = RegOpenKeyExA(HKEY_LOCAL_MACHINE, kKeyName.c_str(),
+                                 0, KEY_READ, &key_handle);
+  if (status != ERROR_SUCCESS) {
+      return "";
+  }
+
+  std::unique_ptr<std::remove_pointer<HKEY>::type,
+                  decltype(&RegCloseKey)> key(key_handle, RegCloseKey);
+
+  DWORD buffer_size = MAX_PATH * sizeof(char);
+  std::unique_ptr<BYTE[]> buffer(new BYTE[buffer_size]);
+
+  status = RegQueryValueExA(key.get(), kValueName.c_str(), nullptr, nullptr,
+                            buffer.get(), &buffer_size);
+  if (status != ERROR_SUCCESS) {
+    return "";
+  }
+
+  return reinterpret_cast<char*>(buffer.get());
+#elif defined(__linux__)
+  return "/opt/intel/opencl";
+#elif defined(__APPLE__)
+  return "/System/Library/Extensions/AppleIntelBDWGraphicsMTLDriver.bundle/Contents/MacOS";
+#else
+  return "";
+#endif
+}
+
+inline std::vector<std::string> GetMDLibraryPossiblePaths() {
+  std::vector<std::string> paths;
+
+  std::string prefered_path = GetPreferedLibraryPath();
+  std::vector<std::string> library_names = GetMDLibraryName();
+
+  for (auto& library_name : library_names) {
+    if (!prefered_path.empty()) {
+      paths.push_back(prefered_path + "/" + library_name);
+    }
+    paths.push_back(library_name);
+  }
+
+  return paths;
+}
+
+} // namespace metrics
+} // namespace utils
+
+#endif // PTI_SAMPLES_UTILS_METRIC_UTILS_H_
\ No newline at end of file
diff --git a/src/lib/binutils/intel/shared_library.h b/src/lib/binutils/intel/shared_library.h
new file mode 100644
index 0000000000..6c75cb4e92
--- /dev/null
+++ b/src/lib/binutils/intel/shared_library.h
@@ -0,0 +1,72 @@
+//==============================================================
+// Copyright © 2019 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+// =============================================================
+
+#ifndef PTI_SAMPLES_UTILS_SHARED_LIBRARY_H_
+#define PTI_SAMPLES_UTILS_SHARED_LIBRARY_H_
+
+#include <assert.h>
+
+#if defined(_WIN32)
+#include <Windows.h>
+#elif defined(__gnu_linux__) || defined(__APPLE__)
+#include <cerrno>
+#include <dlfcn.h>
+#endif
+
+#include <string>
+#include <vector>
+
+class SharedLibrary {
+ public:
+  static SharedLibrary* Create(const std::string& name) {
+#if defined(_WIN32)
+    HMODULE handle = nullptr;
+    handle = LoadLibraryA(name.c_str());
+#elif defined(__gnu_linux__) || defined(__APPLE__)
+    void* handle = nullptr;
+    handle = dlopen(name.c_str(), RTLD_NOW);
+#endif
+    if (handle != nullptr) {
+      return new SharedLibrary(handle);
+    }
+    return nullptr;
+  }
+
+  ~SharedLibrary() {
+#if defined(_WIN32)
+    BOOL completed = FreeLibrary(handle_);
+    assert(completed == TRUE);
+#elif defined(__gnu_linux__) || defined(__APPLE__)
+    int completed = dlclose(handle_);
+    assert(completed == 0);
+#endif
+  }
+
+  template<typename T> T GetSym(const char* name) {
+    void* sym = nullptr;
+#if defined(_WIN32)
+    sym = GetProcAddress(handle_, name);
+#elif defined(__gnu_linux__) || defined(__APPLE__)
+    sym = dlsym(handle_, name);
+#endif
+    return reinterpret_cast<T>(sym);
+  }
+
+ private:
+#if defined(_WIN32)
+  SharedLibrary(HMODULE handle) : handle_(handle) {}
+#elif defined(__gnu_linux__) || defined(__APPLE__)
+  SharedLibrary(void* handle) : handle_(handle) {}
+#endif
+
+#if defined(_WIN32)
+  HMODULE handle_ = nullptr;
+#elif defined(__gnu_linux__) || defined(__APPLE__)
+  void* handle_ = nullptr;
+#endif
+};
+
+#endif // PTI_SAMPLES_UTILS_SHARED_LIBRARY_H_
\ No newline at end of file
diff --git a/src/lib/binutils/intel/utils.h b/src/lib/binutils/intel/utils.h
new file mode 100644
index 0000000000..2c4be6ebc9
--- /dev/null
+++ b/src/lib/binutils/intel/utils.h
@@ -0,0 +1,117 @@
+//==============================================================
+// Copyright © 2019 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+// =============================================================
+
+#ifndef PTI_SAMPLES_UTILS_UTILS_H_
+#define PTI_SAMPLES_UTILS_UTILS_H_
+
+#if defined(_WIN32)
+#include <windows.h>
+#elif defined(__gnu_linux__)
+#include <unistd.h>
+#endif
+
+#include <assert.h>
+#include <stdint.h>
+
+#include <fstream>
+#include <string>
+#include <vector>
+
+#define MAX_STR_SIZE 1024
+
+#define NSEC_IN_USEC 1000
+#define NSEC_IN_MSEC 1000000
+#define NSEC_IN_SEC  1000000000
+
+namespace utils {
+
+struct Comparator {
+  template<typename T>
+  bool operator()(const T& left, const T& right) {
+    if (left.second != right.second) {
+      return left.second > right.second;
+    }
+    return left.first > right.first;
+  }
+};
+
+#if defined(__gnu_linux__)
+uint64_t ConvertClockMonotonicToRaw(uint64_t clock_monotonic) {
+  int status = 0;
+
+  timespec monotonic_time;
+  status = clock_gettime(CLOCK_MONOTONIC, &monotonic_time);
+  assert(status == 0);
+
+  timespec raw_time;
+  status = clock_gettime(CLOCK_MONOTONIC_RAW, &raw_time);
+  assert(status == 0);
+
+  uint64_t raw = raw_time.tv_nsec + NSEC_IN_SEC * raw_time.tv_sec;
+  uint64_t monotonic = monotonic_time.tv_nsec +
+    NSEC_IN_SEC * monotonic_time.tv_sec;
+  if (raw > monotonic) {
+    return clock_monotonic + (raw - monotonic);
+  } else {
+    return clock_monotonic - (monotonic - raw);
+  }
+}
+#endif
+
+inline std::string GetExecutablePath() {
+  char buffer[MAX_STR_SIZE] = { 0 };
+#if defined(_WIN32)
+  DWORD status = GetModuleFileNameA(nullptr, buffer, MAX_STR_SIZE);
+  assert(status > 0);
+#elif defined(__gnu_linux__)
+  ssize_t status = readlink("/proc/self/exe", buffer, MAX_STR_SIZE);
+  assert(status > 0);
+#else
+#error not supported
+#endif
+  std::string path(buffer);
+  return path.substr(0, path.find_last_of("/\\") + 1);
+}
+
+inline std::vector<uint8_t> LoadBinaryFile(const std::string& path) {
+  std::vector<uint8_t> binary;
+  std::ifstream stream(path, std::ios::in | std::ios::binary);
+  if (!stream.good()) {
+    return binary;
+  }
+
+  size_t size = 0;
+  stream.seekg(0, std::ifstream::end);
+  size = static_cast<size_t>(stream.tellg());
+  stream.seekg(0, std::ifstream::beg);
+  if (size == 0) {
+    return binary;
+  }
+
+  binary.resize(size);
+  stream.read(reinterpret_cast<char *>(binary.data()), size);
+  return binary;
+}
+
+inline void SetEnv(const std::string& str) {
+  int status = 0;
+#if defined(_WIN32)
+  status = _putenv(str.c_str());
+#elif defined(__gnu_linux__)
+  status = putenv(const_cast<char*>(str.c_str()));
+#else
+#error not supported
+#endif
+  assert(status == 0);
+}
+
+inline const char* GetEnv(const char* name) {
+  return getenv(name);
+}
+
+} // namespace utils
+
+#endif // PTI_SAMPLES_UTILS_UTILS_H_
\ No newline at end of file
diff --git a/src/lib/binutils/intel/ze_tracer.h b/src/lib/binutils/intel/ze_tracer.h
new file mode 100644
index 0000000000..a63941d447
--- /dev/null
+++ b/src/lib/binutils/intel/ze_tracer.h
@@ -0,0 +1,104 @@
+//==============================================================
+// Copyright © 2020 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+// =============================================================
+
+#ifndef PTI_SAMPLES_UTILS_ZE_TRACER_H_
+#define PTI_SAMPLES_UTILS_ZE_TRACER_H_
+
+#include <assert.h>
+
+#include <set>
+
+#include <L0/tracing_api.h>
+
+#define ZE_FUNCTION_COUNT      (ze_tracing::ZE_FUNCTION_COUNT)
+#define ZE_CALLBACK_SITE_ENTER (ze_tracing::ZE_CALLBACK_SITE_ENTER)
+#define ZE_CALLBACK_SITE_EXIT  (ze_tracing::ZE_CALLBACK_SITE_EXIT)
+
+using callback_data_t = ze_tracing::callback_data_t;
+using function_id_t = ze_tracing::function_id_t;
+using tracing_callback_t = ze_tracing::tracing_callback_t;
+
+class ZeTracer {
+ public:
+  ZeTracer(ze_driver_handle_t driver,
+           tracing_callback_t callback,
+           void* user_data) {
+    assert(driver != nullptr);
+
+    data_.callback = callback;
+    data_.user_data = user_data;
+
+    ze_result_t status = ZE_RESULT_SUCCESS;
+    zet_tracer_desc_t tracer_desc = {};
+    tracer_desc.version = ZET_TRACER_DESC_VERSION_CURRENT;
+    tracer_desc.pUserData = &data_;
+
+    status = zetTracerCreate(driver, &tracer_desc, &handle_);
+    assert(status == ZE_RESULT_SUCCESS);
+  }
+
+  ~ZeTracer() {
+    if (handle_ != nullptr) {
+      ze_result_t status = ZE_RESULT_SUCCESS;
+      status = zetTracerDestroy(handle_);
+      assert(status == ZE_RESULT_SUCCESS);
+    }
+  }
+
+  bool SetTracingFunction(function_id_t function) {
+    if (!IsValid()) {
+      return false;
+    }
+
+    if (function >= 0 && function < ZE_FUNCTION_COUNT) {
+      functions_.insert(function);
+      return true;
+    }
+
+    return false;
+  }
+
+  bool Enable() {
+    if (!IsValid()) {
+      return false;
+    }
+
+    ze_tracing::SetTracingFunctions(handle_, functions_);
+
+    ze_result_t status = ZE_RESULT_SUCCESS;
+    status = zetTracerSetEnabled(handle_, true);
+    if (status != ZE_RESULT_SUCCESS) {
+      return false;
+    }
+
+    return true;
+  }
+
+  bool Disable() {
+    if (!IsValid()) {
+      return false;
+    }
+
+    ze_result_t status = ZE_RESULT_SUCCESS;
+    status = zetTracerSetEnabled(handle_, false);
+    if (status != ZE_RESULT_SUCCESS) {
+      return false;
+    }
+
+    return true;
+  }
+
+  bool IsValid() const {
+    return (handle_ != nullptr);
+  }
+
+ private:
+  zet_tracer_handle_t handle_ = nullptr;
+  std::set<function_id_t> functions_;
+  ze_tracing::global_data_t data_;
+};
+
+#endif // PTI_SAMPLES_UTILS_ZE_TRACER_H_
\ No newline at end of file
diff --git a/src/lib/binutils/intel/ze_utils.h b/src/lib/binutils/intel/ze_utils.h
new file mode 100644
index 0000000000..67e5c288eb
--- /dev/null
+++ b/src/lib/binutils/intel/ze_utils.h
@@ -0,0 +1,143 @@
+//==============================================================
+// Copyright © 2019 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+// =============================================================
+
+#ifndef PTI_SAMPLES_UTILS_ZE_UTILS_H_
+#define PTI_SAMPLES_UTILS_ZE_UTILS_H_
+
+#include <assert.h>
+#include <string.h>
+
+#include <string>
+#include <vector>
+
+#include <level_zero/ze_api.h>
+#include <level_zero/zet_api.h>
+
+namespace utils {
+namespace ze {
+
+inline void GetIntelDeviceAndDriver(ze_device_type_t type,
+                                    ze_device_handle_t& device,
+                                    ze_driver_handle_t& driver) {
+  ze_result_t status = ZE_RESULT_SUCCESS;
+
+  uint32_t driver_count = 0;
+  status = zeDriverGet(&driver_count, nullptr);
+  if (status != ZE_RESULT_SUCCESS || driver_count == 0) {
+    return;
+  }
+
+  std::vector<ze_driver_handle_t> driver_list(driver_count, nullptr);
+  status = zeDriverGet(&driver_count, driver_list.data());
+  assert(status == ZE_RESULT_SUCCESS);
+
+  for (uint32_t i = 0; i < driver_count; ++i) {
+    uint32_t device_count = 0;
+    status = zeDeviceGet(driver_list[i], &device_count, nullptr);
+    if (status != ZE_RESULT_SUCCESS || device_count == 0) {
+        continue;
+    }
+
+    std::vector<ze_device_handle_t> device_list(device_count, nullptr);
+    status = zeDeviceGet(driver_list[i], &device_count, device_list.data());
+    assert(status == ZE_RESULT_SUCCESS);
+
+    for (uint32_t j = 0; j < device_count; ++j) {
+      ze_device_properties_t props;
+      props.version = ZE_DEVICE_PROPERTIES_VERSION_CURRENT;
+      status = zeDeviceGetProperties(device_list[j], &props);
+      assert(status == ZE_RESULT_SUCCESS);
+
+      if (props.type == type && strstr(props.name, "Intel") != nullptr) {
+        device = device_list[j];
+        driver = driver_list[i];
+        break;
+      }
+    }
+  }
+
+  return;
+}
+
+inline std::string GetDeviceName(ze_device_handle_t device) {
+  assert(device != nullptr);
+  ze_result_t status = ZE_RESULT_SUCCESS;
+  ze_device_properties_t props;
+  props.version = ZE_DEVICE_PROPERTIES_VERSION_CURRENT;
+  status = zeDeviceGetProperties(device, &props);
+  assert(status == ZE_RESULT_SUCCESS);
+  return props.name;
+}
+
+static int GetMetricId(zet_metric_group_handle_t group, std::string name) {
+  assert(group != nullptr);
+
+  ze_result_t status = ZE_RESULT_SUCCESS;
+  uint32_t metric_count = 0;
+  status = zetMetricGet(group, &metric_count, nullptr);
+  assert(status == ZE_RESULT_SUCCESS);
+
+  if (metric_count == 0) {
+    return -1;
+  }
+
+  std::vector<zet_metric_handle_t> metric_list(metric_count, nullptr);
+  status = zetMetricGet(group, &metric_count, metric_list.data());
+  assert(status == ZE_RESULT_SUCCESS);
+
+  int target = -1;
+  for (uint32_t i = 0; i < metric_count; ++i) {
+    zet_metric_properties_t metric_props = {};
+    metric_props.version = ZET_METRIC_PROPERTIES_VERSION_CURRENT;
+    status = zetMetricGetProperties(metric_list[i], &metric_props);
+    assert(status == ZE_RESULT_SUCCESS);
+
+    if (name == metric_props.name) {
+      target = i;
+      break;
+    }
+  }
+
+  return target;
+}
+
+static zet_metric_group_handle_t FindMetricGroup(
+    ze_device_handle_t device, std::string name,
+    zet_metric_group_sampling_type_t type) {
+  assert(device != nullptr);
+  
+  ze_result_t status = ZE_RESULT_SUCCESS;
+  uint32_t group_count = 0;
+  status = zetMetricGroupGet(device, &group_count, nullptr);
+  assert(status == ZE_RESULT_SUCCESS);
+  if (group_count == 0) {
+    return nullptr;
+  }
+
+  std::vector<zet_metric_group_handle_t> group_list(group_count, nullptr);
+  status = zetMetricGroupGet(device, &group_count, group_list.data());
+  assert(status == ZE_RESULT_SUCCESS);
+
+  zet_metric_group_handle_t target = nullptr;
+  for (uint32_t i = 0; i < group_count; ++i) {
+    zet_metric_group_properties_t group_props = {};
+    group_props.version = ZET_METRIC_GROUP_PROPERTIES_VERSION_CURRENT;
+    status = zetMetricGroupGetProperties(group_list[i], &group_props);
+    assert(status == ZE_RESULT_SUCCESS);
+
+    if (name == group_props.name && type == group_props.samplingType) {
+      target = group_list[i];
+      break;
+    }
+  }
+
+  return target;
+}
+
+} // namespace ze
+} // namespace utils
+
+#endif // PTI_SAMPLES_UTILS_ZE_UTILS_H_
\ No newline at end of file
diff --git a/src/lib/isa/Makefile.in b/src/lib/isa/Makefile.in
index 8ced9a5aff..e1c22f050e 100644
--- a/src/lib/isa/Makefile.in
+++ b/src/lib/isa/Makefile.in
@@ -363,9 +363,18 @@ OPT_CUDA_LDFLAGS = @OPT_CUDA_LDFLAGS@
 OPT_CUPTI = @OPT_CUPTI@
 OPT_CUPTI_IFLAGS = @OPT_CUPTI_IFLAGS@
 OPT_CUPTI_LDFLAGS = @OPT_CUPTI_LDFLAGS@
+OPT_GTPIN = @OPT_GTPIN@
+OPT_GTPIN_IFLAGS = @OPT_GTPIN_IFLAGS@
+OPT_GTPIN_LDFLAGS = @OPT_GTPIN_LDFLAGS@
+OPT_IGC = @OPT_IGC@
+OPT_IGC_IFLAGS = @OPT_IGC_IFLAGS@
+OPT_IGC_LDFLAGS = @OPT_IGC_LDFLAGS@
 OPT_LEVEL0 = @OPT_LEVEL0@
 OPT_LEVEL0_IFLAGS = @OPT_LEVEL0_IFLAGS@
 OPT_LEVEL0_LDFLAGS = @OPT_LEVEL0_LDFLAGS@
+OPT_METRICS_DISCOVERY = @OPT_METRICS_DISCOVERY@
+OPT_METRICS_DISCOVERY_IFLAGS = @OPT_METRICS_DISCOVERY_IFLAGS@
+OPT_METRICS_DISCOVERY_LDFLAGS = @OPT_METRICS_DISCOVERY_LDFLAGS@
 OPT_OBJCOPY = @OPT_OBJCOPY@
 OPT_OPENCL_IFLAGS = @OPT_OPENCL_IFLAGS@
 OPT_PAPI = @OPT_PAPI@
diff --git a/src/lib/prof-lean/Makefile.in b/src/lib/prof-lean/Makefile.in
index 88a725cbea..a859751bb5 100644
--- a/src/lib/prof-lean/Makefile.in
+++ b/src/lib/prof-lean/Makefile.in
@@ -366,9 +366,18 @@ OPT_CUDA_LDFLAGS = @OPT_CUDA_LDFLAGS@
 OPT_CUPTI = @OPT_CUPTI@
 OPT_CUPTI_IFLAGS = @OPT_CUPTI_IFLAGS@
 OPT_CUPTI_LDFLAGS = @OPT_CUPTI_LDFLAGS@
+OPT_GTPIN = @OPT_GTPIN@
+OPT_GTPIN_IFLAGS = @OPT_GTPIN_IFLAGS@
+OPT_GTPIN_LDFLAGS = @OPT_GTPIN_LDFLAGS@
+OPT_IGC = @OPT_IGC@
+OPT_IGC_IFLAGS = @OPT_IGC_IFLAGS@
+OPT_IGC_LDFLAGS = @OPT_IGC_LDFLAGS@
 OPT_LEVEL0 = @OPT_LEVEL0@
 OPT_LEVEL0_IFLAGS = @OPT_LEVEL0_IFLAGS@
 OPT_LEVEL0_LDFLAGS = @OPT_LEVEL0_LDFLAGS@
+OPT_METRICS_DISCOVERY = @OPT_METRICS_DISCOVERY@
+OPT_METRICS_DISCOVERY_IFLAGS = @OPT_METRICS_DISCOVERY_IFLAGS@
+OPT_METRICS_DISCOVERY_LDFLAGS = @OPT_METRICS_DISCOVERY_LDFLAGS@
 OPT_OBJCOPY = @OPT_OBJCOPY@
 OPT_OPENCL_IFLAGS = @OPT_OPENCL_IFLAGS@
 OPT_PAPI = @OPT_PAPI@
diff --git a/src/lib/prof/Makefile.in b/src/lib/prof/Makefile.in
index f0c6d1532c..a64b4d4eff 100644
--- a/src/lib/prof/Makefile.in
+++ b/src/lib/prof/Makefile.in
@@ -373,9 +373,18 @@ OPT_CUDA_LDFLAGS = @OPT_CUDA_LDFLAGS@
 OPT_CUPTI = @OPT_CUPTI@
 OPT_CUPTI_IFLAGS = @OPT_CUPTI_IFLAGS@
 OPT_CUPTI_LDFLAGS = @OPT_CUPTI_LDFLAGS@
+OPT_GTPIN = @OPT_GTPIN@
+OPT_GTPIN_IFLAGS = @OPT_GTPIN_IFLAGS@
+OPT_GTPIN_LDFLAGS = @OPT_GTPIN_LDFLAGS@
+OPT_IGC = @OPT_IGC@
+OPT_IGC_IFLAGS = @OPT_IGC_IFLAGS@
+OPT_IGC_LDFLAGS = @OPT_IGC_LDFLAGS@
 OPT_LEVEL0 = @OPT_LEVEL0@
 OPT_LEVEL0_IFLAGS = @OPT_LEVEL0_IFLAGS@
 OPT_LEVEL0_LDFLAGS = @OPT_LEVEL0_LDFLAGS@
+OPT_METRICS_DISCOVERY = @OPT_METRICS_DISCOVERY@
+OPT_METRICS_DISCOVERY_IFLAGS = @OPT_METRICS_DISCOVERY_IFLAGS@
+OPT_METRICS_DISCOVERY_LDFLAGS = @OPT_METRICS_DISCOVERY_LDFLAGS@
 OPT_OBJCOPY = @OPT_OBJCOPY@
 OPT_OPENCL_IFLAGS = @OPT_OPENCL_IFLAGS@
 OPT_PAPI = @OPT_PAPI@
diff --git a/src/lib/profxml/Makefile.in b/src/lib/profxml/Makefile.in
index 1e9a4d8c0c..a5ee9980e7 100644
--- a/src/lib/profxml/Makefile.in
+++ b/src/lib/profxml/Makefile.in
@@ -368,9 +368,18 @@ OPT_CUDA_LDFLAGS = @OPT_CUDA_LDFLAGS@
 OPT_CUPTI = @OPT_CUPTI@
 OPT_CUPTI_IFLAGS = @OPT_CUPTI_IFLAGS@
 OPT_CUPTI_LDFLAGS = @OPT_CUPTI_LDFLAGS@
+OPT_GTPIN = @OPT_GTPIN@
+OPT_GTPIN_IFLAGS = @OPT_GTPIN_IFLAGS@
+OPT_GTPIN_LDFLAGS = @OPT_GTPIN_LDFLAGS@
+OPT_IGC = @OPT_IGC@
+OPT_IGC_IFLAGS = @OPT_IGC_IFLAGS@
+OPT_IGC_LDFLAGS = @OPT_IGC_LDFLAGS@
 OPT_LEVEL0 = @OPT_LEVEL0@
 OPT_LEVEL0_IFLAGS = @OPT_LEVEL0_IFLAGS@
 OPT_LEVEL0_LDFLAGS = @OPT_LEVEL0_LDFLAGS@
+OPT_METRICS_DISCOVERY = @OPT_METRICS_DISCOVERY@
+OPT_METRICS_DISCOVERY_IFLAGS = @OPT_METRICS_DISCOVERY_IFLAGS@
+OPT_METRICS_DISCOVERY_LDFLAGS = @OPT_METRICS_DISCOVERY_LDFLAGS@
 OPT_OBJCOPY = @OPT_OBJCOPY@
 OPT_OPENCL_IFLAGS = @OPT_OPENCL_IFLAGS@
 OPT_PAPI = @OPT_PAPI@
diff --git a/src/lib/stubs-gcc_s/Makefile.in b/src/lib/stubs-gcc_s/Makefile.in
index 5ee5bbf912..38b2ac32dd 100644
--- a/src/lib/stubs-gcc_s/Makefile.in
+++ b/src/lib/stubs-gcc_s/Makefile.in
@@ -346,9 +346,18 @@ OPT_CUDA_LDFLAGS = @OPT_CUDA_LDFLAGS@
 OPT_CUPTI = @OPT_CUPTI@
 OPT_CUPTI_IFLAGS = @OPT_CUPTI_IFLAGS@
 OPT_CUPTI_LDFLAGS = @OPT_CUPTI_LDFLAGS@
+OPT_GTPIN = @OPT_GTPIN@
+OPT_GTPIN_IFLAGS = @OPT_GTPIN_IFLAGS@
+OPT_GTPIN_LDFLAGS = @OPT_GTPIN_LDFLAGS@
+OPT_IGC = @OPT_IGC@
+OPT_IGC_IFLAGS = @OPT_IGC_IFLAGS@
+OPT_IGC_LDFLAGS = @OPT_IGC_LDFLAGS@
 OPT_LEVEL0 = @OPT_LEVEL0@
 OPT_LEVEL0_IFLAGS = @OPT_LEVEL0_IFLAGS@
 OPT_LEVEL0_LDFLAGS = @OPT_LEVEL0_LDFLAGS@
+OPT_METRICS_DISCOVERY = @OPT_METRICS_DISCOVERY@
+OPT_METRICS_DISCOVERY_IFLAGS = @OPT_METRICS_DISCOVERY_IFLAGS@
+OPT_METRICS_DISCOVERY_LDFLAGS = @OPT_METRICS_DISCOVERY_LDFLAGS@
 OPT_OBJCOPY = @OPT_OBJCOPY@
 OPT_OPENCL_IFLAGS = @OPT_OPENCL_IFLAGS@
 OPT_PAPI = @OPT_PAPI@
diff --git a/src/lib/support-lean/Makefile.in b/src/lib/support-lean/Makefile.in
index 4947727127..a527163052 100644
--- a/src/lib/support-lean/Makefile.in
+++ b/src/lib/support-lean/Makefile.in
@@ -352,9 +352,18 @@ OPT_CUDA_LDFLAGS = @OPT_CUDA_LDFLAGS@
 OPT_CUPTI = @OPT_CUPTI@
 OPT_CUPTI_IFLAGS = @OPT_CUPTI_IFLAGS@
 OPT_CUPTI_LDFLAGS = @OPT_CUPTI_LDFLAGS@
+OPT_GTPIN = @OPT_GTPIN@
+OPT_GTPIN_IFLAGS = @OPT_GTPIN_IFLAGS@
+OPT_GTPIN_LDFLAGS = @OPT_GTPIN_LDFLAGS@
+OPT_IGC = @OPT_IGC@
+OPT_IGC_IFLAGS = @OPT_IGC_IFLAGS@
+OPT_IGC_LDFLAGS = @OPT_IGC_LDFLAGS@
 OPT_LEVEL0 = @OPT_LEVEL0@
 OPT_LEVEL0_IFLAGS = @OPT_LEVEL0_IFLAGS@
 OPT_LEVEL0_LDFLAGS = @OPT_LEVEL0_LDFLAGS@
+OPT_METRICS_DISCOVERY = @OPT_METRICS_DISCOVERY@
+OPT_METRICS_DISCOVERY_IFLAGS = @OPT_METRICS_DISCOVERY_IFLAGS@
+OPT_METRICS_DISCOVERY_LDFLAGS = @OPT_METRICS_DISCOVERY_LDFLAGS@
 OPT_OBJCOPY = @OPT_OBJCOPY@
 OPT_OPENCL_IFLAGS = @OPT_OPENCL_IFLAGS@
 OPT_PAPI = @OPT_PAPI@
diff --git a/src/lib/support/Makefile.in b/src/lib/support/Makefile.in
index 0f4038a779..04ccf50066 100644
--- a/src/lib/support/Makefile.in
+++ b/src/lib/support/Makefile.in
@@ -381,9 +381,18 @@ OPT_CUDA_LDFLAGS = @OPT_CUDA_LDFLAGS@
 OPT_CUPTI = @OPT_CUPTI@
 OPT_CUPTI_IFLAGS = @OPT_CUPTI_IFLAGS@
 OPT_CUPTI_LDFLAGS = @OPT_CUPTI_LDFLAGS@
+OPT_GTPIN = @OPT_GTPIN@
+OPT_GTPIN_IFLAGS = @OPT_GTPIN_IFLAGS@
+OPT_GTPIN_LDFLAGS = @OPT_GTPIN_LDFLAGS@
+OPT_IGC = @OPT_IGC@
+OPT_IGC_IFLAGS = @OPT_IGC_IFLAGS@
+OPT_IGC_LDFLAGS = @OPT_IGC_LDFLAGS@
 OPT_LEVEL0 = @OPT_LEVEL0@
 OPT_LEVEL0_IFLAGS = @OPT_LEVEL0_IFLAGS@
 OPT_LEVEL0_LDFLAGS = @OPT_LEVEL0_LDFLAGS@
+OPT_METRICS_DISCOVERY = @OPT_METRICS_DISCOVERY@
+OPT_METRICS_DISCOVERY_IFLAGS = @OPT_METRICS_DISCOVERY_IFLAGS@
+OPT_METRICS_DISCOVERY_LDFLAGS = @OPT_METRICS_DISCOVERY_LDFLAGS@
 OPT_OBJCOPY = @OPT_OBJCOPY@
 OPT_OPENCL_IFLAGS = @OPT_OPENCL_IFLAGS@
 OPT_PAPI = @OPT_PAPI@
diff --git a/src/lib/xml/Makefile.in b/src/lib/xml/Makefile.in
index 970502f954..c84dfd0f73 100644
--- a/src/lib/xml/Makefile.in
+++ b/src/lib/xml/Makefile.in
@@ -365,9 +365,18 @@ OPT_CUDA_LDFLAGS = @OPT_CUDA_LDFLAGS@
 OPT_CUPTI = @OPT_CUPTI@
 OPT_CUPTI_IFLAGS = @OPT_CUPTI_IFLAGS@
 OPT_CUPTI_LDFLAGS = @OPT_CUPTI_LDFLAGS@
+OPT_GTPIN = @OPT_GTPIN@
+OPT_GTPIN_IFLAGS = @OPT_GTPIN_IFLAGS@
+OPT_GTPIN_LDFLAGS = @OPT_GTPIN_LDFLAGS@
+OPT_IGC = @OPT_IGC@
+OPT_IGC_IFLAGS = @OPT_IGC_IFLAGS@
+OPT_IGC_LDFLAGS = @OPT_IGC_LDFLAGS@
 OPT_LEVEL0 = @OPT_LEVEL0@
 OPT_LEVEL0_IFLAGS = @OPT_LEVEL0_IFLAGS@
 OPT_LEVEL0_LDFLAGS = @OPT_LEVEL0_LDFLAGS@
+OPT_METRICS_DISCOVERY = @OPT_METRICS_DISCOVERY@
+OPT_METRICS_DISCOVERY_IFLAGS = @OPT_METRICS_DISCOVERY_IFLAGS@
+OPT_METRICS_DISCOVERY_LDFLAGS = @OPT_METRICS_DISCOVERY_LDFLAGS@
 OPT_OBJCOPY = @OPT_OBJCOPY@
 OPT_OPENCL_IFLAGS = @OPT_OPENCL_IFLAGS@
 OPT_PAPI = @OPT_PAPI@
diff --git a/src/tool/Makefile.in b/src/tool/Makefile.in
index 545ef9b2a9..2fb762ac42 100644
--- a/src/tool/Makefile.in
+++ b/src/tool/Makefile.in
@@ -339,9 +339,18 @@ OPT_CUDA_LDFLAGS = @OPT_CUDA_LDFLAGS@
 OPT_CUPTI = @OPT_CUPTI@
 OPT_CUPTI_IFLAGS = @OPT_CUPTI_IFLAGS@
 OPT_CUPTI_LDFLAGS = @OPT_CUPTI_LDFLAGS@
+OPT_GTPIN = @OPT_GTPIN@
+OPT_GTPIN_IFLAGS = @OPT_GTPIN_IFLAGS@
+OPT_GTPIN_LDFLAGS = @OPT_GTPIN_LDFLAGS@
+OPT_IGC = @OPT_IGC@
+OPT_IGC_IFLAGS = @OPT_IGC_IFLAGS@
+OPT_IGC_LDFLAGS = @OPT_IGC_LDFLAGS@
 OPT_LEVEL0 = @OPT_LEVEL0@
 OPT_LEVEL0_IFLAGS = @OPT_LEVEL0_IFLAGS@
 OPT_LEVEL0_LDFLAGS = @OPT_LEVEL0_LDFLAGS@
+OPT_METRICS_DISCOVERY = @OPT_METRICS_DISCOVERY@
+OPT_METRICS_DISCOVERY_IFLAGS = @OPT_METRICS_DISCOVERY_IFLAGS@
+OPT_METRICS_DISCOVERY_LDFLAGS = @OPT_METRICS_DISCOVERY_LDFLAGS@
 OPT_OBJCOPY = @OPT_OBJCOPY@
 OPT_OPENCL_IFLAGS = @OPT_OPENCL_IFLAGS@
 OPT_PAPI = @OPT_PAPI@
diff --git a/src/tool/hpcfnbounds/Makefile.in b/src/tool/hpcfnbounds/Makefile.in
index dfe3a93fa0..0ae4b5aa47 100644
--- a/src/tool/hpcfnbounds/Makefile.in
+++ b/src/tool/hpcfnbounds/Makefile.in
@@ -449,9 +449,18 @@ OPT_CUDA_LDFLAGS = @OPT_CUDA_LDFLAGS@
 OPT_CUPTI = @OPT_CUPTI@
 OPT_CUPTI_IFLAGS = @OPT_CUPTI_IFLAGS@
 OPT_CUPTI_LDFLAGS = @OPT_CUPTI_LDFLAGS@
+OPT_GTPIN = @OPT_GTPIN@
+OPT_GTPIN_IFLAGS = @OPT_GTPIN_IFLAGS@
+OPT_GTPIN_LDFLAGS = @OPT_GTPIN_LDFLAGS@
+OPT_IGC = @OPT_IGC@
+OPT_IGC_IFLAGS = @OPT_IGC_IFLAGS@
+OPT_IGC_LDFLAGS = @OPT_IGC_LDFLAGS@
 OPT_LEVEL0 = @OPT_LEVEL0@
 OPT_LEVEL0_IFLAGS = @OPT_LEVEL0_IFLAGS@
 OPT_LEVEL0_LDFLAGS = @OPT_LEVEL0_LDFLAGS@
+OPT_METRICS_DISCOVERY = @OPT_METRICS_DISCOVERY@
+OPT_METRICS_DISCOVERY_IFLAGS = @OPT_METRICS_DISCOVERY_IFLAGS@
+OPT_METRICS_DISCOVERY_LDFLAGS = @OPT_METRICS_DISCOVERY_LDFLAGS@
 OPT_OBJCOPY = @OPT_OBJCOPY@
 OPT_OPENCL_IFLAGS = @OPT_OPENCL_IFLAGS@
 OPT_PAPI = @OPT_PAPI@
diff --git a/src/tool/hpcfnbounds2/Makefile.in b/src/tool/hpcfnbounds2/Makefile.in
index 46f9f0ce7f..8fd59fe54a 100644
--- a/src/tool/hpcfnbounds2/Makefile.in
+++ b/src/tool/hpcfnbounds2/Makefile.in
@@ -347,9 +347,18 @@ OPT_CUDA_LDFLAGS = @OPT_CUDA_LDFLAGS@
 OPT_CUPTI = @OPT_CUPTI@
 OPT_CUPTI_IFLAGS = @OPT_CUPTI_IFLAGS@
 OPT_CUPTI_LDFLAGS = @OPT_CUPTI_LDFLAGS@
+OPT_GTPIN = @OPT_GTPIN@
+OPT_GTPIN_IFLAGS = @OPT_GTPIN_IFLAGS@
+OPT_GTPIN_LDFLAGS = @OPT_GTPIN_LDFLAGS@
+OPT_IGC = @OPT_IGC@
+OPT_IGC_IFLAGS = @OPT_IGC_IFLAGS@
+OPT_IGC_LDFLAGS = @OPT_IGC_LDFLAGS@
 OPT_LEVEL0 = @OPT_LEVEL0@
 OPT_LEVEL0_IFLAGS = @OPT_LEVEL0_IFLAGS@
 OPT_LEVEL0_LDFLAGS = @OPT_LEVEL0_LDFLAGS@
+OPT_METRICS_DISCOVERY = @OPT_METRICS_DISCOVERY@
+OPT_METRICS_DISCOVERY_IFLAGS = @OPT_METRICS_DISCOVERY_IFLAGS@
+OPT_METRICS_DISCOVERY_LDFLAGS = @OPT_METRICS_DISCOVERY_LDFLAGS@
 OPT_OBJCOPY = @OPT_OBJCOPY@
 OPT_OPENCL_IFLAGS = @OPT_OPENCL_IFLAGS@
 OPT_PAPI = @OPT_PAPI@
diff --git a/src/tool/hpclump/Makefile.in b/src/tool/hpclump/Makefile.in
index 37fe2a0159..34792ea7be 100644
--- a/src/tool/hpclump/Makefile.in
+++ b/src/tool/hpclump/Makefile.in
@@ -380,9 +380,18 @@ OPT_CUDA_LDFLAGS = @OPT_CUDA_LDFLAGS@
 OPT_CUPTI = @OPT_CUPTI@
 OPT_CUPTI_IFLAGS = @OPT_CUPTI_IFLAGS@
 OPT_CUPTI_LDFLAGS = @OPT_CUPTI_LDFLAGS@
+OPT_GTPIN = @OPT_GTPIN@
+OPT_GTPIN_IFLAGS = @OPT_GTPIN_IFLAGS@
+OPT_GTPIN_LDFLAGS = @OPT_GTPIN_LDFLAGS@
+OPT_IGC = @OPT_IGC@
+OPT_IGC_IFLAGS = @OPT_IGC_IFLAGS@
+OPT_IGC_LDFLAGS = @OPT_IGC_LDFLAGS@
 OPT_LEVEL0 = @OPT_LEVEL0@
 OPT_LEVEL0_IFLAGS = @OPT_LEVEL0_IFLAGS@
 OPT_LEVEL0_LDFLAGS = @OPT_LEVEL0_LDFLAGS@
+OPT_METRICS_DISCOVERY = @OPT_METRICS_DISCOVERY@
+OPT_METRICS_DISCOVERY_IFLAGS = @OPT_METRICS_DISCOVERY_IFLAGS@
+OPT_METRICS_DISCOVERY_LDFLAGS = @OPT_METRICS_DISCOVERY_LDFLAGS@
 OPT_OBJCOPY = @OPT_OBJCOPY@
 OPT_OPENCL_IFLAGS = @OPT_OPENCL_IFLAGS@
 OPT_PAPI = @OPT_PAPI@
diff --git a/src/tool/hpcprof-flat/Makefile.in b/src/tool/hpcprof-flat/Makefile.in
index ad27eb7012..5f6ab64282 100644
--- a/src/tool/hpcprof-flat/Makefile.in
+++ b/src/tool/hpcprof-flat/Makefile.in
@@ -415,9 +415,18 @@ OPT_CUDA_LDFLAGS = @OPT_CUDA_LDFLAGS@
 OPT_CUPTI = @OPT_CUPTI@
 OPT_CUPTI_IFLAGS = @OPT_CUPTI_IFLAGS@
 OPT_CUPTI_LDFLAGS = @OPT_CUPTI_LDFLAGS@
+OPT_GTPIN = @OPT_GTPIN@
+OPT_GTPIN_IFLAGS = @OPT_GTPIN_IFLAGS@
+OPT_GTPIN_LDFLAGS = @OPT_GTPIN_LDFLAGS@
+OPT_IGC = @OPT_IGC@
+OPT_IGC_IFLAGS = @OPT_IGC_IFLAGS@
+OPT_IGC_LDFLAGS = @OPT_IGC_LDFLAGS@
 OPT_LEVEL0 = @OPT_LEVEL0@
 OPT_LEVEL0_IFLAGS = @OPT_LEVEL0_IFLAGS@
 OPT_LEVEL0_LDFLAGS = @OPT_LEVEL0_LDFLAGS@
+OPT_METRICS_DISCOVERY = @OPT_METRICS_DISCOVERY@
+OPT_METRICS_DISCOVERY_IFLAGS = @OPT_METRICS_DISCOVERY_IFLAGS@
+OPT_METRICS_DISCOVERY_LDFLAGS = @OPT_METRICS_DISCOVERY_LDFLAGS@
 OPT_OBJCOPY = @OPT_OBJCOPY@
 OPT_OPENCL_IFLAGS = @OPT_OPENCL_IFLAGS@
 OPT_PAPI = @OPT_PAPI@
diff --git a/src/tool/hpcprof-mpi/Makefile.in b/src/tool/hpcprof-mpi/Makefile.in
index 698b145708..d00867fd0f 100644
--- a/src/tool/hpcprof-mpi/Makefile.in
+++ b/src/tool/hpcprof-mpi/Makefile.in
@@ -415,9 +415,18 @@ OPT_CUDA_LDFLAGS = @OPT_CUDA_LDFLAGS@
 OPT_CUPTI = @OPT_CUPTI@
 OPT_CUPTI_IFLAGS = @OPT_CUPTI_IFLAGS@
 OPT_CUPTI_LDFLAGS = @OPT_CUPTI_LDFLAGS@
+OPT_GTPIN = @OPT_GTPIN@
+OPT_GTPIN_IFLAGS = @OPT_GTPIN_IFLAGS@
+OPT_GTPIN_LDFLAGS = @OPT_GTPIN_LDFLAGS@
+OPT_IGC = @OPT_IGC@
+OPT_IGC_IFLAGS = @OPT_IGC_IFLAGS@
+OPT_IGC_LDFLAGS = @OPT_IGC_LDFLAGS@
 OPT_LEVEL0 = @OPT_LEVEL0@
 OPT_LEVEL0_IFLAGS = @OPT_LEVEL0_IFLAGS@
 OPT_LEVEL0_LDFLAGS = @OPT_LEVEL0_LDFLAGS@
+OPT_METRICS_DISCOVERY = @OPT_METRICS_DISCOVERY@
+OPT_METRICS_DISCOVERY_IFLAGS = @OPT_METRICS_DISCOVERY_IFLAGS@
+OPT_METRICS_DISCOVERY_LDFLAGS = @OPT_METRICS_DISCOVERY_LDFLAGS@
 OPT_OBJCOPY = @OPT_OBJCOPY@
 OPT_OPENCL_IFLAGS = @OPT_OPENCL_IFLAGS@
 OPT_PAPI = @OPT_PAPI@
diff --git a/src/tool/hpcprof/Makefile.in b/src/tool/hpcprof/Makefile.in
index dcee5fd1ba..0a538fb755 100644
--- a/src/tool/hpcprof/Makefile.in
+++ b/src/tool/hpcprof/Makefile.in
@@ -412,9 +412,18 @@ OPT_CUDA_LDFLAGS = @OPT_CUDA_LDFLAGS@
 OPT_CUPTI = @OPT_CUPTI@
 OPT_CUPTI_IFLAGS = @OPT_CUPTI_IFLAGS@
 OPT_CUPTI_LDFLAGS = @OPT_CUPTI_LDFLAGS@
+OPT_GTPIN = @OPT_GTPIN@
+OPT_GTPIN_IFLAGS = @OPT_GTPIN_IFLAGS@
+OPT_GTPIN_LDFLAGS = @OPT_GTPIN_LDFLAGS@
+OPT_IGC = @OPT_IGC@
+OPT_IGC_IFLAGS = @OPT_IGC_IFLAGS@
+OPT_IGC_LDFLAGS = @OPT_IGC_LDFLAGS@
 OPT_LEVEL0 = @OPT_LEVEL0@
 OPT_LEVEL0_IFLAGS = @OPT_LEVEL0_IFLAGS@
 OPT_LEVEL0_LDFLAGS = @OPT_LEVEL0_LDFLAGS@
+OPT_METRICS_DISCOVERY = @OPT_METRICS_DISCOVERY@
+OPT_METRICS_DISCOVERY_IFLAGS = @OPT_METRICS_DISCOVERY_IFLAGS@
+OPT_METRICS_DISCOVERY_LDFLAGS = @OPT_METRICS_DISCOVERY_LDFLAGS@
 OPT_OBJCOPY = @OPT_OBJCOPY@
 OPT_OPENCL_IFLAGS = @OPT_OPENCL_IFLAGS@
 OPT_PAPI = @OPT_PAPI@
diff --git a/src/tool/hpcproftt/Makefile.in b/src/tool/hpcproftt/Makefile.in
index 90e9904509..e54666297b 100644
--- a/src/tool/hpcproftt/Makefile.in
+++ b/src/tool/hpcproftt/Makefile.in
@@ -415,9 +415,18 @@ OPT_CUDA_LDFLAGS = @OPT_CUDA_LDFLAGS@
 OPT_CUPTI = @OPT_CUPTI@
 OPT_CUPTI_IFLAGS = @OPT_CUPTI_IFLAGS@
 OPT_CUPTI_LDFLAGS = @OPT_CUPTI_LDFLAGS@
+OPT_GTPIN = @OPT_GTPIN@
+OPT_GTPIN_IFLAGS = @OPT_GTPIN_IFLAGS@
+OPT_GTPIN_LDFLAGS = @OPT_GTPIN_LDFLAGS@
+OPT_IGC = @OPT_IGC@
+OPT_IGC_IFLAGS = @OPT_IGC_IFLAGS@
+OPT_IGC_LDFLAGS = @OPT_IGC_LDFLAGS@
 OPT_LEVEL0 = @OPT_LEVEL0@
 OPT_LEVEL0_IFLAGS = @OPT_LEVEL0_IFLAGS@
 OPT_LEVEL0_LDFLAGS = @OPT_LEVEL0_LDFLAGS@
+OPT_METRICS_DISCOVERY = @OPT_METRICS_DISCOVERY@
+OPT_METRICS_DISCOVERY_IFLAGS = @OPT_METRICS_DISCOVERY_IFLAGS@
+OPT_METRICS_DISCOVERY_LDFLAGS = @OPT_METRICS_DISCOVERY_LDFLAGS@
 OPT_OBJCOPY = @OPT_OBJCOPY@
 OPT_OPENCL_IFLAGS = @OPT_OPENCL_IFLAGS@
 OPT_PAPI = @OPT_PAPI@
diff --git a/src/tool/hpcrun-flat/Makefile.in b/src/tool/hpcrun-flat/Makefile.in
index afca940de1..a41f83aac7 100644
--- a/src/tool/hpcrun-flat/Makefile.in
+++ b/src/tool/hpcrun-flat/Makefile.in
@@ -410,9 +410,18 @@ OPT_CUDA_LDFLAGS = @OPT_CUDA_LDFLAGS@
 OPT_CUPTI = @OPT_CUPTI@
 OPT_CUPTI_IFLAGS = @OPT_CUPTI_IFLAGS@
 OPT_CUPTI_LDFLAGS = @OPT_CUPTI_LDFLAGS@
+OPT_GTPIN = @OPT_GTPIN@
+OPT_GTPIN_IFLAGS = @OPT_GTPIN_IFLAGS@
+OPT_GTPIN_LDFLAGS = @OPT_GTPIN_LDFLAGS@
+OPT_IGC = @OPT_IGC@
+OPT_IGC_IFLAGS = @OPT_IGC_IFLAGS@
+OPT_IGC_LDFLAGS = @OPT_IGC_LDFLAGS@
 OPT_LEVEL0 = @OPT_LEVEL0@
 OPT_LEVEL0_IFLAGS = @OPT_LEVEL0_IFLAGS@
 OPT_LEVEL0_LDFLAGS = @OPT_LEVEL0_LDFLAGS@
+OPT_METRICS_DISCOVERY = @OPT_METRICS_DISCOVERY@
+OPT_METRICS_DISCOVERY_IFLAGS = @OPT_METRICS_DISCOVERY_IFLAGS@
+OPT_METRICS_DISCOVERY_LDFLAGS = @OPT_METRICS_DISCOVERY_LDFLAGS@
 OPT_OBJCOPY = @OPT_OBJCOPY@
 OPT_OPENCL_IFLAGS = @OPT_OPENCL_IFLAGS@
 OPT_PAPI = @OPT_PAPI@
diff --git a/src/tool/hpcrun/Makefile.am b/src/tool/hpcrun/Makefile.am
index b0091c14f5..e1e656e925 100644
--- a/src/tool/hpcrun/Makefile.am
+++ b/src/tool/hpcrun/Makefile.am
@@ -505,23 +505,23 @@ MY_CUPTI_FILES = sample-sources/nvidia.c	\
 	gpu/nvidia/cupti-gpu-api.c		
 endif
 
-if ENABLE_OPENCL
-#MY_OPENCL_FILES =
-MY_BASE_FILES += \
-	sample-sources/opencl.c 												\
-	gpu/opencl/opencl-intercept.c										\
-	gpu/opencl/opencl-api.c													\
-	gpu/opencl/opencl-memory-manager.c  						\
-	gpu/opencl/opencl-activity-translate.c					\
-	gpu/instrumentation/gtpin-instrumentation-kernel-data-map.c			\
-	gpu/instrumentation/gtpin-instrumentation-kernel-memory-map.c	\
-	gpu/instrumentation/opencl-instrumentation.c
+if OPT_ENABLE_OPENCL
+MY_OPENCL_FILES = sample-sources/opencl.c \
+	gpu/opencl/opencl-intercept.c \
+	gpu/opencl/opencl-api.c \
+	gpu/opencl/opencl-memory-manager.c \
+	gpu/opencl/opencl-activity-translate.c 
 endif
 
-
+if OPT_ENABLE_GTPIN
+MY_GTPIN_FILES = \
+	gpu/instrumentation/gtpin-instrumentation-kernel-data-map.c \
+	gpu/instrumentation/gtpin-instrumentation-kernel-memory-map.c \
+	gpu/instrumentation/opencl-instrumentation.c
+endif
 
 if OPT_ENABLE_ROCM
-MY_ROCM_FILES=\
+MY_ROCM_FILES =\
 	sample-sources/amd.c \
 	gpu/amd/roctracer-activity-translate.c \
 	gpu/amd/roctracer-api.c 	
@@ -529,7 +529,7 @@ endif
 
 if OPT_ENABLE_LEVEL0
 MY_LEVEL0_FILES=\
-	sample-sources/level0.c \	
+	sample-sources/level0.c \
 	gpu/level0/level0-api.c \
 	gpu/level0/level0-command-list-map.c \
 	gpu/level0/level0-command-process.c \
@@ -984,13 +984,24 @@ if OPT_ENABLE_LEVEL0
 
   MY_CPP_DEFINES  += -DHPCRUN_SS_LEVEL0
 endif
-if ENABLE_OPENCL
-	libhpcrun_la_CFLAGS += $(OPENCL_IFLAGS)
-	libhpcrun_la_LDFLAGS += "-L/home/aarontcopal2/Documents/inteloneapi/gtpin/Profilers/Lib/intel64 -lgtpin"
-	libhpcrun_la_LDFLAGS +=	"-Wl,-rpath='/home/aarontcopal2/Documents/inteloneapi/gtpin/Profilers/Lib/intel64'"
+
+if OPT_ENABLE_OPENCL
+  libhpcrun_la_SOURCES  += $(MY_OPENCL_FILES)
+  libhpcrun_la_CPPFLAGS += -DENABLE_OPENCL
+  libhpcrun_la_CFLAGS   += $(OPT_OPENCL_IFLAGS)
+
   MY_CPP_DEFINES  += -DHPCRUN_SS_OPENCL
 endif 
 
+if OPT_ENABLE_GTPIN
+  libhpcrun_la_SOURCES  += $(MY_GTPIN_FILES)
+  libhpcrun_la_CPPFLAGS += -DENABLE_GTPIN
+  libhpcrun_la_CFLAGS   += $(OPT_GTPIN_IFLAGS)
+  libhpcrun_la_LDFLAGS  += $(OPT_GTPIN_LDFLAGS)
+
+  MY_CPP_DEFINES  += -DHPCRUN_SS_GTPIN
+endif
+
 if UNW_LIBUNW
   UNW_SOURCE_FILES = $(UNW_LIBUNW_FILES)
   UNW_INCLUDE_DIRS = $(UNW_LIBUNW_INCLUDE_DIRS)
diff --git a/src/tool/hpcrun/Makefile.in b/src/tool/hpcrun/Makefile.in
index 80190787c6..566d0a5da9 100644
--- a/src/tool/hpcrun/Makefile.in
+++ b/src/tool/hpcrun/Makefile.in
@@ -166,37 +166,26 @@ pkglibexec_PROGRAMS =
 @OPT_ENABLE_KERNEL_4_3_FALSE@@OPT_ENABLE_PERF_EVENT_TRUE@am__append_15 = sample-sources/perf/kernel_blocking_stub.c
 @OPT_PAPI_CUPTI_TRUE@am__append_16 = sample-sources/papi-c-cupti.c
 
-#MY_OPENCL_FILES =
-@ENABLE_OPENCL_TRUE@am__append_17 = \
-@ENABLE_OPENCL_TRUE@	sample-sources/opencl.c 												\
-@ENABLE_OPENCL_TRUE@	gpu/opencl/opencl-intercept.c										\
-@ENABLE_OPENCL_TRUE@	gpu/opencl/opencl-api.c													\
-@ENABLE_OPENCL_TRUE@	gpu/opencl/opencl-memory-manager.c  						\
-@ENABLE_OPENCL_TRUE@	gpu/opencl/opencl-activity-translate.c					\
-@ENABLE_OPENCL_TRUE@	gpu/instrumentation/gtpin-instrumentation-kernel-data-map.c			\
-@ENABLE_OPENCL_TRUE@	gpu/instrumentation/gtpin-instrumentation-kernel-memory-map.c	\
-@ENABLE_OPENCL_TRUE@	gpu/instrumentation/opencl-instrumentation.c
-
-
 #
 # BG/Q backend requires special treatment to avoid deadlocks
 #
-@OPT_BGQ_BACKEND_TRUE@am__append_18 = -DUSE_HW_THREAD_ID -DNONZERO_THRESHOLD
+@OPT_BGQ_BACKEND_TRUE@am__append_17 = -DUSE_HW_THREAD_ID -DNONZERO_THRESHOLD
+@OPT_BGQ_BACKEND_TRUE@am__append_18 = -I$(srcdir)/utilities/bgq-cnk
 @OPT_BGQ_BACKEND_TRUE@am__append_19 = -I$(srcdir)/utilities/bgq-cnk
-@OPT_BGQ_BACKEND_TRUE@am__append_20 = -I$(srcdir)/utilities/bgq-cnk
+@OPT_ENABLE_MPI_WRAP_TRUE@am__append_20 = mpi-overrides.c
 @OPT_ENABLE_MPI_WRAP_TRUE@am__append_21 = mpi-overrides.c
-@OPT_ENABLE_MPI_WRAP_TRUE@am__append_22 = mpi-overrides.c
+@OPT_BGQ_BACKEND_TRUE@am__append_22 = utilities/bgq-cnk/libhardware-thread-id.la
 @OPT_BGQ_BACKEND_TRUE@am__append_23 = utilities/bgq-cnk/libhardware-thread-id.la
-@OPT_BGQ_BACKEND_TRUE@am__append_24 = utilities/bgq-cnk/libhardware-thread-id.la
 
 #  libhpcrun_o_LDFLAGS   += $(ZLIB_HPCLINK_LIB)
 
 #-----------------------------------------------------------
 # whirled peas
 #-----------------------------------------------------------
-@HOST_OS_LINUX_TRUE@am__append_25 = $(MY_LINUX_DYNAMIC_FILES)
+@HOST_OS_LINUX_TRUE@am__append_24 = $(MY_LINUX_DYNAMIC_FILES)
+@HOST_CPU_MIPS_TRUE@am__append_25 = $(MY_MIPS_FILES)
 @HOST_CPU_MIPS_TRUE@am__append_26 = $(MY_MIPS_FILES)
-@HOST_CPU_MIPS_TRUE@am__append_27 = $(MY_MIPS_FILES)
+@HOST_CPU_MIPS_TRUE@am__append_27 = $(MY_MIPS_INCLUDE_DIRS)
 @HOST_CPU_MIPS_TRUE@am__append_28 = $(MY_MIPS_INCLUDE_DIRS)
 @HOST_CPU_MIPS_TRUE@am__append_29 = $(MY_MIPS_INCLUDE_DIRS)
 @HOST_CPU_MIPS_TRUE@am__append_30 = $(MY_MIPS_INCLUDE_DIRS)
@@ -207,15 +196,15 @@ pkglibexec_PROGRAMS =
 @HOST_CPU_MIPS_TRUE@am__append_35 = $(MY_MIPS_INCLUDE_DIRS)
 @HOST_CPU_MIPS_TRUE@am__append_36 = $(MY_MIPS_INCLUDE_DIRS)
 @HOST_CPU_MIPS_TRUE@am__append_37 = $(MY_MIPS_INCLUDE_DIRS)
-@HOST_CPU_MIPS_TRUE@am__append_38 = $(MY_MIPS_INCLUDE_DIRS)
 
 # Note: setting CCASFLAGS here is a no-op hack with the side effect of
 # prefixing the tramp.s file names so they will be compiled separately
 # for .o and .so targets.  CFLAGS does this for the .c files, but
 # CFLAGS doesn't apply to .s files.  See the automake docs section
 # 8.3.9.2, Objects created with both libtool and without.
+@HOST_CPU_PPC_TRUE@am__append_38 = $(MY_PPC_FILES)
 @HOST_CPU_PPC_TRUE@am__append_39 = $(MY_PPC_FILES)
-@HOST_CPU_PPC_TRUE@am__append_40 = $(MY_PPC_FILES)
+@HOST_CPU_PPC_TRUE@am__append_40 = $(MY_PPC_INCLUDE_DIRS)
 @HOST_CPU_PPC_TRUE@am__append_41 = $(MY_PPC_INCLUDE_DIRS)
 @HOST_CPU_PPC_TRUE@am__append_42 = $(MY_PPC_INCLUDE_DIRS)
 @HOST_CPU_PPC_TRUE@am__append_43 = $(MY_PPC_INCLUDE_DIRS)
@@ -228,13 +217,13 @@ pkglibexec_PROGRAMS =
 @HOST_CPU_PPC_TRUE@am__append_50 = $(MY_PPC_INCLUDE_DIRS)
 @HOST_CPU_PPC_TRUE@am__append_51 = $(MY_PPC_INCLUDE_DIRS)
 @HOST_CPU_PPC_TRUE@am__append_52 = $(MY_PPC_INCLUDE_DIRS)
-@HOST_CPU_PPC_TRUE@am__append_53 = $(MY_PPC_INCLUDE_DIRS)
+@HOST_CPU_X86_FAMILY_TRUE@am__append_53 = $(MY_X86_FILES)
 @HOST_CPU_X86_FAMILY_TRUE@am__append_54 = $(MY_X86_FILES)
-@HOST_CPU_X86_FAMILY_TRUE@am__append_55 = $(MY_X86_FILES)
+@HOST_CPU_X86_FAMILY_TRUE@am__append_55 = $(MY_X86_INCLUDE_DIRS)
 @HOST_CPU_X86_FAMILY_TRUE@am__append_56 = $(MY_X86_INCLUDE_DIRS)
-@HOST_CPU_X86_FAMILY_TRUE@am__append_57 = $(MY_X86_INCLUDE_DIRS)
-@HOST_CPU_X86_FAMILY_TRUE@am__append_58 = $(XED2_HPCRUN_LIBS)
-@HOST_CPU_X86_FAMILY_TRUE@am__append_59 = $(XED2_HPCLINK_LIBS) 
+@HOST_CPU_X86_FAMILY_TRUE@am__append_57 = $(XED2_HPCRUN_LIBS)
+@HOST_CPU_X86_FAMILY_TRUE@am__append_58 = $(XED2_HPCLINK_LIBS) 
+@HOST_CPU_X86_FAMILY_TRUE@am__append_59 = $(MY_X86_INCLUDE_DIRS)
 @HOST_CPU_X86_FAMILY_TRUE@am__append_60 = $(MY_X86_INCLUDE_DIRS)
 @HOST_CPU_X86_FAMILY_TRUE@am__append_61 = $(MY_X86_INCLUDE_DIRS)
 @HOST_CPU_X86_FAMILY_TRUE@am__append_62 = $(MY_X86_INCLUDE_DIRS)
@@ -246,9 +235,9 @@ pkglibexec_PROGRAMS =
 @HOST_CPU_X86_FAMILY_TRUE@am__append_68 = $(MY_X86_INCLUDE_DIRS)
 @HOST_CPU_X86_FAMILY_TRUE@am__append_69 = $(MY_X86_INCLUDE_DIRS)
 @HOST_CPU_X86_FAMILY_TRUE@am__append_70 = $(MY_X86_INCLUDE_DIRS)
-@HOST_CPU_X86_FAMILY_TRUE@am__append_71 = $(MY_X86_INCLUDE_DIRS)
+@HOST_CPU_IA64_TRUE@am__append_71 = $(MY_IA64_FILES)
 @HOST_CPU_IA64_TRUE@am__append_72 = $(MY_IA64_FILES)
-@HOST_CPU_IA64_TRUE@am__append_73 = $(MY_IA64_FILES)
+@HOST_CPU_IA64_TRUE@am__append_73 = $(MY_IA64_INCLUDE_DIRS)
 @HOST_CPU_IA64_TRUE@am__append_74 = $(MY_IA64_INCLUDE_DIRS)
 @HOST_CPU_IA64_TRUE@am__append_75 = $(MY_IA64_INCLUDE_DIRS)
 @HOST_CPU_IA64_TRUE@am__append_76 = $(MY_IA64_INCLUDE_DIRS)
@@ -259,9 +248,9 @@ pkglibexec_PROGRAMS =
 @HOST_CPU_IA64_TRUE@am__append_81 = $(MY_IA64_INCLUDE_DIRS)
 @HOST_CPU_IA64_TRUE@am__append_82 = $(MY_IA64_INCLUDE_DIRS)
 @HOST_CPU_IA64_TRUE@am__append_83 = $(MY_IA64_INCLUDE_DIRS)
-@HOST_CPU_IA64_TRUE@am__append_84 = $(MY_IA64_INCLUDE_DIRS)
+@HOST_CPU_AARCH64_TRUE@am__append_84 = $(MY_AARCH64_FILES)
 @HOST_CPU_AARCH64_TRUE@am__append_85 = $(MY_AARCH64_FILES)
-@HOST_CPU_AARCH64_TRUE@am__append_86 = $(MY_AARCH64_FILES)
+@HOST_CPU_AARCH64_TRUE@am__append_86 = $(MY_AARCH64_INCLUDE_DIRS)
 @HOST_CPU_AARCH64_TRUE@am__append_87 = $(MY_AARCH64_INCLUDE_DIRS)
 @HOST_CPU_AARCH64_TRUE@am__append_88 = $(MY_AARCH64_INCLUDE_DIRS)
 @HOST_CPU_AARCH64_TRUE@am__append_89 = $(MY_AARCH64_INCLUDE_DIRS)
@@ -274,43 +263,50 @@ pkglibexec_PROGRAMS =
 @HOST_CPU_AARCH64_TRUE@am__append_96 = $(MY_AARCH64_INCLUDE_DIRS)
 @HOST_CPU_AARCH64_TRUE@am__append_97 = $(MY_AARCH64_INCLUDE_DIRS)
 @HOST_CPU_AARCH64_TRUE@am__append_98 = $(MY_AARCH64_INCLUDE_DIRS)
-@HOST_CPU_AARCH64_TRUE@am__append_99 = $(MY_AARCH64_INCLUDE_DIRS)
-@OPT_PAPI_DYNAMIC_TRUE@am__append_100 = $(MY_PAPI_FILES)
-@OPT_PAPI_DYNAMIC_TRUE@am__append_101 = $(PAPI_INC_FLGS)
-@OPT_PAPI_DYNAMIC_TRUE@am__append_102 = $(PAPI_LD_FLGS)
-@OPT_PAPI_DYNAMIC_TRUE@am__append_103 = -DHPCRUN_SS_PAPI
+@OPT_PAPI_DYNAMIC_TRUE@am__append_99 = $(MY_PAPI_FILES)
+@OPT_PAPI_DYNAMIC_TRUE@am__append_100 = $(PAPI_INC_FLGS)
+@OPT_PAPI_DYNAMIC_TRUE@am__append_101 = $(PAPI_LD_FLGS)
+@OPT_PAPI_DYNAMIC_TRUE@am__append_102 = -DHPCRUN_SS_PAPI
+@OPT_ENABLE_CUPTI_TRUE@am__append_103 = $(MY_CUPTI_FILES)
 @OPT_ENABLE_CUPTI_TRUE@am__append_104 = $(MY_CUPTI_FILES)
-@OPT_ENABLE_CUPTI_TRUE@am__append_105 = $(MY_CUPTI_FILES)
-@OPT_ENABLE_CUPTI_TRUE@am__append_106 = $(CUPTI_INC_FLGS)
-@OPT_ENABLE_CUPTI_TRUE@am__append_107 = -DHPCRUN_SS_NVIDIA
-@OPT_PAPI_CUPTI_TRUE@am__append_108 = $(CUPTI_INC_FLGS)
-@OPT_PAPI_CUPTI_TRUE@am__append_109 = -DHPCRUN_SS_PAPI_C_CUPTI
-@OPT_PAPI_STATIC_TRUE@am__append_110 = $(MY_PAPI_FILES)
-@OPT_PAPI_STATIC_TRUE@am__append_111 = $(PAPI_INC_FLGS)
-@OPT_PAPI_STATIC_TRUE@am__append_112 = $(PAPI_LD_FLGS)
-@OPT_PAPI_STATIC_TRUE@am__append_113 = -DHPCRUN_SS_PAPI
+@OPT_ENABLE_CUPTI_TRUE@am__append_105 = $(CUPTI_INC_FLGS)
+@OPT_ENABLE_CUPTI_TRUE@am__append_106 = -DHPCRUN_SS_NVIDIA
+@OPT_PAPI_CUPTI_TRUE@am__append_107 = $(CUPTI_INC_FLGS)
+@OPT_PAPI_CUPTI_TRUE@am__append_108 = -DHPCRUN_SS_PAPI_C_CUPTI
+@OPT_PAPI_STATIC_TRUE@am__append_109 = $(MY_PAPI_FILES)
+@OPT_PAPI_STATIC_TRUE@am__append_110 = $(PAPI_INC_FLGS)
+@OPT_PAPI_STATIC_TRUE@am__append_111 = $(PAPI_LD_FLGS)
+@OPT_PAPI_STATIC_TRUE@am__append_112 = -DHPCRUN_SS_PAPI
+@OPT_ENABLE_UPC_TRUE@am__append_113 = $(MY_UPC_FILES)
 @OPT_ENABLE_UPC_TRUE@am__append_114 = $(MY_UPC_FILES)
-@OPT_ENABLE_UPC_TRUE@am__append_115 = $(MY_UPC_FILES)
+@OPT_ENABLE_UPC_TRUE@am__append_115 = $(OPT_UPC_IFLAGS)
 @OPT_ENABLE_UPC_TRUE@am__append_116 = $(OPT_UPC_IFLAGS)
-@OPT_ENABLE_UPC_TRUE@am__append_117 = $(OPT_UPC_IFLAGS)
-@OPT_ENABLE_UPC_TRUE@am__append_118 = $(OPT_UPC_LDFLAGS)
+@OPT_ENABLE_UPC_TRUE@am__append_117 = $(OPT_UPC_LDFLAGS)
+@OPT_ENABLE_LUSH_PTHREADS_TRUE@am__append_118 = -DLUSH_PTHREADS
 @OPT_ENABLE_LUSH_PTHREADS_TRUE@am__append_119 = -DLUSH_PTHREADS
-@OPT_ENABLE_LUSH_PTHREADS_TRUE@am__append_120 = -DLUSH_PTHREADS
-@OPT_ENABLE_CUDA_TRUE@am__append_121 = $(MY_CUDA_FILES)
-@OPT_ENABLE_CUDA_TRUE@am__append_122 = -DENABLE_CUDA
-@OPT_ENABLE_CUDA_TRUE@am__append_123 = $(OPT_CUDA_IFLAGS)
-@OPT_ENABLE_CUDA_TRUE@am__append_124 = $(MY_CUDA_FILES)
-@OPT_ENABLE_ROCM_TRUE@am__append_125 = $(MY_ROCM_FILES)
-@OPT_ENABLE_ROCM_TRUE@am__append_126 = -DENABLE_ROCM
-@OPT_ENABLE_ROCM_TRUE@am__append_127 = $(OPT_ROCM_IFLAGS)
-@OPT_ENABLE_ROCM_TRUE@am__append_128 = -DHPCRUN_SS_AMD
-@OPT_ENABLE_LEVEL0_TRUE@am__append_129 = $(MY_LEVEL0_FILES)
-@OPT_ENABLE_LEVEL0_TRUE@am__append_130 = -DENABLE_LEVEL0
-@OPT_ENABLE_LEVEL0_TRUE@am__append_131 = $(OPT_LEVEL0_IFLAGS)
-@OPT_ENABLE_LEVEL0_TRUE@am__append_132 = -DHPCRUN_SS_LEVEL0
-@ENABLE_OPENCL_TRUE@am__append_133 = -DHPCRUN_SS_OPENCL
-@OPT_ENABLE_LUSH_TRUE@@OPT_WITH_CILK_TRUE@am__append_134 = libagent-cilk.la
-@OPT_ENABLE_LUSH_TRUE@am__append_135 = libagent-pthread.la \
+@OPT_ENABLE_CUDA_TRUE@am__append_120 = $(MY_CUDA_FILES)
+@OPT_ENABLE_CUDA_TRUE@am__append_121 = -DENABLE_CUDA
+@OPT_ENABLE_CUDA_TRUE@am__append_122 = $(OPT_CUDA_IFLAGS)
+@OPT_ENABLE_CUDA_TRUE@am__append_123 = $(MY_CUDA_FILES)
+@OPT_ENABLE_ROCM_TRUE@am__append_124 = $(MY_ROCM_FILES)
+@OPT_ENABLE_ROCM_TRUE@am__append_125 = -DENABLE_ROCM
+@OPT_ENABLE_ROCM_TRUE@am__append_126 = $(OPT_ROCM_IFLAGS)
+@OPT_ENABLE_ROCM_TRUE@am__append_127 = -DHPCRUN_SS_AMD
+@OPT_ENABLE_LEVEL0_TRUE@am__append_128 = $(MY_LEVEL0_FILES)
+@OPT_ENABLE_LEVEL0_TRUE@am__append_129 = -DENABLE_LEVEL0
+@OPT_ENABLE_LEVEL0_TRUE@am__append_130 = $(OPT_LEVEL0_IFLAGS)
+@OPT_ENABLE_LEVEL0_TRUE@am__append_131 = -DHPCRUN_SS_LEVEL0
+@OPT_ENABLE_OPENCL_TRUE@am__append_132 = $(MY_OPENCL_FILES)
+@OPT_ENABLE_OPENCL_TRUE@am__append_133 = -DENABLE_OPENCL
+@OPT_ENABLE_OPENCL_TRUE@am__append_134 = $(OPT_OPENCL_IFLAGS)
+@OPT_ENABLE_OPENCL_TRUE@am__append_135 = -DHPCRUN_SS_OPENCL
+@OPT_ENABLE_GTPIN_TRUE@am__append_136 = $(MY_GTPIN_FILES)
+@OPT_ENABLE_GTPIN_TRUE@am__append_137 = -DENABLE_GTPIN
+@OPT_ENABLE_GTPIN_TRUE@am__append_138 = $(OPT_GTPIN_IFLAGS)
+@OPT_ENABLE_GTPIN_TRUE@am__append_139 = $(OPT_GTPIN_LDFLAGS)
+@OPT_ENABLE_GTPIN_TRUE@am__append_140 = -DHPCRUN_SS_GTPIN
+@OPT_ENABLE_LUSH_TRUE@@OPT_WITH_CILK_TRUE@am__append_141 = libagent-cilk.la
+@OPT_ENABLE_LUSH_TRUE@am__append_142 = libagent-pthread.la \
 @OPT_ENABLE_LUSH_TRUE@	libagent-tbb.la
 subdir = src/tool/hpcrun
 ACLOCAL_M4 = $(top_srcdir)/aclocal.m4
@@ -452,7 +448,7 @@ libagent_tbb_la_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CC \
 	-o $@
 @OPT_ENABLE_LUSH_TRUE@am_libagent_tbb_la_rpath = -rpath $(pkglibdir)
 libhpcrun_la_DEPENDENCIES = $(HPCLIB_ProfLean) $(HPCLIB_SupportLean) \
-	$(am__append_23)
+	$(am__append_22)
 am__libhpcrun_la_SOURCES_DIST = utilities/first_func.c main.h main.c \
 	disabled.c closure-registry.c cct_insert_backtrace.c \
 	cct_backtrace_finalize.c env.c epoch.c files.c \
@@ -517,12 +513,6 @@ am__libhpcrun_la_SOURCES_DIST = utilities/first_func.c main.h main.c \
 	sample-sources/perf/perfmon-util-dummy.c \
 	sample-sources/perf/kernel_blocking.c \
 	sample-sources/perf/kernel_blocking_stub.c \
-	sample-sources/opencl.c gpu/opencl/opencl-intercept.c \
-	gpu/opencl/opencl-api.c gpu/opencl/opencl-memory-manager.c \
-	gpu/opencl/opencl-activity-translate.c \
-	gpu/instrumentation/gtpin-instrumentation-kernel-data-map.c \
-	gpu/instrumentation/gtpin-instrumentation-kernel-memory-map.c \
-	gpu/instrumentation/opencl-instrumentation.c \
 	fnbounds/fnbounds_client.c fnbounds/fnbounds_dynamic.c \
 	monitor-exts/openmp.c hpcrun_dlfns.c custom-init-dynamic.c \
 	os/linux/dylib.c unwind/common/default_validation_summary.c \
@@ -547,11 +537,17 @@ am__libhpcrun_la_SOURCES_DIST = utilities/first_func.c main.h main.c \
 	gpu/level0/level0-api.c gpu/level0/level0-command-list-map.c \
 	gpu/level0/level0-command-process.c \
 	gpu/level0/level0-data-node.c gpu/level0/level0-event-map.c \
-	gpu/level0/level0-handle-map.c unwind/common/backtrace.c \
-	unwind/common/unw-throw.c unwind/common/binarytree_uwi.c \
-	unwind/common/interval_t.c unwind/common/libunw_intervals.c \
-	unwind/common/stack_troll.c unwind/common/uw_hash.c \
-	unwind/common/uw_recipe_map.c \
+	gpu/level0/level0-handle-map.c sample-sources/opencl.c \
+	gpu/opencl/opencl-intercept.c gpu/opencl/opencl-api.c \
+	gpu/opencl/opencl-memory-manager.c \
+	gpu/opencl/opencl-activity-translate.c \
+	gpu/instrumentation/gtpin-instrumentation-kernel-data-map.c \
+	gpu/instrumentation/gtpin-instrumentation-kernel-memory-map.c \
+	gpu/instrumentation/opencl-instrumentation.c \
+	unwind/common/backtrace.c unwind/common/unw-throw.c \
+	unwind/common/binarytree_uwi.c unwind/common/interval_t.c \
+	unwind/common/libunw_intervals.c unwind/common/stack_troll.c \
+	unwind/common/uw_hash.c unwind/common/uw_recipe_map.c \
 	unwind/generic-libunwind/libunw-unwind.c \
 	unwind/ppc64/ppc64-unwind.c \
 	unwind/ppc64/ppc64-unwind-interval.c \
@@ -586,16 +582,7 @@ am__libhpcrun_la_SOURCES_DIST = utilities/first_func.c main.h main.c \
 @OPT_ENABLE_PERF_EVENT_TRUE@@OPT_PERFMON_FALSE@am__objects_11 = sample-sources/perf/libhpcrun_la-perfmon-util-dummy.lo
 @OPT_ENABLE_KERNEL_4_3_TRUE@@OPT_ENABLE_PERF_EVENT_TRUE@am__objects_12 = sample-sources/perf/libhpcrun_la-kernel_blocking.lo
 @OPT_ENABLE_KERNEL_4_3_FALSE@@OPT_ENABLE_PERF_EVENT_TRUE@am__objects_13 = sample-sources/perf/libhpcrun_la-kernel_blocking_stub.lo
-@ENABLE_OPENCL_TRUE@am__objects_14 =  \
-@ENABLE_OPENCL_TRUE@	sample-sources/libhpcrun_la-opencl.lo \
-@ENABLE_OPENCL_TRUE@	gpu/opencl/libhpcrun_la-opencl-intercept.lo \
-@ENABLE_OPENCL_TRUE@	gpu/opencl/libhpcrun_la-opencl-api.lo \
-@ENABLE_OPENCL_TRUE@	gpu/opencl/libhpcrun_la-opencl-memory-manager.lo \
-@ENABLE_OPENCL_TRUE@	gpu/opencl/libhpcrun_la-opencl-activity-translate.lo \
-@ENABLE_OPENCL_TRUE@	gpu/instrumentation/libhpcrun_la-gtpin-instrumentation-kernel-data-map.lo \
-@ENABLE_OPENCL_TRUE@	gpu/instrumentation/libhpcrun_la-gtpin-instrumentation-kernel-memory-map.lo \
-@ENABLE_OPENCL_TRUE@	gpu/instrumentation/libhpcrun_la-opencl-instrumentation.lo
-am__objects_15 = utilities/libhpcrun_la-first_func.lo \
+am__objects_14 = utilities/libhpcrun_la-first_func.lo \
 	libhpcrun_la-main.lo libhpcrun_la-disabled.lo \
 	libhpcrun_la-closure-registry.lo \
 	libhpcrun_la-cct_insert_backtrace.lo \
@@ -694,39 +681,38 @@ am__objects_15 = utilities/libhpcrun_la-first_func.lo \
 	utilities/libhpcrun_la-tokenize.lo \
 	utilities/libhpcrun_la-unlink.lo $(am__objects_7) \
 	$(am__objects_8) $(am__objects_9) $(am__objects_10) \
-	$(am__objects_11) $(am__objects_12) $(am__objects_13) \
-	$(am__objects_14)
-am__objects_16 = fnbounds/libhpcrun_la-fnbounds_client.lo \
+	$(am__objects_11) $(am__objects_12) $(am__objects_13)
+am__objects_15 = fnbounds/libhpcrun_la-fnbounds_client.lo \
 	fnbounds/libhpcrun_la-fnbounds_dynamic.lo \
 	monitor-exts/libhpcrun_la-openmp.lo \
 	libhpcrun_la-hpcrun_dlfns.lo \
 	libhpcrun_la-custom-init-dynamic.lo
-am__objects_17 = os/linux/libhpcrun_la-dylib.lo
-@HOST_OS_LINUX_TRUE@am__objects_18 = $(am__objects_17)
-am__objects_19 =  \
+am__objects_16 = os/linux/libhpcrun_la-dylib.lo
+@HOST_OS_LINUX_TRUE@am__objects_17 = $(am__objects_16)
+am__objects_18 =  \
 	unwind/common/libhpcrun_la-default_validation_summary.lo
-@HOST_CPU_MIPS_TRUE@am__objects_20 = $(am__objects_19)
-am__objects_21 = trampoline/ppc64/libhpcrun_la-ppc64-tramp.lo \
+@HOST_CPU_MIPS_TRUE@am__objects_19 = $(am__objects_18)
+am__objects_20 = trampoline/ppc64/libhpcrun_la-ppc64-tramp.lo \
 	utilities/arch/ppc64/libhpcrun_la-ppc64-context-pc.lo
-@HOST_CPU_PPC_TRUE@am__objects_22 = $(am__objects_21)
-am__objects_23 = trampoline/x86-family/libhpcrun_la-x86-tramp.lo \
+@HOST_CPU_PPC_TRUE@am__objects_21 = $(am__objects_20)
+am__objects_22 = trampoline/x86-family/libhpcrun_la-x86-tramp.lo \
 	utilities/arch/x86-family/libhpcrun_la-x86-context-pc.lo
-@HOST_CPU_X86_FAMILY_TRUE@am__objects_24 = $(am__objects_23)
-am__objects_25 = trampoline/ia64/libhpcrun_la-ia64-tramp.lo \
+@HOST_CPU_X86_FAMILY_TRUE@am__objects_23 = $(am__objects_22)
+am__objects_24 = trampoline/ia64/libhpcrun_la-ia64-tramp.lo \
 	utilities/arch/ia64/libhpcrun_la-ia64-context-pc.lo
-@HOST_CPU_IA64_TRUE@am__objects_26 = $(am__objects_25)
-am__objects_27 = trampoline/aarch64/libhpcrun_la-aarch64-tramp.lo \
+@HOST_CPU_IA64_TRUE@am__objects_25 = $(am__objects_24)
+am__objects_26 = trampoline/aarch64/libhpcrun_la-aarch64-tramp.lo \
 	utilities/arch/libunwind/libhpcrun_la-libunwind-context-pc.lo
-@HOST_CPU_AARCH64_TRUE@am__objects_28 = $(am__objects_27)
-@OPT_PAPI_CUPTI_TRUE@am__objects_29 = sample-sources/libhpcrun_la-papi-c-cupti.lo
-@OPT_PAPI_COMPONENT_FALSE@am__objects_30 =  \
+@HOST_CPU_AARCH64_TRUE@am__objects_27 = $(am__objects_26)
+@OPT_PAPI_CUPTI_TRUE@am__objects_28 = sample-sources/libhpcrun_la-papi-c-cupti.lo
+@OPT_PAPI_COMPONENT_FALSE@am__objects_29 =  \
 @OPT_PAPI_COMPONENT_FALSE@	sample-sources/libhpcrun_la-papi.lo \
-@OPT_PAPI_COMPONENT_FALSE@	$(am__objects_29)
-@OPT_PAPI_COMPONENT_TRUE@am__objects_30 = sample-sources/libhpcrun_la-papi-c.lo \
+@OPT_PAPI_COMPONENT_FALSE@	$(am__objects_28)
+@OPT_PAPI_COMPONENT_TRUE@am__objects_29 = sample-sources/libhpcrun_la-papi-c.lo \
 @OPT_PAPI_COMPONENT_TRUE@	sample-sources/libhpcrun_la-papi-c-extended-info.lo \
-@OPT_PAPI_COMPONENT_TRUE@	$(am__objects_29)
-@OPT_PAPI_DYNAMIC_TRUE@am__objects_31 = $(am__objects_30)
-@OPT_ENABLE_CUPTI_TRUE@am__objects_32 =  \
+@OPT_PAPI_COMPONENT_TRUE@	$(am__objects_28)
+@OPT_PAPI_DYNAMIC_TRUE@am__objects_30 = $(am__objects_29)
+@OPT_ENABLE_CUPTI_TRUE@am__objects_31 =  \
 @OPT_ENABLE_CUPTI_TRUE@	sample-sources/libhpcrun_la-nvidia.lo \
 @OPT_ENABLE_CUPTI_TRUE@	gpu/nvidia/libhpcrun_la-cubin-hash-map.lo \
 @OPT_ENABLE_CUPTI_TRUE@	gpu/nvidia/libhpcrun_la-cubin-id-map.lo \
@@ -737,16 +723,16 @@ am__objects_27 = trampoline/aarch64/libhpcrun_la-aarch64-tramp.lo \
 @OPT_ENABLE_CUPTI_TRUE@	gpu/nvidia/libhpcrun_la-cupti-analysis.lo \
 @OPT_ENABLE_CUPTI_TRUE@	gpu/nvidia/libhpcrun_la-cupti-api.lo \
 @OPT_ENABLE_CUPTI_TRUE@	gpu/nvidia/libhpcrun_la-cupti-gpu-api.lo
-@OPT_ENABLE_CUPTI_TRUE@am__objects_33 = $(am__objects_32)
-am__objects_34 = sample-sources/libhpcrun_la-upc.lo
-@OPT_ENABLE_UPC_TRUE@am__objects_35 = $(am__objects_34)
-am__objects_36 =
-@OPT_ENABLE_ROCM_TRUE@am__objects_37 =  \
+@OPT_ENABLE_CUPTI_TRUE@am__objects_32 = $(am__objects_31)
+am__objects_33 = sample-sources/libhpcrun_la-upc.lo
+@OPT_ENABLE_UPC_TRUE@am__objects_34 = $(am__objects_33)
+am__objects_35 =
+@OPT_ENABLE_ROCM_TRUE@am__objects_36 =  \
 @OPT_ENABLE_ROCM_TRUE@	sample-sources/libhpcrun_la-amd.lo \
 @OPT_ENABLE_ROCM_TRUE@	gpu/amd/libhpcrun_la-roctracer-activity-translate.lo \
 @OPT_ENABLE_ROCM_TRUE@	gpu/amd/libhpcrun_la-roctracer-api.lo
-@OPT_ENABLE_ROCM_TRUE@am__objects_38 = $(am__objects_37)
-@OPT_ENABLE_LEVEL0_TRUE@am__objects_39 =  \
+@OPT_ENABLE_ROCM_TRUE@am__objects_37 = $(am__objects_36)
+@OPT_ENABLE_LEVEL0_TRUE@am__objects_38 =  \
 @OPT_ENABLE_LEVEL0_TRUE@	sample-sources/libhpcrun_la-level0.lo \
 @OPT_ENABLE_LEVEL0_TRUE@	gpu/level0/libhpcrun_la-level0-api.lo \
 @OPT_ENABLE_LEVEL0_TRUE@	gpu/level0/libhpcrun_la-level0-command-list-map.lo \
@@ -754,24 +740,35 @@ am__objects_36 =
 @OPT_ENABLE_LEVEL0_TRUE@	gpu/level0/libhpcrun_la-level0-data-node.lo \
 @OPT_ENABLE_LEVEL0_TRUE@	gpu/level0/libhpcrun_la-level0-event-map.lo \
 @OPT_ENABLE_LEVEL0_TRUE@	gpu/level0/libhpcrun_la-level0-handle-map.lo
-@OPT_ENABLE_LEVEL0_TRUE@am__objects_40 = $(am__objects_39)
-am__objects_41 = unwind/common/libhpcrun_la-backtrace.lo \
+@OPT_ENABLE_LEVEL0_TRUE@am__objects_39 = $(am__objects_38)
+@OPT_ENABLE_OPENCL_TRUE@am__objects_40 =  \
+@OPT_ENABLE_OPENCL_TRUE@	sample-sources/libhpcrun_la-opencl.lo \
+@OPT_ENABLE_OPENCL_TRUE@	gpu/opencl/libhpcrun_la-opencl-intercept.lo \
+@OPT_ENABLE_OPENCL_TRUE@	gpu/opencl/libhpcrun_la-opencl-api.lo \
+@OPT_ENABLE_OPENCL_TRUE@	gpu/opencl/libhpcrun_la-opencl-memory-manager.lo \
+@OPT_ENABLE_OPENCL_TRUE@	gpu/opencl/libhpcrun_la-opencl-activity-translate.lo
+@OPT_ENABLE_OPENCL_TRUE@am__objects_41 = $(am__objects_40)
+@OPT_ENABLE_GTPIN_TRUE@am__objects_42 = gpu/instrumentation/libhpcrun_la-gtpin-instrumentation-kernel-data-map.lo \
+@OPT_ENABLE_GTPIN_TRUE@	gpu/instrumentation/libhpcrun_la-gtpin-instrumentation-kernel-memory-map.lo \
+@OPT_ENABLE_GTPIN_TRUE@	gpu/instrumentation/libhpcrun_la-opencl-instrumentation.lo
+@OPT_ENABLE_GTPIN_TRUE@am__objects_43 = $(am__objects_42)
+am__objects_44 = unwind/common/libhpcrun_la-backtrace.lo \
 	unwind/common/libhpcrun_la-unw-throw.lo
-am__objects_42 = $(am__objects_41) \
+am__objects_45 = $(am__objects_44) \
 	unwind/common/libhpcrun_la-binarytree_uwi.lo \
 	unwind/common/libhpcrun_la-interval_t.lo \
 	unwind/common/libhpcrun_la-libunw_intervals.lo \
 	unwind/common/libhpcrun_la-stack_troll.lo \
 	unwind/common/libhpcrun_la-uw_hash.lo \
 	unwind/common/libhpcrun_la-uw_recipe_map.lo
-am__objects_43 = $(am__objects_42) \
+am__objects_46 = $(am__objects_45) \
 	unwind/generic-libunwind/libhpcrun_la-libunw-unwind.lo \
 	unwind/common/libhpcrun_la-default_validation_summary.lo
-am__objects_44 = $(am__objects_42) \
+am__objects_47 = $(am__objects_45) \
 	unwind/ppc64/libhpcrun_la-ppc64-unwind.lo \
 	unwind/ppc64/libhpcrun_la-ppc64-unwind-interval.lo \
 	unwind/common/libhpcrun_la-default_validation_summary.lo
-am__objects_45 = $(am__objects_42) \
+am__objects_48 = $(am__objects_45) \
 	unwind/x86-family/libhpcrun_la-x86-all.lo \
 	unwind/x86-family/libhpcrun_la-amd-xop.lo \
 	unwind/x86-family/libhpcrun_la-x86-cold-path.lo \
@@ -791,15 +788,16 @@ am__objects_45 = $(am__objects_42) \
 	unwind/x86-family/manual-intervals/libhpcrun_la-x86-32bit-icc-variant.lo \
 	unwind/x86-family/manual-intervals/libhpcrun_la-x86-fail-intervals.lo \
 	unwind/x86-family/manual-intervals/libhpcrun_la-x86-pgi-mp_pexit.lo
-@UNW_LIBUNW_FALSE@@UNW_PPC64_FALSE@@UNW_X86_TRUE@am__objects_46 = $(am__objects_45)
-@UNW_LIBUNW_FALSE@@UNW_PPC64_TRUE@am__objects_46 = $(am__objects_44)
-@UNW_LIBUNW_TRUE@am__objects_46 = $(am__objects_43)
-am_libhpcrun_la_OBJECTS = $(am__objects_15) $(am__objects_16) \
-	$(am__objects_18) $(am__objects_20) $(am__objects_22) \
-	$(am__objects_24) $(am__objects_26) $(am__objects_28) \
-	$(am__objects_31) $(am__objects_33) $(am__objects_35) \
-	$(am__objects_36) $(am__objects_38) $(am__objects_40) \
-	$(am__objects_46) utilities/libhpcrun_la-last_func.lo
+@UNW_LIBUNW_FALSE@@UNW_PPC64_FALSE@@UNW_X86_TRUE@am__objects_49 = $(am__objects_48)
+@UNW_LIBUNW_FALSE@@UNW_PPC64_TRUE@am__objects_49 = $(am__objects_47)
+@UNW_LIBUNW_TRUE@am__objects_49 = $(am__objects_46)
+am_libhpcrun_la_OBJECTS = $(am__objects_14) $(am__objects_15) \
+	$(am__objects_17) $(am__objects_19) $(am__objects_21) \
+	$(am__objects_23) $(am__objects_25) $(am__objects_27) \
+	$(am__objects_30) $(am__objects_32) $(am__objects_34) \
+	$(am__objects_35) $(am__objects_37) $(am__objects_39) \
+	$(am__objects_41) $(am__objects_43) $(am__objects_49) \
+	utilities/libhpcrun_la-last_func.lo
 libhpcrun_la_OBJECTS = $(am_libhpcrun_la_OBJECTS)
 libhpcrun_la_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) \
 	$(LIBTOOLFLAGS) --mode=link $(CCLD) $(libhpcrun_la_CFLAGS) \
@@ -933,12 +931,6 @@ am__libhpcrun_o_SOURCES_DIST = utilities/first_func.c main.h main.c \
 	sample-sources/perf/perfmon-util-dummy.c \
 	sample-sources/perf/kernel_blocking.c \
 	sample-sources/perf/kernel_blocking_stub.c \
-	sample-sources/opencl.c gpu/opencl/opencl-intercept.c \
-	gpu/opencl/opencl-api.c gpu/opencl/opencl-memory-manager.c \
-	gpu/opencl/opencl-activity-translate.c \
-	gpu/instrumentation/gtpin-instrumentation-kernel-data-map.c \
-	gpu/instrumentation/gtpin-instrumentation-kernel-memory-map.c \
-	gpu/instrumentation/opencl-instrumentation.c \
 	fnbounds/fnbounds_static.c custom-init-static.c \
 	unwind/common/default_validation_summary.c \
 	trampoline/ppc64/ppc64-tramp.s \
@@ -983,27 +975,19 @@ am__libhpcrun_o_SOURCES_DIST = utilities/first_func.c main.h main.c \
 	unwind/x86-family/manual-intervals/x86-fail-intervals.c \
 	unwind/x86-family/manual-intervals/x86-pgi-mp_pexit.c \
 	utilities/last_func.c
-@HOST_CPU_PPC_TRUE@am__objects_47 = trampoline/common/libhpcrun_o-trampoline_eager.$(OBJEXT)
-@HOST_CPU_PPC_FALSE@am__objects_48 = trampoline/common/libhpcrun_o-trampoline_lazy.$(OBJEXT)
-@OPT_ENABLE_PERF_EVENT_TRUE@am__objects_49 = sample-sources/perf/libhpcrun_o-event_custom.$(OBJEXT) \
+@HOST_CPU_PPC_TRUE@am__objects_50 = trampoline/common/libhpcrun_o-trampoline_eager.$(OBJEXT)
+@HOST_CPU_PPC_FALSE@am__objects_51 = trampoline/common/libhpcrun_o-trampoline_lazy.$(OBJEXT)
+@OPT_ENABLE_PERF_EVENT_TRUE@am__objects_52 = sample-sources/perf/libhpcrun_o-event_custom.$(OBJEXT) \
 @OPT_ENABLE_PERF_EVENT_TRUE@	sample-sources/perf/libhpcrun_o-linux_perf.$(OBJEXT) \
 @OPT_ENABLE_PERF_EVENT_TRUE@	sample-sources/perf/libhpcrun_o-perf_event_open.$(OBJEXT) \
 @OPT_ENABLE_PERF_EVENT_TRUE@	sample-sources/perf/libhpcrun_o-perf-util.$(OBJEXT) \
 @OPT_ENABLE_PERF_EVENT_TRUE@	sample-sources/perf/libhpcrun_o-perf_mmap.$(OBJEXT) \
 @OPT_ENABLE_PERF_EVENT_TRUE@	sample-sources/perf/libhpcrun_o-perf_skid.$(OBJEXT)
-@OPT_ENABLE_PERF_EVENT_TRUE@@OPT_PERFMON_TRUE@am__objects_50 = sample-sources/perf/libhpcrun_o-perfmon-util.$(OBJEXT)
-@OPT_ENABLE_PERF_EVENT_TRUE@@OPT_PERFMON_FALSE@am__objects_51 = sample-sources/perf/libhpcrun_o-perfmon-util-dummy.$(OBJEXT)
-@OPT_ENABLE_KERNEL_4_3_TRUE@@OPT_ENABLE_PERF_EVENT_TRUE@am__objects_52 = sample-sources/perf/libhpcrun_o-kernel_blocking.$(OBJEXT)
-@OPT_ENABLE_KERNEL_4_3_FALSE@@OPT_ENABLE_PERF_EVENT_TRUE@am__objects_53 = sample-sources/perf/libhpcrun_o-kernel_blocking_stub.$(OBJEXT)
-@ENABLE_OPENCL_TRUE@am__objects_54 = sample-sources/libhpcrun_o-opencl.$(OBJEXT) \
-@ENABLE_OPENCL_TRUE@	gpu/opencl/libhpcrun_o-opencl-intercept.$(OBJEXT) \
-@ENABLE_OPENCL_TRUE@	gpu/opencl/libhpcrun_o-opencl-api.$(OBJEXT) \
-@ENABLE_OPENCL_TRUE@	gpu/opencl/libhpcrun_o-opencl-memory-manager.$(OBJEXT) \
-@ENABLE_OPENCL_TRUE@	gpu/opencl/libhpcrun_o-opencl-activity-translate.$(OBJEXT) \
-@ENABLE_OPENCL_TRUE@	gpu/instrumentation/libhpcrun_o-gtpin-instrumentation-kernel-data-map.$(OBJEXT) \
-@ENABLE_OPENCL_TRUE@	gpu/instrumentation/libhpcrun_o-gtpin-instrumentation-kernel-memory-map.$(OBJEXT) \
-@ENABLE_OPENCL_TRUE@	gpu/instrumentation/libhpcrun_o-opencl-instrumentation.$(OBJEXT)
-am__objects_55 = utilities/libhpcrun_o-first_func.$(OBJEXT) \
+@OPT_ENABLE_PERF_EVENT_TRUE@@OPT_PERFMON_TRUE@am__objects_53 = sample-sources/perf/libhpcrun_o-perfmon-util.$(OBJEXT)
+@OPT_ENABLE_PERF_EVENT_TRUE@@OPT_PERFMON_FALSE@am__objects_54 = sample-sources/perf/libhpcrun_o-perfmon-util-dummy.$(OBJEXT)
+@OPT_ENABLE_KERNEL_4_3_TRUE@@OPT_ENABLE_PERF_EVENT_TRUE@am__objects_55 = sample-sources/perf/libhpcrun_o-kernel_blocking.$(OBJEXT)
+@OPT_ENABLE_KERNEL_4_3_FALSE@@OPT_ENABLE_PERF_EVENT_TRUE@am__objects_56 = sample-sources/perf/libhpcrun_o-kernel_blocking_stub.$(OBJEXT)
+am__objects_57 = utilities/libhpcrun_o-first_func.$(OBJEXT) \
 	libhpcrun_o-main.$(OBJEXT) libhpcrun_o-disabled.$(OBJEXT) \
 	libhpcrun_o-closure-registry.$(OBJEXT) \
 	libhpcrun_o-cct_insert_backtrace.$(OBJEXT) \
@@ -1113,29 +1097,28 @@ am__objects_55 = utilities/libhpcrun_o-first_func.$(OBJEXT) \
 	utilities/libhpcrun_o-line_wrapping.$(OBJEXT) \
 	utilities/libhpcrun_o-timer.$(OBJEXT) \
 	utilities/libhpcrun_o-tokenize.$(OBJEXT) \
-	utilities/libhpcrun_o-unlink.$(OBJEXT) $(am__objects_47) \
-	$(am__objects_48) $(am__objects_49) $(am__objects_50) \
+	utilities/libhpcrun_o-unlink.$(OBJEXT) $(am__objects_50) \
 	$(am__objects_51) $(am__objects_52) $(am__objects_53) \
-	$(am__objects_54)
-am__objects_56 = fnbounds/libhpcrun_o-fnbounds_static.$(OBJEXT) \
+	$(am__objects_54) $(am__objects_55) $(am__objects_56)
+am__objects_58 = fnbounds/libhpcrun_o-fnbounds_static.$(OBJEXT) \
 	libhpcrun_o-custom-init-static.$(OBJEXT)
-am__objects_57 = unwind/common/libhpcrun_o-default_validation_summary.$(OBJEXT)
-@HOST_CPU_MIPS_TRUE@am__objects_58 = $(am__objects_57)
-am__objects_59 = trampoline/ppc64/libhpcrun_o-ppc64-tramp.$(OBJEXT) \
+am__objects_59 = unwind/common/libhpcrun_o-default_validation_summary.$(OBJEXT)
+@HOST_CPU_MIPS_TRUE@am__objects_60 = $(am__objects_59)
+am__objects_61 = trampoline/ppc64/libhpcrun_o-ppc64-tramp.$(OBJEXT) \
 	utilities/arch/ppc64/libhpcrun_o-ppc64-context-pc.$(OBJEXT)
-@HOST_CPU_PPC_TRUE@am__objects_60 = $(am__objects_59)
-am__objects_61 =  \
+@HOST_CPU_PPC_TRUE@am__objects_62 = $(am__objects_61)
+am__objects_63 =  \
 	trampoline/x86-family/libhpcrun_o-x86-tramp.$(OBJEXT) \
 	utilities/arch/x86-family/libhpcrun_o-x86-context-pc.$(OBJEXT)
-@HOST_CPU_X86_FAMILY_TRUE@am__objects_62 = $(am__objects_61)
-am__objects_63 = trampoline/ia64/libhpcrun_o-ia64-tramp.$(OBJEXT) \
+@HOST_CPU_X86_FAMILY_TRUE@am__objects_64 = $(am__objects_63)
+am__objects_65 = trampoline/ia64/libhpcrun_o-ia64-tramp.$(OBJEXT) \
 	utilities/arch/ia64/libhpcrun_o-ia64-context-pc.$(OBJEXT)
-@HOST_CPU_IA64_TRUE@am__objects_64 = $(am__objects_63)
-am__objects_65 =  \
+@HOST_CPU_IA64_TRUE@am__objects_66 = $(am__objects_65)
+am__objects_67 =  \
 	trampoline/aarch64/libhpcrun_o-aarch64-tramp.$(OBJEXT) \
 	utilities/arch/libunwind/libhpcrun_o-libunwind-context-pc.$(OBJEXT)
-@HOST_CPU_AARCH64_TRUE@am__objects_66 = $(am__objects_65)
-@OPT_ENABLE_CUPTI_TRUE@am__objects_67 = sample-sources/libhpcrun_o-nvidia.$(OBJEXT) \
+@HOST_CPU_AARCH64_TRUE@am__objects_68 = $(am__objects_67)
+@OPT_ENABLE_CUPTI_TRUE@am__objects_69 = sample-sources/libhpcrun_o-nvidia.$(OBJEXT) \
 @OPT_ENABLE_CUPTI_TRUE@	gpu/nvidia/libhpcrun_o-cubin-hash-map.$(OBJEXT) \
 @OPT_ENABLE_CUPTI_TRUE@	gpu/nvidia/libhpcrun_o-cubin-id-map.$(OBJEXT) \
 @OPT_ENABLE_CUPTI_TRUE@	gpu/nvidia/libhpcrun_o-cubin-symbols.$(OBJEXT) \
@@ -1145,33 +1128,33 @@ am__objects_65 =  \
 @OPT_ENABLE_CUPTI_TRUE@	gpu/nvidia/libhpcrun_o-cupti-analysis.$(OBJEXT) \
 @OPT_ENABLE_CUPTI_TRUE@	gpu/nvidia/libhpcrun_o-cupti-api.$(OBJEXT) \
 @OPT_ENABLE_CUPTI_TRUE@	gpu/nvidia/libhpcrun_o-cupti-gpu-api.$(OBJEXT)
-@OPT_ENABLE_CUPTI_TRUE@am__objects_68 = $(am__objects_67)
-@OPT_PAPI_CUPTI_TRUE@am__objects_69 = sample-sources/libhpcrun_o-papi-c-cupti.$(OBJEXT)
-@OPT_PAPI_COMPONENT_FALSE@am__objects_70 = sample-sources/libhpcrun_o-papi.$(OBJEXT) \
-@OPT_PAPI_COMPONENT_FALSE@	$(am__objects_69)
-@OPT_PAPI_COMPONENT_TRUE@am__objects_70 = sample-sources/libhpcrun_o-papi-c.$(OBJEXT) \
+@OPT_ENABLE_CUPTI_TRUE@am__objects_70 = $(am__objects_69)
+@OPT_PAPI_CUPTI_TRUE@am__objects_71 = sample-sources/libhpcrun_o-papi-c-cupti.$(OBJEXT)
+@OPT_PAPI_COMPONENT_FALSE@am__objects_72 = sample-sources/libhpcrun_o-papi.$(OBJEXT) \
+@OPT_PAPI_COMPONENT_FALSE@	$(am__objects_71)
+@OPT_PAPI_COMPONENT_TRUE@am__objects_72 = sample-sources/libhpcrun_o-papi-c.$(OBJEXT) \
 @OPT_PAPI_COMPONENT_TRUE@	sample-sources/libhpcrun_o-papi-c-extended-info.$(OBJEXT) \
-@OPT_PAPI_COMPONENT_TRUE@	$(am__objects_69)
-@OPT_PAPI_STATIC_TRUE@am__objects_71 = $(am__objects_70)
-am__objects_72 = sample-sources/libhpcrun_o-upc.$(OBJEXT)
-@OPT_ENABLE_UPC_TRUE@am__objects_73 = $(am__objects_72)
-am__objects_74 = unwind/common/libhpcrun_o-backtrace.$(OBJEXT) \
+@OPT_PAPI_COMPONENT_TRUE@	$(am__objects_71)
+@OPT_PAPI_STATIC_TRUE@am__objects_73 = $(am__objects_72)
+am__objects_74 = sample-sources/libhpcrun_o-upc.$(OBJEXT)
+@OPT_ENABLE_UPC_TRUE@am__objects_75 = $(am__objects_74)
+am__objects_76 = unwind/common/libhpcrun_o-backtrace.$(OBJEXT) \
 	unwind/common/libhpcrun_o-unw-throw.$(OBJEXT)
-am__objects_75 = $(am__objects_74) \
+am__objects_77 = $(am__objects_76) \
 	unwind/common/libhpcrun_o-binarytree_uwi.$(OBJEXT) \
 	unwind/common/libhpcrun_o-interval_t.$(OBJEXT) \
 	unwind/common/libhpcrun_o-libunw_intervals.$(OBJEXT) \
 	unwind/common/libhpcrun_o-stack_troll.$(OBJEXT) \
 	unwind/common/libhpcrun_o-uw_hash.$(OBJEXT) \
 	unwind/common/libhpcrun_o-uw_recipe_map.$(OBJEXT)
-am__objects_76 = $(am__objects_75) \
+am__objects_78 = $(am__objects_77) \
 	unwind/generic-libunwind/libhpcrun_o-libunw-unwind.$(OBJEXT) \
 	unwind/common/libhpcrun_o-default_validation_summary.$(OBJEXT)
-am__objects_77 = $(am__objects_75) \
+am__objects_79 = $(am__objects_77) \
 	unwind/ppc64/libhpcrun_o-ppc64-unwind.$(OBJEXT) \
 	unwind/ppc64/libhpcrun_o-ppc64-unwind-interval.$(OBJEXT) \
 	unwind/common/libhpcrun_o-default_validation_summary.$(OBJEXT)
-am__objects_78 = $(am__objects_75) \
+am__objects_80 = $(am__objects_77) \
 	unwind/x86-family/libhpcrun_o-x86-all.$(OBJEXT) \
 	unwind/x86-family/libhpcrun_o-amd-xop.$(OBJEXT) \
 	unwind/x86-family/libhpcrun_o-x86-cold-path.$(OBJEXT) \
@@ -1191,17 +1174,17 @@ am__objects_78 = $(am__objects_75) \
 	unwind/x86-family/manual-intervals/libhpcrun_o-x86-32bit-icc-variant.$(OBJEXT) \
 	unwind/x86-family/manual-intervals/libhpcrun_o-x86-fail-intervals.$(OBJEXT) \
 	unwind/x86-family/manual-intervals/libhpcrun_o-x86-pgi-mp_pexit.$(OBJEXT)
-@UNW_LIBUNW_FALSE@@UNW_PPC64_FALSE@@UNW_X86_TRUE@am__objects_79 = $(am__objects_78)
-@UNW_LIBUNW_FALSE@@UNW_PPC64_TRUE@am__objects_79 = $(am__objects_77)
-@UNW_LIBUNW_TRUE@am__objects_79 = $(am__objects_76)
-am_libhpcrun_o_OBJECTS = $(am__objects_55) $(am__objects_56) \
-	$(am__objects_58) $(am__objects_60) $(am__objects_62) \
-	$(am__objects_64) $(am__objects_66) $(am__objects_68) \
-	$(am__objects_71) $(am__objects_73) $(am__objects_36) \
-	$(am__objects_79) utilities/libhpcrun_o-last_func.$(OBJEXT)
+@UNW_LIBUNW_FALSE@@UNW_PPC64_FALSE@@UNW_X86_TRUE@am__objects_81 = $(am__objects_80)
+@UNW_LIBUNW_FALSE@@UNW_PPC64_TRUE@am__objects_81 = $(am__objects_79)
+@UNW_LIBUNW_TRUE@am__objects_81 = $(am__objects_78)
+am_libhpcrun_o_OBJECTS = $(am__objects_57) $(am__objects_58) \
+	$(am__objects_60) $(am__objects_62) $(am__objects_64) \
+	$(am__objects_66) $(am__objects_68) $(am__objects_70) \
+	$(am__objects_73) $(am__objects_75) $(am__objects_35) \
+	$(am__objects_81) utilities/libhpcrun_o-last_func.$(OBJEXT)
 libhpcrun_o_OBJECTS = $(am_libhpcrun_o_OBJECTS)
 libhpcrun_o_DEPENDENCIES = $(HPCLIB_ProfLean) $(HPCLIB_SupportLean) \
-	$(am__append_24)
+	$(am__append_23)
 libhpcrun_o_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) \
 	$(LIBTOOLFLAGS) --mode=link $(CCLD) $(libhpcrun_o_CFLAGS) \
 	$(CFLAGS) $(libhpcrun_o_LDFLAGS) $(LDFLAGS) -o $@
@@ -1494,9 +1477,18 @@ OPT_CUDA_LDFLAGS = @OPT_CUDA_LDFLAGS@
 OPT_CUPTI = @OPT_CUPTI@
 OPT_CUPTI_IFLAGS = @OPT_CUPTI_IFLAGS@
 OPT_CUPTI_LDFLAGS = @OPT_CUPTI_LDFLAGS@
+OPT_GTPIN = @OPT_GTPIN@
+OPT_GTPIN_IFLAGS = @OPT_GTPIN_IFLAGS@
+OPT_GTPIN_LDFLAGS = @OPT_GTPIN_LDFLAGS@
+OPT_IGC = @OPT_IGC@
+OPT_IGC_IFLAGS = @OPT_IGC_IFLAGS@
+OPT_IGC_LDFLAGS = @OPT_IGC_LDFLAGS@
 OPT_LEVEL0 = @OPT_LEVEL0@
 OPT_LEVEL0_IFLAGS = @OPT_LEVEL0_IFLAGS@
 OPT_LEVEL0_LDFLAGS = @OPT_LEVEL0_LDFLAGS@
+OPT_METRICS_DISCOVERY = @OPT_METRICS_DISCOVERY@
+OPT_METRICS_DISCOVERY_IFLAGS = @OPT_METRICS_DISCOVERY_IFLAGS@
+OPT_METRICS_DISCOVERY_LDFLAGS = @OPT_METRICS_DISCOVERY_LDFLAGS@
 OPT_OBJCOPY = @OPT_OBJCOPY@
 OPT_OPENCL_IFLAGS = @OPT_OPENCL_IFLAGS@
 OPT_PAPI = @OPT_PAPI@
@@ -1682,10 +1674,10 @@ bin_SCRIPTS = $(am__append_4) $(am__append_6)
 pkglibexec_SCRIPTS = $(am__append_1)
 include_HEADERS = $(am__append_2)
 pkglib_LIBRARIES = $(am__append_5)
-pkglib_LTLIBRARIES = $(am__append_3) $(am__append_7) $(am__append_134) \
-	$(am__append_135)
-BUILT_SOURCES = $(am__append_21)
-CLEANFILES = $(am__append_22)
+pkglib_LTLIBRARIES = $(am__append_3) $(am__append_7) $(am__append_141) \
+	$(am__append_142)
+BUILT_SOURCES = $(am__append_20)
+CLEANFILES = $(am__append_21)
 PAPI_INC_FLGS = @OPT_PAPI_IFLAGS@ 
 PAPI_LD_FLGS = @OPT_PAPI_LDFLAGS@
 CUPTI_INC_FLGS = @OPT_CUPTI_IFLAGS@
@@ -1771,10 +1763,10 @@ UNW_MIPS_INCLUDE_DIRS = \
 
 UNW_MIPS_LD_FLAGS = 
 MY_CPP_DEFINES = -D_GNU_SOURCE -DINLINE_FN=1 -DLOCAL_BUILD=1 \
-	-D__HIP_PLATFORM_HCC__=1 $(am__append_11) $(am__append_18) \
-	$(am__append_103) $(am__append_107) $(am__append_109) \
-	$(am__append_113) $(am__append_128) $(am__append_132) \
-	$(am__append_133)
+	-D__HIP_PLATFORM_HCC__=1 $(am__append_11) $(am__append_17) \
+	$(am__append_102) $(am__append_106) $(am__append_108) \
+	$(am__append_112) $(am__append_127) $(am__append_131) \
+	$(am__append_135) $(am__append_140)
 MY_BASE_FILES = utilities/first_func.c main.h main.c disabled.c \
 	closure-registry.c cct_insert_backtrace.c \
 	cct_backtrace_finalize.c env.c epoch.c files.c \
@@ -1829,7 +1821,7 @@ MY_BASE_FILES = utilities/first_func.c main.h main.c disabled.c \
 	utilities/tokenize.h utilities/tokenize.c utilities/unlink.h \
 	utilities/unlink.c $(am__append_8) $(am__append_9) \
 	$(am__append_10) $(am__append_12) $(am__append_13) \
-	$(am__append_14) $(am__append_15) $(am__append_17)
+	$(am__append_14) $(am__append_15)
 MY_DYNAMIC_FILES = \
 	fnbounds/fnbounds_client.c	\
 	fnbounds/fnbounds_dynamic.c	\
@@ -1881,6 +1873,17 @@ MY_AARCH64_FILES = \
 @OPT_ENABLE_CUPTI_TRUE@	gpu/nvidia/cupti-api.c			\
 @OPT_ENABLE_CUPTI_TRUE@	gpu/nvidia/cupti-gpu-api.c		
 
+@OPT_ENABLE_OPENCL_TRUE@MY_OPENCL_FILES = sample-sources/opencl.c \
+@OPT_ENABLE_OPENCL_TRUE@	gpu/opencl/opencl-intercept.c \
+@OPT_ENABLE_OPENCL_TRUE@	gpu/opencl/opencl-api.c \
+@OPT_ENABLE_OPENCL_TRUE@	gpu/opencl/opencl-memory-manager.c \
+@OPT_ENABLE_OPENCL_TRUE@	gpu/opencl/opencl-activity-translate.c 
+
+@OPT_ENABLE_GTPIN_TRUE@MY_GTPIN_FILES = \
+@OPT_ENABLE_GTPIN_TRUE@	gpu/instrumentation/gtpin-instrumentation-kernel-data-map.c \
+@OPT_ENABLE_GTPIN_TRUE@	gpu/instrumentation/gtpin-instrumentation-kernel-memory-map.c \
+@OPT_ENABLE_GTPIN_TRUE@	gpu/instrumentation/opencl-instrumentation.c
+
 @OPT_ENABLE_ROCM_TRUE@MY_ROCM_FILES = \
 @OPT_ENABLE_ROCM_TRUE@	sample-sources/amd.c \
 @OPT_ENABLE_ROCM_TRUE@	gpu/amd/roctracer-activity-translate.c \
@@ -1932,15 +1935,16 @@ MY_AARCH64_INCLUDE_DIRS = \
 	-I$(srcdir)/utilities/arch/aarch64
 
 libhpcrun_la_SOURCES = $(MY_BASE_FILES) $(MY_DYNAMIC_FILES) \
-	$(am__append_25) $(am__append_26) $(am__append_39) \
-	$(am__append_54) $(am__append_72) $(am__append_85) \
-	$(am__append_100) $(am__append_104) $(am__append_114) \
-	$(am__append_121) $(am__append_125) $(am__append_129) \
-	$(UNW_SOURCE_FILES) utilities/last_func.c
+	$(am__append_24) $(am__append_25) $(am__append_38) \
+	$(am__append_53) $(am__append_71) $(am__append_84) \
+	$(am__append_99) $(am__append_103) $(am__append_113) \
+	$(am__append_120) $(am__append_124) $(am__append_128) \
+	$(am__append_132) $(am__append_136) $(UNW_SOURCE_FILES) \
+	utilities/last_func.c
 libhpcrun_o_SOURCES = $(MY_BASE_FILES) $(MY_STATIC_FILES) \
-	$(am__append_27) $(am__append_40) $(am__append_55) \
-	$(am__append_73) $(am__append_86) $(am__append_105) \
-	$(am__append_110) $(am__append_115) $(am__append_124) \
+	$(am__append_26) $(am__append_39) $(am__append_54) \
+	$(am__append_72) $(am__append_85) $(am__append_104) \
+	$(am__append_109) $(am__append_114) $(am__append_123) \
 	$(UNW_SOURCE_FILES) utilities/last_func.c
 libhpcrun_wrap_a_SOURCES = \
 	monitor-exts/openmp.c
@@ -1985,58 +1989,58 @@ libhpctoolkit_a_SOURCES = \
 # cppflags
 #-----------------------------------------------------------
 libhpcrun_la_CPPFLAGS = $(MY_CPP_DEFINES) $(LIBUNWIND_CPPFLAGS_DYN) \
-	$(MY_INCLUDE_DIRS) $(am__append_19) $(am__append_28) \
-	$(am__append_41) $(am__append_56) $(am__append_74) \
-	$(am__append_87) $(am__append_101) $(am__append_106) \
-	$(am__append_108) $(am__append_116) $(am__append_119) \
-	$(am__append_122) $(am__append_126) $(am__append_130) \
-	$(UNW_INCLUDE_DIRS)
+	$(MY_INCLUDE_DIRS) $(am__append_18) $(am__append_27) \
+	$(am__append_40) $(am__append_55) $(am__append_73) \
+	$(am__append_86) $(am__append_100) $(am__append_105) \
+	$(am__append_107) $(am__append_115) $(am__append_118) \
+	$(am__append_121) $(am__append_125) $(am__append_129) \
+	$(am__append_133) $(am__append_137) $(UNW_INCLUDE_DIRS)
 libhpcrun_o_CPPFLAGS = -DHPCRUN_STATIC_LINK $(MY_CPP_DEFINES) \
-	$(LIBUNWIND_CPPFLAGS_STAT) $(MY_INCLUDE_DIRS) $(am__append_20) \
-	$(am__append_29) $(am__append_42) $(am__append_57) \
-	$(am__append_75) $(am__append_88) $(am__append_111) \
-	$(am__append_117) $(am__append_120) $(UNW_INCLUDE_DIRS)
+	$(LIBUNWIND_CPPFLAGS_STAT) $(MY_INCLUDE_DIRS) $(am__append_19) \
+	$(am__append_28) $(am__append_41) $(am__append_56) \
+	$(am__append_74) $(am__append_87) $(am__append_110) \
+	$(am__append_116) $(am__append_119) $(UNW_INCLUDE_DIRS)
 libhpcrun_wrap_a_CPPFLAGS = \
 	-DHPCRUN_STATIC_LINK		\
 	$(MY_CPP_DEFINES)		\
 	$(MY_INCLUDE_DIRS)
 
 libhpcrun_ga_la_CPPFLAGS = $(MY_CPP_DEFINES) $(MY_INCLUDE_DIRS) \
-	$(am__append_30) $(am__append_43) $(am__append_60) \
-	$(am__append_76) $(am__append_89) $(UNW_INCLUDE_DIRS)
+	$(am__append_29) $(am__append_42) $(am__append_59) \
+	$(am__append_75) $(am__append_88) $(UNW_INCLUDE_DIRS)
 libhpcrun_ga_wrap_a_CPPFLAGS = -DHPCRUN_STATIC_LINK $(MY_CPP_DEFINES) \
-	$(MY_INCLUDE_DIRS) $(am__append_31) $(am__append_44) \
-	$(am__append_61) $(am__append_77) $(am__append_90) \
+	$(MY_INCLUDE_DIRS) $(am__append_30) $(am__append_43) \
+	$(am__append_60) $(am__append_76) $(am__append_89) \
 	$(UNW_INCLUDE_DIRS)
 libhpcrun_gprof_la_CPPFLAGS = $(MY_CPP_DEFINES) $(MY_INCLUDE_DIRS) \
-	$(am__append_45) $(am__append_62) $(am__append_91)
+	$(am__append_44) $(am__append_61) $(am__append_90)
 libhpcrun_gprof_wrap_a_CPPFLAGS = -DHPCRUN_STATIC_LINK \
-	$(MY_CPP_DEFINES) $(MY_INCLUDE_DIRS) $(am__append_46) \
-	$(am__append_63) $(am__append_92)
+	$(MY_CPP_DEFINES) $(MY_INCLUDE_DIRS) $(am__append_45) \
+	$(am__append_62) $(am__append_91)
 libhpcrun_io_la_CPPFLAGS = $(MY_CPP_DEFINES) $(MY_INCLUDE_DIRS) \
-	$(am__append_32) $(am__append_47) $(am__append_64) \
-	$(am__append_78) $(am__append_93) $(UNW_INCLUDE_DIRS)
+	$(am__append_31) $(am__append_46) $(am__append_63) \
+	$(am__append_77) $(am__append_92) $(UNW_INCLUDE_DIRS)
 libhpcrun_io_wrap_a_CPPFLAGS = -DHPCRUN_STATIC_LINK $(MY_CPP_DEFINES) \
-	$(MY_INCLUDE_DIRS) $(am__append_33) $(am__append_48) \
-	$(am__append_65) $(am__append_79) $(am__append_94) \
+	$(MY_INCLUDE_DIRS) $(am__append_32) $(am__append_47) \
+	$(am__append_64) $(am__append_78) $(am__append_93) \
 	$(UNW_INCLUDE_DIRS)
 libhpcrun_memleak_la_CPPFLAGS = $(MY_CPP_DEFINES) $(MY_INCLUDE_DIRS) \
-	$(am__append_34) $(am__append_49) $(am__append_66) \
-	$(am__append_80) $(am__append_95) $(UNW_INCLUDE_DIRS)
+	$(am__append_33) $(am__append_48) $(am__append_65) \
+	$(am__append_79) $(am__append_94) $(UNW_INCLUDE_DIRS)
 libhpcrun_memleak_wrap_a_CPPFLAGS = -DHPCRUN_STATIC_LINK \
-	$(MY_CPP_DEFINES) $(MY_INCLUDE_DIRS) $(am__append_35) \
-	$(am__append_50) $(am__append_67) $(am__append_81) \
-	$(am__append_96) $(UNW_INCLUDE_DIRS)
+	$(MY_CPP_DEFINES) $(MY_INCLUDE_DIRS) $(am__append_34) \
+	$(am__append_49) $(am__append_66) $(am__append_80) \
+	$(am__append_95) $(UNW_INCLUDE_DIRS)
 libhpcrun_pthread_la_CPPFLAGS = $(MY_CPP_DEFINES) $(MY_INCLUDE_DIRS) \
-	$(am__append_36) $(am__append_51) $(am__append_68) \
-	$(am__append_82) $(am__append_97) $(UNW_INCLUDE_DIRS)
+	$(am__append_35) $(am__append_50) $(am__append_67) \
+	$(am__append_81) $(am__append_96) $(UNW_INCLUDE_DIRS)
 libhpcrun_pthread_wrap_a_CPPFLAGS = -DHPCRUN_STATIC_LINK \
-	$(MY_CPP_DEFINES) $(MY_INCLUDE_DIRS) $(am__append_37) \
-	$(am__append_52) $(am__append_69) $(am__append_83) \
-	$(am__append_98) $(UNW_INCLUDE_DIRS)
+	$(MY_CPP_DEFINES) $(MY_INCLUDE_DIRS) $(am__append_36) \
+	$(am__append_51) $(am__append_68) $(am__append_82) \
+	$(am__append_97) $(UNW_INCLUDE_DIRS)
 libhpcrun_mpi_la_CPPFLAGS = $(MY_CPP_DEFINES) -I$(MPI_INC) \
-	$(MY_INCLUDE_DIRS) $(am__append_38) $(am__append_53) \
-	$(am__append_70) $(am__append_84) $(am__append_99) \
+	$(MY_INCLUDE_DIRS) $(am__append_37) $(am__append_52) \
+	$(am__append_69) $(am__append_83) $(am__append_98) \
 	$(UNW_INCLUDE_DIRS)
 libhpctoolkit_la_CPPFLAGS = \
 	$(MY_CPP_DEFINES)		\
@@ -2052,8 +2056,8 @@ libhpctoolkit_a_CPPFLAGS = \
 # cflags
 #-----------------------------------------------------------
 libhpcrun_la_CFLAGS = $(CFLAGS) $(HOST_CFLAGS) $(PERFMON_CFLAGS) \
-	$(am__append_123) $(am__append_127) $(am__append_131) \
-	$(GOTCHA_IFLAGS)
+	$(am__append_122) $(am__append_126) $(am__append_130) \
+	$(am__append_134) $(am__append_138) $(GOTCHA_IFLAGS)
 libhpcrun_o_CFLAGS = $(CFLAGS) $(HOST_CFLAGS) $(PERFMON_CFLAGS)
 libhpcrun_wrap_a_CFLAGS = $(CFLAGS) $(HOST_CFLAGS)
 libhpcrun_ga_la_CFLAGS = $(CFLAGS) $(HOST_CFLAGS)
@@ -2072,14 +2076,15 @@ libhpcrun_mpi_la_CFLAGS = $(CFLAGS) $(HOST_CFLAGS)
 # ldflags
 #-----------------------------------------------------------
 libhpcrun_la_LIBADD = $(HPCLIB_ProfLean) $(HPCLIB_SupportLean) \
-	$(am__append_23)
+	$(am__append_22)
 libhpcrun_o_LDADD = $(HPCLIB_ProfLean) $(HPCLIB_SupportLean) \
-	$(am__append_24)
+	$(am__append_23)
 libhpcrun_la_LDFLAGS = -Wl,-Bsymbolic -L$(LIBMONITOR_LIB) -lmonitor \
 	-lpthread -lrt -lelf -L$(LIBELF_LIB) $(LIBUNWIND_LDFLAGS_DYN) \
 	$(LZMA_LDFLAGS_DYN) $(PERFMON_LDFLAGS_DYN) $(MBEDTLS_LIBS) \
-	$(OPT_ROCM_LDFLAGS) $(am__append_58) $(am__append_102) \
-	$(am__append_118) $(GOTCHA_LDFLAGS) $(UNW_DYNAMIC_LD_FLAGS)
+	$(OPT_ROCM_LDFLAGS) $(am__append_57) $(am__append_101) \
+	$(am__append_117) $(am__append_139) $(GOTCHA_LDFLAGS) \
+	$(UNW_DYNAMIC_LD_FLAGS)
 libhpcrun_ga_la_LDFLAGS = -Wl,-Bsymbolic
 libhpcrun_gprof_la_LDFLAGS = -Wl,-Bsymbolic
 libhpcrun_io_la_LDFLAGS = -Wl,-Bsymbolic
@@ -2087,9 +2092,9 @@ libhpcrun_memleak_la_LDFLAGS = -Wl,-Bsymbolic
 libhpcrun_pthread_la_LDFLAGS = -Wl,-Bsymbolic
 libhpcrun_mpi_la_LDFLAGS = -Wl,-Bsymbolic
 libhpcrun_o_LDFLAGS = $(LIBUNWIND_LDFLAGS_STAT) \
-	$(PERFMON_LDFLAGS_STAT) $(am__append_59) $(am__append_112) \
+	$(PERFMON_LDFLAGS_STAT) $(am__append_58) $(am__append_111) \
 	$(UNW_STATIC_LD_FLAGS)
-MY_AGENT_INCLUDE_DIRS = $(MY_INCLUDE_DIRS) $(am__append_71) \
+MY_AGENT_INCLUDE_DIRS = $(MY_INCLUDE_DIRS) $(am__append_70) \
 	$(UNW_INCLUDE_DIRS)
 @HOST_CPU_AARCH64_TRUE@libhpcrun_la_CCASFLAGS = $(AM_CCASFLAGS)
 @HOST_CPU_PPC_TRUE@libhpcrun_la_CCASFLAGS = $(AM_CCASFLAGS)
@@ -2629,40 +2634,6 @@ sample-sources/perf/libhpcrun_la-kernel_blocking.lo:  \
 sample-sources/perf/libhpcrun_la-kernel_blocking_stub.lo:  \
 	sample-sources/perf/$(am__dirstamp) \
 	sample-sources/perf/$(DEPDIR)/$(am__dirstamp)
-sample-sources/libhpcrun_la-opencl.lo: sample-sources/$(am__dirstamp) \
-	sample-sources/$(DEPDIR)/$(am__dirstamp)
-gpu/opencl/$(am__dirstamp):
-	@$(MKDIR_P) gpu/opencl
-	@: > gpu/opencl/$(am__dirstamp)
-gpu/opencl/$(DEPDIR)/$(am__dirstamp):
-	@$(MKDIR_P) gpu/opencl/$(DEPDIR)
-	@: > gpu/opencl/$(DEPDIR)/$(am__dirstamp)
-gpu/opencl/libhpcrun_la-opencl-intercept.lo:  \
-	gpu/opencl/$(am__dirstamp) \
-	gpu/opencl/$(DEPDIR)/$(am__dirstamp)
-gpu/opencl/libhpcrun_la-opencl-api.lo: gpu/opencl/$(am__dirstamp) \
-	gpu/opencl/$(DEPDIR)/$(am__dirstamp)
-gpu/opencl/libhpcrun_la-opencl-memory-manager.lo:  \
-	gpu/opencl/$(am__dirstamp) \
-	gpu/opencl/$(DEPDIR)/$(am__dirstamp)
-gpu/opencl/libhpcrun_la-opencl-activity-translate.lo:  \
-	gpu/opencl/$(am__dirstamp) \
-	gpu/opencl/$(DEPDIR)/$(am__dirstamp)
-gpu/instrumentation/$(am__dirstamp):
-	@$(MKDIR_P) gpu/instrumentation
-	@: > gpu/instrumentation/$(am__dirstamp)
-gpu/instrumentation/$(DEPDIR)/$(am__dirstamp):
-	@$(MKDIR_P) gpu/instrumentation/$(DEPDIR)
-	@: > gpu/instrumentation/$(DEPDIR)/$(am__dirstamp)
-gpu/instrumentation/libhpcrun_la-gtpin-instrumentation-kernel-data-map.lo:  \
-	gpu/instrumentation/$(am__dirstamp) \
-	gpu/instrumentation/$(DEPDIR)/$(am__dirstamp)
-gpu/instrumentation/libhpcrun_la-gtpin-instrumentation-kernel-memory-map.lo:  \
-	gpu/instrumentation/$(am__dirstamp) \
-	gpu/instrumentation/$(DEPDIR)/$(am__dirstamp)
-gpu/instrumentation/libhpcrun_la-opencl-instrumentation.lo:  \
-	gpu/instrumentation/$(am__dirstamp) \
-	gpu/instrumentation/$(DEPDIR)/$(am__dirstamp)
 fnbounds/libhpcrun_la-fnbounds_client.lo: fnbounds/$(am__dirstamp) \
 	fnbounds/$(DEPDIR)/$(am__dirstamp)
 fnbounds/libhpcrun_la-fnbounds_dynamic.lo: fnbounds/$(am__dirstamp) \
@@ -2835,6 +2806,40 @@ gpu/level0/libhpcrun_la-level0-event-map.lo:  \
 gpu/level0/libhpcrun_la-level0-handle-map.lo:  \
 	gpu/level0/$(am__dirstamp) \
 	gpu/level0/$(DEPDIR)/$(am__dirstamp)
+sample-sources/libhpcrun_la-opencl.lo: sample-sources/$(am__dirstamp) \
+	sample-sources/$(DEPDIR)/$(am__dirstamp)
+gpu/opencl/$(am__dirstamp):
+	@$(MKDIR_P) gpu/opencl
+	@: > gpu/opencl/$(am__dirstamp)
+gpu/opencl/$(DEPDIR)/$(am__dirstamp):
+	@$(MKDIR_P) gpu/opencl/$(DEPDIR)
+	@: > gpu/opencl/$(DEPDIR)/$(am__dirstamp)
+gpu/opencl/libhpcrun_la-opencl-intercept.lo:  \
+	gpu/opencl/$(am__dirstamp) \
+	gpu/opencl/$(DEPDIR)/$(am__dirstamp)
+gpu/opencl/libhpcrun_la-opencl-api.lo: gpu/opencl/$(am__dirstamp) \
+	gpu/opencl/$(DEPDIR)/$(am__dirstamp)
+gpu/opencl/libhpcrun_la-opencl-memory-manager.lo:  \
+	gpu/opencl/$(am__dirstamp) \
+	gpu/opencl/$(DEPDIR)/$(am__dirstamp)
+gpu/opencl/libhpcrun_la-opencl-activity-translate.lo:  \
+	gpu/opencl/$(am__dirstamp) \
+	gpu/opencl/$(DEPDIR)/$(am__dirstamp)
+gpu/instrumentation/$(am__dirstamp):
+	@$(MKDIR_P) gpu/instrumentation
+	@: > gpu/instrumentation/$(am__dirstamp)
+gpu/instrumentation/$(DEPDIR)/$(am__dirstamp):
+	@$(MKDIR_P) gpu/instrumentation/$(DEPDIR)
+	@: > gpu/instrumentation/$(DEPDIR)/$(am__dirstamp)
+gpu/instrumentation/libhpcrun_la-gtpin-instrumentation-kernel-data-map.lo:  \
+	gpu/instrumentation/$(am__dirstamp) \
+	gpu/instrumentation/$(DEPDIR)/$(am__dirstamp)
+gpu/instrumentation/libhpcrun_la-gtpin-instrumentation-kernel-memory-map.lo:  \
+	gpu/instrumentation/$(am__dirstamp) \
+	gpu/instrumentation/$(DEPDIR)/$(am__dirstamp)
+gpu/instrumentation/libhpcrun_la-opencl-instrumentation.lo:  \
+	gpu/instrumentation/$(am__dirstamp) \
+	gpu/instrumentation/$(DEPDIR)/$(am__dirstamp)
 unwind/common/libhpcrun_la-backtrace.lo:  \
 	unwind/common/$(am__dirstamp) \
 	unwind/common/$(DEPDIR)/$(am__dirstamp)
@@ -3268,30 +3273,6 @@ sample-sources/perf/libhpcrun_o-kernel_blocking.$(OBJEXT):  \
 sample-sources/perf/libhpcrun_o-kernel_blocking_stub.$(OBJEXT):  \
 	sample-sources/perf/$(am__dirstamp) \
 	sample-sources/perf/$(DEPDIR)/$(am__dirstamp)
-sample-sources/libhpcrun_o-opencl.$(OBJEXT):  \
-	sample-sources/$(am__dirstamp) \
-	sample-sources/$(DEPDIR)/$(am__dirstamp)
-gpu/opencl/libhpcrun_o-opencl-intercept.$(OBJEXT):  \
-	gpu/opencl/$(am__dirstamp) \
-	gpu/opencl/$(DEPDIR)/$(am__dirstamp)
-gpu/opencl/libhpcrun_o-opencl-api.$(OBJEXT):  \
-	gpu/opencl/$(am__dirstamp) \
-	gpu/opencl/$(DEPDIR)/$(am__dirstamp)
-gpu/opencl/libhpcrun_o-opencl-memory-manager.$(OBJEXT):  \
-	gpu/opencl/$(am__dirstamp) \
-	gpu/opencl/$(DEPDIR)/$(am__dirstamp)
-gpu/opencl/libhpcrun_o-opencl-activity-translate.$(OBJEXT):  \
-	gpu/opencl/$(am__dirstamp) \
-	gpu/opencl/$(DEPDIR)/$(am__dirstamp)
-gpu/instrumentation/libhpcrun_o-gtpin-instrumentation-kernel-data-map.$(OBJEXT):  \
-	gpu/instrumentation/$(am__dirstamp) \
-	gpu/instrumentation/$(DEPDIR)/$(am__dirstamp)
-gpu/instrumentation/libhpcrun_o-gtpin-instrumentation-kernel-memory-map.$(OBJEXT):  \
-	gpu/instrumentation/$(am__dirstamp) \
-	gpu/instrumentation/$(DEPDIR)/$(am__dirstamp)
-gpu/instrumentation/libhpcrun_o-opencl-instrumentation.$(OBJEXT):  \
-	gpu/instrumentation/$(am__dirstamp) \
-	gpu/instrumentation/$(DEPDIR)/$(am__dirstamp)
 fnbounds/libhpcrun_o-fnbounds_static.$(OBJEXT):  \
 	fnbounds/$(am__dirstamp) fnbounds/$(DEPDIR)/$(am__dirstamp)
 unwind/common/libhpcrun_o-default_validation_summary.$(OBJEXT):  \
@@ -3754,9 +3735,6 @@ distclean-compile:
 @AMDEP_TRUE@@am__include@ @am__quote@gpu/instrumentation/$(DEPDIR)/libhpcrun_la-gtpin-instrumentation-kernel-data-map.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@gpu/instrumentation/$(DEPDIR)/libhpcrun_la-gtpin-instrumentation-kernel-memory-map.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@gpu/instrumentation/$(DEPDIR)/libhpcrun_la-opencl-instrumentation.Plo@am__quote@
-@AMDEP_TRUE@@am__include@ @am__quote@gpu/instrumentation/$(DEPDIR)/libhpcrun_o-gtpin-instrumentation-kernel-data-map.Po@am__quote@
-@AMDEP_TRUE@@am__include@ @am__quote@gpu/instrumentation/$(DEPDIR)/libhpcrun_o-gtpin-instrumentation-kernel-memory-map.Po@am__quote@
-@AMDEP_TRUE@@am__include@ @am__quote@gpu/instrumentation/$(DEPDIR)/libhpcrun_o-opencl-instrumentation.Po@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@gpu/level0/$(DEPDIR)/libhpcrun_la-level0-api.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@gpu/level0/$(DEPDIR)/libhpcrun_la-level0-command-list-map.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@gpu/level0/$(DEPDIR)/libhpcrun_la-level0-command-process.Plo@am__quote@
@@ -3785,10 +3763,6 @@ distclean-compile:
 @AMDEP_TRUE@@am__include@ @am__quote@gpu/opencl/$(DEPDIR)/libhpcrun_la-opencl-api.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@gpu/opencl/$(DEPDIR)/libhpcrun_la-opencl-intercept.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@gpu/opencl/$(DEPDIR)/libhpcrun_la-opencl-memory-manager.Plo@am__quote@
-@AMDEP_TRUE@@am__include@ @am__quote@gpu/opencl/$(DEPDIR)/libhpcrun_o-opencl-activity-translate.Po@am__quote@
-@AMDEP_TRUE@@am__include@ @am__quote@gpu/opencl/$(DEPDIR)/libhpcrun_o-opencl-api.Po@am__quote@
-@AMDEP_TRUE@@am__include@ @am__quote@gpu/opencl/$(DEPDIR)/libhpcrun_o-opencl-intercept.Po@am__quote@
-@AMDEP_TRUE@@am__include@ @am__quote@gpu/opencl/$(DEPDIR)/libhpcrun_o-opencl-memory-manager.Po@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@lush-agents/$(DEPDIR)/libagent_cilk_la-agent-cilk.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@lush-agents/$(DEPDIR)/libagent_pthread_la-agent-pthread.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@lush-agents/$(DEPDIR)/libagent_tbb_la-agent-tbb.Plo@am__quote@
@@ -3884,7 +3858,6 @@ distclean-compile:
 @AMDEP_TRUE@@am__include@ @am__quote@sample-sources/$(DEPDIR)/libhpcrun_o-nvidia.Po@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@sample-sources/$(DEPDIR)/libhpcrun_o-omp-idle.Po@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@sample-sources/$(DEPDIR)/libhpcrun_o-omp-mutex.Po@am__quote@
-@AMDEP_TRUE@@am__include@ @am__quote@sample-sources/$(DEPDIR)/libhpcrun_o-opencl.Po@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@sample-sources/$(DEPDIR)/libhpcrun_o-papi-c-cupti.Po@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@sample-sources/$(DEPDIR)/libhpcrun_o-papi-c-extended-info.Po@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@sample-sources/$(DEPDIR)/libhpcrun_o-papi-c.Po@am__quote@
@@ -5139,62 +5112,6 @@ sample-sources/perf/libhpcrun_la-kernel_blocking_stub.lo: sample-sources/perf/ke
 @AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
 @am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -c -o sample-sources/perf/libhpcrun_la-kernel_blocking_stub.lo `test -f 'sample-sources/perf/kernel_blocking_stub.c' || echo '$(srcdir)/'`sample-sources/perf/kernel_blocking_stub.c
 
-sample-sources/libhpcrun_la-opencl.lo: sample-sources/opencl.c
-@am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -MT sample-sources/libhpcrun_la-opencl.lo -MD -MP -MF sample-sources/$(DEPDIR)/libhpcrun_la-opencl.Tpo -c -o sample-sources/libhpcrun_la-opencl.lo `test -f 'sample-sources/opencl.c' || echo '$(srcdir)/'`sample-sources/opencl.c
-@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) sample-sources/$(DEPDIR)/libhpcrun_la-opencl.Tpo sample-sources/$(DEPDIR)/libhpcrun_la-opencl.Plo
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='sample-sources/opencl.c' object='sample-sources/libhpcrun_la-opencl.lo' libtool=yes @AMDEPBACKSLASH@
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
-@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -c -o sample-sources/libhpcrun_la-opencl.lo `test -f 'sample-sources/opencl.c' || echo '$(srcdir)/'`sample-sources/opencl.c
-
-gpu/opencl/libhpcrun_la-opencl-intercept.lo: gpu/opencl/opencl-intercept.c
-@am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -MT gpu/opencl/libhpcrun_la-opencl-intercept.lo -MD -MP -MF gpu/opencl/$(DEPDIR)/libhpcrun_la-opencl-intercept.Tpo -c -o gpu/opencl/libhpcrun_la-opencl-intercept.lo `test -f 'gpu/opencl/opencl-intercept.c' || echo '$(srcdir)/'`gpu/opencl/opencl-intercept.c
-@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) gpu/opencl/$(DEPDIR)/libhpcrun_la-opencl-intercept.Tpo gpu/opencl/$(DEPDIR)/libhpcrun_la-opencl-intercept.Plo
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='gpu/opencl/opencl-intercept.c' object='gpu/opencl/libhpcrun_la-opencl-intercept.lo' libtool=yes @AMDEPBACKSLASH@
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
-@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -c -o gpu/opencl/libhpcrun_la-opencl-intercept.lo `test -f 'gpu/opencl/opencl-intercept.c' || echo '$(srcdir)/'`gpu/opencl/opencl-intercept.c
-
-gpu/opencl/libhpcrun_la-opencl-api.lo: gpu/opencl/opencl-api.c
-@am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -MT gpu/opencl/libhpcrun_la-opencl-api.lo -MD -MP -MF gpu/opencl/$(DEPDIR)/libhpcrun_la-opencl-api.Tpo -c -o gpu/opencl/libhpcrun_la-opencl-api.lo `test -f 'gpu/opencl/opencl-api.c' || echo '$(srcdir)/'`gpu/opencl/opencl-api.c
-@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) gpu/opencl/$(DEPDIR)/libhpcrun_la-opencl-api.Tpo gpu/opencl/$(DEPDIR)/libhpcrun_la-opencl-api.Plo
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='gpu/opencl/opencl-api.c' object='gpu/opencl/libhpcrun_la-opencl-api.lo' libtool=yes @AMDEPBACKSLASH@
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
-@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -c -o gpu/opencl/libhpcrun_la-opencl-api.lo `test -f 'gpu/opencl/opencl-api.c' || echo '$(srcdir)/'`gpu/opencl/opencl-api.c
-
-gpu/opencl/libhpcrun_la-opencl-memory-manager.lo: gpu/opencl/opencl-memory-manager.c
-@am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -MT gpu/opencl/libhpcrun_la-opencl-memory-manager.lo -MD -MP -MF gpu/opencl/$(DEPDIR)/libhpcrun_la-opencl-memory-manager.Tpo -c -o gpu/opencl/libhpcrun_la-opencl-memory-manager.lo `test -f 'gpu/opencl/opencl-memory-manager.c' || echo '$(srcdir)/'`gpu/opencl/opencl-memory-manager.c
-@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) gpu/opencl/$(DEPDIR)/libhpcrun_la-opencl-memory-manager.Tpo gpu/opencl/$(DEPDIR)/libhpcrun_la-opencl-memory-manager.Plo
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='gpu/opencl/opencl-memory-manager.c' object='gpu/opencl/libhpcrun_la-opencl-memory-manager.lo' libtool=yes @AMDEPBACKSLASH@
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
-@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -c -o gpu/opencl/libhpcrun_la-opencl-memory-manager.lo `test -f 'gpu/opencl/opencl-memory-manager.c' || echo '$(srcdir)/'`gpu/opencl/opencl-memory-manager.c
-
-gpu/opencl/libhpcrun_la-opencl-activity-translate.lo: gpu/opencl/opencl-activity-translate.c
-@am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -MT gpu/opencl/libhpcrun_la-opencl-activity-translate.lo -MD -MP -MF gpu/opencl/$(DEPDIR)/libhpcrun_la-opencl-activity-translate.Tpo -c -o gpu/opencl/libhpcrun_la-opencl-activity-translate.lo `test -f 'gpu/opencl/opencl-activity-translate.c' || echo '$(srcdir)/'`gpu/opencl/opencl-activity-translate.c
-@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) gpu/opencl/$(DEPDIR)/libhpcrun_la-opencl-activity-translate.Tpo gpu/opencl/$(DEPDIR)/libhpcrun_la-opencl-activity-translate.Plo
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='gpu/opencl/opencl-activity-translate.c' object='gpu/opencl/libhpcrun_la-opencl-activity-translate.lo' libtool=yes @AMDEPBACKSLASH@
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
-@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -c -o gpu/opencl/libhpcrun_la-opencl-activity-translate.lo `test -f 'gpu/opencl/opencl-activity-translate.c' || echo '$(srcdir)/'`gpu/opencl/opencl-activity-translate.c
-
-gpu/instrumentation/libhpcrun_la-gtpin-instrumentation-kernel-data-map.lo: gpu/instrumentation/gtpin-instrumentation-kernel-data-map.c
-@am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -MT gpu/instrumentation/libhpcrun_la-gtpin-instrumentation-kernel-data-map.lo -MD -MP -MF gpu/instrumentation/$(DEPDIR)/libhpcrun_la-gtpin-instrumentation-kernel-data-map.Tpo -c -o gpu/instrumentation/libhpcrun_la-gtpin-instrumentation-kernel-data-map.lo `test -f 'gpu/instrumentation/gtpin-instrumentation-kernel-data-map.c' || echo '$(srcdir)/'`gpu/instrumentation/gtpin-instrumentation-kernel-data-map.c
-@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) gpu/instrumentation/$(DEPDIR)/libhpcrun_la-gtpin-instrumentation-kernel-data-map.Tpo gpu/instrumentation/$(DEPDIR)/libhpcrun_la-gtpin-instrumentation-kernel-data-map.Plo
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='gpu/instrumentation/gtpin-instrumentation-kernel-data-map.c' object='gpu/instrumentation/libhpcrun_la-gtpin-instrumentation-kernel-data-map.lo' libtool=yes @AMDEPBACKSLASH@
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
-@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -c -o gpu/instrumentation/libhpcrun_la-gtpin-instrumentation-kernel-data-map.lo `test -f 'gpu/instrumentation/gtpin-instrumentation-kernel-data-map.c' || echo '$(srcdir)/'`gpu/instrumentation/gtpin-instrumentation-kernel-data-map.c
-
-gpu/instrumentation/libhpcrun_la-gtpin-instrumentation-kernel-memory-map.lo: gpu/instrumentation/gtpin-instrumentation-kernel-memory-map.c
-@am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -MT gpu/instrumentation/libhpcrun_la-gtpin-instrumentation-kernel-memory-map.lo -MD -MP -MF gpu/instrumentation/$(DEPDIR)/libhpcrun_la-gtpin-instrumentation-kernel-memory-map.Tpo -c -o gpu/instrumentation/libhpcrun_la-gtpin-instrumentation-kernel-memory-map.lo `test -f 'gpu/instrumentation/gtpin-instrumentation-kernel-memory-map.c' || echo '$(srcdir)/'`gpu/instrumentation/gtpin-instrumentation-kernel-memory-map.c
-@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) gpu/instrumentation/$(DEPDIR)/libhpcrun_la-gtpin-instrumentation-kernel-memory-map.Tpo gpu/instrumentation/$(DEPDIR)/libhpcrun_la-gtpin-instrumentation-kernel-memory-map.Plo
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='gpu/instrumentation/gtpin-instrumentation-kernel-memory-map.c' object='gpu/instrumentation/libhpcrun_la-gtpin-instrumentation-kernel-memory-map.lo' libtool=yes @AMDEPBACKSLASH@
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
-@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -c -o gpu/instrumentation/libhpcrun_la-gtpin-instrumentation-kernel-memory-map.lo `test -f 'gpu/instrumentation/gtpin-instrumentation-kernel-memory-map.c' || echo '$(srcdir)/'`gpu/instrumentation/gtpin-instrumentation-kernel-memory-map.c
-
-gpu/instrumentation/libhpcrun_la-opencl-instrumentation.lo: gpu/instrumentation/opencl-instrumentation.c
-@am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -MT gpu/instrumentation/libhpcrun_la-opencl-instrumentation.lo -MD -MP -MF gpu/instrumentation/$(DEPDIR)/libhpcrun_la-opencl-instrumentation.Tpo -c -o gpu/instrumentation/libhpcrun_la-opencl-instrumentation.lo `test -f 'gpu/instrumentation/opencl-instrumentation.c' || echo '$(srcdir)/'`gpu/instrumentation/opencl-instrumentation.c
-@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) gpu/instrumentation/$(DEPDIR)/libhpcrun_la-opencl-instrumentation.Tpo gpu/instrumentation/$(DEPDIR)/libhpcrun_la-opencl-instrumentation.Plo
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='gpu/instrumentation/opencl-instrumentation.c' object='gpu/instrumentation/libhpcrun_la-opencl-instrumentation.lo' libtool=yes @AMDEPBACKSLASH@
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
-@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -c -o gpu/instrumentation/libhpcrun_la-opencl-instrumentation.lo `test -f 'gpu/instrumentation/opencl-instrumentation.c' || echo '$(srcdir)/'`gpu/instrumentation/opencl-instrumentation.c
-
 fnbounds/libhpcrun_la-fnbounds_client.lo: fnbounds/fnbounds_client.c
 @am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -MT fnbounds/libhpcrun_la-fnbounds_client.lo -MD -MP -MF fnbounds/$(DEPDIR)/libhpcrun_la-fnbounds_client.Tpo -c -o fnbounds/libhpcrun_la-fnbounds_client.lo `test -f 'fnbounds/fnbounds_client.c' || echo '$(srcdir)/'`fnbounds/fnbounds_client.c
 @am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) fnbounds/$(DEPDIR)/libhpcrun_la-fnbounds_client.Tpo fnbounds/$(DEPDIR)/libhpcrun_la-fnbounds_client.Plo
@@ -5454,6 +5371,62 @@ gpu/level0/libhpcrun_la-level0-handle-map.lo: gpu/level0/level0-handle-map.c
 @AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
 @am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -c -o gpu/level0/libhpcrun_la-level0-handle-map.lo `test -f 'gpu/level0/level0-handle-map.c' || echo '$(srcdir)/'`gpu/level0/level0-handle-map.c
 
+sample-sources/libhpcrun_la-opencl.lo: sample-sources/opencl.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -MT sample-sources/libhpcrun_la-opencl.lo -MD -MP -MF sample-sources/$(DEPDIR)/libhpcrun_la-opencl.Tpo -c -o sample-sources/libhpcrun_la-opencl.lo `test -f 'sample-sources/opencl.c' || echo '$(srcdir)/'`sample-sources/opencl.c
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) sample-sources/$(DEPDIR)/libhpcrun_la-opencl.Tpo sample-sources/$(DEPDIR)/libhpcrun_la-opencl.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='sample-sources/opencl.c' object='sample-sources/libhpcrun_la-opencl.lo' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -c -o sample-sources/libhpcrun_la-opencl.lo `test -f 'sample-sources/opencl.c' || echo '$(srcdir)/'`sample-sources/opencl.c
+
+gpu/opencl/libhpcrun_la-opencl-intercept.lo: gpu/opencl/opencl-intercept.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -MT gpu/opencl/libhpcrun_la-opencl-intercept.lo -MD -MP -MF gpu/opencl/$(DEPDIR)/libhpcrun_la-opencl-intercept.Tpo -c -o gpu/opencl/libhpcrun_la-opencl-intercept.lo `test -f 'gpu/opencl/opencl-intercept.c' || echo '$(srcdir)/'`gpu/opencl/opencl-intercept.c
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) gpu/opencl/$(DEPDIR)/libhpcrun_la-opencl-intercept.Tpo gpu/opencl/$(DEPDIR)/libhpcrun_la-opencl-intercept.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='gpu/opencl/opencl-intercept.c' object='gpu/opencl/libhpcrun_la-opencl-intercept.lo' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -c -o gpu/opencl/libhpcrun_la-opencl-intercept.lo `test -f 'gpu/opencl/opencl-intercept.c' || echo '$(srcdir)/'`gpu/opencl/opencl-intercept.c
+
+gpu/opencl/libhpcrun_la-opencl-api.lo: gpu/opencl/opencl-api.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -MT gpu/opencl/libhpcrun_la-opencl-api.lo -MD -MP -MF gpu/opencl/$(DEPDIR)/libhpcrun_la-opencl-api.Tpo -c -o gpu/opencl/libhpcrun_la-opencl-api.lo `test -f 'gpu/opencl/opencl-api.c' || echo '$(srcdir)/'`gpu/opencl/opencl-api.c
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) gpu/opencl/$(DEPDIR)/libhpcrun_la-opencl-api.Tpo gpu/opencl/$(DEPDIR)/libhpcrun_la-opencl-api.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='gpu/opencl/opencl-api.c' object='gpu/opencl/libhpcrun_la-opencl-api.lo' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -c -o gpu/opencl/libhpcrun_la-opencl-api.lo `test -f 'gpu/opencl/opencl-api.c' || echo '$(srcdir)/'`gpu/opencl/opencl-api.c
+
+gpu/opencl/libhpcrun_la-opencl-memory-manager.lo: gpu/opencl/opencl-memory-manager.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -MT gpu/opencl/libhpcrun_la-opencl-memory-manager.lo -MD -MP -MF gpu/opencl/$(DEPDIR)/libhpcrun_la-opencl-memory-manager.Tpo -c -o gpu/opencl/libhpcrun_la-opencl-memory-manager.lo `test -f 'gpu/opencl/opencl-memory-manager.c' || echo '$(srcdir)/'`gpu/opencl/opencl-memory-manager.c
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) gpu/opencl/$(DEPDIR)/libhpcrun_la-opencl-memory-manager.Tpo gpu/opencl/$(DEPDIR)/libhpcrun_la-opencl-memory-manager.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='gpu/opencl/opencl-memory-manager.c' object='gpu/opencl/libhpcrun_la-opencl-memory-manager.lo' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -c -o gpu/opencl/libhpcrun_la-opencl-memory-manager.lo `test -f 'gpu/opencl/opencl-memory-manager.c' || echo '$(srcdir)/'`gpu/opencl/opencl-memory-manager.c
+
+gpu/opencl/libhpcrun_la-opencl-activity-translate.lo: gpu/opencl/opencl-activity-translate.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -MT gpu/opencl/libhpcrun_la-opencl-activity-translate.lo -MD -MP -MF gpu/opencl/$(DEPDIR)/libhpcrun_la-opencl-activity-translate.Tpo -c -o gpu/opencl/libhpcrun_la-opencl-activity-translate.lo `test -f 'gpu/opencl/opencl-activity-translate.c' || echo '$(srcdir)/'`gpu/opencl/opencl-activity-translate.c
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) gpu/opencl/$(DEPDIR)/libhpcrun_la-opencl-activity-translate.Tpo gpu/opencl/$(DEPDIR)/libhpcrun_la-opencl-activity-translate.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='gpu/opencl/opencl-activity-translate.c' object='gpu/opencl/libhpcrun_la-opencl-activity-translate.lo' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -c -o gpu/opencl/libhpcrun_la-opencl-activity-translate.lo `test -f 'gpu/opencl/opencl-activity-translate.c' || echo '$(srcdir)/'`gpu/opencl/opencl-activity-translate.c
+
+gpu/instrumentation/libhpcrun_la-gtpin-instrumentation-kernel-data-map.lo: gpu/instrumentation/gtpin-instrumentation-kernel-data-map.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -MT gpu/instrumentation/libhpcrun_la-gtpin-instrumentation-kernel-data-map.lo -MD -MP -MF gpu/instrumentation/$(DEPDIR)/libhpcrun_la-gtpin-instrumentation-kernel-data-map.Tpo -c -o gpu/instrumentation/libhpcrun_la-gtpin-instrumentation-kernel-data-map.lo `test -f 'gpu/instrumentation/gtpin-instrumentation-kernel-data-map.c' || echo '$(srcdir)/'`gpu/instrumentation/gtpin-instrumentation-kernel-data-map.c
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) gpu/instrumentation/$(DEPDIR)/libhpcrun_la-gtpin-instrumentation-kernel-data-map.Tpo gpu/instrumentation/$(DEPDIR)/libhpcrun_la-gtpin-instrumentation-kernel-data-map.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='gpu/instrumentation/gtpin-instrumentation-kernel-data-map.c' object='gpu/instrumentation/libhpcrun_la-gtpin-instrumentation-kernel-data-map.lo' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -c -o gpu/instrumentation/libhpcrun_la-gtpin-instrumentation-kernel-data-map.lo `test -f 'gpu/instrumentation/gtpin-instrumentation-kernel-data-map.c' || echo '$(srcdir)/'`gpu/instrumentation/gtpin-instrumentation-kernel-data-map.c
+
+gpu/instrumentation/libhpcrun_la-gtpin-instrumentation-kernel-memory-map.lo: gpu/instrumentation/gtpin-instrumentation-kernel-memory-map.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -MT gpu/instrumentation/libhpcrun_la-gtpin-instrumentation-kernel-memory-map.lo -MD -MP -MF gpu/instrumentation/$(DEPDIR)/libhpcrun_la-gtpin-instrumentation-kernel-memory-map.Tpo -c -o gpu/instrumentation/libhpcrun_la-gtpin-instrumentation-kernel-memory-map.lo `test -f 'gpu/instrumentation/gtpin-instrumentation-kernel-memory-map.c' || echo '$(srcdir)/'`gpu/instrumentation/gtpin-instrumentation-kernel-memory-map.c
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) gpu/instrumentation/$(DEPDIR)/libhpcrun_la-gtpin-instrumentation-kernel-memory-map.Tpo gpu/instrumentation/$(DEPDIR)/libhpcrun_la-gtpin-instrumentation-kernel-memory-map.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='gpu/instrumentation/gtpin-instrumentation-kernel-memory-map.c' object='gpu/instrumentation/libhpcrun_la-gtpin-instrumentation-kernel-memory-map.lo' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -c -o gpu/instrumentation/libhpcrun_la-gtpin-instrumentation-kernel-memory-map.lo `test -f 'gpu/instrumentation/gtpin-instrumentation-kernel-memory-map.c' || echo '$(srcdir)/'`gpu/instrumentation/gtpin-instrumentation-kernel-memory-map.c
+
+gpu/instrumentation/libhpcrun_la-opencl-instrumentation.lo: gpu/instrumentation/opencl-instrumentation.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -MT gpu/instrumentation/libhpcrun_la-opencl-instrumentation.lo -MD -MP -MF gpu/instrumentation/$(DEPDIR)/libhpcrun_la-opencl-instrumentation.Tpo -c -o gpu/instrumentation/libhpcrun_la-opencl-instrumentation.lo `test -f 'gpu/instrumentation/opencl-instrumentation.c' || echo '$(srcdir)/'`gpu/instrumentation/opencl-instrumentation.c
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) gpu/instrumentation/$(DEPDIR)/libhpcrun_la-opencl-instrumentation.Tpo gpu/instrumentation/$(DEPDIR)/libhpcrun_la-opencl-instrumentation.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='gpu/instrumentation/opencl-instrumentation.c' object='gpu/instrumentation/libhpcrun_la-opencl-instrumentation.lo' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -c -o gpu/instrumentation/libhpcrun_la-opencl-instrumentation.lo `test -f 'gpu/instrumentation/opencl-instrumentation.c' || echo '$(srcdir)/'`gpu/instrumentation/opencl-instrumentation.c
+
 unwind/common/libhpcrun_la-backtrace.lo: unwind/common/backtrace.c
 @am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -MT unwind/common/libhpcrun_la-backtrace.lo -MD -MP -MF unwind/common/$(DEPDIR)/libhpcrun_la-backtrace.Tpo -c -o unwind/common/libhpcrun_la-backtrace.lo `test -f 'unwind/common/backtrace.c' || echo '$(srcdir)/'`unwind/common/backtrace.c
 @am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) unwind/common/$(DEPDIR)/libhpcrun_la-backtrace.Tpo unwind/common/$(DEPDIR)/libhpcrun_la-backtrace.Plo
@@ -7526,118 +7499,6 @@ sample-sources/perf/libhpcrun_o-kernel_blocking_stub.obj: sample-sources/perf/ke
 @AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
 @am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_o_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_o_CFLAGS) $(CFLAGS) -c -o sample-sources/perf/libhpcrun_o-kernel_blocking_stub.obj `if test -f 'sample-sources/perf/kernel_blocking_stub.c'; then $(CYGPATH_W) 'sample-sources/perf/kernel_blocking_stub.c'; else $(CYGPATH_W) '$(srcdir)/sample-sources/perf/kernel_blocking_stub.c'; fi`
 
-sample-sources/libhpcrun_o-opencl.o: sample-sources/opencl.c
-@am__fastdepCC_TRUE@	$(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_o_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_o_CFLAGS) $(CFLAGS) -MT sample-sources/libhpcrun_o-opencl.o -MD -MP -MF sample-sources/$(DEPDIR)/libhpcrun_o-opencl.Tpo -c -o sample-sources/libhpcrun_o-opencl.o `test -f 'sample-sources/opencl.c' || echo '$(srcdir)/'`sample-sources/opencl.c
-@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) sample-sources/$(DEPDIR)/libhpcrun_o-opencl.Tpo sample-sources/$(DEPDIR)/libhpcrun_o-opencl.Po
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='sample-sources/opencl.c' object='sample-sources/libhpcrun_o-opencl.o' libtool=no @AMDEPBACKSLASH@
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
-@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_o_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_o_CFLAGS) $(CFLAGS) -c -o sample-sources/libhpcrun_o-opencl.o `test -f 'sample-sources/opencl.c' || echo '$(srcdir)/'`sample-sources/opencl.c
-
-sample-sources/libhpcrun_o-opencl.obj: sample-sources/opencl.c
-@am__fastdepCC_TRUE@	$(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_o_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_o_CFLAGS) $(CFLAGS) -MT sample-sources/libhpcrun_o-opencl.obj -MD -MP -MF sample-sources/$(DEPDIR)/libhpcrun_o-opencl.Tpo -c -o sample-sources/libhpcrun_o-opencl.obj `if test -f 'sample-sources/opencl.c'; then $(CYGPATH_W) 'sample-sources/opencl.c'; else $(CYGPATH_W) '$(srcdir)/sample-sources/opencl.c'; fi`
-@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) sample-sources/$(DEPDIR)/libhpcrun_o-opencl.Tpo sample-sources/$(DEPDIR)/libhpcrun_o-opencl.Po
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='sample-sources/opencl.c' object='sample-sources/libhpcrun_o-opencl.obj' libtool=no @AMDEPBACKSLASH@
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
-@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_o_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_o_CFLAGS) $(CFLAGS) -c -o sample-sources/libhpcrun_o-opencl.obj `if test -f 'sample-sources/opencl.c'; then $(CYGPATH_W) 'sample-sources/opencl.c'; else $(CYGPATH_W) '$(srcdir)/sample-sources/opencl.c'; fi`
-
-gpu/opencl/libhpcrun_o-opencl-intercept.o: gpu/opencl/opencl-intercept.c
-@am__fastdepCC_TRUE@	$(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_o_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_o_CFLAGS) $(CFLAGS) -MT gpu/opencl/libhpcrun_o-opencl-intercept.o -MD -MP -MF gpu/opencl/$(DEPDIR)/libhpcrun_o-opencl-intercept.Tpo -c -o gpu/opencl/libhpcrun_o-opencl-intercept.o `test -f 'gpu/opencl/opencl-intercept.c' || echo '$(srcdir)/'`gpu/opencl/opencl-intercept.c
-@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) gpu/opencl/$(DEPDIR)/libhpcrun_o-opencl-intercept.Tpo gpu/opencl/$(DEPDIR)/libhpcrun_o-opencl-intercept.Po
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='gpu/opencl/opencl-intercept.c' object='gpu/opencl/libhpcrun_o-opencl-intercept.o' libtool=no @AMDEPBACKSLASH@
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
-@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_o_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_o_CFLAGS) $(CFLAGS) -c -o gpu/opencl/libhpcrun_o-opencl-intercept.o `test -f 'gpu/opencl/opencl-intercept.c' || echo '$(srcdir)/'`gpu/opencl/opencl-intercept.c
-
-gpu/opencl/libhpcrun_o-opencl-intercept.obj: gpu/opencl/opencl-intercept.c
-@am__fastdepCC_TRUE@	$(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_o_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_o_CFLAGS) $(CFLAGS) -MT gpu/opencl/libhpcrun_o-opencl-intercept.obj -MD -MP -MF gpu/opencl/$(DEPDIR)/libhpcrun_o-opencl-intercept.Tpo -c -o gpu/opencl/libhpcrun_o-opencl-intercept.obj `if test -f 'gpu/opencl/opencl-intercept.c'; then $(CYGPATH_W) 'gpu/opencl/opencl-intercept.c'; else $(CYGPATH_W) '$(srcdir)/gpu/opencl/opencl-intercept.c'; fi`
-@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) gpu/opencl/$(DEPDIR)/libhpcrun_o-opencl-intercept.Tpo gpu/opencl/$(DEPDIR)/libhpcrun_o-opencl-intercept.Po
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='gpu/opencl/opencl-intercept.c' object='gpu/opencl/libhpcrun_o-opencl-intercept.obj' libtool=no @AMDEPBACKSLASH@
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
-@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_o_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_o_CFLAGS) $(CFLAGS) -c -o gpu/opencl/libhpcrun_o-opencl-intercept.obj `if test -f 'gpu/opencl/opencl-intercept.c'; then $(CYGPATH_W) 'gpu/opencl/opencl-intercept.c'; else $(CYGPATH_W) '$(srcdir)/gpu/opencl/opencl-intercept.c'; fi`
-
-gpu/opencl/libhpcrun_o-opencl-api.o: gpu/opencl/opencl-api.c
-@am__fastdepCC_TRUE@	$(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_o_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_o_CFLAGS) $(CFLAGS) -MT gpu/opencl/libhpcrun_o-opencl-api.o -MD -MP -MF gpu/opencl/$(DEPDIR)/libhpcrun_o-opencl-api.Tpo -c -o gpu/opencl/libhpcrun_o-opencl-api.o `test -f 'gpu/opencl/opencl-api.c' || echo '$(srcdir)/'`gpu/opencl/opencl-api.c
-@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) gpu/opencl/$(DEPDIR)/libhpcrun_o-opencl-api.Tpo gpu/opencl/$(DEPDIR)/libhpcrun_o-opencl-api.Po
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='gpu/opencl/opencl-api.c' object='gpu/opencl/libhpcrun_o-opencl-api.o' libtool=no @AMDEPBACKSLASH@
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
-@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_o_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_o_CFLAGS) $(CFLAGS) -c -o gpu/opencl/libhpcrun_o-opencl-api.o `test -f 'gpu/opencl/opencl-api.c' || echo '$(srcdir)/'`gpu/opencl/opencl-api.c
-
-gpu/opencl/libhpcrun_o-opencl-api.obj: gpu/opencl/opencl-api.c
-@am__fastdepCC_TRUE@	$(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_o_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_o_CFLAGS) $(CFLAGS) -MT gpu/opencl/libhpcrun_o-opencl-api.obj -MD -MP -MF gpu/opencl/$(DEPDIR)/libhpcrun_o-opencl-api.Tpo -c -o gpu/opencl/libhpcrun_o-opencl-api.obj `if test -f 'gpu/opencl/opencl-api.c'; then $(CYGPATH_W) 'gpu/opencl/opencl-api.c'; else $(CYGPATH_W) '$(srcdir)/gpu/opencl/opencl-api.c'; fi`
-@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) gpu/opencl/$(DEPDIR)/libhpcrun_o-opencl-api.Tpo gpu/opencl/$(DEPDIR)/libhpcrun_o-opencl-api.Po
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='gpu/opencl/opencl-api.c' object='gpu/opencl/libhpcrun_o-opencl-api.obj' libtool=no @AMDEPBACKSLASH@
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
-@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_o_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_o_CFLAGS) $(CFLAGS) -c -o gpu/opencl/libhpcrun_o-opencl-api.obj `if test -f 'gpu/opencl/opencl-api.c'; then $(CYGPATH_W) 'gpu/opencl/opencl-api.c'; else $(CYGPATH_W) '$(srcdir)/gpu/opencl/opencl-api.c'; fi`
-
-gpu/opencl/libhpcrun_o-opencl-memory-manager.o: gpu/opencl/opencl-memory-manager.c
-@am__fastdepCC_TRUE@	$(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_o_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_o_CFLAGS) $(CFLAGS) -MT gpu/opencl/libhpcrun_o-opencl-memory-manager.o -MD -MP -MF gpu/opencl/$(DEPDIR)/libhpcrun_o-opencl-memory-manager.Tpo -c -o gpu/opencl/libhpcrun_o-opencl-memory-manager.o `test -f 'gpu/opencl/opencl-memory-manager.c' || echo '$(srcdir)/'`gpu/opencl/opencl-memory-manager.c
-@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) gpu/opencl/$(DEPDIR)/libhpcrun_o-opencl-memory-manager.Tpo gpu/opencl/$(DEPDIR)/libhpcrun_o-opencl-memory-manager.Po
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='gpu/opencl/opencl-memory-manager.c' object='gpu/opencl/libhpcrun_o-opencl-memory-manager.o' libtool=no @AMDEPBACKSLASH@
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
-@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_o_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_o_CFLAGS) $(CFLAGS) -c -o gpu/opencl/libhpcrun_o-opencl-memory-manager.o `test -f 'gpu/opencl/opencl-memory-manager.c' || echo '$(srcdir)/'`gpu/opencl/opencl-memory-manager.c
-
-gpu/opencl/libhpcrun_o-opencl-memory-manager.obj: gpu/opencl/opencl-memory-manager.c
-@am__fastdepCC_TRUE@	$(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_o_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_o_CFLAGS) $(CFLAGS) -MT gpu/opencl/libhpcrun_o-opencl-memory-manager.obj -MD -MP -MF gpu/opencl/$(DEPDIR)/libhpcrun_o-opencl-memory-manager.Tpo -c -o gpu/opencl/libhpcrun_o-opencl-memory-manager.obj `if test -f 'gpu/opencl/opencl-memory-manager.c'; then $(CYGPATH_W) 'gpu/opencl/opencl-memory-manager.c'; else $(CYGPATH_W) '$(srcdir)/gpu/opencl/opencl-memory-manager.c'; fi`
-@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) gpu/opencl/$(DEPDIR)/libhpcrun_o-opencl-memory-manager.Tpo gpu/opencl/$(DEPDIR)/libhpcrun_o-opencl-memory-manager.Po
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='gpu/opencl/opencl-memory-manager.c' object='gpu/opencl/libhpcrun_o-opencl-memory-manager.obj' libtool=no @AMDEPBACKSLASH@
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
-@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_o_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_o_CFLAGS) $(CFLAGS) -c -o gpu/opencl/libhpcrun_o-opencl-memory-manager.obj `if test -f 'gpu/opencl/opencl-memory-manager.c'; then $(CYGPATH_W) 'gpu/opencl/opencl-memory-manager.c'; else $(CYGPATH_W) '$(srcdir)/gpu/opencl/opencl-memory-manager.c'; fi`
-
-gpu/opencl/libhpcrun_o-opencl-activity-translate.o: gpu/opencl/opencl-activity-translate.c
-@am__fastdepCC_TRUE@	$(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_o_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_o_CFLAGS) $(CFLAGS) -MT gpu/opencl/libhpcrun_o-opencl-activity-translate.o -MD -MP -MF gpu/opencl/$(DEPDIR)/libhpcrun_o-opencl-activity-translate.Tpo -c -o gpu/opencl/libhpcrun_o-opencl-activity-translate.o `test -f 'gpu/opencl/opencl-activity-translate.c' || echo '$(srcdir)/'`gpu/opencl/opencl-activity-translate.c
-@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) gpu/opencl/$(DEPDIR)/libhpcrun_o-opencl-activity-translate.Tpo gpu/opencl/$(DEPDIR)/libhpcrun_o-opencl-activity-translate.Po
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='gpu/opencl/opencl-activity-translate.c' object='gpu/opencl/libhpcrun_o-opencl-activity-translate.o' libtool=no @AMDEPBACKSLASH@
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
-@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_o_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_o_CFLAGS) $(CFLAGS) -c -o gpu/opencl/libhpcrun_o-opencl-activity-translate.o `test -f 'gpu/opencl/opencl-activity-translate.c' || echo '$(srcdir)/'`gpu/opencl/opencl-activity-translate.c
-
-gpu/opencl/libhpcrun_o-opencl-activity-translate.obj: gpu/opencl/opencl-activity-translate.c
-@am__fastdepCC_TRUE@	$(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_o_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_o_CFLAGS) $(CFLAGS) -MT gpu/opencl/libhpcrun_o-opencl-activity-translate.obj -MD -MP -MF gpu/opencl/$(DEPDIR)/libhpcrun_o-opencl-activity-translate.Tpo -c -o gpu/opencl/libhpcrun_o-opencl-activity-translate.obj `if test -f 'gpu/opencl/opencl-activity-translate.c'; then $(CYGPATH_W) 'gpu/opencl/opencl-activity-translate.c'; else $(CYGPATH_W) '$(srcdir)/gpu/opencl/opencl-activity-translate.c'; fi`
-@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) gpu/opencl/$(DEPDIR)/libhpcrun_o-opencl-activity-translate.Tpo gpu/opencl/$(DEPDIR)/libhpcrun_o-opencl-activity-translate.Po
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='gpu/opencl/opencl-activity-translate.c' object='gpu/opencl/libhpcrun_o-opencl-activity-translate.obj' libtool=no @AMDEPBACKSLASH@
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
-@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_o_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_o_CFLAGS) $(CFLAGS) -c -o gpu/opencl/libhpcrun_o-opencl-activity-translate.obj `if test -f 'gpu/opencl/opencl-activity-translate.c'; then $(CYGPATH_W) 'gpu/opencl/opencl-activity-translate.c'; else $(CYGPATH_W) '$(srcdir)/gpu/opencl/opencl-activity-translate.c'; fi`
-
-gpu/instrumentation/libhpcrun_o-gtpin-instrumentation-kernel-data-map.o: gpu/instrumentation/gtpin-instrumentation-kernel-data-map.c
-@am__fastdepCC_TRUE@	$(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_o_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_o_CFLAGS) $(CFLAGS) -MT gpu/instrumentation/libhpcrun_o-gtpin-instrumentation-kernel-data-map.o -MD -MP -MF gpu/instrumentation/$(DEPDIR)/libhpcrun_o-gtpin-instrumentation-kernel-data-map.Tpo -c -o gpu/instrumentation/libhpcrun_o-gtpin-instrumentation-kernel-data-map.o `test -f 'gpu/instrumentation/gtpin-instrumentation-kernel-data-map.c' || echo '$(srcdir)/'`gpu/instrumentation/gtpin-instrumentation-kernel-data-map.c
-@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) gpu/instrumentation/$(DEPDIR)/libhpcrun_o-gtpin-instrumentation-kernel-data-map.Tpo gpu/instrumentation/$(DEPDIR)/libhpcrun_o-gtpin-instrumentation-kernel-data-map.Po
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='gpu/instrumentation/gtpin-instrumentation-kernel-data-map.c' object='gpu/instrumentation/libhpcrun_o-gtpin-instrumentation-kernel-data-map.o' libtool=no @AMDEPBACKSLASH@
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
-@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_o_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_o_CFLAGS) $(CFLAGS) -c -o gpu/instrumentation/libhpcrun_o-gtpin-instrumentation-kernel-data-map.o `test -f 'gpu/instrumentation/gtpin-instrumentation-kernel-data-map.c' || echo '$(srcdir)/'`gpu/instrumentation/gtpin-instrumentation-kernel-data-map.c
-
-gpu/instrumentation/libhpcrun_o-gtpin-instrumentation-kernel-data-map.obj: gpu/instrumentation/gtpin-instrumentation-kernel-data-map.c
-@am__fastdepCC_TRUE@	$(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_o_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_o_CFLAGS) $(CFLAGS) -MT gpu/instrumentation/libhpcrun_o-gtpin-instrumentation-kernel-data-map.obj -MD -MP -MF gpu/instrumentation/$(DEPDIR)/libhpcrun_o-gtpin-instrumentation-kernel-data-map.Tpo -c -o gpu/instrumentation/libhpcrun_o-gtpin-instrumentation-kernel-data-map.obj `if test -f 'gpu/instrumentation/gtpin-instrumentation-kernel-data-map.c'; then $(CYGPATH_W) 'gpu/instrumentation/gtpin-instrumentation-kernel-data-map.c'; else $(CYGPATH_W) '$(srcdir)/gpu/instrumentation/gtpin-instrumentation-kernel-data-map.c'; fi`
-@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) gpu/instrumentation/$(DEPDIR)/libhpcrun_o-gtpin-instrumentation-kernel-data-map.Tpo gpu/instrumentation/$(DEPDIR)/libhpcrun_o-gtpin-instrumentation-kernel-data-map.Po
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='gpu/instrumentation/gtpin-instrumentation-kernel-data-map.c' object='gpu/instrumentation/libhpcrun_o-gtpin-instrumentation-kernel-data-map.obj' libtool=no @AMDEPBACKSLASH@
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
-@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_o_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_o_CFLAGS) $(CFLAGS) -c -o gpu/instrumentation/libhpcrun_o-gtpin-instrumentation-kernel-data-map.obj `if test -f 'gpu/instrumentation/gtpin-instrumentation-kernel-data-map.c'; then $(CYGPATH_W) 'gpu/instrumentation/gtpin-instrumentation-kernel-data-map.c'; else $(CYGPATH_W) '$(srcdir)/gpu/instrumentation/gtpin-instrumentation-kernel-data-map.c'; fi`
-
-gpu/instrumentation/libhpcrun_o-gtpin-instrumentation-kernel-memory-map.o: gpu/instrumentation/gtpin-instrumentation-kernel-memory-map.c
-@am__fastdepCC_TRUE@	$(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_o_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_o_CFLAGS) $(CFLAGS) -MT gpu/instrumentation/libhpcrun_o-gtpin-instrumentation-kernel-memory-map.o -MD -MP -MF gpu/instrumentation/$(DEPDIR)/libhpcrun_o-gtpin-instrumentation-kernel-memory-map.Tpo -c -o gpu/instrumentation/libhpcrun_o-gtpin-instrumentation-kernel-memory-map.o `test -f 'gpu/instrumentation/gtpin-instrumentation-kernel-memory-map.c' || echo '$(srcdir)/'`gpu/instrumentation/gtpin-instrumentation-kernel-memory-map.c
-@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) gpu/instrumentation/$(DEPDIR)/libhpcrun_o-gtpin-instrumentation-kernel-memory-map.Tpo gpu/instrumentation/$(DEPDIR)/libhpcrun_o-gtpin-instrumentation-kernel-memory-map.Po
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='gpu/instrumentation/gtpin-instrumentation-kernel-memory-map.c' object='gpu/instrumentation/libhpcrun_o-gtpin-instrumentation-kernel-memory-map.o' libtool=no @AMDEPBACKSLASH@
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
-@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_o_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_o_CFLAGS) $(CFLAGS) -c -o gpu/instrumentation/libhpcrun_o-gtpin-instrumentation-kernel-memory-map.o `test -f 'gpu/instrumentation/gtpin-instrumentation-kernel-memory-map.c' || echo '$(srcdir)/'`gpu/instrumentation/gtpin-instrumentation-kernel-memory-map.c
-
-gpu/instrumentation/libhpcrun_o-gtpin-instrumentation-kernel-memory-map.obj: gpu/instrumentation/gtpin-instrumentation-kernel-memory-map.c
-@am__fastdepCC_TRUE@	$(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_o_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_o_CFLAGS) $(CFLAGS) -MT gpu/instrumentation/libhpcrun_o-gtpin-instrumentation-kernel-memory-map.obj -MD -MP -MF gpu/instrumentation/$(DEPDIR)/libhpcrun_o-gtpin-instrumentation-kernel-memory-map.Tpo -c -o gpu/instrumentation/libhpcrun_o-gtpin-instrumentation-kernel-memory-map.obj `if test -f 'gpu/instrumentation/gtpin-instrumentation-kernel-memory-map.c'; then $(CYGPATH_W) 'gpu/instrumentation/gtpin-instrumentation-kernel-memory-map.c'; else $(CYGPATH_W) '$(srcdir)/gpu/instrumentation/gtpin-instrumentation-kernel-memory-map.c'; fi`
-@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) gpu/instrumentation/$(DEPDIR)/libhpcrun_o-gtpin-instrumentation-kernel-memory-map.Tpo gpu/instrumentation/$(DEPDIR)/libhpcrun_o-gtpin-instrumentation-kernel-memory-map.Po
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='gpu/instrumentation/gtpin-instrumentation-kernel-memory-map.c' object='gpu/instrumentation/libhpcrun_o-gtpin-instrumentation-kernel-memory-map.obj' libtool=no @AMDEPBACKSLASH@
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
-@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_o_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_o_CFLAGS) $(CFLAGS) -c -o gpu/instrumentation/libhpcrun_o-gtpin-instrumentation-kernel-memory-map.obj `if test -f 'gpu/instrumentation/gtpin-instrumentation-kernel-memory-map.c'; then $(CYGPATH_W) 'gpu/instrumentation/gtpin-instrumentation-kernel-memory-map.c'; else $(CYGPATH_W) '$(srcdir)/gpu/instrumentation/gtpin-instrumentation-kernel-memory-map.c'; fi`
-
-gpu/instrumentation/libhpcrun_o-opencl-instrumentation.o: gpu/instrumentation/opencl-instrumentation.c
-@am__fastdepCC_TRUE@	$(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_o_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_o_CFLAGS) $(CFLAGS) -MT gpu/instrumentation/libhpcrun_o-opencl-instrumentation.o -MD -MP -MF gpu/instrumentation/$(DEPDIR)/libhpcrun_o-opencl-instrumentation.Tpo -c -o gpu/instrumentation/libhpcrun_o-opencl-instrumentation.o `test -f 'gpu/instrumentation/opencl-instrumentation.c' || echo '$(srcdir)/'`gpu/instrumentation/opencl-instrumentation.c
-@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) gpu/instrumentation/$(DEPDIR)/libhpcrun_o-opencl-instrumentation.Tpo gpu/instrumentation/$(DEPDIR)/libhpcrun_o-opencl-instrumentation.Po
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='gpu/instrumentation/opencl-instrumentation.c' object='gpu/instrumentation/libhpcrun_o-opencl-instrumentation.o' libtool=no @AMDEPBACKSLASH@
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
-@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_o_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_o_CFLAGS) $(CFLAGS) -c -o gpu/instrumentation/libhpcrun_o-opencl-instrumentation.o `test -f 'gpu/instrumentation/opencl-instrumentation.c' || echo '$(srcdir)/'`gpu/instrumentation/opencl-instrumentation.c
-
-gpu/instrumentation/libhpcrun_o-opencl-instrumentation.obj: gpu/instrumentation/opencl-instrumentation.c
-@am__fastdepCC_TRUE@	$(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_o_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_o_CFLAGS) $(CFLAGS) -MT gpu/instrumentation/libhpcrun_o-opencl-instrumentation.obj -MD -MP -MF gpu/instrumentation/$(DEPDIR)/libhpcrun_o-opencl-instrumentation.Tpo -c -o gpu/instrumentation/libhpcrun_o-opencl-instrumentation.obj `if test -f 'gpu/instrumentation/opencl-instrumentation.c'; then $(CYGPATH_W) 'gpu/instrumentation/opencl-instrumentation.c'; else $(CYGPATH_W) '$(srcdir)/gpu/instrumentation/opencl-instrumentation.c'; fi`
-@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) gpu/instrumentation/$(DEPDIR)/libhpcrun_o-opencl-instrumentation.Tpo gpu/instrumentation/$(DEPDIR)/libhpcrun_o-opencl-instrumentation.Po
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='gpu/instrumentation/opencl-instrumentation.c' object='gpu/instrumentation/libhpcrun_o-opencl-instrumentation.obj' libtool=no @AMDEPBACKSLASH@
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
-@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_o_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_o_CFLAGS) $(CFLAGS) -c -o gpu/instrumentation/libhpcrun_o-opencl-instrumentation.obj `if test -f 'gpu/instrumentation/opencl-instrumentation.c'; then $(CYGPATH_W) 'gpu/instrumentation/opencl-instrumentation.c'; else $(CYGPATH_W) '$(srcdir)/gpu/instrumentation/opencl-instrumentation.c'; fi`
-
 fnbounds/libhpcrun_o-fnbounds_static.o: fnbounds/fnbounds_static.c
 @am__fastdepCC_TRUE@	$(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_o_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_o_CFLAGS) $(CFLAGS) -MT fnbounds/libhpcrun_o-fnbounds_static.o -MD -MP -MF fnbounds/$(DEPDIR)/libhpcrun_o-fnbounds_static.Tpo -c -o fnbounds/libhpcrun_o-fnbounds_static.o `test -f 'fnbounds/fnbounds_static.c' || echo '$(srcdir)/'`fnbounds/fnbounds_static.c
 @am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) fnbounds/$(DEPDIR)/libhpcrun_o-fnbounds_static.Tpo fnbounds/$(DEPDIR)/libhpcrun_o-fnbounds_static.Po
@@ -8873,9 +8734,6 @@ endef
 @OPT_ENABLE_MPI_WRAP_TRUE@mpi-overrides.c: $(srcdir)/sample-sources/$(MPI_PROTO_FILE)
 @OPT_ENABLE_MPI_WRAP_TRUE@	$(PYTHON) $(srcdir)/sample-sources/make-wrappers.py \
 @OPT_ENABLE_MPI_WRAP_TRUE@		--f77symbol $(F77_SYMBOLS) $(srcdir)/sample-sources/$(MPI_PROTO_FILE)
-@ENABLE_OPENCL_TRUE@	libhpcrun_la_CFLAGS += $(OPENCL_IFLAGS)
-@ENABLE_OPENCL_TRUE@	libhpcrun_la_LDFLAGS += "-L/home/aarontcopal2/Documents/inteloneapi/gtpin/Profilers/Lib/intel64 -lgtpin"
-@ENABLE_OPENCL_TRUE@	libhpcrun_la_LDFLAGS +=	"-Wl,-rpath='/home/aarontcopal2/Documents/inteloneapi/gtpin/Profilers/Lib/intel64'"
 
 #-----------------------------------------------------------
 # local hooks
diff --git a/src/tool/hpcrun/utilities/bgq-cnk/Makefile.in b/src/tool/hpcrun/utilities/bgq-cnk/Makefile.in
index 695da3069e..fe77a33bf2 100644
--- a/src/tool/hpcrun/utilities/bgq-cnk/Makefile.in
+++ b/src/tool/hpcrun/utilities/bgq-cnk/Makefile.in
@@ -311,9 +311,18 @@ OPT_CUDA_LDFLAGS = @OPT_CUDA_LDFLAGS@
 OPT_CUPTI = @OPT_CUPTI@
 OPT_CUPTI_IFLAGS = @OPT_CUPTI_IFLAGS@
 OPT_CUPTI_LDFLAGS = @OPT_CUPTI_LDFLAGS@
+OPT_GTPIN = @OPT_GTPIN@
+OPT_GTPIN_IFLAGS = @OPT_GTPIN_IFLAGS@
+OPT_GTPIN_LDFLAGS = @OPT_GTPIN_LDFLAGS@
+OPT_IGC = @OPT_IGC@
+OPT_IGC_IFLAGS = @OPT_IGC_IFLAGS@
+OPT_IGC_LDFLAGS = @OPT_IGC_LDFLAGS@
 OPT_LEVEL0 = @OPT_LEVEL0@
 OPT_LEVEL0_IFLAGS = @OPT_LEVEL0_IFLAGS@
 OPT_LEVEL0_LDFLAGS = @OPT_LEVEL0_LDFLAGS@
+OPT_METRICS_DISCOVERY = @OPT_METRICS_DISCOVERY@
+OPT_METRICS_DISCOVERY_IFLAGS = @OPT_METRICS_DISCOVERY_IFLAGS@
+OPT_METRICS_DISCOVERY_LDFLAGS = @OPT_METRICS_DISCOVERY_LDFLAGS@
 OPT_OBJCOPY = @OPT_OBJCOPY@
 OPT_OPENCL_IFLAGS = @OPT_OPENCL_IFLAGS@
 OPT_PAPI = @OPT_PAPI@
diff --git a/src/tool/hpcserver/Makefile.in b/src/tool/hpcserver/Makefile.in
index 5c1d265a5a..71e3486625 100644
--- a/src/tool/hpcserver/Makefile.in
+++ b/src/tool/hpcserver/Makefile.in
@@ -365,9 +365,18 @@ OPT_CUDA_LDFLAGS = @OPT_CUDA_LDFLAGS@
 OPT_CUPTI = @OPT_CUPTI@
 OPT_CUPTI_IFLAGS = @OPT_CUPTI_IFLAGS@
 OPT_CUPTI_LDFLAGS = @OPT_CUPTI_LDFLAGS@
+OPT_GTPIN = @OPT_GTPIN@
+OPT_GTPIN_IFLAGS = @OPT_GTPIN_IFLAGS@
+OPT_GTPIN_LDFLAGS = @OPT_GTPIN_LDFLAGS@
+OPT_IGC = @OPT_IGC@
+OPT_IGC_IFLAGS = @OPT_IGC_IFLAGS@
+OPT_IGC_LDFLAGS = @OPT_IGC_LDFLAGS@
 OPT_LEVEL0 = @OPT_LEVEL0@
 OPT_LEVEL0_IFLAGS = @OPT_LEVEL0_IFLAGS@
 OPT_LEVEL0_LDFLAGS = @OPT_LEVEL0_LDFLAGS@
+OPT_METRICS_DISCOVERY = @OPT_METRICS_DISCOVERY@
+OPT_METRICS_DISCOVERY_IFLAGS = @OPT_METRICS_DISCOVERY_IFLAGS@
+OPT_METRICS_DISCOVERY_LDFLAGS = @OPT_METRICS_DISCOVERY_LDFLAGS@
 OPT_OBJCOPY = @OPT_OBJCOPY@
 OPT_OPENCL_IFLAGS = @OPT_OPENCL_IFLAGS@
 OPT_PAPI = @OPT_PAPI@
diff --git a/src/tool/hpcserver/mpi/Makefile.in b/src/tool/hpcserver/mpi/Makefile.in
index 467606c9a5..586726c2b0 100644
--- a/src/tool/hpcserver/mpi/Makefile.in
+++ b/src/tool/hpcserver/mpi/Makefile.in
@@ -373,9 +373,18 @@ OPT_CUDA_LDFLAGS = @OPT_CUDA_LDFLAGS@
 OPT_CUPTI = @OPT_CUPTI@
 OPT_CUPTI_IFLAGS = @OPT_CUPTI_IFLAGS@
 OPT_CUPTI_LDFLAGS = @OPT_CUPTI_LDFLAGS@
+OPT_GTPIN = @OPT_GTPIN@
+OPT_GTPIN_IFLAGS = @OPT_GTPIN_IFLAGS@
+OPT_GTPIN_LDFLAGS = @OPT_GTPIN_LDFLAGS@
+OPT_IGC = @OPT_IGC@
+OPT_IGC_IFLAGS = @OPT_IGC_IFLAGS@
+OPT_IGC_LDFLAGS = @OPT_IGC_LDFLAGS@
 OPT_LEVEL0 = @OPT_LEVEL0@
 OPT_LEVEL0_IFLAGS = @OPT_LEVEL0_IFLAGS@
 OPT_LEVEL0_LDFLAGS = @OPT_LEVEL0_LDFLAGS@
+OPT_METRICS_DISCOVERY = @OPT_METRICS_DISCOVERY@
+OPT_METRICS_DISCOVERY_IFLAGS = @OPT_METRICS_DISCOVERY_IFLAGS@
+OPT_METRICS_DISCOVERY_LDFLAGS = @OPT_METRICS_DISCOVERY_LDFLAGS@
 OPT_OBJCOPY = @OPT_OBJCOPY@
 OPT_OPENCL_IFLAGS = @OPT_OPENCL_IFLAGS@
 OPT_PAPI = @OPT_PAPI@
diff --git a/src/tool/hpcstruct/Makefile.in b/src/tool/hpcstruct/Makefile.in
index a39dec41d1..18e5d3e025 100644
--- a/src/tool/hpcstruct/Makefile.in
+++ b/src/tool/hpcstruct/Makefile.in
@@ -405,9 +405,18 @@ OPT_CUDA_LDFLAGS = @OPT_CUDA_LDFLAGS@
 OPT_CUPTI = @OPT_CUPTI@
 OPT_CUPTI_IFLAGS = @OPT_CUPTI_IFLAGS@
 OPT_CUPTI_LDFLAGS = @OPT_CUPTI_LDFLAGS@
+OPT_GTPIN = @OPT_GTPIN@
+OPT_GTPIN_IFLAGS = @OPT_GTPIN_IFLAGS@
+OPT_GTPIN_LDFLAGS = @OPT_GTPIN_LDFLAGS@
+OPT_IGC = @OPT_IGC@
+OPT_IGC_IFLAGS = @OPT_IGC_IFLAGS@
+OPT_IGC_LDFLAGS = @OPT_IGC_LDFLAGS@
 OPT_LEVEL0 = @OPT_LEVEL0@
 OPT_LEVEL0_IFLAGS = @OPT_LEVEL0_IFLAGS@
 OPT_LEVEL0_LDFLAGS = @OPT_LEVEL0_LDFLAGS@
+OPT_METRICS_DISCOVERY = @OPT_METRICS_DISCOVERY@
+OPT_METRICS_DISCOVERY_IFLAGS = @OPT_METRICS_DISCOVERY_IFLAGS@
+OPT_METRICS_DISCOVERY_LDFLAGS = @OPT_METRICS_DISCOVERY_LDFLAGS@
 OPT_OBJCOPY = @OPT_OBJCOPY@
 OPT_OPENCL_IFLAGS = @OPT_OPENCL_IFLAGS@
 OPT_PAPI = @OPT_PAPI@
diff --git a/src/tool/hpctracedump/Makefile.in b/src/tool/hpctracedump/Makefile.in
index 61d5b3f16d..cd2eef79a8 100644
--- a/src/tool/hpctracedump/Makefile.in
+++ b/src/tool/hpctracedump/Makefile.in
@@ -351,9 +351,18 @@ OPT_CUDA_LDFLAGS = @OPT_CUDA_LDFLAGS@
 OPT_CUPTI = @OPT_CUPTI@
 OPT_CUPTI_IFLAGS = @OPT_CUPTI_IFLAGS@
 OPT_CUPTI_LDFLAGS = @OPT_CUPTI_LDFLAGS@
+OPT_GTPIN = @OPT_GTPIN@
+OPT_GTPIN_IFLAGS = @OPT_GTPIN_IFLAGS@
+OPT_GTPIN_LDFLAGS = @OPT_GTPIN_LDFLAGS@
+OPT_IGC = @OPT_IGC@
+OPT_IGC_IFLAGS = @OPT_IGC_IFLAGS@
+OPT_IGC_LDFLAGS = @OPT_IGC_LDFLAGS@
 OPT_LEVEL0 = @OPT_LEVEL0@
 OPT_LEVEL0_IFLAGS = @OPT_LEVEL0_IFLAGS@
 OPT_LEVEL0_LDFLAGS = @OPT_LEVEL0_LDFLAGS@
+OPT_METRICS_DISCOVERY = @OPT_METRICS_DISCOVERY@
+OPT_METRICS_DISCOVERY_IFLAGS = @OPT_METRICS_DISCOVERY_IFLAGS@
+OPT_METRICS_DISCOVERY_LDFLAGS = @OPT_METRICS_DISCOVERY_LDFLAGS@
 OPT_OBJCOPY = @OPT_OBJCOPY@
 OPT_OPENCL_IFLAGS = @OPT_OPENCL_IFLAGS@
 OPT_PAPI = @OPT_PAPI@
diff --git a/src/tool/misc/Makefile.in b/src/tool/misc/Makefile.in
index de8f4f89f4..704457677d 100644
--- a/src/tool/misc/Makefile.in
+++ b/src/tool/misc/Makefile.in
@@ -306,9 +306,18 @@ OPT_CUDA_LDFLAGS = @OPT_CUDA_LDFLAGS@
 OPT_CUPTI = @OPT_CUPTI@
 OPT_CUPTI_IFLAGS = @OPT_CUPTI_IFLAGS@
 OPT_CUPTI_LDFLAGS = @OPT_CUPTI_LDFLAGS@
+OPT_GTPIN = @OPT_GTPIN@
+OPT_GTPIN_IFLAGS = @OPT_GTPIN_IFLAGS@
+OPT_GTPIN_LDFLAGS = @OPT_GTPIN_LDFLAGS@
+OPT_IGC = @OPT_IGC@
+OPT_IGC_IFLAGS = @OPT_IGC_IFLAGS@
+OPT_IGC_LDFLAGS = @OPT_IGC_LDFLAGS@
 OPT_LEVEL0 = @OPT_LEVEL0@
 OPT_LEVEL0_IFLAGS = @OPT_LEVEL0_IFLAGS@
 OPT_LEVEL0_LDFLAGS = @OPT_LEVEL0_LDFLAGS@
+OPT_METRICS_DISCOVERY = @OPT_METRICS_DISCOVERY@
+OPT_METRICS_DISCOVERY_IFLAGS = @OPT_METRICS_DISCOVERY_IFLAGS@
+OPT_METRICS_DISCOVERY_LDFLAGS = @OPT_METRICS_DISCOVERY_LDFLAGS@
 OPT_OBJCOPY = @OPT_OBJCOPY@
 OPT_OPENCL_IFLAGS = @OPT_OPENCL_IFLAGS@
 OPT_PAPI = @OPT_PAPI@
diff --git a/src/tool/xprof/Makefile.in b/src/tool/xprof/Makefile.in
index edef2ccc2c..0a4a968e9b 100644
--- a/src/tool/xprof/Makefile.in
+++ b/src/tool/xprof/Makefile.in
@@ -372,9 +372,18 @@ OPT_CUDA_LDFLAGS = @OPT_CUDA_LDFLAGS@
 OPT_CUPTI = @OPT_CUPTI@
 OPT_CUPTI_IFLAGS = @OPT_CUPTI_IFLAGS@
 OPT_CUPTI_LDFLAGS = @OPT_CUPTI_LDFLAGS@
+OPT_GTPIN = @OPT_GTPIN@
+OPT_GTPIN_IFLAGS = @OPT_GTPIN_IFLAGS@
+OPT_GTPIN_LDFLAGS = @OPT_GTPIN_LDFLAGS@
+OPT_IGC = @OPT_IGC@
+OPT_IGC_IFLAGS = @OPT_IGC_IFLAGS@
+OPT_IGC_LDFLAGS = @OPT_IGC_LDFLAGS@
 OPT_LEVEL0 = @OPT_LEVEL0@
 OPT_LEVEL0_IFLAGS = @OPT_LEVEL0_IFLAGS@
 OPT_LEVEL0_LDFLAGS = @OPT_LEVEL0_LDFLAGS@
+OPT_METRICS_DISCOVERY = @OPT_METRICS_DISCOVERY@
+OPT_METRICS_DISCOVERY_IFLAGS = @OPT_METRICS_DISCOVERY_IFLAGS@
+OPT_METRICS_DISCOVERY_LDFLAGS = @OPT_METRICS_DISCOVERY_LDFLAGS@
 OPT_OBJCOPY = @OPT_OBJCOPY@
 OPT_OPENCL_IFLAGS = @OPT_OPENCL_IFLAGS@
 OPT_PAPI = @OPT_PAPI@

From 08bd009fb90df3e7fee8c10b2f10376912298243 Mon Sep 17 00:00:00 2001
From: Aaron Cherian <aarontcopal2@jlselogin2.ftm.alcf.anl.gov>
Date: Sun, 13 Sep 2020 16:58:48 +0000
Subject: [PATCH 022/177] added source code dumps for opencl

---
 src/tool/hpcrun/gpu/opencl/opencl-intercept.c | 24 +++++++++----------
 1 file changed, 11 insertions(+), 13 deletions(-)

diff --git a/src/tool/hpcrun/gpu/opencl/opencl-intercept.c b/src/tool/hpcrun/gpu/opencl/opencl-intercept.c
index 3a33d0748c..44901c4302 100644
--- a/src/tool/hpcrun/gpu/opencl/opencl-intercept.c
+++ b/src/tool/hpcrun/gpu/opencl/opencl-intercept.c
@@ -118,8 +118,6 @@ setDebugInfoFullFileName
 )
 {
 	if (debugInfoFullFileName == NULL) {
-		//size_t fileNameLength = strlen(fileName);
-		//debugInfoFullFileName = (char*) malloc(sizeof(fileNameLength));
 		debugInfoFullFileName = fileName;	
 	}
 }
@@ -184,29 +182,31 @@ clCreateProgramWithSource_wrapper
  cl_int* errcode_ret
 )
 {
-	clcreateprogramwithsource_t clCreateProgramWithSource_wrappee =
-		GOTCHA_GET_TYPED_WRAPPEE(clCreateProgramWithSource_handle, clcreateprogramwithsource_t);
-	return clCreateProgramWithSource_wrappee(context, count, strings, lengths, errcode_ret);
-	/*
 	ETMSG(OPENCL, "inside clCreateProgramWithSource_wrapper");
 
 	FILE *f_ptr;
 	for (int i = 0; i < (int)count; i++) {
 		// what if a single file has multiple kernels?
-		char *filename = "add.src"; // we need to add logic to get filenames by reading the strings contents
-		//char *filename = getKernelNameFromSourceCode(strings[i]);
+		// we need to add logic to get filenames by reading the strings contents
+		char fileno = '0' + (i + 1); // right now we are naming the files as index numbers
+		// using malloc instead of hpcrun_malloc gives extra garbage characters in file name
+		char *filename = (char*)hpcrun_malloc(sizeof(fileno) + 1);
+		*filename = fileno + '\0';
 		f_ptr = fopen(filename, "w");
 		fwrite(strings[i], lengths[i], 1, f_ptr);
 	}
 	fclose(f_ptr);
-	*/
+	
+	clcreateprogramwithsource_t clCreateProgramWithSource_wrappee =
+		GOTCHA_GET_TYPED_WRAPPEE(clCreateProgramWithSource_handle, clcreateprogramwithsource_t);
+	return clCreateProgramWithSource_wrappee(context, count, strings, lengths, errcode_ret);
 }
 
 
 // we are dumping the debuginfo temporarily since the binary does not have debugsection
 // poorly written code: FIXME
 static char*
-dumpIntelGPUBinary(cl_program program, size_t *fileNameSize) {
+dumpIntelGPUBinary(cl_program program) {
 	int device_count = 1;
 	cl_int status = CL_SUCCESS;
 	size_t *binary_size = (size_t*)hpcrun_malloc(sizeof(size_t) * device_count);
@@ -239,7 +239,6 @@ dumpIntelGPUBinary(cl_program program, size_t *fileNameSize) {
 	assert(status == CL_SUCCESS);
 
 	char *debuginfoFileName = "opencl_main.debuginfo";
-	*fileNameSize = strlen(debuginfoFileName);
 	bin_ptr = fopen(debuginfoFileName, "wb");
 	fwrite(debug_info[0], debug_info_size[0], 1, bin_ptr);
 	fclose(bin_ptr);
@@ -255,8 +254,7 @@ clBuildProgramCallback
 	void* user_data
 )
 {
-	size_t fileNameSize;
-	char* debugInfoFullFileName = dumpIntelGPUBinary(program, &fileNameSize);
+	char* debugInfoFullFileName = dumpIntelGPUBinary(program);
 	setDebugInfoFullFileName(debugInfoFullFileName);
 }
 

From e2b007d34938d8d1423f2e22fc844d0b02fc6a2a Mon Sep 17 00:00:00 2001
From: Jokeren <robinho364@gmail.com>
Date: Mon, 14 Sep 2020 01:24:32 +0000
Subject: [PATCH 023/177] Line formatting

---
 .../instrumentation/opencl-instrumentation.c  | 338 +++++++++---------
 1 file changed, 171 insertions(+), 167 deletions(-)

diff --git a/src/tool/hpcrun/gpu/instrumentation/opencl-instrumentation.c b/src/tool/hpcrun/gpu/instrumentation/opencl-instrumentation.c
index 64d8176029..c4c32a309d 100644
--- a/src/tool/hpcrun/gpu/instrumentation/opencl-instrumentation.c
+++ b/src/tool/hpcrun/gpu/instrumentation/opencl-instrumentation.c
@@ -89,8 +89,8 @@ static atomic_long correlation_id;
 static void
 knobAddBool
 (
-	const char *name,
-	bool value
+ const char *name,
+ bool value
 )
 {
   GTPinKnob knob = KNOB_FindArg(name);
@@ -106,7 +106,7 @@ knobAddBool
 static uint32_t
 getCorrelationId
 (
-  void
+ void
 )
 {
   return atomic_fetch_add(&correlation_id, 1);
@@ -116,22 +116,22 @@ getCorrelationId
 static void
 createKernelNode
 (
-	uint64_t correlation_id
+ uint64_t correlation_id
 )
 {
-	cct_node_t *api_node = gpu_application_thread_correlation_callback(correlation_id);
-	gpu_correlation_id_map_insert(correlation_id, correlation_id);
+  cct_node_t *api_node = gpu_application_thread_correlation_callback(correlation_id);
+  gpu_correlation_id_map_insert(correlation_id, correlation_id);
 
-	gpu_op_ccts_t gpu_op_ccts;
-	gpu_op_placeholder_flags_t gpu_op_placeholder_flags = 0;
-	gpu_op_placeholder_flags_set(&gpu_op_placeholder_flags, gpu_placeholder_type_kernel);
+  gpu_op_ccts_t gpu_op_ccts;
+  gpu_op_placeholder_flags_t gpu_op_placeholder_flags = 0;
+  gpu_op_placeholder_flags_set(&gpu_op_placeholder_flags, gpu_placeholder_type_kernel);
 
-	hpcrun_safe_enter();
-	gpu_op_ccts_insert(api_node, &gpu_op_ccts, gpu_op_placeholder_flags);
-	hpcrun_safe_exit();
+  hpcrun_safe_enter();
+  gpu_op_ccts_insert(api_node, &gpu_op_ccts, gpu_op_placeholder_flags);
+  hpcrun_safe_exit();
 
-	gpu_activity_channel_consume(gpu_metrics_attribute);
-	uint64_t cpu_submit_time = hpcrun_nanotime();
+  gpu_activity_channel_consume(gpu_metrics_attribute);
+  uint64_t cpu_submit_time = hpcrun_nanotime();
   gpu_correlation_channel_produce(correlation_id, &gpu_op_ccts, cpu_submit_time);
 }
 
@@ -139,80 +139,84 @@ createKernelNode
 static uint32_t
 findKernelAndInsertToLoadMap
 (
-	uint8_t *debuginfo,
-	char *input_kernel_name
+ uint8_t *debuginfo,
+ char *input_kernel_name
 )
 {
-	const uint8_t* ptr = debuginfo;
-	const SProgramDebugDataHeaderIGC* header = (const SProgramDebugDataHeaderIGC*)(ptr);
-	ptr += sizeof(SProgramDebugDataHeaderIGC);
-
-	ETMSG(OPENCL, "Number of kernels: %d", header->NumberOfKernels);
-	for (uint32_t i = 0; i < header->NumberOfKernels; ++i) {
-		const SKernelDebugDataHeaderIGC* kernel_header = (const SKernelDebugDataHeaderIGC*)(ptr);
-		ptr += sizeof(SKernelDebugDataHeaderIGC);
-
-		const char* kernel_name = (const char*)(ptr);
-		char *file_name = (char*)hpcrun_malloc(sizeof(kernel_name));
-		strcpy(file_name, kernel_name);
-		strcat(file_name, ".gpubin");
-
-		unsigned kernel_name_size_aligned = sizeof(uint32_t) *
-			(1 + (kernel_header->KernelNameSize - 1) / sizeof(uint32_t));
-		ptr += kernel_name_size_aligned;
-
-		if (kernel_header->SizeVisaDbgInBytes > 0 && strcmp(kernel_name, input_kernel_name) == 0) {
-			FILE *fptr = fopen(file_name, "wb");
-			fwrite(ptr, kernel_header->SizeVisaDbgInBytes, 1, fptr);
-
-			uint32_t hpctoolkit_module_id;
-			load_module_t *module = NULL;
-			hpcrun_loadmap_lock();
-			char *absoluteKernelName = realpath(file_name, NULL); 
-			if ((module = hpcrun_loadmap_findByName(absoluteKernelName)) == NULL) {
-				hpctoolkit_module_id = hpcrun_loadModule_add(absoluteKernelName);
-			} else {
-				hpctoolkit_module_id = module->id;
-			}
-			hpcrun_loadmap_unlock();
-			fclose(fptr);
-			return hpctoolkit_module_id;
-		}
-		// Should be zero for newest drivers
-		assert(kernel_header->SizeGenIsaDbgInBytes == 0);
-
-		ptr += kernel_header->SizeVisaDbgInBytes;
-		ptr += kernel_header->SizeGenIsaDbgInBytes;
-	}
-	return -1;
+  const uint8_t* ptr = debuginfo;
+  const SProgramDebugDataHeaderIGC* header = (const SProgramDebugDataHeaderIGC*)(ptr);
+  ptr += sizeof(SProgramDebugDataHeaderIGC);
+
+  ETMSG(OPENCL, "Number of kernels: %d", header->NumberOfKernels);
+  for (uint32_t i = 0; i < header->NumberOfKernels; ++i) {
+    const SKernelDebugDataHeaderIGC* kernel_header = (const SKernelDebugDataHeaderIGC*)(ptr);
+    ptr += sizeof(SKernelDebugDataHeaderIGC);
+
+    const char* kernel_name = (const char*)(ptr);
+    char *file_name = (char*)hpcrun_malloc(sizeof(kernel_name));
+    strcpy(file_name, kernel_name);
+    strcat(file_name, ".gpubin");
+
+    unsigned kernel_name_size_aligned = sizeof(uint32_t) *
+      (1 + (kernel_header->KernelNameSize - 1) / sizeof(uint32_t));
+    ptr += kernel_name_size_aligned;
+
+    if (kernel_header->SizeVisaDbgInBytes > 0 && strcmp(kernel_name, input_kernel_name) == 0) {
+      FILE *fptr = fopen(file_name, "wb");
+      fwrite(ptr, kernel_header->SizeVisaDbgInBytes, 1, fptr);
+
+      uint32_t hpctoolkit_module_id;
+      load_module_t *module = NULL;
+      char *absoluteKernelName = realpath(file_name, NULL); 
+
+      hpcrun_loadmap_lock();
+      if ((module = hpcrun_loadmap_findByName(absoluteKernelName)) == NULL) {
+        hpctoolkit_module_id = hpcrun_loadModule_add(absoluteKernelName);
+      } else {
+        hpctoolkit_module_id = module->id;
+      }
+      hpcrun_loadmap_unlock();
+
+      fclose(fptr);
+      return hpctoolkit_module_id;
+    }
+    // Should be zero for newest drivers
+    assert(kernel_header->SizeGenIsaDbgInBytes == 0);
+
+    ptr += kernel_header->SizeVisaDbgInBytes;
+    ptr += kernel_header->SizeGenIsaDbgInBytes;
+  }
+  return -1;
 }
 
 
 static uint32_t
 add_opencl_binary_to_loadmap 
 (
-	char *kernel_name
+ char *kernel_name
 )
 {
-	char *debuginfoFileName = getDebugInfoFullFileName();
-	if (debuginfoFileName == NULL) {
-		ETMSG(OPENCL, "debug file not found");
-		return -1;	
-	}
-	FILE *fptr = fopen(debuginfoFileName, "rb");
-	fseek(fptr, 0L, SEEK_END);
-	size_t debug_info_size = ftell(fptr);
-	rewind(fptr);
-	uint8_t *debug_info = (uint8_t*)hpcrun_malloc(debug_info_size);
-	fread(debug_info, debug_info_size, 1, fptr);
-	return findKernelAndInsertToLoadMap(debug_info, kernel_name);
+  char *debuginfoFileName = getDebugInfoFullFileName();
+  ETMSG(OPENCL, "OpenCL binary name %s", debuginfoFileName);
+
+  if (debuginfoFileName == NULL) {
+    ETMSG(OPENCL, "debug file not found");
+    return -1;  
+  }
+  FILE *fptr = fopen(debuginfoFileName, "rb");
+  fseek(fptr, 0L, SEEK_END);
+  size_t debug_info_size = ftell(fptr);
+  rewind(fptr);
+  uint8_t *debug_info = (uint8_t*)hpcrun_malloc(debug_info_size);
+  fread(debug_info, debug_info_size, 1, fptr);
+  return findKernelAndInsertToLoadMap(debug_info, kernel_name);
 }
 
 
 static void
 opencl_activity_notify
 (
-  void
+ void
 )
 {
   gpu_monitoring_thread_activities_ready();
@@ -222,20 +226,20 @@ opencl_activity_notify
 static void
 opencl_kernel_block_activity_translate
 (
-	gpu_activity_t *ga,
-	uint32_t correlation_id,
-	uint32_t loadmap_module_id,
-	uint64_t offset,
-	uint64_t execution_count
+ gpu_activity_t *ga,
+ uint32_t correlation_id,
+ uint32_t loadmap_module_id,
+ uint64_t offset,
+ uint64_t execution_count
 )
 {
-	memset(&ga->details.kernel_block, 0, sizeof(gpu_kernel_block_t));
-	ga->details.kernel_block.correlation_id = correlation_id;
-	ga->details.kernel_block.pc.lm_id = (uint16_t)loadmap_module_id;
-	ga->details.kernel_block.pc.lm_ip = (uintptr_t)offset;
-	ga->details.kernel_block.offset = offset;
-	ga->details.kernel_block.execution_count = execution_count;
-	ga->kind = GPU_ACTIVITY_KERNEL_BLOCK;
+  memset(&ga->details.kernel_block, 0, sizeof(gpu_kernel_block_t));
+  ga->details.kernel_block.correlation_id = correlation_id;
+  ga->details.kernel_block.pc.lm_id = (uint16_t)loadmap_module_id;
+  ga->details.kernel_block.pc.lm_ip = (uintptr_t)offset;
+  ga->details.kernel_block.offset = offset;
+  ga->details.kernel_block.execution_count = execution_count;
+  ga->kind = GPU_ACTIVITY_KERNEL_BLOCK;
 
   cstack_ptr_set(&(ga->next), 0);
 }
@@ -244,45 +248,45 @@ opencl_kernel_block_activity_translate
 static void
 opencl_kernel_block_activity_process
 (
-	gpu_activity_t *ga,
-	uint32_t correlation_id,
-	uint32_t loadmap_module_id,
-	uint64_t offset,
-	uint64_t execution_count
+ gpu_activity_t *ga,
+ uint32_t correlation_id,
+ uint32_t loadmap_module_id,
+ uint64_t offset,
+ uint64_t execution_count
 )
 {
-	opencl_kernel_block_activity_translate(ga, correlation_id, loadmap_module_id, offset, execution_count);
-	gpu_activity_process(ga);
+  opencl_kernel_block_activity_translate(ga, correlation_id, loadmap_module_id, offset, execution_count);
+  gpu_activity_process(ga);
 }
 
 
 static void
 onKernelBuild
 (
-	GTPinKernel kernel,
-	void *v
+ GTPinKernel kernel,
+ void *v
 )
 {
   GTPINTOOL_STATUS status = GTPINTOOL_STATUS_SUCCESS;
 
   assert(kernel_memory_map_lookup1((uint64_t)kernel) == 0);
   assert(kernel_data_map_lookup1((uint64_t)kernel) == 0);
-	
+
   KernelData data;
 
-	uint32_t correlation_id = getCorrelationId();
-	data.kernel_cct_correlation_id = correlation_id;
-	createKernelNode(correlation_id);
+  uint32_t correlation_id = getCorrelationId();
+  data.kernel_cct_correlation_id = correlation_id;
+  createKernelNode(correlation_id);
 
-	mem_pair_node *h;
-	mem_pair_node *current;
-	bool isHeadNull = true;
+  mem_pair_node *h;
+  mem_pair_node *current;
+  bool isHeadNull = true;
 
   for (GTPinBBL block = GTPin_BBLHead(kernel); GTPin_BBLValid(block); block = GTPin_BBLNext(block)) {
     GTPinINS head = GTPin_InsHead(block);
     assert(GTPin_InsValid(head));
-    
-		int32_t offset =  GTPin_InsOffset(head);
+
+    int32_t offset = GTPin_InsOffset(head);
 
     GTPinMem mem = NULL;
     status = GTPin_MemClaim(kernel, sizeof(uint32_t), &mem);
@@ -291,61 +295,61 @@ onKernelBuild
     status = GTPin_OpcodeprofInstrument(head, mem);
     assert(status == GTPINTOOL_STATUS_SUCCESS);
 
-		mem_pair_node *m = hpcrun_malloc(sizeof(mem_pair_node));
-		m->offset = offset;
-		m->mem = mem;
-		m->next = NULL;
-
-		if (isHeadNull == true) {
-			h = m;
-			current = m;
-			isHeadNull = false;
-		} else {
-			current->next = m;
-			current = current->next;
-		}
+    mem_pair_node *m = hpcrun_malloc(sizeof(mem_pair_node));
+    m->offset = offset;
+    m->mem = mem;
+    m->next = NULL;
+
+    if (isHeadNull == true) {
+      h = m;
+      current = m;
+      isHeadNull = false;
+    } else {
+      current->next = m;
+      current = current->next;
+    }
+  }
+  if (h != NULL) {
+    kernel_memory_map_insert1((uint64_t)kernel, h);
   }
-	if (h != NULL) {
-		kernel_memory_map_insert1((uint64_t)kernel, h);
-	}
 
-	gpu_activity_channel_consume(gpu_metrics_attribute);
+  gpu_activity_channel_consume(gpu_metrics_attribute);
 
   char kernel_name[MAX_STR_SIZE];
   status = GTPin_KernelGetName(kernel, MAX_STR_SIZE, kernel_name, NULL);
   assert(status == GTPINTOOL_STATUS_SUCCESS);
 
-	// 
-	// m->next = NULL;
-	// add these details to cct_node. If thats not needed, we can create the kernel_cct in onKernelComplete
+  // 
+  // m->next = NULL;
+  // add these details to cct_node. If thats not needed, we can create the kernel_cct in onKernelComplete
   data.name = kernel_name;
   data.call_count = 0;
-	data.loadmap_module_id = add_opencl_binary_to_loadmap(kernel_name);
-	
-	kernel_data_map_insert1((uint64_t)kernel, data);
+  data.loadmap_module_id = add_opencl_binary_to_loadmap(kernel_name);
+
+  kernel_data_map_insert1((uint64_t)kernel, data);
   ETMSG(OPENCL, "onKernelBuild complete. Inserted key: %"PRIu64 "",(uint64_t)kernel);
 }
 
 
 static void
-onKernelRun
+  onKernelRun
 (
-	GTPinKernelExec kernelExec,
-	void *v
-)
+ GTPinKernelExec kernelExec,
+ void *v
+ )
 {
-	GTPINTOOL_STATUS status = GTPINTOOL_STATUS_SUCCESS;
+  GTPINTOOL_STATUS status = GTPINTOOL_STATUS_SUCCESS;
   GTPin_KernelProfilingActive(kernelExec, 1);
   assert(status == GTPINTOOL_STATUS_SUCCESS);
 }
 
 
 static void
-onKernelComplete
+  onKernelComplete
 (
-	GTPinKernelExec kernelExec,
-	void *v
-)
+ GTPinKernelExec kernelExec,
+ void *v
+ )
 {
   GTPINTOOL_STATUS status = GTPINTOOL_STATUS_SUCCESS;
   GTPinKernel kernel = GTPin_KernelExec_GetKernel(kernelExec);
@@ -353,43 +357,43 @@ onKernelComplete
   assert(kernel_data_map_lookup1((uint64_t)kernel) != 0);
   assert(kernel_memory_map_lookup1((uint64_t)kernel) != 0);
 
-	kernel_data_map_t *kernel_data_list = kernel_data_map_lookup1((uint64_t)kernel);
+  kernel_data_map_t *kernel_data_list = kernel_data_map_lookup1((uint64_t)kernel);
   KernelData data = kernel_data_list->data;
-	kernel_memory_map_t *kernel_memory_list = kernel_memory_map_lookup1((uint64_t)kernel);
-	mem_pair_node *block = kernel_memory_list->head;
-
- 	// get kernel cct root node from correlation_id
-	uint32_t correlation_id = data.kernel_cct_correlation_id;
-
-	while (block != NULL) {
-		/*!
-		 * @return sampling size for mem handle
-		 * @ingroup MEM
-		 * @param[in]   mem     the memory handle
-		 *
-		 * @par Availability:
-		 * - all callbacks
-		 */
+  kernel_memory_map_t *kernel_memory_list = kernel_memory_map_lookup1((uint64_t)kernel);
+  mem_pair_node *block = kernel_memory_list->head;
+
+  // get kernel cct root node from correlation_id
+  uint32_t correlation_id = data.kernel_cct_correlation_id;
+
+  while (block != NULL) {
+    /*!
+     * @return sampling size for mem handle
+     * @ingroup MEM
+     * @param[in]   mem     the memory handle
+     *
+     * @par Availability:
+     * - all callbacks
+     */
     uint32_t thread_count = GTPin_MemSampleLength(block->mem);
     assert(thread_count > 0);
 
     uint32_t total = 0, value = 0;
     for (uint32_t tid = 0; tid < thread_count; ++tid) {
-    	status = GTPin_MemRead(block->mem, tid, sizeof(uint32_t), (char*)(&value), NULL);
-    	assert(status == GTPINTOOL_STATUS_SUCCESS);
-    	total += value;
+      status = GTPin_MemRead(block->mem, tid, sizeof(uint32_t), (char*)(&value), NULL);
+      assert(status == GTPINTOOL_STATUS_SUCCESS);
+      total += value;
     }
 
     //block_map_t *bm = block_map_lookup1(data.block_map_root, block->offset);
     //assert(bm != 0);
-		uint64_t execution_count = total; // + bm->val 
+    uint64_t execution_count = total; // + bm->val 
     //block_map_insert1(data.block_map_root, block->offset, execution_count);
-	
-		opencl_activity_notify();	
-		gpu_activity_t gpu_activity;
-		opencl_kernel_block_activity_process(&gpu_activity, correlation_id, data.loadmap_module_id, block->offset, execution_count);
-		block = block->next;
-		//how to make offset the primary key within the cct and += the execution value for existing ccts?
+
+    opencl_activity_notify();  
+    gpu_activity_t gpu_activity;
+    opencl_kernel_block_activity_process(&gpu_activity, correlation_id, data.loadmap_module_id, block->offset, execution_count);
+    block = block->next;
+    //how to make offset the primary key within the cct and += the execution value for existing ccts?
   }
 
   ++(data.call_count);
@@ -404,26 +408,26 @@ onKernelComplete
 void
 opencl_instrumentation_initialize
 (
-  void
+ void
 )
 {
-  atomic_store(&correlation_id, 5000);	// to avoid conflict with opencl operation correlation ids, we start instrumentation ids with 5000 (TODO:FIX)
+  atomic_store(&correlation_id, 5000);  // to avoid conflict with opencl operation correlation ids, we start instrumentation ids with 5000 (TODO:FIX)
 }
 
 
 void enableProfiling
 (
-  void
+ void
 )
 {
   ETMSG(OPENCL, "inside enableProfiling");
-	opencl_instrumentation_initialize();
-	knobAddBool("silent_warnings", true);
+  opencl_instrumentation_initialize();
+  knobAddBool("silent_warnings", true);
 
-	/*if (utils::GetEnv("PTI_GEN12") != nullptr) {
+  /*if (utils::GetEnv("PTI_GEN12") != nullptr) {
     std::cout << "[INFO] Experimental GTPin mode: GEN12" << std::endl;
     KnobAddBool("gen12_1", true);
-  }*/
+    }*/
 
   GTPin_OnKernelBuild(onKernelBuild, NULL);
   GTPin_OnKernelRun(onKernelRun, NULL);

From c993e27f952db6079674ba8ce82f186a8708f223 Mon Sep 17 00:00:00 2001
From: Aaron Cherian <aarontcopal2@iris07.ftm.alcf.anl.gov>
Date: Mon, 14 Sep 2020 15:37:57 +0000
Subject: [PATCH 024/177] added code for generating intel kernel cfg
 represenation

---
 src/lib/binutils/InputFile.cpp                |   2 +-
 src/lib/binutils/Makefile.am                  |   3 +-
 src/lib/binutils/Makefile.in                  |  22 +-
 .../binutils/intel/.IntelGPUbinutils.cpp.swp  | Bin 0 -> 16384 bytes
 src/lib/binutils/intel/CreateCFG.cpp          | 215 ++++++++++++++++++
 src/lib/binutils/intel/CreateCFG.hpp          |  89 ++++++++
 .../binutils/{ => intel}/IntelGPUbinutils.cpp |  30 ++-
 .../binutils/{ => intel}/IntelGPUbinutils.hpp |   0
 8 files changed, 351 insertions(+), 10 deletions(-)
 create mode 100644 src/lib/binutils/intel/.IntelGPUbinutils.cpp.swp
 create mode 100644 src/lib/binutils/intel/CreateCFG.cpp
 create mode 100644 src/lib/binutils/intel/CreateCFG.hpp
 rename src/lib/binutils/{ => intel}/IntelGPUbinutils.cpp (89%)
 rename src/lib/binutils/{ => intel}/IntelGPUbinutils.hpp (100%)

diff --git a/src/lib/binutils/InputFile.cpp b/src/lib/binutils/InputFile.cpp
index e15a4bf75f..cd284512fc 100644
--- a/src/lib/binutils/InputFile.cpp
+++ b/src/lib/binutils/InputFile.cpp
@@ -75,7 +75,7 @@
 
 #include "ElfHelper.hpp"
 #include "Fatbin.hpp"
-#include "IntelGPUbinutils.hpp"
+#include <intel/IntelGPUbinutils.hpp>
 #include "InputFile.hpp"
 
 
diff --git a/src/lib/binutils/Makefile.am b/src/lib/binutils/Makefile.am
index 8a35ff4e79..11af1e932d 100644
--- a/src/lib/binutils/Makefile.am
+++ b/src/lib/binutils/Makefile.am
@@ -95,7 +95,8 @@ MYSOURCES = \
 	BinUtils.hpp BinUtils.cpp \
 	VMAInterval.hpp VMAInterval.cpp \
 	Fatbin.cpp \
-	IntelGPUbinutils.cpp \
+	intel/IntelGPUbinutils.cpp \
+	intel/CreateCFG.cpp \
 	ElfHelper.cpp \
 	InputFile.cpp \
 	RelocateCubin.cpp 
diff --git a/src/lib/binutils/Makefile.in b/src/lib/binutils/Makefile.in
index bb29f47868..174ab13da7 100644
--- a/src/lib/binutils/Makefile.in
+++ b/src/lib/binutils/Makefile.in
@@ -147,7 +147,8 @@ am__objects_1 = libHPCbinutils_la-LM.lo libHPCbinutils_la-Seg.lo \
 	libHPCbinutils_la-BinUtils.lo libHPCbinutils_la-VMAInterval.lo \
 	libHPCbinutils_la-Fatbin.lo \
 	libHPCbinutils_la-IntelGPUbinutils.lo \
-	libHPCbinutils_la-ElfHelper.lo libHPCbinutils_la-InputFile.lo \
+	libHPCbinutils_la-CreateCFG.lo libHPCbinutils_la-ElfHelper.lo \
+	libHPCbinutils_la-InputFile.lo \
 	libHPCbinutils_la-RelocateCubin.lo
 am_libHPCbinutils_la_OBJECTS = $(am__objects_1)
 libHPCbinutils_la_OBJECTS = $(am_libHPCbinutils_la_OBJECTS)
@@ -550,7 +551,8 @@ MYSOURCES = \
 	BinUtils.hpp BinUtils.cpp \
 	VMAInterval.hpp VMAInterval.cpp \
 	Fatbin.cpp \
-	IntelGPUbinutils.cpp \
+	intel/IntelGPUbinutils.cpp \
+	intel/CreateCFG.cpp \
 	ElfHelper.cpp \
 	InputFile.cpp \
 	RelocateCubin.cpp 
@@ -643,6 +645,7 @@ distclean-compile:
 	-rm -f *.tab.c
 
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libHPCbinutils_la-BinUtils.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libHPCbinutils_la-CreateCFG.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libHPCbinutils_la-Dbg-LM.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libHPCbinutils_la-Dbg-Proc.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libHPCbinutils_la-ElfHelper.Plo@am__quote@
@@ -764,12 +767,19 @@ libHPCbinutils_la-Fatbin.lo: Fatbin.cpp
 @AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
 @am__fastdepCXX_FALSE@	$(AM_V_CXX@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libHPCbinutils_la_CXXFLAGS) $(CXXFLAGS) -c -o libHPCbinutils_la-Fatbin.lo `test -f 'Fatbin.cpp' || echo '$(srcdir)/'`Fatbin.cpp
 
-libHPCbinutils_la-IntelGPUbinutils.lo: IntelGPUbinutils.cpp
-@am__fastdepCXX_TRUE@	$(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libHPCbinutils_la_CXXFLAGS) $(CXXFLAGS) -MT libHPCbinutils_la-IntelGPUbinutils.lo -MD -MP -MF $(DEPDIR)/libHPCbinutils_la-IntelGPUbinutils.Tpo -c -o libHPCbinutils_la-IntelGPUbinutils.lo `test -f 'IntelGPUbinutils.cpp' || echo '$(srcdir)/'`IntelGPUbinutils.cpp
+libHPCbinutils_la-IntelGPUbinutils.lo: intel/IntelGPUbinutils.cpp
+@am__fastdepCXX_TRUE@	$(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libHPCbinutils_la_CXXFLAGS) $(CXXFLAGS) -MT libHPCbinutils_la-IntelGPUbinutils.lo -MD -MP -MF $(DEPDIR)/libHPCbinutils_la-IntelGPUbinutils.Tpo -c -o libHPCbinutils_la-IntelGPUbinutils.lo `test -f 'intel/IntelGPUbinutils.cpp' || echo '$(srcdir)/'`intel/IntelGPUbinutils.cpp
 @am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/libHPCbinutils_la-IntelGPUbinutils.Tpo $(DEPDIR)/libHPCbinutils_la-IntelGPUbinutils.Plo
-@AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='IntelGPUbinutils.cpp' object='libHPCbinutils_la-IntelGPUbinutils.lo' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='intel/IntelGPUbinutils.cpp' object='libHPCbinutils_la-IntelGPUbinutils.lo' libtool=yes @AMDEPBACKSLASH@
 @AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
-@am__fastdepCXX_FALSE@	$(AM_V_CXX@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libHPCbinutils_la_CXXFLAGS) $(CXXFLAGS) -c -o libHPCbinutils_la-IntelGPUbinutils.lo `test -f 'IntelGPUbinutils.cpp' || echo '$(srcdir)/'`IntelGPUbinutils.cpp
+@am__fastdepCXX_FALSE@	$(AM_V_CXX@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libHPCbinutils_la_CXXFLAGS) $(CXXFLAGS) -c -o libHPCbinutils_la-IntelGPUbinutils.lo `test -f 'intel/IntelGPUbinutils.cpp' || echo '$(srcdir)/'`intel/IntelGPUbinutils.cpp
+
+libHPCbinutils_la-CreateCFG.lo: intel/CreateCFG.cpp
+@am__fastdepCXX_TRUE@	$(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libHPCbinutils_la_CXXFLAGS) $(CXXFLAGS) -MT libHPCbinutils_la-CreateCFG.lo -MD -MP -MF $(DEPDIR)/libHPCbinutils_la-CreateCFG.Tpo -c -o libHPCbinutils_la-CreateCFG.lo `test -f 'intel/CreateCFG.cpp' || echo '$(srcdir)/'`intel/CreateCFG.cpp
+@am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/libHPCbinutils_la-CreateCFG.Tpo $(DEPDIR)/libHPCbinutils_la-CreateCFG.Plo
+@AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='intel/CreateCFG.cpp' object='libHPCbinutils_la-CreateCFG.lo' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCXX_FALSE@	$(AM_V_CXX@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libHPCbinutils_la_CXXFLAGS) $(CXXFLAGS) -c -o libHPCbinutils_la-CreateCFG.lo `test -f 'intel/CreateCFG.cpp' || echo '$(srcdir)/'`intel/CreateCFG.cpp
 
 libHPCbinutils_la-ElfHelper.lo: ElfHelper.cpp
 @am__fastdepCXX_TRUE@	$(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libHPCbinutils_la_CXXFLAGS) $(CXXFLAGS) -MT libHPCbinutils_la-ElfHelper.lo -MD -MP -MF $(DEPDIR)/libHPCbinutils_la-ElfHelper.Tpo -c -o libHPCbinutils_la-ElfHelper.lo `test -f 'ElfHelper.cpp' || echo '$(srcdir)/'`ElfHelper.cpp
diff --git a/src/lib/binutils/intel/.IntelGPUbinutils.cpp.swp b/src/lib/binutils/intel/.IntelGPUbinutils.cpp.swp
new file mode 100644
index 0000000000000000000000000000000000000000..225242dc58b5f46a454b4a06430f6bec2cbdfc1f
GIT binary patch
literal 16384
zcmeI3U5q4E6~_zEg;o%PCc=wc8r<!k?dh3aOoZK?k=@y5c3{7l+1&?)bgHXvcNbe-
z)z+<^*<o4r$;9{|8l#CJL0OfU7{7q{=!*o24<;Bf5ET=n%NsF4F@{8c=T`Mh_v|bX
z*old$&iuQ&?#H?3{?EBzr)pZ$rxuQ=Pu4%-@VUWpuDG_@{L-CIUiti;j^nyX9Hm~|
zcf;M1eW}kwq0wR738J?w<v5=k6*y($cDmvwdaB#^(l`!J2kB(T^Cs`A-%+2QGKn`8
z2Cb=95Dn5GG*dy8>Tqg7zUPmh946Pje*aQx73l>71p{LYT<6@ed)Kri-B!L;efY+c
zW4Xmm!9c-4!9c-4!9c-4!9c-4!9c;l|2G3^|0?J6bn(i(n-AsJD>hvJlYbt`(@$^6
zzbd~Uo+vJC$j@(!OTj?FK*2!4K*2!4K*2!4K*2!4K*2!4K*2!4!2f^&*LR#U?fy*;
z{5=03p8tP#z2p2CJORE1z6lmU8QcPX{~^cuDwqRzgB!uWuXCI?z)wL3G{FPlCh)s!
z9p?q`WpDue>w}K-5AYMv1{Lse@X7~h51atkgI|>#=L=v4ync=2{1CLjE>H)5zM4AV
zPvBAT6)+2Sf|s{B&P(7~@Dz9qd=VJX1owe^!2~FQYrxfD8+hjZj&mL?g8RWvunoNO
zKF9e5_&)d^_%1jP7QrF#27!=&f>*%>a2`AXz5~7uj)A+ts{~vwfS15?;92k#_&Vr<
z`@s>gAAAbTf*J5p@CLSe9f-~TDt3F@k4fD$@RSK2*3HzJn&MBL=mtjhlQ`90YLwe?
zgUF<+v2gUnVqNn057M|N)!Z;#Rf$GMk}A`U`A)R!n>x*&a}+n}Ac@rUUVh8{T8e4a
zP1=-3B`avF3PqRQP|eJ!X|;X3T52vVEjH#WS;1cH>uOb<m9mkvk(E_#(Z165Yihe`
zRYjCZlYxgWq$|_f>zsAkL8xs@6{*_xtIl|r>CnVw{Rl(-Bv^J+t>V5;AS#Z`<!=U#
zZ}W9Kh_u2ojl)e|erX<T95^}OTx!gnSU7gHd367ghAP`PV=9YPb*!(Wxx?z9UJg9X
zr8Vf3*M%HBFt2twmtRll=(%1JZxIDS<b?xYtMVMKm+HBD=Ih;lzwF$$mT}RWLF4)R
z<Ygi===X8osYUIE$KAAhq?5=CM6>H+#)|oY+lgYs5HR)b#zMn4kfs`8`;ZQC0nuo7
zYp4xlMj@lZlCZ@T@9XHOX;yo!I5bUPdyGoDHroB5<2C8No2-uJkCUjw_IRa16qwW>
zPiuQo8jdHKRWl_+_u|xawXZkWfq5W^I%BF~E<B{SMRq{DRQI;h5SMkHV)9^IzRP(}
zH6nkJDP%70uO>mKo2m(<i9lv@*Iy2d_Ep?gX;-U$Y)sG!!XRChBs1{3%6EHiN86<&
zPE<^>WQ96a<tBl_Y_O=@XjQGa)ET5hQ3Jz-DpFaUhOY?LlwQ`6G`-T*B1r0byqlP~
zURsmoAYeHN12<Elrf4vgb&AzMhK#fp=}s8n+MX6UtpO@%V-S?;BpLK&`mU;Oyh2Nf
z@?6yLQI8&GTGDEq*v_ptBZADJ#RQQKSOz+A>>Fv?=tSmTvsZ=iBE3Oc4MI}as>Y(*
z4pOq?L`2XLqdz;)Q5rDFGuq6E>NT`YRTN7PdqIj`X<U<)+Z!ahM>SdyIigsoRT~Y2
zS3gKt*=zK0{g)T`ly^fo>uZnZVD^me^+U$8s3v0ig9LMG%%qWFywDAL+OMmH2py~;
zWEa4uKrJrJHKf1R5&=raiBT=fZ-C4IO?kmHap{=0H1u@hb-B&$Wch^&YY3~h>a)iP
zyaD=%D*8;HeTU`rVu8}L+_;-FgvmsR-RBHFNo9MUWq9yaYc=bFn0c%ZWf#XVWvd@l
zB1HtLai>%(LE06^wN$p*Ua=p5jg=%wQynQ*gkHd~h$A}&)OZs;yuPKyF~pooM?U1?
z`H2{|$EHlKKD!*Fn16$1inYlQJtovqP1Cq3Un+8YcEHF`RVO1#CPs!~MXXcBWb4JO
z4RE}*{`k0%IFQ}ENN=YoZAU7LIpYEcv0KE9F6pmr60cyyVej?6;hnT!%NPpD7q5p?
zKd}2E%cN-Aj>9lsk@2|ZT6Qq!wwh3jdQ`fMPn9h-;@3UZ8z4H-88_N$<F?{uO<Vaa
zwbr@!qLQ`P#mdOk77r4LmQ``6Ov=vrEi~!rbdQ*g?QMXUSgvb!bXgC<SA4N`#&OzZ
zOFK@~D2s{p=y8rLw=!#@MXsGV^ayLFW_?whEBD!)r2PzHJGC1)m062%E~>6i`IU;t
zmVEd0^zO;s)4Qi@YJ;2Qf+jE9N18u$eC|Z{Lbwc92-&gzig-OvIy-Ea>F$oj#{Ppx
z8uk6&=_p<awcinMHal!579#PO+_hVFpjzA6QIPKh?PMukcbS;1RG;}jjQi`;swi|^
zPCgiaM6t@aN>wR#*+`hoX}x{Hzg-01eu(qd$;HERBli>PfaZ1k4Xg9JS?3)&|3AdJ
z^<~bsa{eDazyASe{3WnGKkwhn^&{X-&iQ`_FM=1qZ@|yNGvI0P6nGMR4Lk~#K?)LZ
z2iOIs!4#;2KN1^w7No#{2s{Wr0}g>nuoG0lHt;I3gA3p}@EGvGF>nj`7`OranmEGK
z;BgRvQ{ZOsJn@A4z`dXjeo0K>8(<l{Ml9h+;5_JoPlIXjCUJxdK;jH9gBQW`;4F9$
z+zoC5e<5b@TktptfW#DbfY;#l4?y^RNjNS(1p@^G1p`~oz({D0^BBhw4jjbJTCA*c
zK$8SDqa<*4$Tc=hw8VLVEuBQ>^cj1$TFNUrrE^ZHG@eN1JfV47Cy@?0@|%h;@up_L
z+FP@-mysDKMD0qYwnPm_3e@F5RH<?>TB~D2cr~?)y8Se(MbKc+o=nQV0jIS)o9XOG
zjS+davTZ5RY%%@HnhZAPQQEGQH#D?Ief;6U15x=R4XltXSUS>_l`oX>FKFSq@V3$V
zw7spV>BBX()M(t-Y#cq9^?4I|XB6Q}LZT&!h~9b$16YC57{SG;QkHKoZ1&h{)@?6@
zVH7VXt#Z`{O-uJI9B!xy>GOIZR;*S!H!&rbQrqJ=sw->^n?rI*l;eu7n9TGC?Ge><
z{bX|1bl0shUMIg>*Uqr5^t$yFe-NEk&`Atl%5}=i+oW@9YC`k>f;Jq?F~aNhhmAMo
zC`0v3PSr+<B^)e*TKLZCOy+y^!cyt&JSx#PD8SJ(ZF;Urhi*49JJeKY!f)SULA8_f
z@(80jH=7xL%{8RYk37Qmca8&3+{<FR2W-r3sDz~}GkwXrQe~|b36%+bONUN0j~#Cu
zojXj(_*51)meIW_?{5`e&W$0aVEdU{AR8J)JM65%BNWNB<a&fu)V_VHoafm?>K;{=
zv^>dbZ_Ct&NlMhtOJ)8}Xmu%*euAv={#C6onH)Gfht@SXDRDX^!jPzmv1XX5!giBh
z<M$PQ67{eK;gLa?)W|<WKxAkl@xshRW0J(?GUctQG_O-htJp^mtwFo3lT1oh*$!?I
zTD1p%{*6_Je5o`~r<zNoqoaJ!sE#Pt(W%^4j2Fps4t_S5GePSWvQ#U@3h^Rj@HN9F
zkU_8Pd8XnLiQ!of={XB#a<(N0hDvp>q~AK40ee~U&Yk(Q7iJ})NA8emn{SAw6OuZj
zuiR#AZkL}{-_i0}qUeRTxgAg0<zUS`vI5b2s2s}|snYq(03!rv#qVKKot%ZSBf3XW
z4)wD(&@gOoC_>Yc@09GichTI@Cc+_hvALU%$h^o|r|OiDCXEk!Ar-QvX)Vj@AZ2P!
zg{M-Hzp(xv%f^vrw&n?eowC|4p$Uf0Rz8gy*;Z6Gokz;`&8D4lK847`I*7c~P=&P3
z=Dgw4yo^7dI+pT>j^hT!kh0?0Rc|~8uNa!TbPkte0XmyaSopv4+8st=l=CL(Rz&_i
zyp-%Vf=5pa(ZM*KPuzZ2o;K=m(OsF>Q8qh^+vcBlzC2|6r)HP^k!Iyx;)&_0z_<rn
corUPYYO2ke)J@l}i<ICwtM!MsU&5UK0^^(W6#xJL

literal 0
HcmV?d00001

diff --git a/src/lib/binutils/intel/CreateCFG.cpp b/src/lib/binutils/intel/CreateCFG.cpp
new file mode 100644
index 0000000000..d0edcefee4
--- /dev/null
+++ b/src/lib/binutils/intel/CreateCFG.cpp
@@ -0,0 +1,215 @@
+// -*-Mode: C++;-*- // technically C99
+
+// * BeginRiceCopyright *****************************************************
+//
+// --------------------------------------------------------------------------
+// Part of HPCToolkit (hpctoolkit.org)
+//
+// Information about sources of support for research and development of
+// HPCToolkit is at 'hpctoolkit.org' and in 'README.Acknowledgments'.
+// --------------------------------------------------------------------------
+//
+// Copyright ((c)) 2002-2020, Rice University
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+//   notice, this list of conditions and the following disclaimer.
+//
+// * Redistributions in binary form must reproduce the above copyright
+//   notice, this list of conditions and the following disclaimer in the
+//   documentation and/or other materials provided with the distribution.
+//
+// * Neither the name of Rice University (RICE) nor the names of its
+//   contributors may be used to endorse or promote products derived from
+//   this software without specific prior written permission.
+//
+// This software is provided by RICE and contributors "as is" and any
+// express or implied warranties, including, but not limited to, the
+// implied warranties of merchantability and fitness for a particular
+// purpose are disclaimed. In no event shall RICE or contributors be
+// liable for any direct, indirect, incidental, special, exemplary, or
+// consequential damages (including, but not limited to, procurement of
+// substitute goods or services; loss of use, data, or profits; or
+// business interruption) however caused and on any theory of liability,
+// whether in contract, strict liability, or tort (including negligence
+// or otherwise) arising in any way out of the use of this software, even
+// if advised of the possibility of such damage.
+//
+// ******************************************************* EndRiceCopyright *
+
+//******************************************************************************
+// system includes
+//******************************************************************************
+
+#include <kv.hpp>
+#include <igc_binary_decoder.h>
+
+#include <iostream>
+#include <stack>
+#include <algorithm>
+#include <vector>
+#include <sstream>
+#include <set>
+#include <map> 
+
+
+
+//******************************************************************************
+// local includes
+//******************************************************************************
+
+#include "CreateCFG.hpp"
+
+
+
+//******************************************************************************
+// local data
+//******************************************************************************
+
+#define MAX_STR_SIZE 1024
+
+std::vector<int32_t> block_offsets;
+std::map<int32_t, bool> visitedBlockOffsets;
+
+
+
+//******************************************************************************
+// private operations
+//******************************************************************************
+
+static std::set<Edge>
+get_cfg_edges
+(
+	std::vector<uint8_t> binary,
+	size_t binary_size
+)
+{
+	KernelView kv(IGA_GEN9, binary.data(), binary.size(),
+			iga::SWSB_ENCODE_MODE::SingleDistPipe);
+	std::set<Edge> cfg_edges;
+
+	int32_t offset = 0;
+	int32_t size;
+	while (offset < binary_size) {
+		int32_t prev_block_start_offset;
+		int32_t prev_block_end_offset;
+		int32_t block_start_offset;
+		bool isStartOfBasicBlock = kv.isInstTarget(offset);
+		if (isStartOfBasicBlock) {
+			block_offsets.push_back(offset);
+			visitedBlockOffsets.insert({offset, false});
+			block_start_offset = offset;	
+		}
+		size = kv.getInstSize(offset);
+		while (!kv.isInstTarget(offset + size) && (offset + size < binary_size)) {
+			offset += size;	
+			size = kv.getInstSize(offset);
+			if (size == 0) {
+				// this is a weird edge case, what to do?
+				break;
+			}
+		}
+
+		int32_t *jump_targets = new int32_t[KV_MAX_TARGETS_PER_INSTRUCTION];
+		size_t jump_targets_count = kv.getInstTargets(offset, jump_targets);
+		int32_t next_block_start_offset = offset + size;
+		bool isFallThroughEdgeAdded = false;
+
+		for (size_t i = 0; i < jump_targets_count; i++) {
+			if (jump_targets[i] == next_block_start_offset) {
+				isFallThroughEdgeAdded = true;
+			} else if (jump_targets[i] == block_start_offset) {
+				if (block_offsets.size() >= 2) {
+					int32_t from = block_offsets[block_offsets.size() - 2];
+					int32_t from_blockEndOffset;
+					for (Edge edge: cfg_edges) {
+						if (edge.from == from && edge.to == block_start_offset) {
+							from_blockEndOffset	 = edge.from_blockEndOffset;
+						}
+					}	
+					cfg_edges.insert(Edge(block_offsets[block_offsets.size() - 2], block_start_offset, from_blockEndOffset));
+				}
+			}
+			cfg_edges.insert(Edge(block_start_offset, jump_targets[i], next_block_start_offset - size));
+		}
+		if(!isFallThroughEdgeAdded) {
+			cfg_edges.insert(Edge(block_start_offset, next_block_start_offset, next_block_start_offset - size));
+		}
+		prev_block_start_offset = block_start_offset;
+		prev_block_end_offset = offset; 
+		offset += size;
+	}
+	cfg_edges.insert(Edge(block_offsets[block_offsets.size() - 1], binary_size, binary_size - size));
+	return cfg_edges;
+}
+
+
+static void
+printCFGEdges
+(
+	std::set<Edge> cfg_edges
+)
+{
+	for (Edge edge: cfg_edges) {
+		std::cout << edge.from << "->" << edge.to << std::endl;	
+	}	
+}
+
+
+static void
+printBasicBlocks
+(
+	std::vector<uint8_t> binary,
+	std::set<Edge> cfg_edges
+)
+{
+	KernelView kv(IGA_GEN9, binary.data(), binary.size(), iga::SWSB_ENCODE_MODE::SingleDistPipe);
+	int32_t offset;
+	char text[MAX_STR_SIZE] = { 0 };
+	size_t length;
+	int32_t size;
+
+	for (Edge edge: cfg_edges) {
+		offset = edge.from;
+		if(edge.from == edge.to) {
+			// skip self-loops
+			continue;
+		}
+		auto it = visitedBlockOffsets.find(offset);
+		if (it->second) {
+			continue;
+		} else {
+			it->second = true;
+		}
+		std::cout << offset << " [ label=\"\\\n"; 
+		while (offset < edge.to) {
+			size = kv.getInstSize(offset);
+			length = kv.getInstSyntax(offset, text, MAX_STR_SIZE);
+			assert(length > 0);
+			std::cout << offset << ": " << text << "\\\l";
+			offset += size;
+		}
+		std::cout << "\" shape=\"box\"]; \n" << std::endl;
+	}	
+}
+
+
+
+//******************************************************************************
+// interface operations
+//******************************************************************************
+
+// pass Intel kernel's raw gen binary
+// kernel's text region is a raw gen binary
+// you  can find kernel nested in [debug section of GPU binary/separate debug section dump]
+void printCFGInDotGraph(std::vector<uint8_t> intelRawGenBinary) {
+	std::cout << "digraph GEMM_iga {" << std::endl;
+	std::set<Edge> edges = get_cfg_edges(intelRawGenBinary, intelRawGenBinary.size());
+	printBasicBlocks(intelRawGenBinary, edges);
+	printCFGEdges(edges);
+	std::cout << "}" << std::endl;
+}
diff --git a/src/lib/binutils/intel/CreateCFG.hpp b/src/lib/binutils/intel/CreateCFG.hpp
new file mode 100644
index 0000000000..8ae82eeb17
--- /dev/null
+++ b/src/lib/binutils/intel/CreateCFG.hpp
@@ -0,0 +1,89 @@
+// -*-Mode: C++;-*- // technically C99
+
+// * BeginRiceCopyright *****************************************************
+//
+// --------------------------------------------------------------------------
+// Part of HPCToolkit (hpctoolkit.org)
+//
+// Information about sources of support for research and development of
+// HPCToolkit is at 'hpctoolkit.org' and in 'README.Acknowledgments'.
+// --------------------------------------------------------------------------
+//
+// Copyright ((c)) 2002-2020, Rice University
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+//   notice, this list of conditions and the following disclaimer.
+//
+// * Redistributions in binary form must reproduce the above copyright
+//   notice, this list of conditions and the following disclaimer in the
+//   documentation and/or other materials provided with the distribution.
+//
+// * Neither the name of Rice University (RICE) nor the names of its
+//   contributors may be used to endorse or promote products derived from
+//   this software without specific prior written permission.
+//
+// This software is provided by RICE and contributors "as is" and any
+// express or implied warranties, including, but not limited to, the
+// implied warranties of merchantability and fitness for a particular
+// purpose are disclaimed. In no event shall RICE or contributors be
+// liable for any direct, indirect, incidental, special, exemplary, or
+// consequential damages (including, but not limited to, procurement of
+// substitute goods or services; loss of use, data, or profits; or
+// business interruption) however caused and on any theory of liability,
+// whether in contract, strict liability, or tort (including negligence
+// or otherwise) arising in any way out of the use of this software, even
+// if advised of the possibility of such damage.
+//
+// ******************************************************* EndRiceCopyright *
+
+//******************************************************************************
+// system includes
+//******************************************************************************
+
+#include <vector>
+
+
+
+//******************************************************************************
+// local data
+//******************************************************************************
+
+class Edge {
+	public:
+		int32_t from;	
+		int32_t to;
+		int32_t from_blockEndOffset;
+
+		Edge(int32_t f, int32_t t, int32_t from_b) {
+			from = f;
+			to = t;
+			from_blockEndOffset = from_b;
+		}
+
+		bool operator == (const Edge &that) const 
+		{
+			return((this->from == that.from) && (this->to == that.to));
+		}
+		
+		bool operator<(const Edge& that) const 
+		{
+			if (this->from == that.from) {
+				return (this->to < that.to);
+			} else {
+				return (this->from < that.from);
+			}
+		}
+};
+
+
+
+//******************************************************************************
+// interface operations
+//******************************************************************************
+
+void printCFGInDotGraph(std::vector<uint8_t> intelRawGenBinary);
diff --git a/src/lib/binutils/IntelGPUbinutils.cpp b/src/lib/binutils/intel/IntelGPUbinutils.cpp
similarity index 89%
rename from src/lib/binutils/IntelGPUbinutils.cpp
rename to src/lib/binutils/intel/IntelGPUbinutils.cpp
index 51a3e94869..6d8d0e5ebe 100644
--- a/src/lib/binutils/IntelGPUbinutils.cpp
+++ b/src/lib/binutils/intel/IntelGPUbinutils.cpp
@@ -57,8 +57,9 @@
 #include <unistd.h>
 #include <libelf.h>
 
-#include "igc_binary_decoder.h"
-#include "gen_symbols_decoder.h"
+#include <igc_binary_decoder.h>
+#include <gen_symbols_decoder.h>
+
 
 
 //******************************************************************************
@@ -69,6 +70,7 @@
 #include <lib/support/diagnostics.h>
 #include <lib/support/RealPathMgr.cpp>
 #include "IntelGPUbinutils.hpp"
+#include "CreateCFG.hpp"
 
 
 
@@ -209,6 +211,30 @@ extract_kernelelfs
 			bool result = elfFile->open(file_buffer, f_size, file_name);
 
 			filevector->push_back(elfFile);
+			
+			// start cfg generation
+			Elf *elf = elfFile->getElf();
+			file_buffer = elfFile->getMemory();
+			ElfSectionVector *sections = elfGetSectionVector(elf);
+			GElf_Ehdr ehdr_v;
+			GElf_Ehdr *ehdr = gelf_getehdr(elf, &ehdr_v);
+
+			if (ehdr) {
+				for (auto si = sections->begin(); si != sections->end(); si++) {
+					Elf_Scn *scn = *si;
+					GElf_Shdr shdr_v;
+					GElf_Shdr *shdr = gelf_getshdr(scn, &shdr_v);
+					if (!shdr) continue;
+					char *sectionData = elfSectionGetData(file_buffer, shdr);
+					const char *section_name = elf_strptr(elf, ehdr->e_shstrndx, shdr->sh_name);
+					if (strcmp(section_name, ".text") == 0) {
+						std::vector<uint8_t> intelRawGenBinary(reinterpret_cast<uint8_t*>(sectionData), 
+								reinterpret_cast<uint8_t*>(sectionData) + kernel_header->SizeVisaDbgInBytes);
+						printCFGInDotGraph(intelRawGenBinary);
+					}
+				}
+			}
+			//end cfg generation
 		} else {
 			extractSuccess = false;
 		}
diff --git a/src/lib/binutils/IntelGPUbinutils.hpp b/src/lib/binutils/intel/IntelGPUbinutils.hpp
similarity index 100%
rename from src/lib/binutils/IntelGPUbinutils.hpp
rename to src/lib/binutils/intel/IntelGPUbinutils.hpp

From c3a0fbbdc59327056d9e86d1a2873a028656d0b4 Mon Sep 17 00:00:00 2001
From: dejangrubisic <grubisic.dejan@yahoo.com>
Date: Tue, 15 Sep 2020 21:09:50 -0500
Subject: [PATCH 025/177] opencl writing trace files enabled

---
 src/tool/hpcrun/sample-sources/opencl.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/tool/hpcrun/sample-sources/opencl.c b/src/tool/hpcrun/sample-sources/opencl.c
index dfe1f6c5c0..f7e9ca0468 100644
--- a/src/tool/hpcrun/sample-sources/opencl.c
+++ b/src/tool/hpcrun/sample-sources/opencl.c
@@ -66,7 +66,7 @@
 
 #define GPU_STRING "gpu=opencl"
 static device_finalizer_fn_entry_t device_finalizer_shutdown;
-
+static device_finalizer_fn_entry_t device_trace_finalizer_shutdown;
 
 
 //******************************************************************************
@@ -156,6 +156,10 @@ METHOD_FN(finalize_event_list)
   opencl_api_initialize();
   device_finalizer_shutdown.fn = opencl_api_finalize;
   device_finalizer_register(device_finalizer_type_shutdown, &device_finalizer_shutdown);
+
+  // Register shutdown functions to write trace files
+  device_trace_finalizer_shutdown.fn = gpu_trace_fini;
+  device_finalizer_register(device_finalizer_type_shutdown, &device_trace_finalizer_shutdown);
 }
 
 

From 697e738da63a9e2a1cfd1db5598a78ac22a63e42 Mon Sep 17 00:00:00 2001
From: Jokeren <robinho364@gmail.com>
Date: Thu, 17 Sep 2020 03:32:27 +0000
Subject: [PATCH 026/177] Init CFG creation; at least it compiles

---
 src/lib/banal/Makefile.am                     |   7 +-
 src/lib/banal/Makefile.in                     |  74 +++--
 src/lib/banal/Struct.cpp                      |  38 ++-
 src/lib/banal/cuda/DotCFG.hpp                 |   8 +-
 src/lib/banal/intel/IntelBlock.cpp            |  34 +++
 src/lib/banal/intel/IntelBlock.hpp            |  26 ++
 src/lib/banal/intel/IntelCFGFactory.cpp       |  92 +++++++
 src/lib/banal/intel/IntelCFGFactory.hpp       |  32 +++
 src/lib/banal/intel/IntelCodeSource.cpp       |  15 ++
 src/lib/banal/intel/IntelCodeSource.hpp       |  55 ++++
 src/lib/banal/intel/IntelCreateCFG.cpp        | 213 ---------------
 src/lib/banal/intel/IntelCreateCFG.hpp        |  94 -------
 src/lib/banal/intel/IntelFunction.cpp         |  12 +
 src/lib/banal/intel/IntelFunction.hpp         |  25 ++
 src/lib/banal/intel/IntelGPUbanal.cpp         | 122 ---------
 src/lib/banal/intel/ReadIntelCFG.cpp          | 254 ++++++++++++++++++
 .../{IntelGPUbanal.hpp => ReadIntelCFG.hpp}   |  27 +-
 src/lib/binutils/ElfHelper.cpp                |  31 +++
 src/lib/binutils/ElfHelper.hpp                |   5 +-
 src/lib/binutils/Makefile.am                  |   3 +-
 src/lib/binutils/Makefile.in                  |  16 +-
 src/lib/binutils/intel/CreateCFG.cpp          | 215 ---------------
 src/lib/binutils/intel/CreateCFG.hpp          |  94 -------
 src/lib/binutils/intel/IntelGPUbinutils.cpp   |  42 +--
 24 files changed, 701 insertions(+), 833 deletions(-)
 create mode 100644 src/lib/banal/intel/IntelBlock.cpp
 create mode 100644 src/lib/banal/intel/IntelBlock.hpp
 create mode 100644 src/lib/banal/intel/IntelCFGFactory.cpp
 create mode 100644 src/lib/banal/intel/IntelCFGFactory.hpp
 create mode 100644 src/lib/banal/intel/IntelCodeSource.cpp
 create mode 100644 src/lib/banal/intel/IntelCodeSource.hpp
 delete mode 100644 src/lib/banal/intel/IntelCreateCFG.cpp
 delete mode 100644 src/lib/banal/intel/IntelCreateCFG.hpp
 create mode 100644 src/lib/banal/intel/IntelFunction.cpp
 create mode 100644 src/lib/banal/intel/IntelFunction.hpp
 delete mode 100644 src/lib/banal/intel/IntelGPUbanal.cpp
 create mode 100644 src/lib/banal/intel/ReadIntelCFG.cpp
 rename src/lib/banal/intel/{IntelGPUbanal.hpp => ReadIntelCFG.hpp} (89%)
 delete mode 100644 src/lib/binutils/intel/CreateCFG.cpp
 delete mode 100644 src/lib/binutils/intel/CreateCFG.hpp

diff --git a/src/lib/banal/Makefile.am b/src/lib/banal/Makefile.am
index addd576daf..a9248b52c4 100644
--- a/src/lib/banal/Makefile.am
+++ b/src/lib/banal/Makefile.am
@@ -78,8 +78,11 @@ MYSOURCES = \
 	cuda/CudaBlock.cpp  \
 	cuda/CudaCodeSource.cpp  \
 	cuda/ReadCubinCFG.cpp \
-	intel/IntelGPUbanal.cpp \
-	intel/IntelCreateCFG.cpp \
+	intel/IntelCFGFactory.cpp  \
+	intel/IntelFunction.cpp  \
+	intel/IntelBlock.cpp  \
+	intel/IntelCodeSource.cpp  \
+	intel/ReadIntelCFG.cpp \
 	Struct.cpp  \
 	Struct-Inline.cpp  \
 	Struct-Output.cpp
diff --git a/src/lib/banal/Makefile.in b/src/lib/banal/Makefile.in
index ebc687f19f..d1567b5143 100644
--- a/src/lib/banal/Makefile.in
+++ b/src/lib/banal/Makefile.in
@@ -143,9 +143,12 @@ am__objects_1 = cuda/libHPCbanal_la-CFGParser.lo \
 	cuda/libHPCbanal_la-CudaBlock.lo \
 	cuda/libHPCbanal_la-CudaCodeSource.lo \
 	cuda/libHPCbanal_la-ReadCubinCFG.lo \
-	intel/libHPCbanal_la-IntelGPUbanal.lo \
-	intel/libHPCbanal_la-IntelCreateCFG.lo \
-	libHPCbanal_la-Struct.lo libHPCbanal_la-Struct-Inline.lo \
+	intel/libHPCbanal_la-IntelCFGFactory.lo \
+	intel/libHPCbanal_la-IntelFunction.lo \
+	intel/libHPCbanal_la-IntelBlock.lo \
+	intel/libHPCbanal_la-IntelCodeSource.lo \
+	intel/libHPCbanal_la-ReadIntelCFG.lo libHPCbanal_la-Struct.lo \
+	libHPCbanal_la-Struct-Inline.lo \
 	libHPCbanal_la-Struct-Output.lo
 am_libHPCbanal_la_OBJECTS = $(am__objects_1)
 libHPCbanal_la_OBJECTS = $(am_libHPCbanal_la_OBJECTS)
@@ -537,8 +540,11 @@ MYSOURCES = \
 	cuda/CudaBlock.cpp  \
 	cuda/CudaCodeSource.cpp  \
 	cuda/ReadCubinCFG.cpp \
-	intel/IntelGPUbanal.cpp \
-	intel/IntelCreateCFG.cpp \
+	intel/IntelCFGFactory.cpp  \
+	intel/IntelFunction.cpp  \
+	intel/IntelBlock.cpp  \
+	intel/IntelCodeSource.cpp  \
+	intel/ReadIntelCFG.cpp \
 	Struct.cpp  \
 	Struct-Inline.cpp  \
 	Struct-Output.cpp
@@ -646,9 +652,15 @@ intel/$(am__dirstamp):
 intel/$(DEPDIR)/$(am__dirstamp):
 	@$(MKDIR_P) intel/$(DEPDIR)
 	@: > intel/$(DEPDIR)/$(am__dirstamp)
-intel/libHPCbanal_la-IntelGPUbanal.lo: intel/$(am__dirstamp) \
+intel/libHPCbanal_la-IntelCFGFactory.lo: intel/$(am__dirstamp) \
 	intel/$(DEPDIR)/$(am__dirstamp)
-intel/libHPCbanal_la-IntelCreateCFG.lo: intel/$(am__dirstamp) \
+intel/libHPCbanal_la-IntelFunction.lo: intel/$(am__dirstamp) \
+	intel/$(DEPDIR)/$(am__dirstamp)
+intel/libHPCbanal_la-IntelBlock.lo: intel/$(am__dirstamp) \
+	intel/$(DEPDIR)/$(am__dirstamp)
+intel/libHPCbanal_la-IntelCodeSource.lo: intel/$(am__dirstamp) \
+	intel/$(DEPDIR)/$(am__dirstamp)
+intel/libHPCbanal_la-ReadIntelCFG.lo: intel/$(am__dirstamp) \
 	intel/$(DEPDIR)/$(am__dirstamp)
 
 libHPCbanal.la: $(libHPCbanal_la_OBJECTS) $(libHPCbanal_la_DEPENDENCIES) $(EXTRA_libHPCbanal_la_DEPENDENCIES) 
@@ -678,8 +690,11 @@ distclean-compile:
 @AMDEP_TRUE@@am__include@ @am__quote@cuda/$(DEPDIR)/libHPCbanal_la-CudaFunction.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@cuda/$(DEPDIR)/libHPCbanal_la-GraphReader.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@cuda/$(DEPDIR)/libHPCbanal_la-ReadCubinCFG.Plo@am__quote@
-@AMDEP_TRUE@@am__include@ @am__quote@intel/$(DEPDIR)/libHPCbanal_la-IntelCreateCFG.Plo@am__quote@
-@AMDEP_TRUE@@am__include@ @am__quote@intel/$(DEPDIR)/libHPCbanal_la-IntelGPUbanal.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@intel/$(DEPDIR)/libHPCbanal_la-IntelBlock.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@intel/$(DEPDIR)/libHPCbanal_la-IntelCFGFactory.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@intel/$(DEPDIR)/libHPCbanal_la-IntelCodeSource.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@intel/$(DEPDIR)/libHPCbanal_la-IntelFunction.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@intel/$(DEPDIR)/libHPCbanal_la-ReadIntelCFG.Plo@am__quote@
 
 .cpp.o:
 @am__fastdepCXX_TRUE@	$(AM_V_CXX)depbase=`echo $@ | sed 's|[^/]*$$|$(DEPDIR)/&|;s|\.o$$||'`;\
@@ -754,19 +769,40 @@ cuda/libHPCbanal_la-ReadCubinCFG.lo: cuda/ReadCubinCFG.cpp
 @AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
 @am__fastdepCXX_FALSE@	$(AM_V_CXX@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libHPCbanal_la_CXXFLAGS) $(CXXFLAGS) -c -o cuda/libHPCbanal_la-ReadCubinCFG.lo `test -f 'cuda/ReadCubinCFG.cpp' || echo '$(srcdir)/'`cuda/ReadCubinCFG.cpp
 
-intel/libHPCbanal_la-IntelGPUbanal.lo: intel/IntelGPUbanal.cpp
-@am__fastdepCXX_TRUE@	$(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libHPCbanal_la_CXXFLAGS) $(CXXFLAGS) -MT intel/libHPCbanal_la-IntelGPUbanal.lo -MD -MP -MF intel/$(DEPDIR)/libHPCbanal_la-IntelGPUbanal.Tpo -c -o intel/libHPCbanal_la-IntelGPUbanal.lo `test -f 'intel/IntelGPUbanal.cpp' || echo '$(srcdir)/'`intel/IntelGPUbanal.cpp
-@am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) intel/$(DEPDIR)/libHPCbanal_la-IntelGPUbanal.Tpo intel/$(DEPDIR)/libHPCbanal_la-IntelGPUbanal.Plo
-@AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='intel/IntelGPUbanal.cpp' object='intel/libHPCbanal_la-IntelGPUbanal.lo' libtool=yes @AMDEPBACKSLASH@
+intel/libHPCbanal_la-IntelCFGFactory.lo: intel/IntelCFGFactory.cpp
+@am__fastdepCXX_TRUE@	$(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libHPCbanal_la_CXXFLAGS) $(CXXFLAGS) -MT intel/libHPCbanal_la-IntelCFGFactory.lo -MD -MP -MF intel/$(DEPDIR)/libHPCbanal_la-IntelCFGFactory.Tpo -c -o intel/libHPCbanal_la-IntelCFGFactory.lo `test -f 'intel/IntelCFGFactory.cpp' || echo '$(srcdir)/'`intel/IntelCFGFactory.cpp
+@am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) intel/$(DEPDIR)/libHPCbanal_la-IntelCFGFactory.Tpo intel/$(DEPDIR)/libHPCbanal_la-IntelCFGFactory.Plo
+@AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='intel/IntelCFGFactory.cpp' object='intel/libHPCbanal_la-IntelCFGFactory.lo' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCXX_FALSE@	$(AM_V_CXX@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libHPCbanal_la_CXXFLAGS) $(CXXFLAGS) -c -o intel/libHPCbanal_la-IntelCFGFactory.lo `test -f 'intel/IntelCFGFactory.cpp' || echo '$(srcdir)/'`intel/IntelCFGFactory.cpp
+
+intel/libHPCbanal_la-IntelFunction.lo: intel/IntelFunction.cpp
+@am__fastdepCXX_TRUE@	$(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libHPCbanal_la_CXXFLAGS) $(CXXFLAGS) -MT intel/libHPCbanal_la-IntelFunction.lo -MD -MP -MF intel/$(DEPDIR)/libHPCbanal_la-IntelFunction.Tpo -c -o intel/libHPCbanal_la-IntelFunction.lo `test -f 'intel/IntelFunction.cpp' || echo '$(srcdir)/'`intel/IntelFunction.cpp
+@am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) intel/$(DEPDIR)/libHPCbanal_la-IntelFunction.Tpo intel/$(DEPDIR)/libHPCbanal_la-IntelFunction.Plo
+@AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='intel/IntelFunction.cpp' object='intel/libHPCbanal_la-IntelFunction.lo' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCXX_FALSE@	$(AM_V_CXX@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libHPCbanal_la_CXXFLAGS) $(CXXFLAGS) -c -o intel/libHPCbanal_la-IntelFunction.lo `test -f 'intel/IntelFunction.cpp' || echo '$(srcdir)/'`intel/IntelFunction.cpp
+
+intel/libHPCbanal_la-IntelBlock.lo: intel/IntelBlock.cpp
+@am__fastdepCXX_TRUE@	$(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libHPCbanal_la_CXXFLAGS) $(CXXFLAGS) -MT intel/libHPCbanal_la-IntelBlock.lo -MD -MP -MF intel/$(DEPDIR)/libHPCbanal_la-IntelBlock.Tpo -c -o intel/libHPCbanal_la-IntelBlock.lo `test -f 'intel/IntelBlock.cpp' || echo '$(srcdir)/'`intel/IntelBlock.cpp
+@am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) intel/$(DEPDIR)/libHPCbanal_la-IntelBlock.Tpo intel/$(DEPDIR)/libHPCbanal_la-IntelBlock.Plo
+@AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='intel/IntelBlock.cpp' object='intel/libHPCbanal_la-IntelBlock.lo' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCXX_FALSE@	$(AM_V_CXX@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libHPCbanal_la_CXXFLAGS) $(CXXFLAGS) -c -o intel/libHPCbanal_la-IntelBlock.lo `test -f 'intel/IntelBlock.cpp' || echo '$(srcdir)/'`intel/IntelBlock.cpp
+
+intel/libHPCbanal_la-IntelCodeSource.lo: intel/IntelCodeSource.cpp
+@am__fastdepCXX_TRUE@	$(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libHPCbanal_la_CXXFLAGS) $(CXXFLAGS) -MT intel/libHPCbanal_la-IntelCodeSource.lo -MD -MP -MF intel/$(DEPDIR)/libHPCbanal_la-IntelCodeSource.Tpo -c -o intel/libHPCbanal_la-IntelCodeSource.lo `test -f 'intel/IntelCodeSource.cpp' || echo '$(srcdir)/'`intel/IntelCodeSource.cpp
+@am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) intel/$(DEPDIR)/libHPCbanal_la-IntelCodeSource.Tpo intel/$(DEPDIR)/libHPCbanal_la-IntelCodeSource.Plo
+@AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='intel/IntelCodeSource.cpp' object='intel/libHPCbanal_la-IntelCodeSource.lo' libtool=yes @AMDEPBACKSLASH@
 @AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
-@am__fastdepCXX_FALSE@	$(AM_V_CXX@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libHPCbanal_la_CXXFLAGS) $(CXXFLAGS) -c -o intel/libHPCbanal_la-IntelGPUbanal.lo `test -f 'intel/IntelGPUbanal.cpp' || echo '$(srcdir)/'`intel/IntelGPUbanal.cpp
+@am__fastdepCXX_FALSE@	$(AM_V_CXX@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libHPCbanal_la_CXXFLAGS) $(CXXFLAGS) -c -o intel/libHPCbanal_la-IntelCodeSource.lo `test -f 'intel/IntelCodeSource.cpp' || echo '$(srcdir)/'`intel/IntelCodeSource.cpp
 
-intel/libHPCbanal_la-IntelCreateCFG.lo: intel/IntelCreateCFG.cpp
-@am__fastdepCXX_TRUE@	$(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libHPCbanal_la_CXXFLAGS) $(CXXFLAGS) -MT intel/libHPCbanal_la-IntelCreateCFG.lo -MD -MP -MF intel/$(DEPDIR)/libHPCbanal_la-IntelCreateCFG.Tpo -c -o intel/libHPCbanal_la-IntelCreateCFG.lo `test -f 'intel/IntelCreateCFG.cpp' || echo '$(srcdir)/'`intel/IntelCreateCFG.cpp
-@am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) intel/$(DEPDIR)/libHPCbanal_la-IntelCreateCFG.Tpo intel/$(DEPDIR)/libHPCbanal_la-IntelCreateCFG.Plo
-@AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='intel/IntelCreateCFG.cpp' object='intel/libHPCbanal_la-IntelCreateCFG.lo' libtool=yes @AMDEPBACKSLASH@
+intel/libHPCbanal_la-ReadIntelCFG.lo: intel/ReadIntelCFG.cpp
+@am__fastdepCXX_TRUE@	$(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libHPCbanal_la_CXXFLAGS) $(CXXFLAGS) -MT intel/libHPCbanal_la-ReadIntelCFG.lo -MD -MP -MF intel/$(DEPDIR)/libHPCbanal_la-ReadIntelCFG.Tpo -c -o intel/libHPCbanal_la-ReadIntelCFG.lo `test -f 'intel/ReadIntelCFG.cpp' || echo '$(srcdir)/'`intel/ReadIntelCFG.cpp
+@am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) intel/$(DEPDIR)/libHPCbanal_la-ReadIntelCFG.Tpo intel/$(DEPDIR)/libHPCbanal_la-ReadIntelCFG.Plo
+@AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='intel/ReadIntelCFG.cpp' object='intel/libHPCbanal_la-ReadIntelCFG.lo' libtool=yes @AMDEPBACKSLASH@
 @AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
-@am__fastdepCXX_FALSE@	$(AM_V_CXX@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libHPCbanal_la_CXXFLAGS) $(CXXFLAGS) -c -o intel/libHPCbanal_la-IntelCreateCFG.lo `test -f 'intel/IntelCreateCFG.cpp' || echo '$(srcdir)/'`intel/IntelCreateCFG.cpp
+@am__fastdepCXX_FALSE@	$(AM_V_CXX@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libHPCbanal_la_CXXFLAGS) $(CXXFLAGS) -c -o intel/libHPCbanal_la-ReadIntelCFG.lo `test -f 'intel/ReadIntelCFG.cpp' || echo '$(srcdir)/'`intel/ReadIntelCFG.cpp
 
 libHPCbanal_la-Struct.lo: Struct.cpp
 @am__fastdepCXX_TRUE@	$(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libHPCbanal_la_CXXFLAGS) $(CXXFLAGS) -MT libHPCbanal_la-Struct.lo -MD -MP -MF $(DEPDIR)/libHPCbanal_la-Struct.Tpo -c -o libHPCbanal_la-Struct.lo `test -f 'Struct.cpp' || echo '$(srcdir)/'`Struct.cpp
diff --git a/src/lib/banal/Struct.cpp b/src/lib/banal/Struct.cpp
index bfce07ecd4..210a31d52e 100644
--- a/src/lib/banal/Struct.cpp
+++ b/src/lib/banal/Struct.cpp
@@ -116,8 +116,7 @@
 #include "Struct-Skel.hpp"
 
 #include "cuda/ReadCubinCFG.hpp"
-
-#include "intel/IntelGPUbanal.hpp"
+#include "intel/ReadIntelCFG.hpp"
 
 #ifdef ENABLE_OPENMP
 #include <omp.h>
@@ -173,6 +172,10 @@ static Symtab * the_symtab = NULL;
 static int cuda_arch = 0;
 static size_t cubin_size = 0;
 
+// FIXME: temporary until instruction size problem is fixed
+static int intel_arch = 0;
+static std::map<int, int> inst_size;
+
 static BAnal::Struct::Options opts;
 
 //----------------------------------------------------------------------
@@ -552,20 +555,6 @@ printTime(const char *label, struct timeval *tv_prev, struct rusage *ru_prev,
   cout << endl;
 }
 
-static string
-getFileNameFromAbsolutePath(string str)
-{
-	vector <string> tokens; 
-	stringstream str_stream(str); 
-	string intermediate; 
-
-	// Tokenizing w.r.t. '/'
-	while(getline(str_stream, intermediate, '/')) { 
-		tokens.push_back(intermediate); 
-	} 
-	return tokens[tokens.size() - 1];
-}
-
 
 //
 // makeStructure -- the main entry point for hpcstruct realmain().
@@ -661,17 +650,16 @@ makeStructure(string filename,
     omp_set_num_threads(opts.jobs_parse);
 #endif
 
+    // TODO(Aaron): determine these variables
 		bool isIntelArch = true;
 		bool cfgNotPresent = true;
 		if (isIntelArch && cfgNotPresent) {
 			//std::cerr << "executing intel-gen9 specific code." << std::endl;
-			add_custom_function_object(symtab, getFileNameFromAbsolutePath(elfFile->getFileName())); //adds a dummy function object
-			code_src = new SymtabCodeSource(symtab);
-		  code_obj = new CodeObject(code_src, NULL, NULL, false, true); //last param is bool ignoreParse
-      //code_obj->parse();
-			parsable = false;
-		}
-    else if (! cuda_file) { // don't run parseapi on cuda binary
+      // TODO(Aaron): does instruction size change with different generations?
+      intel_arch = 1;
+      parsable = readIntelCFG(search_path, elfFile, the_symtab, inst_size,
+        structOpts.compute_gpu_cfg, &code_src, &code_obj);
+		} else if (! cuda_file) { // don't run parseapi on cuda binary
       code_src = new SymtabCodeSource(symtab);
       code_obj = new CodeObject(code_src);
       code_obj->parse();
@@ -1829,6 +1817,8 @@ doBlock(WorkEnv & env, GroupInfo * ginfo, ParseAPI::Function * func,
   if (cuda_arch > 0) {
     device = "NVIDIA sm_" + std::to_string(cuda_arch);
     len = (cuda_arch >= 70) ? 16 : 8;
+  } else if (intel_arch > 0) {
+    device = "INTEL GPU";
   }
   
   for (auto iit = imap.begin(); iit != imap.end(); ++iit) {
@@ -1843,6 +1833,8 @@ doBlock(WorkEnv & env, GroupInfo * ginfo, ParseAPI::Function * func,
 #else
       len = iit->second.size();
 #endif
+    } else if (intel_arch) {
+      len = inst_size.at(vma);
     }
 
     lmcache.getLineInfo(vma, filenm, line);
diff --git a/src/lib/banal/cuda/DotCFG.hpp b/src/lib/banal/cuda/DotCFG.hpp
index a0e9bd72b8..d18a0e2dc2 100644
--- a/src/lib/banal/cuda/DotCFG.hpp
+++ b/src/lib/banal/cuda/DotCFG.hpp
@@ -15,6 +15,7 @@ namespace CudaParse {
 
 struct Inst {
   int offset;
+  int size;
   bool dual_first;
   bool dual_second;
   bool is_call;
@@ -26,10 +27,13 @@ struct Inst {
   std::string target;
   std::vector<std::string> operands;
 
-  // constructor for dummy inst
-  explicit Inst(int offset) : offset(offset), dual_first(false), dual_second(false),
+  // Constructor for dummy inst
+  Inst(int offset, int size) : offset(offset), size(size), dual_first(false), dual_second(false),
     is_call(false), is_jump(false), is_sync(false) {}
 
+  explicit Inst(int offset) : Inst(offset, 0) {}
+
+  // Cuda instruction constructor
   Inst(std::string &inst_str) : offset(0), dual_first(false), dual_second(false),
     is_call(false), is_jump(false), is_sync(false) {
     if (inst_str.find("{") != std::string::npos) {  // Dual first
diff --git a/src/lib/banal/intel/IntelBlock.cpp b/src/lib/banal/intel/IntelBlock.cpp
new file mode 100644
index 0000000000..9786e6e6e9
--- /dev/null
+++ b/src/lib/banal/intel/IntelBlock.cpp
@@ -0,0 +1,34 @@
+#include "IntelBlock.hpp"
+#include <Instruction.h>
+
+
+namespace Dyninst {
+namespace ParseAPI {
+
+IntelBlock::IntelBlock(CodeObject * o, CodeRegion * r,
+  Address start, std::vector<Offset> &offsets) : Block(o, r, start) {
+  for (auto offset : offsets) {
+    _inst_offsets.push_back(offset);
+  }
+}
+
+
+Address IntelBlock::last() const {
+  return this->_inst_offsets.back();
+}
+
+
+void IntelBlock::getInsns(Insns &insns) const {
+  for (auto offset : _inst_offsets) {
+#ifdef DYNINST_INSTRUCTION_PTR
+    insns.insert(std::pair<long unsigned int, 
+      InstructionAPI::InstructionPtr>(offset, NULL));
+#else
+    InstructionAPI::Instruction inst;    
+    insns[offset] = inst;
+#endif
+  }
+}
+
+}
+}
diff --git a/src/lib/banal/intel/IntelBlock.hpp b/src/lib/banal/intel/IntelBlock.hpp
new file mode 100644
index 0000000000..fbdaa732c7
--- /dev/null
+++ b/src/lib/banal/intel/IntelBlock.hpp
@@ -0,0 +1,26 @@
+#ifndef BANAL_INTEL_INTEL_BLOCK_HPP
+#define BANAL_INTEL_INTEL_BLOCK_HPP
+
+#include <CFG.h>
+
+namespace Dyninst {
+namespace ParseAPI {
+
+class PARSER_EXPORT IntelBlock : public Block {
+ public:
+  IntelBlock(CodeObject * o, CodeRegion * r, Address start, std::vector<Offset> &offsets);
+
+  virtual ~IntelBlock() {}
+
+  virtual void getInsns(Insns &insns) const;
+
+  virtual Address last() const;
+
+ private:
+  std::vector<Offset> _inst_offsets;
+};
+
+}
+}
+
+#endif
diff --git a/src/lib/banal/intel/IntelCFGFactory.cpp b/src/lib/banal/intel/IntelCFGFactory.cpp
new file mode 100644
index 0000000000..21459620bf
--- /dev/null
+++ b/src/lib/banal/intel/IntelCFGFactory.cpp
@@ -0,0 +1,92 @@
+#include "IntelCFGFactory.hpp"
+#include "IntelFunction.hpp"
+#include <iostream>
+
+#define DEBUG_CUDA_CFGFACTORY 0
+
+namespace Dyninst {
+namespace ParseAPI {
+
+Function *IntelCFGFactory::mkfunc(Address addr, FuncSource src, 
+  std::string name, CodeObject * obj, CodeRegion * region, 
+  Dyninst::InstructionSource * isrc) {
+  // Find function by name
+  for (auto *function : _functions) {
+    if (function->name == name) {
+      IntelFunction *ret_func = new IntelFunction(function->address, name, obj, region, isrc);
+
+      bool first_entry = true;
+      if (DEBUG_CUDA_CFGFACTORY) {
+        std::cout << "Function: " << function->name << " addr: 0x" <<
+          std::hex << addr << std::dec << std::endl;
+      }
+      for (auto *block : function->blocks) {
+        IntelBlock *ret_block = NULL;
+        // If a block has not been created by callers, create it
+        // Otherwise get the block from _block_filter
+        if (_block_filter.find(block->id) == _block_filter.end()) {
+          if (DEBUG_CUDA_CFGFACTORY) {
+            std::cout << "New block: " << block->name << " id: " << block->id << std::endl;
+          }
+          std::vector<Offset> inst_offsets;
+          for (auto *inst : block->insts) {
+            inst_offsets.push_back(inst->offset);
+          }
+          ret_block = new IntelBlock(obj, region, block->address, inst_offsets);
+          _block_filter[block->id] = ret_block;
+          blocks_.add(ret_block);
+        } else {
+          if (DEBUG_CUDA_CFGFACTORY) {
+            std::cout << "Old block: " << block->name << " id: " << block->id << std::endl;
+          }
+          ret_block = _block_filter[block->id];
+        }
+        ret_func->add_block(ret_block);
+
+        if (first_entry) {
+          ret_func->setEntry(ret_block);
+          first_entry = false;
+        }
+
+        // Create edges and related blocks
+        for (auto *target : block->targets) {
+          IntelBlock *ret_target_block = NULL;
+          if (_block_filter.find(target->block->id) == _block_filter.end()) {
+            if (DEBUG_CUDA_CFGFACTORY) {
+              std::cout << "New block: " << target->block->name << " id: " << target->block->id << std::endl;
+            }
+            std::vector<Offset> inst_offsets;
+            for (auto *inst : target->block->insts) {
+              inst_offsets.push_back(inst->offset);
+            }
+            ret_target_block = new IntelBlock(obj, region, target->block->address, inst_offsets);
+            _block_filter[target->block->id] = ret_target_block;
+            blocks_.add(ret_target_block);
+          } else {
+            if (DEBUG_CUDA_CFGFACTORY) {
+              std::cout << "Old block: " << target->block->name << " id: " << target->block->id << std::endl;
+            }
+            ret_target_block = _block_filter[target->block->id];
+          }
+
+          Edge *ret_edge = new Edge(ret_block, ret_target_block, target->type);
+          ret_edge->ignore_index();
+          if (DEBUG_CUDA_CFGFACTORY) {
+            std::cout << "Edge: "<< " -> " << target->block->name << std::endl;
+          }
+          ret_edge->install();
+          edges_.add(ret_edge);
+        }
+      }
+      return ret_func;
+    }
+  }
+  return NULL;
+  // iterate blocks
+  // add blocks
+  // iterate targets
+  // add edges
+}
+
+}
+}
diff --git a/src/lib/banal/intel/IntelCFGFactory.hpp b/src/lib/banal/intel/IntelCFGFactory.hpp
new file mode 100644
index 0000000000..939154b16c
--- /dev/null
+++ b/src/lib/banal/intel/IntelCFGFactory.hpp
@@ -0,0 +1,32 @@
+#ifndef BANAL_INTEL_INTEL_CFG_FACTORY_HPP
+#define BANAL_INTEL_INTEL_CFG_FACTORY_HPP
+
+#include <CFGFactory.h>
+#include <unordered_map>
+
+#include "IntelBlock.hpp"
+#include "../cuda/DotCFG.hpp"
+
+namespace Dyninst {
+namespace ParseAPI {
+
+class PARSER_EXPORT IntelCFGFactory : public CFGFactory {   
+ public:
+  IntelCFGFactory(std::vector<CudaParse::Function *> &functions) :
+    _functions(functions) {}
+  virtual ~IntelCFGFactory() {}
+
+ protected:
+  virtual Function * mkfunc(Address addr, FuncSource src, 
+    std::string name, CodeObject * obj, CodeRegion * region,
+    Dyninst::InstructionSource * isrc);
+
+ private:
+  std::vector<CudaParse::Function *> &_functions;
+  std::unordered_map<size_t, IntelBlock *> _block_filter; 
+};
+
+}
+}
+
+#endif
diff --git a/src/lib/banal/intel/IntelCodeSource.cpp b/src/lib/banal/intel/IntelCodeSource.cpp
new file mode 100644
index 0000000000..eaf2346c8b
--- /dev/null
+++ b/src/lib/banal/intel/IntelCodeSource.cpp
@@ -0,0 +1,15 @@
+#include "IntelCodeSource.hpp"
+
+namespace Dyninst {
+namespace ParseAPI {
+
+IntelCodeSource::IntelCodeSource(
+  std::vector<CudaParse::Function *> &functions, Dyninst::SymtabAPI::Symtab *s) {
+  for (auto *function : functions) {
+    Address address = function->address;
+    _hints.push_back(Hint(address, 0, 0, function->name));
+  }
+}
+
+}
+}
diff --git a/src/lib/banal/intel/IntelCodeSource.hpp b/src/lib/banal/intel/IntelCodeSource.hpp
new file mode 100644
index 0000000000..ed97cbdfd2
--- /dev/null
+++ b/src/lib/banal/intel/IntelCodeSource.hpp
@@ -0,0 +1,55 @@
+#ifndef BANAL_INTEL_INTEL_CODE_SOURCE_HPP
+#define BANAL_INTEL_INTEL_CODE_SOURCE_HPP
+
+#include <dyn_regs.h>
+#include <CodeSource.h>
+#include <Symtab.h>
+
+#include "../cuda/DotCFG.hpp"
+
+namespace Dyninst {
+namespace ParseAPI {
+  class PARSER_EXPORT IntelCodeSource : public /*Symtab */ CodeSource {
+ public:
+  IntelCodeSource(std::vector<CudaParse::Function *> &functions, 
+		Dyninst::SymtabAPI::Symtab *s);
+  ~IntelCodeSource() {}
+
+ public:
+  /** InstructionSource implementation **/
+  virtual bool isValidAddress(const Address) const { return false; }
+  virtual void* getPtrToInstruction(const Address) const { return NULL; }
+  virtual void* getPtrToData(const Address) const { return NULL; }
+  virtual unsigned int getAddressWidth() const { return 0; }
+  virtual bool isCode(const Address) const { return false; }
+  virtual bool isData(const Address) const { return false; }
+  virtual bool isReadOnly(const Address) const { return false; }
+  virtual Address offset() const { return 0; }
+  virtual Address length() const { return 0; }
+  virtual Architecture getArch() const { return Arch_cuda; }
+
+  virtual bool nonReturning(Address /*func_entry*/) { return false; }                                                                                  
+	virtual bool nonReturningSyscall(int /*number*/) { return false; }
+
+	/* If the binary file type supplies per-function
+	 * TOC's (e.g. ppc64 Linux), override.
+	 */
+  virtual Address getTOC(Address) const { return _table_of_contents; }
+
+  // statistics accessor
+  virtual void print_stats() const { return; }                                                                                                         
+  virtual bool have_stats() const { return false; }
+
+  // manage statistics
+  virtual void incrementCounter(const std::string& /*name*/) const { return; } 
+  virtual void addCounter(const std::string& /*name*/, int /*num*/) const { return; }
+  virtual void decrementCounter(const std::string& /*name*/) const { return; }
+  virtual void startTimer(const std::string& /*name*/) const { return; } 
+  virtual void stopTimer(const std::string& /*name*/) const { return; }
+  virtual bool findCatchBlockByTryRange(Address /*given try address*/, std::set<Address> & /* catch start */)  const { return false; }
+};
+
+}
+}
+
+#endif
diff --git a/src/lib/banal/intel/IntelCreateCFG.cpp b/src/lib/banal/intel/IntelCreateCFG.cpp
deleted file mode 100644
index c5fd3abe1f..0000000000
--- a/src/lib/banal/intel/IntelCreateCFG.cpp
+++ /dev/null
@@ -1,213 +0,0 @@
-// -*-Mode: C++;-*- // technically C99
-
-// * BeginRiceCopyright *****************************************************
-//
-// --------------------------------------------------------------------------
-// Part of HPCToolkit (hpctoolkit.org)
-//
-// Information about sources of support for research and development of
-// HPCToolkit is at 'hpctoolkit.org' and in 'README.Acknowledgments'.
-// --------------------------------------------------------------------------
-//
-// Copyright ((c)) 2002-2020, Rice University
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// * Redistributions of source code must retain the above copyright
-//   notice, this list of conditions and the following disclaimer.
-//
-// * Redistributions in binary form must reproduce the above copyright
-//   notice, this list of conditions and the following disclaimer in the
-//   documentation and/or other materials provided with the distribution.
-//
-// * Neither the name of Rice University (RICE) nor the names of its
-//   contributors may be used to endorse or promote products derived from
-//   this software without specific prior written permission.
-//
-// This software is provided by RICE and contributors "as is" and any
-// express or implied warranties, including, but not limited to, the
-// implied warranties of merchantability and fitness for a particular
-// purpose are disclaimed. In no event shall RICE or contributors be
-// liable for any direct, indirect, incidental, special, exemplary, or
-// consequential damages (including, but not limited to, procurement of
-// substitute goods or services; loss of use, data, or profits; or
-// business interruption) however caused and on any theory of liability,
-// whether in contract, strict liability, or tort (including negligence
-// or otherwise) arising in any way out of the use of this software, even
-// if advised of the possibility of such damage.
-//
-// ******************************************************* EndRiceCopyright *
-
-//******************************************************************************
-// system includes
-//******************************************************************************
-
-#include <iga/kv.hpp>
-
-#include <iostream>
-#include <stack>
-#include <algorithm>
-#include <vector>
-#include <sstream>
-#include <set>
-#include <map> 
-
-
-//******************************************************************************
-// local includes
-//******************************************************************************
-
-#include <lib/binutils/intel/igc_binary_decoder.h>
-
-#include "IntelCreateCFG.hpp"
-
-//******************************************************************************
-// local data
-//******************************************************************************
-
-#define MAX_STR_SIZE 1024
-
-std::vector<int32_t> block_offsets;
-std::map<int32_t, bool> visitedBlockOffsets;
-
-
-
-//******************************************************************************
-// private operations
-//******************************************************************************
-
-static std::set<Edge>
-get_cfg_edges
-(
-	std::vector<uint8_t> binary,
-	size_t binary_size
-)
-{
-	KernelView kv(IGA_GEN9, binary.data(), binary.size(),
-			iga::SWSB_ENCODE_MODE::SingleDistPipe);
-	std::set<Edge> cfg_edges;
-
-	int32_t offset = 0;
-	int32_t size;
-	while (offset < binary_size) {
-		int32_t prev_block_start_offset;
-		int32_t prev_block_end_offset;
-		int32_t block_start_offset;
-		bool isStartOfBasicBlock = kv.isInstTarget(offset);
-		if (isStartOfBasicBlock) {
-			block_offsets.push_back(offset);
-			visitedBlockOffsets.insert({offset, false});
-			block_start_offset = offset;	
-		}
-		size = kv.getInstSize(offset);
-		while (!kv.isInstTarget(offset + size) && (offset + size < binary_size)) {
-			offset += size;	
-			size = kv.getInstSize(offset);
-			if (size == 0) {
-				// this is a weird edge case, what to do?
-				break;
-			}
-		}
-
-		int32_t *jump_targets = new int32_t[KV_MAX_TARGETS_PER_INSTRUCTION];
-		size_t jump_targets_count = kv.getInstTargets(offset, jump_targets);
-		int32_t next_block_start_offset = offset + size;
-		bool isFallThroughEdgeAdded = false;
-
-		for (size_t i = 0; i < jump_targets_count; i++) {
-			if (jump_targets[i] == next_block_start_offset) {
-				isFallThroughEdgeAdded = true;
-			} else if (jump_targets[i] == block_start_offset) {
-				if (block_offsets.size() >= 2) {
-					int32_t from = block_offsets[block_offsets.size() - 2];
-					int32_t from_blockEndOffset;
-					for (Edge edge: cfg_edges) {
-						if (edge.from == from && edge.to == block_start_offset) {
-							from_blockEndOffset	 = edge.from_blockEndOffset;
-						}
-					}	
-					cfg_edges.insert(Edge(block_offsets[block_offsets.size() - 2], block_start_offset, from_blockEndOffset));
-				}
-			}
-			cfg_edges.insert(Edge(block_start_offset, jump_targets[i], next_block_start_offset - size));
-		}
-		if(!isFallThroughEdgeAdded) {
-			cfg_edges.insert(Edge(block_start_offset, next_block_start_offset, next_block_start_offset - size));
-		}
-		prev_block_start_offset = block_start_offset;
-		prev_block_end_offset = offset; 
-		offset += size;
-	}
-	cfg_edges.insert(Edge(block_offsets[block_offsets.size() - 1], binary_size, binary_size - size));
-	return cfg_edges;
-}
-
-
-static void
-printCFGEdges
-(
-	std::set<Edge> cfg_edges
-)
-{
-	for (Edge edge: cfg_edges) {
-		std::cout << edge.from << "->" << edge.to << std::endl;	
-	}	
-}
-
-
-static void
-printBasicBlocks
-(
-	std::vector<uint8_t> binary,
-	std::set<Edge> cfg_edges
-)
-{
-	KernelView kv(IGA_GEN9, binary.data(), binary.size(), iga::SWSB_ENCODE_MODE::SingleDistPipe);
-	int32_t offset;
-	char text[MAX_STR_SIZE] = { 0 };
-	size_t length;
-	int32_t size;
-
-	for (Edge edge: cfg_edges) {
-		offset = edge.from;
-		if(edge.from == edge.to) {
-			// skip self-loops
-			continue;
-		}
-		auto it = visitedBlockOffsets.find(offset);
-		if (it->second) {
-			continue;
-		} else {
-			it->second = true;
-		}
-		std::cout << offset << " [ label=\"\\\n"; 
-		while (offset < edge.to) {
-			size = kv.getInstSize(offset);
-			length = kv.getInstSyntax(offset, text, MAX_STR_SIZE);
-			assert(length > 0);
-			std::cout << offset << ": " << text << "\\\l";
-			offset += size;
-		}
-		std::cout << "\" shape=\"box\"]; \n" << std::endl;
-	}	
-}
-
-
-
-//******************************************************************************
-// interface operations
-//******************************************************************************
-
-// pass Intel kernel's raw gen binary
-// kernel's text region is a raw gen binary
-// you  can find kernel nested in [debug section of GPU binary/separate debug section dump]
-void printCFGInDotGraph(std::vector<uint8_t> intelRawGenBinary) {
-	std::cout << "digraph GEMM_iga {" << std::endl;
-	std::set<Edge> edges = get_cfg_edges(intelRawGenBinary, intelRawGenBinary.size());
-	printBasicBlocks(intelRawGenBinary, edges);
-	printCFGEdges(edges);
-	std::cout << "}" << std::endl;
-}
diff --git a/src/lib/banal/intel/IntelCreateCFG.hpp b/src/lib/banal/intel/IntelCreateCFG.hpp
deleted file mode 100644
index 03dac10980..0000000000
--- a/src/lib/banal/intel/IntelCreateCFG.hpp
+++ /dev/null
@@ -1,94 +0,0 @@
-// -*-Mode: C++;-*- // technically C99
-
-// * BeginRiceCopyright *****************************************************
-//
-// --------------------------------------------------------------------------
-// Part of HPCToolkit (hpctoolkit.org)
-//
-// Information about sources of support for research and development of
-// HPCToolkit is at 'hpctoolkit.org' and in 'README.Acknowledgments'.
-// --------------------------------------------------------------------------
-//
-// Copyright ((c)) 2002-2020, Rice University
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// * Redistributions of source code must retain the above copyright
-//   notice, this list of conditions and the following disclaimer.
-//
-// * Redistributions in binary form must reproduce the above copyright
-//   notice, this list of conditions and the following disclaimer in the
-//   documentation and/or other materials provided with the distribution.
-//
-// * Neither the name of Rice University (RICE) nor the names of its
-//   contributors may be used to endorse or promote products derived from
-//   this software without specific prior written permission.
-//
-// This software is provided by RICE and contributors "as is" and any
-// express or implied warranties, including, but not limited to, the
-// implied warranties of merchantability and fitness for a particular
-// purpose are disclaimed. In no event shall RICE or contributors be
-// liable for any direct, indirect, incidental, special, exemplary, or
-// consequential damages (including, but not limited to, procurement of
-// substitute goods or services; loss of use, data, or profits; or
-// business interruption) however caused and on any theory of liability,
-// whether in contract, strict liability, or tort (including negligence
-// or otherwise) arising in any way out of the use of this software, even
-// if advised of the possibility of such damage.
-//
-// ******************************************************* EndRiceCopyright *
-
-//******************************************************************************
-// system includes
-//******************************************************************************
-
-#ifndef BANAL_INTEL_INTEL_CREATE_CFG
-#define BANAL_INTEL_INTEL_CREATE_CFG
-
-#include <vector>
-
-
-
-//******************************************************************************
-// local data
-//******************************************************************************
-
-class Edge {
-	public:
-		int32_t from;	
-		int32_t to;
-		int32_t from_blockEndOffset;
-
-		Edge(int32_t f, int32_t t, int32_t from_b) {
-			from = f;
-			to = t;
-			from_blockEndOffset = from_b;
-		}
-
-		bool operator == (const Edge &that) const 
-		{
-			return((this->from == that.from) && (this->to == that.to));
-		}
-		
-		bool operator<(const Edge& that) const 
-		{
-			if (this->from == that.from) {
-				return (this->to < that.to);
-			} else {
-				return (this->from < that.from);
-			}
-		}
-};
-
-
-
-//******************************************************************************
-// interface operations
-//******************************************************************************
-
-void printCFGInDotGraph(std::vector<uint8_t> intelRawGenBinary);
-
-#endif
diff --git a/src/lib/banal/intel/IntelFunction.cpp b/src/lib/banal/intel/IntelFunction.cpp
new file mode 100644
index 0000000000..8184f5f9ae
--- /dev/null
+++ b/src/lib/banal/intel/IntelFunction.cpp
@@ -0,0 +1,12 @@
+#include "IntelFunction.hpp"
+
+namespace Dyninst {
+namespace ParseAPI {
+
+void IntelFunction::setEntry(Block *entry) {
+  _region = entry->region();
+  _entry = entry;
+}
+
+}
+}
diff --git a/src/lib/banal/intel/IntelFunction.hpp b/src/lib/banal/intel/IntelFunction.hpp
new file mode 100644
index 0000000000..72305a3af0
--- /dev/null
+++ b/src/lib/banal/intel/IntelFunction.hpp
@@ -0,0 +1,25 @@
+#ifndef BANAL_INTEL_INTEL_FUNCTION_HPP
+#define BANAL_INTEL_INTEL_FUNCTION_HPP
+
+#include <CFG.h>
+
+namespace Dyninst {
+namespace ParseAPI {
+
+class PARSER_EXPORT IntelFunction : public ParseAPI::Function {
+ public:
+  IntelFunction(Address addr, std::string name, CodeObject * obj, 
+    CodeRegion * region, InstructionSource * isource) :
+    Function(addr, name, obj, region, isource) {
+    _cache_valid = true;
+  }
+
+  virtual ~IntelFunction() {}
+
+  void setEntry(Block *entry);
+};
+
+}
+}
+
+#endif
diff --git a/src/lib/banal/intel/IntelGPUbanal.cpp b/src/lib/banal/intel/IntelGPUbanal.cpp
deleted file mode 100644
index 152ba3ce9d..0000000000
--- a/src/lib/banal/intel/IntelGPUbanal.cpp
+++ /dev/null
@@ -1,122 +0,0 @@
-// * BeginRiceCopyright *****************************************************
-//
-// $HeadURL$
-// $Id$
-//
-// --------------------------------------------------------------------------
-// Part of HPCToolkit (hpctoolkit.org)
-//
-// Information about sources of support for research and development of
-// HPCToolkit is at 'hpctoolkit.org' and in 'README.Acknowledgments'.
-// --------------------------------------------------------------------------
-//
-// Copyright ((c)) 2002-2020, Rice University
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// * Redistributions of source code must retain the above copyright
-//   notice, this list of conditions and the following disclaimer.
-//
-// * Redistributions in binary form must reproduce the above copyright
-//   notice, this list of conditions and the following disclaimer in the
-//   documentation and/or other materials provided with the distribution.
-//
-// * Neither the name of Rice University (RICE) nor the names of its
-//   contributors may be used to endorse or promote products derived from
-//   this software without specific prior written permission.
-//
-// This software is provided by RICE and contributors "as is" and any
-// express or implied warranties, including, but not limited to, the
-// implied warranties of merchantability and fitness for a particular
-// purpose are disclaimed. In no event shall RICE or contributors be
-// liable for any direct, indirect, incidental, special, exemplary, or
-// consequential damages (including, but not limited to, procurement of
-// substitute goods or services; loss of use, data, or profits; or
-// business interruption) however caused and on any theory of liability,
-// whether in contract, strict liability, or tort (including negligence
-// or otherwise) arising in any way out of the use of this software, even
-// if advised of the possibility of such damage.
-//
-// ******************************************************* EndRiceCopyright *
-
-
-//***************************************************************************
-
-//******************************************************************************
-// system includes
-//******************************************************************************
-
-#include <iostream>
-#include <string>
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <fcntl.h>
-#include <unistd.h>
-#include <libelf.h>
-
-#include <Symtab.h>
-
-using namespace Dyninst;
-using namespace SymtabAPI;
-
-
-//******************************************************************************
-// local includes
-//******************************************************************************
-
-#include <lib/support/diagnostics.h>
-#include "IntelGPUbanal.hpp"
-
-
-
-//******************************************************************************
-// macros
-//******************************************************************************
-
-#define DBG 1
-
-#define INTEL_GPU_DEBUG_SECTION_NAME "Intel(R) OpenCL Device Debug"
-
-
-
-//******************************************************************************
-// interface operations
-//******************************************************************************
-
-void 
-add_custom_function_object
-(
-	Symtab* symtab,
-	std::string func_obj_name
-)
-{
-	const std::string& name = func_obj_name;
-
-	Region *reg = NULL;
-	bool status = symtab->findRegion(reg, ".text");
-	assert(status == true);
-	unsigned long reg_size = reg->getMemSize();
-
-	Symbol *custom_symbol = new Symbol(
-			name, 
-			SymtabAPI::Symbol::ST_FUNCTION, // SymbolType
-			Symbol::SL_LOCAL, //SymbolLinkage
-			SymtabAPI::Symbol::SV_DEFAULT, //SymbolVisibility
-			0, //Offset,
-			NULL, //Module *module 
-			reg, //Region *r
-			reg_size, //unsigned s
-			false, //bool d
-			false, //bool a
-			-1, //int index
-			-1, //int strindex
-			false //bool cs
-	);
-	
-	//adding the custom symbol into the symtab object
-	status = symtab->addSymbol(custom_symbol); //(Symbol *newsym)
-	assert(status == true);
-}
diff --git a/src/lib/banal/intel/ReadIntelCFG.cpp b/src/lib/banal/intel/ReadIntelCFG.cpp
new file mode 100644
index 0000000000..e5914597ad
--- /dev/null
+++ b/src/lib/banal/intel/ReadIntelCFG.cpp
@@ -0,0 +1,254 @@
+// * BeginRiceCopyright *****************************************************
+//
+// $HeadURL$
+// $Id$
+//
+// --------------------------------------------------------------------------
+// Part of HPCToolkit (hpctoolkit.org)
+//
+// Information about sources of support for research and development of
+// HPCToolkit is at 'hpctoolkit.org' and in 'README.Acknowledgments'.
+// --------------------------------------------------------------------------
+//
+// Copyright ((c)) 2002-2020, Rice University
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+//   notice, this list of conditions and the following disclaimer.
+//
+// * Redistributions in binary form must reproduce the above copyright
+//   notice, this list of conditions and the following disclaimer in the
+//   documentation and/or other materials provided with the distribution.
+//
+// * Neither the name of Rice University (RICE) nor the names of its
+//   contributors may be used to endorse or promote products derived from
+//   this software without specific prior written permission.
+//
+// This software is provided by RICE and contributors "as is" and any
+// express or implied warranties, including, but not limited to, the
+// implied warranties of merchantability and fitness for a particular
+// purpose are disclaimed. In no event shall RICE or contributors be
+// liable for any direct, indirect, incidental, special, exemplary, or
+// consequential damages (including, but not limited to, procurement of
+// substitute goods or services; loss of use, data, or profits; or
+// business interruption) however caused and on any theory of liability,
+// whether in contract, strict liability, or tort (including negligence
+// or otherwise) arising in any way out of the use of this software, even
+// if advised of the possibility of such damage.
+//
+// ******************************************************* EndRiceCopyright *
+
+
+//***************************************************************************
+
+//******************************************************************************
+// system includes
+//******************************************************************************
+
+#include <iostream>
+#include <string>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <libelf.h>
+
+#include <Symtab.h>
+#include <CodeSource.h>
+#include <CodeObject.h>
+
+#include <iga/kv.hpp>
+
+//******************************************************************************
+// local includes
+//******************************************************************************
+
+#include <lib/binutils/ElfHelper.hpp>
+#include <lib/support/diagnostics.h>
+
+#include "../cuda/DotCFG.hpp"
+#include "IntelCFGFactory.hpp"
+#include "IntelFunction.hpp"
+#include "IntelBlock.hpp"
+#include "IntelCodeSource.hpp"
+#include "ReadIntelCFG.hpp"
+
+//******************************************************************************
+// macros
+//******************************************************************************
+
+#define DEBUG 1
+
+#define MAX_STR_SIZE 1024
+#define INTEL_GPU_DEBUG_SECTION_NAME "Intel(R) OpenCL Device Debug"
+
+using namespace Dyninst;
+using namespace ParseAPI;
+using namespace SymtabAPI;
+using namespace InstructionAPI;
+
+static void 
+addCustomFunctionObject
+(
+ const std::string &func_obj_name,
+ Symtab *symtab
+)
+{
+	Region *reg = NULL;
+	bool status = symtab->findRegion(reg, ".text");
+	assert(status == true);
+
+	unsigned long reg_size = reg->getMemSize();
+	Symbol *custom_symbol = new Symbol(
+			func_obj_name, 
+			SymtabAPI::Symbol::ST_FUNCTION, // SymbolType
+			Symbol::SL_LOCAL, //SymbolLinkage
+			SymtabAPI::Symbol::SV_DEFAULT, //SymbolVisibility
+			0, //Offset,
+			NULL, //Module *module 
+			reg, //Region *r
+			reg_size, //unsigned s
+			false, //bool d
+			false, //bool a
+			-1, //int index
+			-1, //int strindex
+			false //bool cs
+	);
+	
+	//adding the custom symbol into the symtab object
+	status = symtab->addSymbol(custom_symbol); //(Symbol *newsym)
+	assert(status == true);
+}
+
+
+static std::string
+getFileNameFromAbsolutePath(const std::string &str) {
+  // TODO(Aaron): you can just find the last "/" and grab "/" to the end
+  std::vector<std::string> tokens; 
+  std::stringstream str_stream(str); 
+  std::string intermediate; 
+
+	// Tokenizing w.r.t. '/'
+	while(std::getline(str_stream, intermediate, '/')) { 
+		tokens.push_back(intermediate); 
+	} 
+	return tokens[tokens.size() - 1];
+}
+
+
+static void
+parseIntelCFG
+(
+ char *text_section,
+ int text_section_size,
+ CudaParse::Function &function
+)
+{
+	KernelView kv(IGA_GEN9, text_section, text_section_size, iga::SWSB_ENCODE_MODE::SingleDistPipe);
+  std::map<int, CudaParse::Block *> block_offset_map;
+
+	int offset = 0;
+	int size = 0;
+  int block_id = 0;
+
+  // Construct basic blocks
+	while (offset < text_section_size) {
+    auto *block = new CudaParse::Block(block_id, function.name + "_" + std::to_string(block_id)); 
+    function.blocks.push_back(block);
+    block_offset_map[offset] = block;
+
+		size = kv.getInstSize(offset);
+    auto *inst = new CudaParse::Inst(offset, size);
+    block->insts.push_back(inst);
+
+		while (!kv.isInstTarget(offset + size) && (offset + size < text_section_size)) {
+			offset += size;	
+			size = kv.getInstSize(offset);
+			if (size == 0) {
+				// this is a weird edge case, what to do?
+				break;
+			}
+
+      inst = new CudaParse::Inst(offset, size);
+      block->insts.push_back(inst);
+		}
+
+    offset = block->insts.back()->offset;
+    if (kv.getOpcode(offset) == iga::Op::CALL || kv.getOpcode(offset) == iga::Op::CALLA) {
+      inst->is_call = true;
+    } else {
+      inst->is_jump = true;
+    }
+  }
+
+  // Construct targets
+  std::array<int, KV_MAX_TARGETS_PER_INSTRUCTION> jump_targets;
+  for (size_t i = 0; i < function.blocks.size(); ++i) {
+    auto *block = function.blocks[i];
+    auto *inst = block->insts.back();
+		size_t jump_targets_count = kv.getInstTargets(inst->offset, jump_targets.data());
+		int next_block_start_offset = 0;
+    if (i != function.blocks.size() - 1) {
+      next_block_start_offset = function.blocks[i + 1]->insts.front()->offset;
+    }
+
+		for (size_t i = 0; i < jump_targets_count; i++) {
+      auto *target_block = block_offset_map.at(jump_targets[i]);
+			if (jump_targets[i] == next_block_start_offset) {
+        // Fall through
+        if (inst->is_call) {
+          block->targets.push_back(new CudaParse::Target(inst, target_block, CudaParse::TargetType::CALL_FT));
+        } else {
+          block->targets.push_back(new CudaParse::Target(inst, target_block, CudaParse::TargetType::FALLTHROUGH));
+        }
+			} else {
+        // Jump
+        block->targets.push_back(new CudaParse::Target(inst, target_block, CudaParse::TargetType::DIRECT));
+			}
+		}
+	}
+}
+
+
+bool
+readIntelCFG
+(
+ const std::string &search_path,
+ ElfFile *elfFile,
+ Dyninst::SymtabAPI::Symtab *the_symtab, 
+ std::map<int, int> &inst_size,
+ bool cfg_wanted,
+ Dyninst::ParseAPI::CodeSource **code_src, 
+ Dyninst::ParseAPI::CodeObject **code_obj
+)
+{
+  auto function_name = getFileNameFromAbsolutePath(elfFile->getFileName());
+  addCustomFunctionObject(function_name, the_symtab); //adds a dummy function object
+
+  char *text_section = NULL;
+  auto text_section_size = elfFile->getTextSection(text_section);
+  if (text_section_size == 0) {
+    return false;
+  }
+
+  CudaParse::Function function(0, function_name);
+  parseIntelCFG(text_section, text_section_size, function);
+  std::vector<CudaParse::Function *> functions = {&function};
+
+  CFGFactory *cfg_fact = new IntelCFGFactory(functions);
+  *code_src = new IntelCodeSource(functions, the_symtab); 
+  *code_obj = new CodeObject(*code_src, cfg_fact);
+  (*code_obj)->parse();
+
+  for (auto *block : function.blocks) {
+    for (auto *inst : block->insts) {
+      inst_size[inst->offset] = inst->size;
+    }
+  }
+
+  return true;
+}
diff --git a/src/lib/banal/intel/IntelGPUbanal.hpp b/src/lib/banal/intel/ReadIntelCFG.hpp
similarity index 89%
rename from src/lib/banal/intel/IntelGPUbanal.hpp
rename to src/lib/banal/intel/ReadIntelCFG.hpp
index f2d160aaa2..5ed98fd6dd 100644
--- a/src/lib/banal/intel/IntelGPUbanal.hpp
+++ b/src/lib/banal/intel/ReadIntelCFG.hpp
@@ -46,14 +46,16 @@
 // system includes
 //******************************************************************************
 
+#ifndef BANAL_INTEL_READ_INTEL_CFG_HPP
+#define BANAL_INTEL_READ_INTEL_CFG_HPP
+
 #include <iostream>
 #include <string>
 #include <Symtab.h>
+#include <CodeSource.h>
+#include <CodeObject.h>
 
-using namespace Dyninst;
-using namespace SymtabAPI;
-
-
+#include <lib/binutils/ElfHelper.hpp>
 
 //******************************************************************************
 // type definitions
@@ -75,15 +77,20 @@ enum SHT_OPENCL : uint32_t {
     SHT_OPENCL_SPIRV_SC_VALUES = 0xff00000c          // Specialization Constants values
 };
 
-
-
 //******************************************************************************
 // interface functions
 //******************************************************************************
 
-void 
-add_custom_function_object
+bool
+readIntelCFG
 (
-	Symtab* symtab,
-	std::string func_obj_name
+ const std::string &search_path,
+ ElfFile *elfFile,
+ Dyninst::SymtabAPI::Symtab *the_symtab, 
+ std::map<int, int> &inst_size,
+ bool cfg_wanted,
+ Dyninst::ParseAPI::CodeSource **code_src, 
+ Dyninst::ParseAPI::CodeObject **code_obj
 );
+
+#endif
diff --git a/src/lib/binutils/ElfHelper.cpp b/src/lib/binutils/ElfHelper.cpp
index ff89558659..b067d20f02 100644
--- a/src/lib/binutils/ElfHelper.cpp
+++ b/src/lib/binutils/ElfHelper.cpp
@@ -175,3 +175,34 @@ elfSectionGetData
   char *sectionData = obj_ptr + shdr->sh_offset;
   return sectionData;
 }
+
+
+size_t
+ElfFile::getTextSection
+(
+ char *text_section
+)
+{
+  // start cfg generation
+  ElfSectionVector *sections = elfGetSectionVector(elf);
+  GElf_Ehdr ehdr_v;
+  GElf_Ehdr *ehdr = gelf_getehdr(elf, &ehdr_v);
+
+  if (ehdr) {
+    for (auto si = sections->begin(); si != sections->end(); si++) {
+      Elf_Scn *scn = *si;
+      GElf_Shdr shdr_v;
+      GElf_Shdr *shdr = gelf_getshdr(scn, &shdr_v);
+      if (!shdr) continue;
+      char *sectionData = elfSectionGetData(memPtr, shdr);
+      const char *section_name = elf_strptr(elf, ehdr->e_shstrndx, shdr->sh_name);
+      if (strcmp(section_name, ".text") == 0) {
+        // TODO(Aaron): can a intel GPU binary has two text sections?
+        text_section = sectionData;
+        return shdr->sh_size;
+      }
+    }
+  }
+
+  return 0;
+}
diff --git a/src/lib/binutils/ElfHelper.hpp b/src/lib/binutils/ElfHelper.hpp
index 6f33a9eb18..f3ff77d60a 100644
--- a/src/lib/binutils/ElfHelper.hpp
+++ b/src/lib/binutils/ElfHelper.hpp
@@ -95,6 +95,7 @@ class ElfFile {
   char *getMemoryOriginal() { return origPtr; }
   size_t getLength() { return memLen; }
   std::string getFileName() { return fileName; }
+  size_t getTextSection(char *text_section);
 private:
   int arch;
   char *origPtr;
@@ -104,14 +105,10 @@ class ElfFile {
   std::string fileName;
 };
 
-
 class ElfFileVector : public std::vector<ElfFile *> {};
 
-
 class ElfSectionVector : public std::vector<Elf_Scn *> {};
 
-
-
 //******************************************************************************
 // interface functions
 //******************************************************************************
diff --git a/src/lib/binutils/Makefile.am b/src/lib/binutils/Makefile.am
index 1a8b3c2cf0..1223a53e40 100644
--- a/src/lib/binutils/Makefile.am
+++ b/src/lib/binutils/Makefile.am
@@ -106,8 +106,7 @@ MYSOURCES = \
 	ElfHelper.cpp \
 	InputFile.cpp \
 	RelocateCubin.cpp \
-	intel/IntelGPUbinutils.cpp \
-	intel/CreateCFG.cpp
+	intel/IntelGPUbinutils.cpp
 
 
 #############################################################################
diff --git a/src/lib/binutils/Makefile.in b/src/lib/binutils/Makefile.in
index 96fe555963..ff79979420 100644
--- a/src/lib/binutils/Makefile.in
+++ b/src/lib/binutils/Makefile.in
@@ -149,8 +149,7 @@ am__objects_1 = libHPCbinutils_la-LM.lo libHPCbinutils_la-Seg.lo \
 	libHPCbinutils_la-Fatbin.lo libHPCbinutils_la-ElfHelper.lo \
 	libHPCbinutils_la-InputFile.lo \
 	libHPCbinutils_la-RelocateCubin.lo \
-	intel/libHPCbinutils_la-IntelGPUbinutils.lo \
-	intel/libHPCbinutils_la-CreateCFG.lo
+	intel/libHPCbinutils_la-IntelGPUbinutils.lo
 am_libHPCbinutils_la_OBJECTS = $(am__objects_1)
 libHPCbinutils_la_OBJECTS = $(am_libHPCbinutils_la_OBJECTS)
 AM_V_lt = $(am__v_lt_@AM_V@)
@@ -566,8 +565,7 @@ MYSOURCES = \
 	ElfHelper.cpp \
 	InputFile.cpp \
 	RelocateCubin.cpp \
-	intel/IntelGPUbinutils.cpp \
-	intel/CreateCFG.cpp
+	intel/IntelGPUbinutils.cpp
 
 
 #############################################################################
@@ -656,8 +654,6 @@ intel/$(DEPDIR)/$(am__dirstamp):
 	@: > intel/$(DEPDIR)/$(am__dirstamp)
 intel/libHPCbinutils_la-IntelGPUbinutils.lo: intel/$(am__dirstamp) \
 	intel/$(DEPDIR)/$(am__dirstamp)
-intel/libHPCbinutils_la-CreateCFG.lo: intel/$(am__dirstamp) \
-	intel/$(DEPDIR)/$(am__dirstamp)
 
 libHPCbinutils.la: $(libHPCbinutils_la_OBJECTS) $(libHPCbinutils_la_DEPENDENCIES) $(EXTRA_libHPCbinutils_la_DEPENDENCIES) 
 	$(AM_V_CXXLD)$(libHPCbinutils_la_LINK)  $(libHPCbinutils_la_OBJECTS) $(libHPCbinutils_la_LIBADD) $(LIBS)
@@ -685,7 +681,6 @@ distclean-compile:
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libHPCbinutils_la-SimpleSymbols.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libHPCbinutils_la-SimpleSymbolsFactories.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libHPCbinutils_la-VMAInterval.Plo@am__quote@
-@AMDEP_TRUE@@am__include@ @am__quote@intel/$(DEPDIR)/libHPCbinutils_la-CreateCFG.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@intel/$(DEPDIR)/libHPCbinutils_la-IntelGPUbinutils.Plo@am__quote@
 
 .cpp.o:
@@ -824,13 +819,6 @@ intel/libHPCbinutils_la-IntelGPUbinutils.lo: intel/IntelGPUbinutils.cpp
 @AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
 @am__fastdepCXX_FALSE@	$(AM_V_CXX@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libHPCbinutils_la_CXXFLAGS) $(CXXFLAGS) -c -o intel/libHPCbinutils_la-IntelGPUbinutils.lo `test -f 'intel/IntelGPUbinutils.cpp' || echo '$(srcdir)/'`intel/IntelGPUbinutils.cpp
 
-intel/libHPCbinutils_la-CreateCFG.lo: intel/CreateCFG.cpp
-@am__fastdepCXX_TRUE@	$(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libHPCbinutils_la_CXXFLAGS) $(CXXFLAGS) -MT intel/libHPCbinutils_la-CreateCFG.lo -MD -MP -MF intel/$(DEPDIR)/libHPCbinutils_la-CreateCFG.Tpo -c -o intel/libHPCbinutils_la-CreateCFG.lo `test -f 'intel/CreateCFG.cpp' || echo '$(srcdir)/'`intel/CreateCFG.cpp
-@am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) intel/$(DEPDIR)/libHPCbinutils_la-CreateCFG.Tpo intel/$(DEPDIR)/libHPCbinutils_la-CreateCFG.Plo
-@AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='intel/CreateCFG.cpp' object='intel/libHPCbinutils_la-CreateCFG.lo' libtool=yes @AMDEPBACKSLASH@
-@AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
-@am__fastdepCXX_FALSE@	$(AM_V_CXX@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libHPCbinutils_la_CXXFLAGS) $(CXXFLAGS) -c -o intel/libHPCbinutils_la-CreateCFG.lo `test -f 'intel/CreateCFG.cpp' || echo '$(srcdir)/'`intel/CreateCFG.cpp
-
 mostlyclean-libtool:
 	-rm -f *.lo
 
diff --git a/src/lib/binutils/intel/CreateCFG.cpp b/src/lib/binutils/intel/CreateCFG.cpp
deleted file mode 100644
index c4f6ffb97d..0000000000
--- a/src/lib/binutils/intel/CreateCFG.cpp
+++ /dev/null
@@ -1,215 +0,0 @@
-// -*-Mode: C++;-*- // technically C99
-
-// * BeginRiceCopyright *****************************************************
-//
-// --------------------------------------------------------------------------
-// Part of HPCToolkit (hpctoolkit.org)
-//
-// Information about sources of support for research and development of
-// HPCToolkit is at 'hpctoolkit.org' and in 'README.Acknowledgments'.
-// --------------------------------------------------------------------------
-//
-// Copyright ((c)) 2002-2020, Rice University
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// * Redistributions of source code must retain the above copyright
-//   notice, this list of conditions and the following disclaimer.
-//
-// * Redistributions in binary form must reproduce the above copyright
-//   notice, this list of conditions and the following disclaimer in the
-//   documentation and/or other materials provided with the distribution.
-//
-// * Neither the name of Rice University (RICE) nor the names of its
-//   contributors may be used to endorse or promote products derived from
-//   this software without specific prior written permission.
-//
-// This software is provided by RICE and contributors "as is" and any
-// express or implied warranties, including, but not limited to, the
-// implied warranties of merchantability and fitness for a particular
-// purpose are disclaimed. In no event shall RICE or contributors be
-// liable for any direct, indirect, incidental, special, exemplary, or
-// consequential damages (including, but not limited to, procurement of
-// substitute goods or services; loss of use, data, or profits; or
-// business interruption) however caused and on any theory of liability,
-// whether in contract, strict liability, or tort (including negligence
-// or otherwise) arising in any way out of the use of this software, even
-// if advised of the possibility of such damage.
-//
-// ******************************************************* EndRiceCopyright *
-
-//******************************************************************************
-// system includes
-//******************************************************************************
-
-#include <iga/kv.hpp>
-#include "igc_binary_decoder.h"
-
-#include <iostream>
-#include <stack>
-#include <algorithm>
-#include <vector>
-#include <sstream>
-#include <set>
-#include <map> 
-
-
-
-//******************************************************************************
-// local includes
-//******************************************************************************
-
-#include "CreateCFG.hpp"
-
-
-
-//******************************************************************************
-// local data
-//******************************************************************************
-
-#define MAX_STR_SIZE 1024
-
-std::vector<int32_t> block_offsets;
-std::map<int32_t, bool> visitedBlockOffsets;
-
-
-
-//******************************************************************************
-// private operations
-//******************************************************************************
-
-static std::set<Edge>
-get_cfg_edges
-(
-	std::vector<uint8_t> binary,
-	size_t binary_size
-)
-{
-	KernelView kv(IGA_GEN9, binary.data(), binary.size(),
-			iga::SWSB_ENCODE_MODE::SingleDistPipe);
-	std::set<Edge> cfg_edges;
-
-	int32_t offset = 0;
-	int32_t size;
-	while (offset < binary_size) {
-		int32_t prev_block_start_offset;
-		int32_t prev_block_end_offset;
-		int32_t block_start_offset;
-		bool isStartOfBasicBlock = kv.isInstTarget(offset);
-		if (isStartOfBasicBlock) {
-			block_offsets.push_back(offset);
-			visitedBlockOffsets.insert({offset, false});
-			block_start_offset = offset;	
-		}
-		size = kv.getInstSize(offset);
-		while (!kv.isInstTarget(offset + size) && (offset + size < binary_size)) {
-			offset += size;	
-			size = kv.getInstSize(offset);
-			if (size == 0) {
-				// this is a weird edge case, what to do?
-				break;
-			}
-		}
-
-		int32_t *jump_targets = new int32_t[KV_MAX_TARGETS_PER_INSTRUCTION];
-		size_t jump_targets_count = kv.getInstTargets(offset, jump_targets);
-		int32_t next_block_start_offset = offset + size;
-		bool isFallThroughEdgeAdded = false;
-
-		for (size_t i = 0; i < jump_targets_count; i++) {
-			if (jump_targets[i] == next_block_start_offset) {
-				isFallThroughEdgeAdded = true;
-			} else if (jump_targets[i] == block_start_offset) {
-				if (block_offsets.size() >= 2) {
-					int32_t from = block_offsets[block_offsets.size() - 2];
-					int32_t from_blockEndOffset;
-					for (Edge edge: cfg_edges) {
-						if (edge.from == from && edge.to == block_start_offset) {
-							from_blockEndOffset	 = edge.from_blockEndOffset;
-						}
-					}	
-					cfg_edges.insert(Edge(block_offsets[block_offsets.size() - 2], block_start_offset, from_blockEndOffset));
-				}
-			}
-			cfg_edges.insert(Edge(block_start_offset, jump_targets[i], next_block_start_offset - size));
-		}
-		if(!isFallThroughEdgeAdded) {
-			cfg_edges.insert(Edge(block_start_offset, next_block_start_offset, next_block_start_offset - size));
-		}
-		prev_block_start_offset = block_start_offset;
-		prev_block_end_offset = offset; 
-		offset += size;
-	}
-	cfg_edges.insert(Edge(block_offsets[block_offsets.size() - 1], binary_size, binary_size - size));
-	return cfg_edges;
-}
-
-
-static void
-printCFGEdges
-(
-	std::set<Edge> cfg_edges
-)
-{
-	for (Edge edge: cfg_edges) {
-		std::cout << edge.from << "->" << edge.to << std::endl;	
-	}	
-}
-
-
-static void
-printBasicBlocks
-(
-	std::vector<uint8_t> binary,
-	std::set<Edge> cfg_edges
-)
-{
-	KernelView kv(IGA_GEN9, binary.data(), binary.size(), iga::SWSB_ENCODE_MODE::SingleDistPipe);
-	int32_t offset;
-	char text[MAX_STR_SIZE] = { 0 };
-	size_t length;
-	int32_t size;
-
-	for (Edge edge: cfg_edges) {
-		offset = edge.from;
-		if(edge.from == edge.to) {
-			// skip self-loops
-			continue;
-		}
-		auto it = visitedBlockOffsets.find(offset);
-		if (it->second) {
-			continue;
-		} else {
-			it->second = true;
-		}
-		std::cout << offset << " [ label=\"\\\n"; 
-		while (offset < edge.to) {
-			size = kv.getInstSize(offset);
-			length = kv.getInstSyntax(offset, text, MAX_STR_SIZE);
-			assert(length > 0);
-			std::cout << offset << ": " << text << "\\\l";
-			offset += size;
-		}
-		std::cout << "\" shape=\"box\"]; \n" << std::endl;
-	}	
-}
-
-
-
-//******************************************************************************
-// interface operations
-//******************************************************************************
-
-// pass Intel kernel's raw gen binary
-// kernel's text region is a raw gen binary
-// you  can find kernel nested in [debug section of GPU binary/separate debug section dump]
-void printCFGInDotGraph(std::vector<uint8_t> intelRawGenBinary) {
-	std::cout << "digraph GEMM_iga {" << std::endl;
-	std::set<Edge> edges = get_cfg_edges(intelRawGenBinary, intelRawGenBinary.size());
-	printBasicBlocks(intelRawGenBinary, edges);
-	printCFGEdges(edges);
-	std::cout << "}" << std::endl;
-}
diff --git a/src/lib/binutils/intel/CreateCFG.hpp b/src/lib/binutils/intel/CreateCFG.hpp
deleted file mode 100644
index 4c478a672b..0000000000
--- a/src/lib/binutils/intel/CreateCFG.hpp
+++ /dev/null
@@ -1,94 +0,0 @@
-// -*-Mode: C++;-*- // technically C99
-
-// * BeginRiceCopyright *****************************************************
-//
-// --------------------------------------------------------------------------
-// Part of HPCToolkit (hpctoolkit.org)
-//
-// Information about sources of support for research and development of
-// HPCToolkit is at 'hpctoolkit.org' and in 'README.Acknowledgments'.
-// --------------------------------------------------------------------------
-//
-// Copyright ((c)) 2002-2020, Rice University
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// * Redistributions of source code must retain the above copyright
-//   notice, this list of conditions and the following disclaimer.
-//
-// * Redistributions in binary form must reproduce the above copyright
-//   notice, this list of conditions and the following disclaimer in the
-//   documentation and/or other materials provided with the distribution.
-//
-// * Neither the name of Rice University (RICE) nor the names of its
-//   contributors may be used to endorse or promote products derived from
-//   this software without specific prior written permission.
-//
-// This software is provided by RICE and contributors "as is" and any
-// express or implied warranties, including, but not limited to, the
-// implied warranties of merchantability and fitness for a particular
-// purpose are disclaimed. In no event shall RICE or contributors be
-// liable for any direct, indirect, incidental, special, exemplary, or
-// consequential damages (including, but not limited to, procurement of
-// substitute goods or services; loss of use, data, or profits; or
-// business interruption) however caused and on any theory of liability,
-// whether in contract, strict liability, or tort (including negligence
-// or otherwise) arising in any way out of the use of this software, even
-// if advised of the possibility of such damage.
-//
-// ******************************************************* EndRiceCopyright *
-
-//******************************************************************************
-// system includes
-//******************************************************************************
-
-#ifndef BINUTILS_INTEL_CREATE_CFG
-#define BINUTILS_INTEL_CREATE_CFG
-
-#include <vector>
-
-
-
-//******************************************************************************
-// local data
-//******************************************************************************
-
-class Edge {
-	public:
-		int32_t from;	
-		int32_t to;
-		int32_t from_blockEndOffset;
-
-		Edge(int32_t f, int32_t t, int32_t from_b) {
-			from = f;
-			to = t;
-			from_blockEndOffset = from_b;
-		}
-
-		bool operator == (const Edge &that) const 
-		{
-			return((this->from == that.from) && (this->to == that.to));
-		}
-		
-		bool operator<(const Edge& that) const 
-		{
-			if (this->from == that.from) {
-				return (this->to < that.to);
-			} else {
-				return (this->from < that.from);
-			}
-		}
-};
-
-
-
-//******************************************************************************
-// interface operations
-//******************************************************************************
-
-void printCFGInDotGraph(std::vector<uint8_t> intelRawGenBinary);
-
-#endif
diff --git a/src/lib/binutils/intel/IntelGPUbinutils.cpp b/src/lib/binutils/intel/IntelGPUbinutils.cpp
index d947475661..fb5b6673f8 100644
--- a/src/lib/binutils/intel/IntelGPUbinutils.cpp
+++ b/src/lib/binutils/intel/IntelGPUbinutils.cpp
@@ -70,8 +70,6 @@
 #include <lib/support/diagnostics.h>
 #include <lib/support/RealPathMgr.cpp>
 #include "IntelGPUbinutils.hpp"
-#include "CreateCFG.hpp"
-
 
 
 //******************************************************************************
@@ -166,7 +164,7 @@ openclElfSectionType
 static bool
 extract_kernelelfs
 (
-	std::vector<uint8_t> symbols,
+	std::vector<uint8_t> &symbols,
 	ElfFileVector *filevector
 )
 {
@@ -178,6 +176,7 @@ extract_kernelelfs
 
 	if (header->NumberOfKernels == 0) {
 		extractSuccess = false;
+    return extractSuccess;
 	}
 	
 	for (uint32_t i = 0; i < header->NumberOfKernels; ++i) {
@@ -186,9 +185,7 @@ extract_kernelelfs
 		ptr += sizeof(SKernelDebugDataHeaderIGC);
 
 		const char* kernel_name = reinterpret_cast<const char*>(ptr);
-		char *file_name = (char*)malloc(sizeof(kernel_name));
-		strcpy(file_name, kernel_name);
-		strcat(file_name, ".gpubin");
+    std::string file_name = std::string(kernel_name) + ".gpubin";
 
 		unsigned kernel_name_size_aligned = sizeof(uint32_t) *
 			(1 + (kernel_header->KernelNameSize - 1) / sizeof(uint32_t));
@@ -202,16 +199,21 @@ extract_kernelelfs
 			std::string file_contents((std::istreambuf_iterator<char>(in)), 
 			    std::istreambuf_iterator<char>());
 
-
-			ElfFile *elfFile = new ElfFile;
-			int file_fd = open(file_name, O_RDONLY);
-			size_t f_size = file_size(file_fd);
-			char  *file_buffer = (char *) malloc(f_size);
-			size_t bytes = read_all(file_fd, file_buffer, f_size);
-			bool result = elfFile->open(file_buffer, f_size, file_name);
-
-			filevector->push_back(elfFile);
+      ElfFile *elfFile = new ElfFile;
+      int file_fd = open(file_name.c_str(), O_RDONLY);
+      size_t f_size = file_size(file_fd);
+      char *file_buffer = (char *)malloc(f_size);
+      size_t bytes = read_all(file_fd, file_buffer, f_size);
+
+      if (elfFile->open(file_buffer, f_size, file_name)) {
+        extractSuccess = true;
+        filevector->push_back(elfFile);
+      } else {
+        extractSuccess = false;
+        break;
+      }
 			
+      /*
 			// start cfg generation
 			Elf *elf = elfFile->getElf();
 			file_buffer = elfFile->getMemory();
@@ -230,15 +232,18 @@ extract_kernelelfs
 					if (strcmp(section_name, ".text") == 0) {
 						std::vector<uint8_t> intelRawGenBinary(reinterpret_cast<uint8_t*>(sectionData), 
 								reinterpret_cast<uint8_t*>(sectionData) + kernel_header->SizeVisaDbgInBytes);
-						printCFGInDotGraph(intelRawGenBinary);
+						printCFGInDotGraph(kernel_name, intelRawGenBinary);
 					}
 				}
 			}
+      */
 			//end cfg generation
 		} else {
 			extractSuccess = false;
+      break;
 		}
 	}
+
 	return extractSuccess;
 }
 
@@ -252,8 +257,6 @@ isCustomOpenCLBinary
   return (strcmp(section_name, ".SHT_OPENCL_DEV_DEBUG") == 0);
 }
 
-
-
 //******************************************************************************
 // interface operations
 //******************************************************************************
@@ -299,7 +302,8 @@ findIntelGPUbins
 			}*/
     }
   }
-	FILE *fptr;
+  // TODO(Aaron): why put this section here?
+  FILE *fptr;
 	if (!fileHasDebugSection && (fptr = fopen("opencl_main.debuginfo", "rb"))) {
 		fileHasDebugSection = true;
 		fseek(fptr, 0L, SEEK_END);

From 4bf977e19616a504a86e458579c3ba906add276e Mon Sep 17 00:00:00 2001
From: Jokeren <robinho364@gmail.com>
Date: Thu, 17 Sep 2020 14:46:54 +0000
Subject: [PATCH 027/177] Change tab to space

---
 src/lib/banal/intel/ReadIntelCFG.cpp        | 102 ++++-----
 src/lib/banal/intel/ReadIntelCFG.hpp        |  26 +--
 src/lib/binutils/intel/IntelGPUbinutils.cpp | 224 ++++++++++----------
 3 files changed, 176 insertions(+), 176 deletions(-)

diff --git a/src/lib/banal/intel/ReadIntelCFG.cpp b/src/lib/banal/intel/ReadIntelCFG.cpp
index e5914597ad..786bcfb6ce 100644
--- a/src/lib/banal/intel/ReadIntelCFG.cpp
+++ b/src/lib/banal/intel/ReadIntelCFG.cpp
@@ -98,30 +98,30 @@ addCustomFunctionObject
  Symtab *symtab
 )
 {
-	Region *reg = NULL;
-	bool status = symtab->findRegion(reg, ".text");
-	assert(status == true);
-
-	unsigned long reg_size = reg->getMemSize();
-	Symbol *custom_symbol = new Symbol(
-			func_obj_name, 
-			SymtabAPI::Symbol::ST_FUNCTION, // SymbolType
-			Symbol::SL_LOCAL, //SymbolLinkage
-			SymtabAPI::Symbol::SV_DEFAULT, //SymbolVisibility
-			0, //Offset,
-			NULL, //Module *module 
-			reg, //Region *r
-			reg_size, //unsigned s
-			false, //bool d
-			false, //bool a
-			-1, //int index
-			-1, //int strindex
-			false //bool cs
-	);
-	
-	//adding the custom symbol into the symtab object
-	status = symtab->addSymbol(custom_symbol); //(Symbol *newsym)
-	assert(status == true);
+  Region *reg = NULL;
+  bool status = symtab->findRegion(reg, ".text");
+  assert(status == true);
+
+  unsigned long reg_size = reg->getMemSize();
+  Symbol *custom_symbol = new Symbol(
+      func_obj_name, 
+      SymtabAPI::Symbol::ST_FUNCTION, // SymbolType
+      Symbol::SL_LOCAL, //SymbolLinkage
+      SymtabAPI::Symbol::SV_DEFAULT, //SymbolVisibility
+      0, //Offset,
+      NULL, //Module *module 
+      reg, //Region *r
+      reg_size, //unsigned s
+      false, //bool d
+      false, //bool a
+      -1, //int index
+      -1, //int strindex
+      false //bool cs
+  );
+  
+  //adding the custom symbol into the symtab object
+  status = symtab->addSymbol(custom_symbol); //(Symbol *newsym)
+  assert(status == true);
 }
 
 
@@ -132,11 +132,11 @@ getFileNameFromAbsolutePath(const std::string &str) {
   std::stringstream str_stream(str); 
   std::string intermediate; 
 
-	// Tokenizing w.r.t. '/'
-	while(std::getline(str_stream, intermediate, '/')) { 
-		tokens.push_back(intermediate); 
-	} 
-	return tokens[tokens.size() - 1];
+  // Tokenizing w.r.t. '/'
+  while(std::getline(str_stream, intermediate, '/')) { 
+    tokens.push_back(intermediate); 
+  } 
+  return tokens[tokens.size() - 1];
 }
 
 
@@ -148,34 +148,34 @@ parseIntelCFG
  CudaParse::Function &function
 )
 {
-	KernelView kv(IGA_GEN9, text_section, text_section_size, iga::SWSB_ENCODE_MODE::SingleDistPipe);
+  KernelView kv(IGA_GEN9, text_section, text_section_size, iga::SWSB_ENCODE_MODE::SingleDistPipe);
   std::map<int, CudaParse::Block *> block_offset_map;
 
-	int offset = 0;
-	int size = 0;
+  int offset = 0;
+  int size = 0;
   int block_id = 0;
 
   // Construct basic blocks
-	while (offset < text_section_size) {
+  while (offset < text_section_size) {
     auto *block = new CudaParse::Block(block_id, function.name + "_" + std::to_string(block_id)); 
     function.blocks.push_back(block);
     block_offset_map[offset] = block;
 
-		size = kv.getInstSize(offset);
+    size = kv.getInstSize(offset);
     auto *inst = new CudaParse::Inst(offset, size);
     block->insts.push_back(inst);
 
-		while (!kv.isInstTarget(offset + size) && (offset + size < text_section_size)) {
-			offset += size;	
-			size = kv.getInstSize(offset);
-			if (size == 0) {
-				// this is a weird edge case, what to do?
-				break;
-			}
+    while (!kv.isInstTarget(offset + size) && (offset + size < text_section_size)) {
+      offset += size;  
+      size = kv.getInstSize(offset);
+      if (size == 0) {
+        // this is a weird edge case, what to do?
+        break;
+      }
 
       inst = new CudaParse::Inst(offset, size);
       block->insts.push_back(inst);
-		}
+    }
 
     offset = block->insts.back()->offset;
     if (kv.getOpcode(offset) == iga::Op::CALL || kv.getOpcode(offset) == iga::Op::CALLA) {
@@ -190,27 +190,27 @@ parseIntelCFG
   for (size_t i = 0; i < function.blocks.size(); ++i) {
     auto *block = function.blocks[i];
     auto *inst = block->insts.back();
-		size_t jump_targets_count = kv.getInstTargets(inst->offset, jump_targets.data());
-		int next_block_start_offset = 0;
+    size_t jump_targets_count = kv.getInstTargets(inst->offset, jump_targets.data());
+    int next_block_start_offset = 0;
     if (i != function.blocks.size() - 1) {
       next_block_start_offset = function.blocks[i + 1]->insts.front()->offset;
     }
 
-		for (size_t i = 0; i < jump_targets_count; i++) {
+    for (size_t i = 0; i < jump_targets_count; i++) {
       auto *target_block = block_offset_map.at(jump_targets[i]);
-			if (jump_targets[i] == next_block_start_offset) {
+      if (jump_targets[i] == next_block_start_offset) {
         // Fall through
         if (inst->is_call) {
           block->targets.push_back(new CudaParse::Target(inst, target_block, CudaParse::TargetType::CALL_FT));
         } else {
           block->targets.push_back(new CudaParse::Target(inst, target_block, CudaParse::TargetType::FALLTHROUGH));
         }
-			} else {
+      } else {
         // Jump
         block->targets.push_back(new CudaParse::Target(inst, target_block, CudaParse::TargetType::DIRECT));
-			}
-		}
-	}
+      }
+    }
+  }
 }
 
 
@@ -230,7 +230,7 @@ readIntelCFG
   addCustomFunctionObject(function_name, the_symtab); //adds a dummy function object
 
   char *text_section = NULL;
-  auto text_section_size = elfFile->getTextSection(text_section);
+  auto text_section_size = elfFile->getTextSection(&text_section);
   if (text_section_size == 0) {
     return false;
   }
diff --git a/src/lib/banal/intel/ReadIntelCFG.hpp b/src/lib/banal/intel/ReadIntelCFG.hpp
index 5ed98fd6dd..3730ec98a2 100644
--- a/src/lib/banal/intel/ReadIntelCFG.hpp
+++ b/src/lib/banal/intel/ReadIntelCFG.hpp
@@ -62,19 +62,19 @@
 //*****************************************************************************
 
 enum SHT_OPENCL : uint32_t {
-    SHT_OPENCL_SOURCE = 0xff000000,                  // CL source to link into LLVM binary
-    SHT_OPENCL_HEADER = 0xff000001,                  // CL header to link into LLVM binary
-    SHT_OPENCL_LLVM_TEXT = 0xff000002,               // LLVM text
-    SHT_OPENCL_LLVM_BINARY = 0xff000003,             // LLVM byte code
-    SHT_OPENCL_LLVM_ARCHIVE = 0xff000004,            // LLVM archives(s)
-    SHT_OPENCL_DEV_BINARY = 0xff000005,              // Device binary (coherent by default)
-    SHT_OPENCL_OPTIONS = 0xff000006,                 // CL Options
-    SHT_OPENCL_PCH = 0xff000007,                     // PCH (pre-compiled headers)
-    SHT_OPENCL_DEV_DEBUG = 0xff000008,               // Device debug
-    SHT_OPENCL_SPIRV = 0xff000009,                   // SPIRV
-    SHT_OPENCL_NON_COHERENT_DEV_BINARY = 0xff00000a, // Non-coherent Device binary
-    SHT_OPENCL_SPIRV_SC_IDS = 0xff00000b,            // Specialization Constants IDs
-    SHT_OPENCL_SPIRV_SC_VALUES = 0xff00000c          // Specialization Constants values
+  SHT_OPENCL_SOURCE = 0xff000000,                  // CL source to link into LLVM binary
+  SHT_OPENCL_HEADER = 0xff000001,                  // CL header to link into LLVM binary
+  SHT_OPENCL_LLVM_TEXT = 0xff000002,               // LLVM text
+  SHT_OPENCL_LLVM_BINARY = 0xff000003,             // LLVM byte code
+  SHT_OPENCL_LLVM_ARCHIVE = 0xff000004,            // LLVM archives(s)
+  SHT_OPENCL_DEV_BINARY = 0xff000005,              // Device binary (coherent by default)
+  SHT_OPENCL_OPTIONS = 0xff000006,                 // CL Options
+  SHT_OPENCL_PCH = 0xff000007,                     // PCH (pre-compiled headers)
+  SHT_OPENCL_DEV_DEBUG = 0xff000008,               // Device debug
+  SHT_OPENCL_SPIRV = 0xff000009,                   // SPIRV
+  SHT_OPENCL_NON_COHERENT_DEV_BINARY = 0xff00000a, // Non-coherent Device binary
+  SHT_OPENCL_SPIRV_SC_IDS = 0xff00000b,            // Specialization Constants IDs
+  SHT_OPENCL_SPIRV_SC_VALUES = 0xff00000c          // Specialization Constants values
 };
 
 //******************************************************************************
diff --git a/src/lib/binutils/intel/IntelGPUbinutils.cpp b/src/lib/binutils/intel/IntelGPUbinutils.cpp
index fb5b6673f8..0a32fae64b 100644
--- a/src/lib/binutils/intel/IntelGPUbinutils.cpp
+++ b/src/lib/binutils/intel/IntelGPUbinutils.cpp
@@ -125,79 +125,79 @@ read_all(int fd, void *buf, size_t count)
 static const char*
 openclElfSectionType
 (
-	Elf64_Word sh_type
+  Elf64_Word sh_type
 )
 {
-	switch (sh_type) {
+  switch (sh_type) {
     case SHT_OPENCL_SOURCE:
-			return "SHT_OPENCL_SOURCE";
+      return "SHT_OPENCL_SOURCE";
     case SHT_OPENCL_HEADER:
-			return "SHT_OPENCL_HEADER";
+      return "SHT_OPENCL_HEADER";
     case SHT_OPENCL_LLVM_TEXT:
-			return "SHT_OPENCL_LLVM_TEXT";
+      return "SHT_OPENCL_LLVM_TEXT";
     case SHT_OPENCL_LLVM_BINARY:
-			return "SHT_OPENCL_LLVM_BINARY";
+      return "SHT_OPENCL_LLVM_BINARY";
     case SHT_OPENCL_LLVM_ARCHIVE:
-			return "SHT_OPENCL_LLVM_ARCHIVE";
+      return "SHT_OPENCL_LLVM_ARCHIVE";
     case SHT_OPENCL_DEV_BINARY:
-			return "SHT_OPENCL_DEV_BINARY";
+      return "SHT_OPENCL_DEV_BINARY";
     case SHT_OPENCL_OPTIONS:
-			return "SHT_OPENCL_OPTIONS";
+      return "SHT_OPENCL_OPTIONS";
     case SHT_OPENCL_PCH:
-			return "SHT_OPENCL_PCH";
+      return "SHT_OPENCL_PCH";
     case SHT_OPENCL_DEV_DEBUG:
-			return "SHT_OPENCL_DEV_DEBUG";
+      return "SHT_OPENCL_DEV_DEBUG";
     case SHT_OPENCL_SPIRV:
-			return "SHT_OPENCL_SPIRV";
+      return "SHT_OPENCL_SPIRV";
     case SHT_OPENCL_NON_COHERENT_DEV_BINARY:
-			return "SHT_OPENCL_NON_COHERENT_DEV_BINARY";
+      return "SHT_OPENCL_NON_COHERENT_DEV_BINARY";
     case SHT_OPENCL_SPIRV_SC_IDS:
-			return "SHT_OPENCL_SPIRV_SC_IDS";
+      return "SHT_OPENCL_SPIRV_SC_IDS";
     case SHT_OPENCL_SPIRV_SC_VALUES:
-			return "SHT_OPENCL_SPIRV_SC_VALUES";
-		default:
-			return "unknown type";
-	}
+      return "SHT_OPENCL_SPIRV_SC_VALUES";
+    default:
+      return "unknown type";
+  }
 }
 
 
 static bool
 extract_kernelelfs
 (
-	std::vector<uint8_t> &symbols,
-	ElfFileVector *filevector
+  std::vector<uint8_t> &symbols,
+  ElfFileVector *filevector
 )
 {
-	bool extractSuccess = true;
-	const uint8_t* ptr = symbols.data();
-	const SProgramDebugDataHeaderIGC* header =
-		reinterpret_cast<const SProgramDebugDataHeaderIGC*>(ptr);
-	ptr += sizeof(SProgramDebugDataHeaderIGC);
-
-	if (header->NumberOfKernels == 0) {
-		extractSuccess = false;
+  bool extractSuccess = true;
+  const uint8_t* ptr = symbols.data();
+  const SProgramDebugDataHeaderIGC* header =
+    reinterpret_cast<const SProgramDebugDataHeaderIGC*>(ptr);
+  ptr += sizeof(SProgramDebugDataHeaderIGC);
+
+  if (header->NumberOfKernels == 0) {
+    extractSuccess = false;
     return extractSuccess;
-	}
-	
-	for (uint32_t i = 0; i < header->NumberOfKernels; ++i) {
-		const SKernelDebugDataHeaderIGC* kernel_header =
-			reinterpret_cast<const SKernelDebugDataHeaderIGC*>(ptr);
-		ptr += sizeof(SKernelDebugDataHeaderIGC);
-
-		const char* kernel_name = reinterpret_cast<const char*>(ptr);
+  }
+  
+  for (uint32_t i = 0; i < header->NumberOfKernels; ++i) {
+    const SKernelDebugDataHeaderIGC* kernel_header =
+      reinterpret_cast<const SKernelDebugDataHeaderIGC*>(ptr);
+    ptr += sizeof(SKernelDebugDataHeaderIGC);
+
+    const char* kernel_name = reinterpret_cast<const char*>(ptr);
     std::string file_name = std::string(kernel_name) + ".gpubin";
 
-		unsigned kernel_name_size_aligned = sizeof(uint32_t) *
-			(1 + (kernel_header->KernelNameSize - 1) / sizeof(uint32_t));
-		ptr += kernel_name_size_aligned;
+    unsigned kernel_name_size_aligned = sizeof(uint32_t) *
+      (1 + (kernel_header->KernelNameSize - 1) / sizeof(uint32_t));
+    ptr += kernel_name_size_aligned;
 
-		if (kernel_header->SizeVisaDbgInBytes > 0) {
-			/*FILE *f_ptr = fopen(kernel_name, "wb");
-			fwrite(ptr, kernel_header->SizeVisaDbgInBytes, 1, f_ptr);
-			fclose(f_ptr);*/
-			std::ifstream in(kernel_name);
-			std::string file_contents((std::istreambuf_iterator<char>(in)), 
-			    std::istreambuf_iterator<char>());
+    if (kernel_header->SizeVisaDbgInBytes > 0) {
+      /*FILE *f_ptr = fopen(kernel_name, "wb");
+      fwrite(ptr, kernel_header->SizeVisaDbgInBytes, 1, f_ptr);
+      fclose(f_ptr);*/
+      std::ifstream in(kernel_name);
+      std::string file_contents((std::istreambuf_iterator<char>(in)), 
+          std::istreambuf_iterator<char>());
 
       ElfFile *elfFile = new ElfFile;
       int file_fd = open(file_name.c_str(), O_RDONLY);
@@ -212,46 +212,46 @@ extract_kernelelfs
         extractSuccess = false;
         break;
       }
-			
+      
       /*
-			// start cfg generation
-			Elf *elf = elfFile->getElf();
-			file_buffer = elfFile->getMemory();
-			ElfSectionVector *sections = elfGetSectionVector(elf);
-			GElf_Ehdr ehdr_v;
-			GElf_Ehdr *ehdr = gelf_getehdr(elf, &ehdr_v);
-
-			if (ehdr) {
-				for (auto si = sections->begin(); si != sections->end(); si++) {
-					Elf_Scn *scn = *si;
-					GElf_Shdr shdr_v;
-					GElf_Shdr *shdr = gelf_getshdr(scn, &shdr_v);
-					if (!shdr) continue;
-					char *sectionData = elfSectionGetData(file_buffer, shdr);
-					const char *section_name = elf_strptr(elf, ehdr->e_shstrndx, shdr->sh_name);
-					if (strcmp(section_name, ".text") == 0) {
-						std::vector<uint8_t> intelRawGenBinary(reinterpret_cast<uint8_t*>(sectionData), 
-								reinterpret_cast<uint8_t*>(sectionData) + kernel_header->SizeVisaDbgInBytes);
-						printCFGInDotGraph(kernel_name, intelRawGenBinary);
-					}
-				}
-			}
+      // start cfg generation
+      Elf *elf = elfFile->getElf();
+      file_buffer = elfFile->getMemory();
+      ElfSectionVector *sections = elfGetSectionVector(elf);
+      GElf_Ehdr ehdr_v;
+      GElf_Ehdr *ehdr = gelf_getehdr(elf, &ehdr_v);
+
+      if (ehdr) {
+        for (auto si = sections->begin(); si != sections->end(); si++) {
+          Elf_Scn *scn = *si;
+          GElf_Shdr shdr_v;
+          GElf_Shdr *shdr = gelf_getshdr(scn, &shdr_v);
+          if (!shdr) continue;
+          char *sectionData = elfSectionGetData(file_buffer, shdr);
+          const char *section_name = elf_strptr(elf, ehdr->e_shstrndx, shdr->sh_name);
+          if (strcmp(section_name, ".text") == 0) {
+            std::vector<uint8_t> intelRawGenBinary(reinterpret_cast<uint8_t*>(sectionData), 
+                reinterpret_cast<uint8_t*>(sectionData) + kernel_header->SizeVisaDbgInBytes);
+            printCFGInDotGraph(kernel_name, intelRawGenBinary);
+          }
+        }
+      }
       */
-			//end cfg generation
-		} else {
-			extractSuccess = false;
+      //end cfg generation
+    } else {
+      extractSuccess = false;
       break;
-		}
-	}
+    }
+  }
 
-	return extractSuccess;
+  return extractSuccess;
 }
 
 
 static bool
 isCustomOpenCLBinary
 (
-	const char *section_name
+  const char *section_name
 )
 {
   return (strcmp(section_name, ".SHT_OPENCL_DEV_DEBUG") == 0);
@@ -264,56 +264,56 @@ isCustomOpenCLBinary
 bool
 findIntelGPUbins
 (
-	ElfFile *elfFile,
-	ElfFileVector *filevector
+  ElfFile *elfFile,
+  ElfFileVector *filevector
 )
 {
-	bool fileHasDebugSection = false;
-	bool extractSuccess = false;
+  bool fileHasDebugSection = false;
+  bool extractSuccess = false;
 
   Elf *elf = elfFile->getElf();
-	char *file_buffer = elfFile->getMemory();
+  char *file_buffer = elfFile->getMemory();
   ElfSectionVector *sections = elfGetSectionVector(elf);
   GElf_Ehdr ehdr_v;
   GElf_Ehdr *ehdr = gelf_getehdr(elf, &ehdr_v);
 
   if (ehdr) {
     for (auto si = sections->begin(); si != sections->end(); si++) {
-			Elf_Scn *scn = *si;
-			GElf_Shdr shdr_v;
-			GElf_Shdr *shdr = gelf_getshdr(scn, &shdr_v);
-			if (!shdr) continue;
-			char *sectionData = elfSectionGetData(file_buffer, shdr);
-			const char *section_name = elf_strptr(elf, ehdr->e_shstrndx, shdr->sh_name);
-			//std::cerr << "section name: " << section_name << ". section type: " << openclElfSectionType(shdr->sh_type) << std::endl;
-
-			// extract debug section
-			if ((shdr->sh_type == SHT_OPENCL_DEV_DEBUG && strcmp(section_name, INTEL_GPU_DEBUG_SECTION_NAME) == 0)
-					|| isCustomOpenCLBinary(section_name)) {
-				fileHasDebugSection = true;
-				std::vector<uint8_t> debug_info(reinterpret_cast<uint8_t*>(sectionData), reinterpret_cast<uint8_t*>(sectionData) + shdr->sh_size);
-				extractSuccess = extract_kernelelfs(debug_info, filevector);
-				break;
-			} /*else if (strcmp(section_name, ".text") == 0) {
-				FILE *bin_ptr;
-				bin_ptr = fopen("switch.text", "wb");
-				fwrite(sectionData, shdr->sh_size, 1, bin_ptr);
-				fclose(bin_ptr);
-			}*/
+      Elf_Scn *scn = *si;
+      GElf_Shdr shdr_v;
+      GElf_Shdr *shdr = gelf_getshdr(scn, &shdr_v);
+      if (!shdr) continue;
+      char *sectionData = elfSectionGetData(file_buffer, shdr);
+      const char *section_name = elf_strptr(elf, ehdr->e_shstrndx, shdr->sh_name);
+      //std::cerr << "section name: " << section_name << ". section type: " << openclElfSectionType(shdr->sh_type) << std::endl;
+
+      // extract debug section
+      if ((shdr->sh_type == SHT_OPENCL_DEV_DEBUG && strcmp(section_name, INTEL_GPU_DEBUG_SECTION_NAME) == 0)
+          || isCustomOpenCLBinary(section_name)) {
+        fileHasDebugSection = true;
+        std::vector<uint8_t> debug_info(reinterpret_cast<uint8_t*>(sectionData), reinterpret_cast<uint8_t*>(sectionData) + shdr->sh_size);
+        extractSuccess = extract_kernelelfs(debug_info, filevector);
+        break;
+      } /*else if (strcmp(section_name, ".text") == 0) {
+        FILE *bin_ptr;
+        bin_ptr = fopen("switch.text", "wb");
+        fwrite(sectionData, shdr->sh_size, 1, bin_ptr);
+        fclose(bin_ptr);
+      }*/
     }
   }
   // TODO(Aaron): why put this section here?
   FILE *fptr;
-	if (!fileHasDebugSection && (fptr = fopen("opencl_main.debuginfo", "rb"))) {
-		fileHasDebugSection = true;
-		fseek(fptr, 0L, SEEK_END);
-		size_t debug_info_size = ftell(fptr);
-		printf("debug_info_size: %zu\n", debug_info_size);
-		rewind(fptr);
-		std::vector<uint8_t> debug_info(debug_info_size);
-		fread(debug_info.data(), debug_info_size, 1, fptr);
-		extractSuccess = extract_kernelelfs(debug_info, filevector);
-	}
+  if (!fileHasDebugSection && (fptr = fopen("opencl_main.debuginfo", "rb"))) {
+    fileHasDebugSection = true;
+    fseek(fptr, 0L, SEEK_END);
+    size_t debug_info_size = ftell(fptr);
+    printf("debug_info_size: %zu\n", debug_info_size);
+    rewind(fptr);
+    std::vector<uint8_t> debug_info(debug_info_size);
+    fread(debug_info.data(), debug_info_size, 1, fptr);
+    extractSuccess = extract_kernelelfs(debug_info, filevector);
+  }
   bool success = fileHasDebugSection && extractSuccess;
   return success; 
 }

From 302a20cfecfacf4669a05c89f84a55617846afcb Mon Sep 17 00:00:00 2001
From: Jokeren <robinho364@gmail.com>
Date: Thu, 17 Sep 2020 16:15:15 +0000
Subject: [PATCH 028/177] Works but needs correctness check

---
 src/lib/banal/Struct.cpp                |  2 --
 src/lib/banal/cuda/DotCFG.hpp           |  4 ++-
 src/lib/banal/intel/IntelCFGFactory.cpp | 14 ++++----
 src/lib/banal/intel/ReadIntelCFG.cpp    | 46 +++++++++++++++++--------
 src/lib/binutils/ElfHelper.cpp          |  4 +--
 src/lib/binutils/ElfHelper.hpp          |  2 +-
 src/lib/binutils/InputFile.cpp          | 32 ++++++++---------
 7 files changed, 60 insertions(+), 44 deletions(-)

diff --git a/src/lib/banal/Struct.cpp b/src/lib/banal/Struct.cpp
index 210a31d52e..34f782878d 100644
--- a/src/lib/banal/Struct.cpp
+++ b/src/lib/banal/Struct.cpp
@@ -698,9 +698,7 @@ makeStructure(string filename,
 
     makeWorkList(fileMap, wlPrint, wlLaunch);
 		
-		std::cerr << elfFile->getArch() << std::endl;
 		char *elfFileRealPath = realpath(elfFile->getFileName().c_str(), NULL);
-		std::cerr << elfFileRealPath << std::endl;
     Output::printLoadModuleBegin(outFile, elfFileRealPath);
 
 #pragma omp parallel  default(none)				\
diff --git a/src/lib/banal/cuda/DotCFG.hpp b/src/lib/banal/cuda/DotCFG.hpp
index d18a0e2dc2..288229b7f7 100644
--- a/src/lib/banal/cuda/DotCFG.hpp
+++ b/src/lib/banal/cuda/DotCFG.hpp
@@ -150,7 +150,9 @@ struct Block {
   size_t id;
   std::string name;
 
-  Block(size_t id, const std::string &name) : id(id), name(name) {}
+  Block(size_t id, int address, const std::string &name) : id(id), address(address), name(name) {}
+
+  Block(size_t id, const std::string &name) : Block(id, 0, name) {}
 
   bool operator<(const Block &other) const {
     if (this->insts.size() == 0) {
diff --git a/src/lib/banal/intel/IntelCFGFactory.cpp b/src/lib/banal/intel/IntelCFGFactory.cpp
index 21459620bf..092ddc99d1 100644
--- a/src/lib/banal/intel/IntelCFGFactory.cpp
+++ b/src/lib/banal/intel/IntelCFGFactory.cpp
@@ -2,7 +2,7 @@
 #include "IntelFunction.hpp"
 #include <iostream>
 
-#define DEBUG_CUDA_CFGFACTORY 0
+#define DEBUG_INTEL_CFGFACTORY 0
 
 namespace Dyninst {
 namespace ParseAPI {
@@ -16,7 +16,7 @@ Function *IntelCFGFactory::mkfunc(Address addr, FuncSource src,
       IntelFunction *ret_func = new IntelFunction(function->address, name, obj, region, isrc);
 
       bool first_entry = true;
-      if (DEBUG_CUDA_CFGFACTORY) {
+      if (DEBUG_INTEL_CFGFACTORY) {
         std::cout << "Function: " << function->name << " addr: 0x" <<
           std::hex << addr << std::dec << std::endl;
       }
@@ -25,7 +25,7 @@ Function *IntelCFGFactory::mkfunc(Address addr, FuncSource src,
         // If a block has not been created by callers, create it
         // Otherwise get the block from _block_filter
         if (_block_filter.find(block->id) == _block_filter.end()) {
-          if (DEBUG_CUDA_CFGFACTORY) {
+          if (DEBUG_INTEL_CFGFACTORY) {
             std::cout << "New block: " << block->name << " id: " << block->id << std::endl;
           }
           std::vector<Offset> inst_offsets;
@@ -36,7 +36,7 @@ Function *IntelCFGFactory::mkfunc(Address addr, FuncSource src,
           _block_filter[block->id] = ret_block;
           blocks_.add(ret_block);
         } else {
-          if (DEBUG_CUDA_CFGFACTORY) {
+          if (DEBUG_INTEL_CFGFACTORY) {
             std::cout << "Old block: " << block->name << " id: " << block->id << std::endl;
           }
           ret_block = _block_filter[block->id];
@@ -52,7 +52,7 @@ Function *IntelCFGFactory::mkfunc(Address addr, FuncSource src,
         for (auto *target : block->targets) {
           IntelBlock *ret_target_block = NULL;
           if (_block_filter.find(target->block->id) == _block_filter.end()) {
-            if (DEBUG_CUDA_CFGFACTORY) {
+            if (DEBUG_INTEL_CFGFACTORY) {
               std::cout << "New block: " << target->block->name << " id: " << target->block->id << std::endl;
             }
             std::vector<Offset> inst_offsets;
@@ -63,7 +63,7 @@ Function *IntelCFGFactory::mkfunc(Address addr, FuncSource src,
             _block_filter[target->block->id] = ret_target_block;
             blocks_.add(ret_target_block);
           } else {
-            if (DEBUG_CUDA_CFGFACTORY) {
+            if (DEBUG_INTEL_CFGFACTORY) {
               std::cout << "Old block: " << target->block->name << " id: " << target->block->id << std::endl;
             }
             ret_target_block = _block_filter[target->block->id];
@@ -71,7 +71,7 @@ Function *IntelCFGFactory::mkfunc(Address addr, FuncSource src,
 
           Edge *ret_edge = new Edge(ret_block, ret_target_block, target->type);
           ret_edge->ignore_index();
-          if (DEBUG_CUDA_CFGFACTORY) {
+          if (DEBUG_INTEL_CFGFACTORY) {
             std::cout << "Edge: "<< " -> " << target->block->name << std::endl;
           }
           ret_edge->install();
diff --git a/src/lib/banal/intel/ReadIntelCFG.cpp b/src/lib/banal/intel/ReadIntelCFG.cpp
index 786bcfb6ce..44ce6006bd 100644
--- a/src/lib/banal/intel/ReadIntelCFG.cpp
+++ b/src/lib/banal/intel/ReadIntelCFG.cpp
@@ -157,7 +157,9 @@ parseIntelCFG
 
   // Construct basic blocks
   while (offset < text_section_size) {
-    auto *block = new CudaParse::Block(block_id, function.name + "_" + std::to_string(block_id)); 
+    auto *block = new CudaParse::Block(block_id, offset, function.name + "_" + std::to_string(block_id)); 
+    block_id++;
+
     function.blocks.push_back(block);
     block_offset_map[offset] = block;
 
@@ -177,38 +179,52 @@ parseIntelCFG
       block->insts.push_back(inst);
     }
 
-    offset = block->insts.back()->offset;
     if (kv.getOpcode(offset) == iga::Op::CALL || kv.getOpcode(offset) == iga::Op::CALLA) {
       inst->is_call = true;
     } else {
       inst->is_jump = true;
     }
+    offset += size;
   }
 
   // Construct targets
-  std::array<int, KV_MAX_TARGETS_PER_INSTRUCTION> jump_targets;
+  std::array<int, KV_MAX_TARGETS_PER_INSTRUCTION + 1> jump_targets;
   for (size_t i = 0; i < function.blocks.size(); ++i) {
     auto *block = function.blocks[i];
     auto *inst = block->insts.back();
     size_t jump_targets_count = kv.getInstTargets(inst->offset, jump_targets.data());
+    
+    // Add fall through edge
     int next_block_start_offset = 0;
     if (i != function.blocks.size() - 1) {
       next_block_start_offset = function.blocks[i + 1]->insts.front()->offset;
     }
-
-    for (size_t i = 0; i < jump_targets_count; i++) {
-      auto *target_block = block_offset_map.at(jump_targets[i]);
-      if (jump_targets[i] == next_block_start_offset) {
-        // Fall through
-        if (inst->is_call) {
-          block->targets.push_back(new CudaParse::Target(inst, target_block, CudaParse::TargetType::CALL_FT));
-        } else {
-          block->targets.push_back(new CudaParse::Target(inst, target_block, CudaParse::TargetType::FALLTHROUGH));
+    jump_targets[jump_targets_count] = next_block_start_offset;
+
+    for (size_t j = 0; j < jump_targets_count + 1; j++) {
+      auto *target_block = block_offset_map.at(jump_targets[j]);
+      auto type = CudaParse::TargetType::DIRECT;
+      // Jump
+      bool added = false;
+      for (auto *target : block->targets) {
+        if (target->block == target_block) {
+          added = true;
         }
-      } else {
-        // Jump
-        block->targets.push_back(new CudaParse::Target(inst, target_block, CudaParse::TargetType::DIRECT));
       }
+      if (!added) {
+        block->targets.push_back(new CudaParse::Target(inst, target_block, type));
+      }
+    }
+  }
+
+  if (DEBUG) {
+    for (auto *block : function.blocks) {
+      std::cout << std::hex;
+      std::cout << block->name << ": [" << block->insts.front()->offset << ", " << block->insts.back()->offset << "]" << std::endl;
+      for (auto *target : block->targets) {
+        std::cout << "\t" << block->name << "->" << target->block->name << std::endl;
+      }
+      std::cout << std::dec;
     }
   }
 }
diff --git a/src/lib/binutils/ElfHelper.cpp b/src/lib/binutils/ElfHelper.cpp
index b067d20f02..b8ca5bfbe1 100644
--- a/src/lib/binutils/ElfHelper.cpp
+++ b/src/lib/binutils/ElfHelper.cpp
@@ -180,7 +180,7 @@ elfSectionGetData
 size_t
 ElfFile::getTextSection
 (
- char *text_section
+ char **text_section
 )
 {
   // start cfg generation
@@ -198,7 +198,7 @@ ElfFile::getTextSection
       const char *section_name = elf_strptr(elf, ehdr->e_shstrndx, shdr->sh_name);
       if (strcmp(section_name, ".text") == 0) {
         // TODO(Aaron): can a intel GPU binary has two text sections?
-        text_section = sectionData;
+        *text_section = sectionData;
         return shdr->sh_size;
       }
     }
diff --git a/src/lib/binutils/ElfHelper.hpp b/src/lib/binutils/ElfHelper.hpp
index f3ff77d60a..337e295d07 100644
--- a/src/lib/binutils/ElfHelper.hpp
+++ b/src/lib/binutils/ElfHelper.hpp
@@ -95,7 +95,7 @@ class ElfFile {
   char *getMemoryOriginal() { return origPtr; }
   size_t getLength() { return memLen; }
   std::string getFileName() { return fileName; }
-  size_t getTextSection(char *text_section);
+  size_t getTextSection(char **text_section);
 private:
   int arch;
   char *origPtr;
diff --git a/src/lib/binutils/InputFile.cpp b/src/lib/binutils/InputFile.cpp
index 9a25e84662..ac03b799c8 100644
--- a/src/lib/binutils/InputFile.cpp
+++ b/src/lib/binutils/InputFile.cpp
@@ -122,20 +122,20 @@ read_all(int fd, void *buf, size_t count)
 static bool
 isIntelGPUFile
 (
-	ElfFile *elfFile
+  ElfFile *elfFile
 )
 {
   Elf *elf = elfFile->getElf();
   GElf_Ehdr ehdr_v;
   GElf_Ehdr *ehdr = gelf_getehdr(elf, &ehdr_v);
 
-	int intelGPUType = 0xff04;
-	std::cerr << "ehdr->e_type: " << ehdr->e_type << std::endl;
-	std::cerr << "ehdr->e_type == intelGPUType: " << (ehdr->e_type == intelGPUType) << std::endl;
+  int intelGPUType = 0xff04;
+  std::cerr << "ehdr->e_type: " << ehdr->e_type << std::endl;
+  std::cerr << "ehdr->e_type == intelGPUType: " << (ehdr->e_type == intelGPUType) << std::endl;
   if (ehdr && ehdr->e_type == intelGPUType) {
-		return true;
-	}
-	return false;
+    return true;
+  }
+  return false;
 }
 
 
@@ -160,7 +160,7 @@ InputFile::openFile
 
   if (file_fd < 0) {
     DIAG_MsgIf_GENERIC(tag, 1, "Unable to open input file: " 
-		       << filename << " (" << strerror(errno) << ")");
+           << filename << " (" << strerror(errno) << ")");
 
     if (errType != InputFileError_WarningNothrow) throw 1;
 
@@ -181,7 +181,7 @@ InputFile::openFile
 
   if (file_buffer == 0) {
     DIAG_MsgIf_GENERIC(tag, 1, "Unable to allocate file buffer of " 
-		       << f_size << " bytes");
+           << f_size << " bytes");
     if (errType != InputFileError_WarningNothrow) throw 1;
 
     return false;
@@ -191,7 +191,7 @@ InputFile::openFile
 
   if (f_size != bytes) {
     DIAG_MsgIf_GENERIC(tag, 1, "Read only " << bytes << " bytes of "
-		       << f_size << " bytes from file " << filename);
+           << f_size << " bytes from file " << filename);
 
     if (errType != InputFileError_WarningNothrow) throw 1;
 
@@ -205,12 +205,12 @@ InputFile::openFile
 
   if (result) {
     filevector = new ElfFileVector;
-		if (isIntelGPUFile(elfFile)) {
-			findIntelGPUbins(elfFile, filevector);
-		}
-		else {
-			filevector->push_back(elfFile);
-		}
+    if (isIntelGPUFile(elfFile)) {
+      findIntelGPUbins(elfFile, filevector);
+    }
+    else {
+      filevector->push_back(elfFile);
+    }
     //findCubins(elfFile, filevector);
   } else {
     DIAG_MsgIf_GENERIC(tag, 1, "Not an ELF binary " << filename);

From b90d4b1af0dfd22d105b7b43b3a556dc9c0b0a53 Mon Sep 17 00:00:00 2001
From: dejangrubisic <grubisic.dejan@yahoo.com>
Date: Thu, 17 Sep 2020 15:01:18 -0500
Subject: [PATCH 029/177] include gpu-trace.h in opencl.c

---
 src/tool/hpcrun/sample-sources/opencl.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/tool/hpcrun/sample-sources/opencl.c b/src/tool/hpcrun/sample-sources/opencl.c
index f7e9ca0468..581afa112f 100644
--- a/src/tool/hpcrun/sample-sources/opencl.c
+++ b/src/tool/hpcrun/sample-sources/opencl.c
@@ -51,6 +51,7 @@
 
 #include <hpcrun/device-finalizers.h>
 #include <hpcrun/gpu/gpu-metrics.h>
+#include <hpcrun/gpu/gpu-trace.h>
 #include <hpcrun/gpu/opencl/opencl-api.h>
 #include <hpcrun/thread_data.h>
 

From e255da4aab3ca677bf5ace4feb44a52f6a67c1fa Mon Sep 17 00:00:00 2001
From: Jokeren <robinho364@gmail.com>
Date: Thu, 17 Sep 2020 22:28:50 +0000
Subject: [PATCH 030/177] Fix bugs

---
 src/lib/banal/Struct.cpp             | 9 +++++----
 src/lib/banal/intel/ReadIntelCFG.cpp | 1 +
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/src/lib/banal/Struct.cpp b/src/lib/banal/Struct.cpp
index 34f782878d..bd773125a4 100644
--- a/src/lib/banal/Struct.cpp
+++ b/src/lib/banal/Struct.cpp
@@ -656,6 +656,7 @@ makeStructure(string filename,
 		if (isIntelArch && cfgNotPresent) {
 			//std::cerr << "executing intel-gen9 specific code." << std::endl;
       // TODO(Aaron): does instruction size change with different generations?
+      inst_size.clear();
       intel_arch = 1;
       parsable = readIntelCFG(search_path, elfFile, the_symtab, inst_size,
         structOpts.compute_gpu_cfg, &code_src, &code_obj);
@@ -1825,15 +1826,15 @@ doBlock(WorkEnv & env, GroupInfo * ginfo, ParseAPI::Function * func,
     string filenm = "";
     uint line = 0;
 
-    if (cuda_arch == 0) {
+    if (intel_arch > 0) {
+      len = inst_size.at(vma);
+    } else if (cuda_arch == 0) {
 #ifdef DYNINST_INSTRUCTION_PTR
       len = iit->second->size();
 #else
       len = iit->second.size();
 #endif
-    } else if (intel_arch) {
-      len = inst_size.at(vma);
-    }
+    } 
 
     lmcache.getLineInfo(vma, filenm, line);
 
diff --git a/src/lib/banal/intel/ReadIntelCFG.cpp b/src/lib/banal/intel/ReadIntelCFG.cpp
index 44ce6006bd..c817751485 100644
--- a/src/lib/banal/intel/ReadIntelCFG.cpp
+++ b/src/lib/banal/intel/ReadIntelCFG.cpp
@@ -203,6 +203,7 @@ parseIntelCFG
 
     for (size_t j = 0; j < jump_targets_count + 1; j++) {
       auto *target_block = block_offset_map.at(jump_targets[j]);
+      // TODO(Aaron): call edge
       auto type = CudaParse::TargetType::DIRECT;
       // Jump
       bool added = false;

From 66ac87e46c4ed6e3a3e43af40f74d6503d9109f9 Mon Sep 17 00:00:00 2001
From: Keren Zhou <robinho364@gmail.com>
Date: Fri, 18 Sep 2020 14:09:59 -0500
Subject: [PATCH 031/177] Update dwarf.h

---
 src/lib/binutils/intel/dwarf.h | 20 ++++++++++++++++++--
 1 file changed, 18 insertions(+), 2 deletions(-)

diff --git a/src/lib/binutils/intel/dwarf.h b/src/lib/binutils/intel/dwarf.h
index e033c284de..4f10645d11 100644
--- a/src/lib/binutils/intel/dwarf.h
+++ b/src/lib/binutils/intel/dwarf.h
@@ -1,7 +1,23 @@
 //==============================================================
 // Copyright © 2019 Intel Corporation
 //
-// SPDX-License-Identifier: MIT
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// 
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// 
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
 // =============================================================
 
 #ifndef PTI_SAMPLES_UTILS_DWARF_H_
@@ -38,4 +54,4 @@ struct Dwarf32Header {
 };
 #pragma pack(pop)
 
-#endif // PTI_SAMPLES_UTILS_DWARF_H_
\ No newline at end of file
+#endif // PTI_SAMPLES_UTILS_DWARF_H_

From 540e5036d23c063af461b1b0e5c3973e93518a43 Mon Sep 17 00:00:00 2001
From: Jokeren <robinho364@gmail.com>
Date: Sun, 20 Sep 2020 04:47:00 +0000
Subject: [PATCH 032/177] Add Intel copyright for all pti samples

---
 src/lib/binutils/intel/dwarf_parser.h        | 20 ++++++++++++++++--
 src/lib/binutils/intel/dwarf_state_machine.h | 21 +++++++++++++++++--
 src/lib/binutils/intel/elf.h                 | 20 ++++++++++++++++--
 src/lib/binutils/intel/elf_parser.h          | 20 ++++++++++++++++--
 src/lib/binutils/intel/gen_binary_decoder.h  | 18 +++++++++++++++-
 src/lib/binutils/intel/gen_symbols_decoder.h | 18 +++++++++++++++-
 src/lib/binutils/intel/igc_binary_decoder.h  | 20 ++++++++++++++++--
 src/lib/binutils/intel/leb128.h              | 20 ++++++++++++++++--
 src/lib/binutils/intel/metric_device.h       | 20 ++++++++++++++++--
 src/lib/binutils/intel/metric_utils.h        | 21 +++++++++++++++++--
 src/lib/binutils/intel/shared_library.h      | 20 ++++++++++++++++--
 src/lib/binutils/intel/ze_tracer.h           | 22 +++++++++++++++++---
 src/lib/binutils/intel/ze_utils.h            | 20 ++++++++++++++++--
 13 files changed, 235 insertions(+), 25 deletions(-)

diff --git a/src/lib/binutils/intel/dwarf_parser.h b/src/lib/binutils/intel/dwarf_parser.h
index 08e37fbf86..071d2d92ed 100644
--- a/src/lib/binutils/intel/dwarf_parser.h
+++ b/src/lib/binutils/intel/dwarf_parser.h
@@ -1,7 +1,23 @@
 //==============================================================
 // Copyright © 2019 Intel Corporation
 //
-// SPDX-License-Identifier: MIT
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// 
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// 
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
 // =============================================================
 
 #ifndef PTI_SAMPLES_UTILS_DWARF_PARSER_H_
@@ -141,4 +157,4 @@ class DwarfParser {
   uint32_t size_;
 };
 
-#endif // PTI_SAMPLES_UTILS_DWARF_PARSER_H_
\ No newline at end of file
+#endif // PTI_SAMPLES_UTILS_DWARF_PARSER_H_
diff --git a/src/lib/binutils/intel/dwarf_state_machine.h b/src/lib/binutils/intel/dwarf_state_machine.h
index 644ee6bd10..bb7a69016d 100644
--- a/src/lib/binutils/intel/dwarf_state_machine.h
+++ b/src/lib/binutils/intel/dwarf_state_machine.h
@@ -1,9 +1,26 @@
 //==============================================================
 // Copyright © 2019 Intel Corporation
 //
-// SPDX-License-Identifier: MIT
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// 
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// 
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
 // =============================================================
 
+
 #ifndef PTI_SAMPLES_UTILS_DWARF_STATE_MACHINE_H_
 #define PTI_SAMPLES_UTILS_DWARF_STATE_MACHINE_H_
 
@@ -207,4 +224,4 @@ class DwarfStateMachine {
   DwarfLineInfo line_info_;
 };
 
-#endif // PTI_SAMPLES_UTILS_DWARF_STATE_MACHINE_H_
\ No newline at end of file
+#endif // PTI_SAMPLES_UTILS_DWARF_STATE_MACHINE_H_
diff --git a/src/lib/binutils/intel/elf.h b/src/lib/binutils/intel/elf.h
index 7e2ce6c4f6..8104004598 100644
--- a/src/lib/binutils/intel/elf.h
+++ b/src/lib/binutils/intel/elf.h
@@ -1,7 +1,23 @@
 //==============================================================
 // Copyright © 2019 Intel Corporation
 //
-// SPDX-License-Identifier: MIT
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// 
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// 
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
 // =============================================================
 
 #ifndef PTI_SAMPLES_UTILS_ELF_H_
@@ -42,4 +58,4 @@ struct Elf64SectionHeader {
   uint64_t entsize;
 };
 
-#endif // PTI_SAMPLES_UTILS_ELF_H_
\ No newline at end of file
+#endif // PTI_SAMPLES_UTILS_ELF_H_
diff --git a/src/lib/binutils/intel/elf_parser.h b/src/lib/binutils/intel/elf_parser.h
index 28326941b4..7c55a6d080 100644
--- a/src/lib/binutils/intel/elf_parser.h
+++ b/src/lib/binutils/intel/elf_parser.h
@@ -1,7 +1,23 @@
 //==============================================================
 // Copyright © 2019 Intel Corporation
 //
-// SPDX-License-Identifier: MIT
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// 
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// 
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
 // =============================================================
 
 #ifndef PTI_SAMPLES_UTILS_ELF_PARSER_H_
@@ -126,4 +142,4 @@ class ElfParser {
   uint32_t size_ = 0;
 };
 
-#endif // PTI_SAMPLES_UTILS_ELF_PARSER_H_
\ No newline at end of file
+#endif // PTI_SAMPLES_UTILS_ELF_PARSER_H_
diff --git a/src/lib/binutils/intel/gen_binary_decoder.h b/src/lib/binutils/intel/gen_binary_decoder.h
index 4f84d1952c..349c63857b 100644
--- a/src/lib/binutils/intel/gen_binary_decoder.h
+++ b/src/lib/binutils/intel/gen_binary_decoder.h
@@ -1,7 +1,23 @@
 //==============================================================
 // Copyright © 2019 Intel Corporation
 //
-// SPDX-License-Identifier: MIT
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// 
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// 
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
 // =============================================================
 
 #ifndef PTI_SAMPLES_UTILS_GEN_BINARY_DECODER_H_
diff --git a/src/lib/binutils/intel/gen_symbols_decoder.h b/src/lib/binutils/intel/gen_symbols_decoder.h
index c270a8236d..601d3a2ae5 100644
--- a/src/lib/binutils/intel/gen_symbols_decoder.h
+++ b/src/lib/binutils/intel/gen_symbols_decoder.h
@@ -1,7 +1,23 @@
 //==============================================================
 // Copyright © 2019 Intel Corporation
 //
-// SPDX-License-Identifier: MIT
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// 
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// 
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
 // =============================================================
 
 #ifndef PTI_SAMPLES_UTILS_GEN_SYMBOLS_DECODER_H_
diff --git a/src/lib/binutils/intel/igc_binary_decoder.h b/src/lib/binutils/intel/igc_binary_decoder.h
index aa292f8c63..77f00cad0d 100644
--- a/src/lib/binutils/intel/igc_binary_decoder.h
+++ b/src/lib/binutils/intel/igc_binary_decoder.h
@@ -1,7 +1,23 @@
 //==============================================================
-// Copyright © 2020 Intel Corporation
+// Copyright © 2019 Intel Corporation
 //
-// SPDX-License-Identifier: MIT
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// 
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// 
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
 // =============================================================
 
 #ifndef PTI_SAMPLES_UTILS_IGC_BINARY_DECODER_H_
diff --git a/src/lib/binutils/intel/leb128.h b/src/lib/binutils/intel/leb128.h
index a5ddb20599..f769add4f9 100644
--- a/src/lib/binutils/intel/leb128.h
+++ b/src/lib/binutils/intel/leb128.h
@@ -1,7 +1,23 @@
 //==============================================================
 // Copyright © 2019 Intel Corporation
 //
-// SPDX-License-Identifier: MIT
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// 
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// 
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
 // =============================================================
 
 #ifndef PTI_SAMPLES_UTILS_LEB128_H_
@@ -71,4 +87,4 @@ inline const uint8_t* Decode32(const uint8_t* ptr, int32_t& value,
 } // namespace leb128
 } // namespace utils
 
-#endif // PTI_SAMPLES_UTILS_LEB128_H_
\ No newline at end of file
+#endif // PTI_SAMPLES_UTILS_LEB128_H_
diff --git a/src/lib/binutils/intel/metric_device.h b/src/lib/binutils/intel/metric_device.h
index 48d5b7f104..0e1a55ac95 100644
--- a/src/lib/binutils/intel/metric_device.h
+++ b/src/lib/binutils/intel/metric_device.h
@@ -1,7 +1,23 @@
 //==============================================================
 // Copyright © 2019 Intel Corporation
 //
-// SPDX-License-Identifier: MIT
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// 
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// 
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
 // =============================================================
 
 #ifndef PTI_SAMPLES_UTILS_METRIC_DEVICE_H_
@@ -66,4 +82,4 @@ class MetricDevice {
   SharedLibrary* lib_ = nullptr;
 };
 
-#endif // PTI_SAMPLES_UTILS_METRIC_DEVICE_H_
\ No newline at end of file
+#endif // PTI_SAMPLES_UTILS_METRIC_DEVICE_H_
diff --git a/src/lib/binutils/intel/metric_utils.h b/src/lib/binutils/intel/metric_utils.h
index 8a6a44692c..2b83a7ca78 100644
--- a/src/lib/binutils/intel/metric_utils.h
+++ b/src/lib/binutils/intel/metric_utils.h
@@ -1,9 +1,26 @@
 //==============================================================
 // Copyright © 2019 Intel Corporation
 //
-// SPDX-License-Identifier: MIT
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// 
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// 
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
 // =============================================================
 
+
 #ifndef PTI_SAMPLES_UTILS_METRIC_UTILS_H_
 #define PTI_SAMPLES_UTILS_METRIC_UTILS_H_
 
@@ -92,4 +109,4 @@ inline std::vector<std::string> GetMDLibraryPossiblePaths() {
 } // namespace metrics
 } // namespace utils
 
-#endif // PTI_SAMPLES_UTILS_METRIC_UTILS_H_
\ No newline at end of file
+#endif // PTI_SAMPLES_UTILS_METRIC_UTILS_H_
diff --git a/src/lib/binutils/intel/shared_library.h b/src/lib/binutils/intel/shared_library.h
index 6c75cb4e92..d22f1b9c44 100644
--- a/src/lib/binutils/intel/shared_library.h
+++ b/src/lib/binutils/intel/shared_library.h
@@ -1,7 +1,23 @@
 //==============================================================
 // Copyright © 2019 Intel Corporation
 //
-// SPDX-License-Identifier: MIT
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// 
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// 
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
 // =============================================================
 
 #ifndef PTI_SAMPLES_UTILS_SHARED_LIBRARY_H_
@@ -69,4 +85,4 @@ class SharedLibrary {
 #endif
 };
 
-#endif // PTI_SAMPLES_UTILS_SHARED_LIBRARY_H_
\ No newline at end of file
+#endif // PTI_SAMPLES_UTILS_SHARED_LIBRARY_H_
diff --git a/src/lib/binutils/intel/ze_tracer.h b/src/lib/binutils/intel/ze_tracer.h
index a63941d447..df1bc954b4 100644
--- a/src/lib/binutils/intel/ze_tracer.h
+++ b/src/lib/binutils/intel/ze_tracer.h
@@ -1,7 +1,23 @@
 //==============================================================
-// Copyright © 2020 Intel Corporation
+// Copyright © 2019 Intel Corporation
 //
-// SPDX-License-Identifier: MIT
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// 
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// 
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
 // =============================================================
 
 #ifndef PTI_SAMPLES_UTILS_ZE_TRACER_H_
@@ -101,4 +117,4 @@ class ZeTracer {
   ze_tracing::global_data_t data_;
 };
 
-#endif // PTI_SAMPLES_UTILS_ZE_TRACER_H_
\ No newline at end of file
+#endif // PTI_SAMPLES_UTILS_ZE_TRACER_H_
diff --git a/src/lib/binutils/intel/ze_utils.h b/src/lib/binutils/intel/ze_utils.h
index 67e5c288eb..0b853295c1 100644
--- a/src/lib/binutils/intel/ze_utils.h
+++ b/src/lib/binutils/intel/ze_utils.h
@@ -1,7 +1,23 @@
 //==============================================================
 // Copyright © 2019 Intel Corporation
 //
-// SPDX-License-Identifier: MIT
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// 
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// 
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
 // =============================================================
 
 #ifndef PTI_SAMPLES_UTILS_ZE_UTILS_H_
@@ -140,4 +156,4 @@ static zet_metric_group_handle_t FindMetricGroup(
 } // namespace ze
 } // namespace utils
 
-#endif // PTI_SAMPLES_UTILS_ZE_UTILS_H_
\ No newline at end of file
+#endif // PTI_SAMPLES_UTILS_ZE_UTILS_H_

From f291d22aaf4fc0cabafab09e70fae1053c4036c7 Mon Sep 17 00:00:00 2001
From: Jokeren <robinho364@gmail.com>
Date: Mon, 21 Sep 2020 04:18:20 +0000
Subject: [PATCH 033/177] Merge cfg analysis for intel and cuda

---
 src/lib/banal/Makefile.am                     |  20 +-
 src/lib/banal/Makefile.in                     | 245 +++++++-----------
 src/lib/banal/Struct.cpp                      |   4 +-
 src/lib/banal/cuda/CudaBlock.hpp              |  26 --
 src/lib/banal/cuda/CudaCFGFactory.hpp         |  32 ---
 .../CFGParser.cpp => gpu/CudaCFGParser.cpp}   |  26 +-
 .../CFGParser.hpp => gpu/CudaCFGParser.hpp}   |   8 +-
 src/lib/banal/{cuda => gpu}/DotCFG.hpp        |  25 +-
 .../{cuda/CudaBlock.cpp => gpu/GPUBlock.cpp}  |   8 +-
 src/lib/banal/gpu/GPUBlock.hpp                |  26 ++
 .../GPUCFGFactory.cpp}                        |  30 +--
 src/lib/banal/gpu/GPUCFGFactory.hpp           |  32 +++
 .../GPUCodeSource.cpp}                        |   6 +-
 .../GPUCodeSource.hpp}                        |  10 +-
 .../CudaFunction.cpp => gpu/GPUFunction.cpp}  |   4 +-
 .../CudaFunction.hpp => gpu/GPUFunction.hpp}  |  10 +-
 src/lib/banal/{cuda => gpu}/Graph.hpp         |   6 +-
 src/lib/banal/{cuda => gpu}/GraphReader.cpp   |   2 +-
 src/lib/banal/{cuda => gpu}/GraphReader.hpp   |   6 +-
 .../ReadCubinCFG.cpp => gpu/ReadCudaCFG.cpp}  |  42 +--
 .../ReadCubinCFG.hpp => gpu/ReadCudaCFG.hpp}  |   7 +-
 src/lib/banal/{intel => gpu}/ReadIntelCFG.cpp |  32 +--
 src/lib/banal/{intel => gpu}/ReadIntelCFG.hpp |   5 +-
 src/lib/banal/intel/IntelBlock.cpp            |  34 ---
 src/lib/banal/intel/IntelBlock.hpp            |  26 --
 src/lib/banal/intel/IntelCFGFactory.cpp       |  92 -------
 src/lib/banal/intel/IntelCFGFactory.hpp       |  32 ---
 src/lib/banal/intel/IntelCodeSource.cpp       |  15 --
 src/lib/banal/intel/IntelCodeSource.hpp       |  55 ----
 src/lib/banal/intel/IntelFunction.cpp         |  12 -
 src/lib/banal/intel/IntelFunction.hpp         |  25 --
 src/lib/binutils/InputFile.cpp                |   7 +-
 src/lib/binutils/Makefile.am                  |   2 +-
 src/lib/binutils/Makefile.in                  |  18 +-
 ...elGPUbinutils.cpp => IntelGPUBinutils.cpp} | 136 ++++------
 ...elGPUbinutils.hpp => IntelGPUBinutils.hpp} |   2 +-
 36 files changed, 343 insertions(+), 725 deletions(-)
 delete mode 100644 src/lib/banal/cuda/CudaBlock.hpp
 delete mode 100644 src/lib/banal/cuda/CudaCFGFactory.hpp
 rename src/lib/banal/{cuda/CFGParser.cpp => gpu/CudaCFGParser.cpp} (95%)
 rename src/lib/banal/{cuda/CFGParser.hpp => gpu/CudaCFGParser.hpp} (91%)
 rename src/lib/banal/{cuda => gpu}/DotCFG.hpp (93%)
 rename src/lib/banal/{cuda/CudaBlock.cpp => gpu/GPUBlock.cpp} (76%)
 create mode 100644 src/lib/banal/gpu/GPUBlock.hpp
 rename src/lib/banal/{cuda/CudaCFGFactory.cpp => gpu/GPUCFGFactory.cpp} (77%)
 create mode 100644 src/lib/banal/gpu/GPUCFGFactory.hpp
 rename src/lib/banal/{cuda/CudaCodeSource.cpp => gpu/GPUCodeSource.cpp} (57%)
 rename src/lib/banal/{cuda/CudaCodeSource.hpp => gpu/GPUCodeSource.hpp} (89%)
 rename src/lib/banal/{cuda/CudaFunction.cpp => gpu/GPUFunction.cpp} (57%)
 rename src/lib/banal/{cuda/CudaFunction.hpp => gpu/GPUFunction.hpp} (53%)
 rename src/lib/banal/{cuda => gpu}/Graph.hpp (92%)
 rename src/lib/banal/{cuda => gpu}/GraphReader.cpp (99%)
 rename src/lib/banal/{cuda => gpu}/GraphReader.hpp (88%)
 rename src/lib/banal/{cuda/ReadCubinCFG.cpp => gpu/ReadCudaCFG.cpp} (88%)
 rename src/lib/banal/{cuda/ReadCubinCFG.hpp => gpu/ReadCudaCFG.hpp} (77%)
 rename src/lib/banal/{intel => gpu}/ReadIntelCFG.cpp (90%)
 rename src/lib/banal/{intel => gpu}/ReadIntelCFG.hpp (97%)
 delete mode 100644 src/lib/banal/intel/IntelBlock.cpp
 delete mode 100644 src/lib/banal/intel/IntelBlock.hpp
 delete mode 100644 src/lib/banal/intel/IntelCFGFactory.cpp
 delete mode 100644 src/lib/banal/intel/IntelCFGFactory.hpp
 delete mode 100644 src/lib/banal/intel/IntelCodeSource.cpp
 delete mode 100644 src/lib/banal/intel/IntelCodeSource.hpp
 delete mode 100644 src/lib/banal/intel/IntelFunction.cpp
 delete mode 100644 src/lib/banal/intel/IntelFunction.hpp
 rename src/lib/binutils/intel/{IntelGPUbinutils.cpp => IntelGPUBinutils.cpp} (64%)
 rename src/lib/binutils/intel/{IntelGPUbinutils.hpp => IntelGPUBinutils.hpp} (99%)

diff --git a/src/lib/banal/Makefile.am b/src/lib/banal/Makefile.am
index a9248b52c4..0357cec4fd 100644
--- a/src/lib/banal/Makefile.am
+++ b/src/lib/banal/Makefile.am
@@ -71,18 +71,14 @@ MD_IFLAGS = @OPT_METRICS_DISCOVERY_IFLAGS@
 endif
 
 MYSOURCES = \
-	cuda/CFGParser.cpp  \
-	cuda/CudaCFGFactory.cpp  \
-	cuda/CudaFunction.cpp  \
-	cuda/GraphReader.cpp \
-	cuda/CudaBlock.cpp  \
-	cuda/CudaCodeSource.cpp  \
-	cuda/ReadCubinCFG.cpp \
-	intel/IntelCFGFactory.cpp  \
-	intel/IntelFunction.cpp  \
-	intel/IntelBlock.cpp  \
-	intel/IntelCodeSource.cpp  \
-	intel/ReadIntelCFG.cpp \
+	gpu/GPUCFGFactory.cpp  \
+	gpu/GPUFunction.cpp  \
+	gpu/GPUBlock.cpp  \
+	gpu/GPUCodeSource.cpp  \
+	gpu/GraphReader.cpp \
+	gpu/CudaCFGParser.cpp  \
+	gpu/ReadCudaCFG.cpp \
+	gpu/ReadIntelCFG.cpp \
 	Struct.cpp  \
 	Struct-Inline.cpp  \
 	Struct-Output.cpp
diff --git a/src/lib/banal/Makefile.in b/src/lib/banal/Makefile.in
index d1567b5143..d912bb53f0 100644
--- a/src/lib/banal/Makefile.in
+++ b/src/lib/banal/Makefile.in
@@ -136,18 +136,14 @@ LTLIBRARIES = $(noinst_LTLIBRARIES)
 am__DEPENDENCIES_1 =
 libHPCbanal_la_DEPENDENCIES = $(am__DEPENDENCIES_1)
 am__dirstamp = $(am__leading_dot)dirstamp
-am__objects_1 = cuda/libHPCbanal_la-CFGParser.lo \
-	cuda/libHPCbanal_la-CudaCFGFactory.lo \
-	cuda/libHPCbanal_la-CudaFunction.lo \
-	cuda/libHPCbanal_la-GraphReader.lo \
-	cuda/libHPCbanal_la-CudaBlock.lo \
-	cuda/libHPCbanal_la-CudaCodeSource.lo \
-	cuda/libHPCbanal_la-ReadCubinCFG.lo \
-	intel/libHPCbanal_la-IntelCFGFactory.lo \
-	intel/libHPCbanal_la-IntelFunction.lo \
-	intel/libHPCbanal_la-IntelBlock.lo \
-	intel/libHPCbanal_la-IntelCodeSource.lo \
-	intel/libHPCbanal_la-ReadIntelCFG.lo libHPCbanal_la-Struct.lo \
+am__objects_1 = gpu/libHPCbanal_la-GPUCFGFactory.lo \
+	gpu/libHPCbanal_la-GPUFunction.lo \
+	gpu/libHPCbanal_la-GPUBlock.lo \
+	gpu/libHPCbanal_la-GPUCodeSource.lo \
+	gpu/libHPCbanal_la-GraphReader.lo \
+	gpu/libHPCbanal_la-CudaCFGParser.lo \
+	gpu/libHPCbanal_la-ReadCudaCFG.lo \
+	gpu/libHPCbanal_la-ReadIntelCFG.lo libHPCbanal_la-Struct.lo \
 	libHPCbanal_la-Struct-Inline.lo \
 	libHPCbanal_la-Struct-Output.lo
 am_libHPCbanal_la_OBJECTS = $(am__objects_1)
@@ -533,18 +529,14 @@ HPCLIB_SupportLean = $(top_builddir)/src/lib/support-lean/libHPCsupport-lean.la
 @OPT_ENABLE_IGC_TRUE@IGC_IFLAGS = @OPT_IGC_IFLAGS@
 @OPT_ENABLE_METRICS_DISCOVERY_TRUE@MD_IFLAGS = @OPT_METRICS_DISCOVERY_IFLAGS@
 MYSOURCES = \
-	cuda/CFGParser.cpp  \
-	cuda/CudaCFGFactory.cpp  \
-	cuda/CudaFunction.cpp  \
-	cuda/GraphReader.cpp \
-	cuda/CudaBlock.cpp  \
-	cuda/CudaCodeSource.cpp  \
-	cuda/ReadCubinCFG.cpp \
-	intel/IntelCFGFactory.cpp  \
-	intel/IntelFunction.cpp  \
-	intel/IntelBlock.cpp  \
-	intel/IntelCodeSource.cpp  \
-	intel/ReadIntelCFG.cpp \
+	gpu/GPUCFGFactory.cpp  \
+	gpu/GPUFunction.cpp  \
+	gpu/GPUBlock.cpp  \
+	gpu/GPUCodeSource.cpp  \
+	gpu/GraphReader.cpp \
+	gpu/CudaCFGParser.cpp  \
+	gpu/ReadCudaCFG.cpp \
+	gpu/ReadIntelCFG.cpp \
 	Struct.cpp  \
 	Struct-Inline.cpp  \
 	Struct-Output.cpp
@@ -626,42 +618,28 @@ clean-noinstLTLIBRARIES:
 	  echo rm -f $${locs}; \
 	  rm -f $${locs}; \
 	}
-cuda/$(am__dirstamp):
-	@$(MKDIR_P) cuda
-	@: > cuda/$(am__dirstamp)
-cuda/$(DEPDIR)/$(am__dirstamp):
-	@$(MKDIR_P) cuda/$(DEPDIR)
-	@: > cuda/$(DEPDIR)/$(am__dirstamp)
-cuda/libHPCbanal_la-CFGParser.lo: cuda/$(am__dirstamp) \
-	cuda/$(DEPDIR)/$(am__dirstamp)
-cuda/libHPCbanal_la-CudaCFGFactory.lo: cuda/$(am__dirstamp) \
-	cuda/$(DEPDIR)/$(am__dirstamp)
-cuda/libHPCbanal_la-CudaFunction.lo: cuda/$(am__dirstamp) \
-	cuda/$(DEPDIR)/$(am__dirstamp)
-cuda/libHPCbanal_la-GraphReader.lo: cuda/$(am__dirstamp) \
-	cuda/$(DEPDIR)/$(am__dirstamp)
-cuda/libHPCbanal_la-CudaBlock.lo: cuda/$(am__dirstamp) \
-	cuda/$(DEPDIR)/$(am__dirstamp)
-cuda/libHPCbanal_la-CudaCodeSource.lo: cuda/$(am__dirstamp) \
-	cuda/$(DEPDIR)/$(am__dirstamp)
-cuda/libHPCbanal_la-ReadCubinCFG.lo: cuda/$(am__dirstamp) \
-	cuda/$(DEPDIR)/$(am__dirstamp)
-intel/$(am__dirstamp):
-	@$(MKDIR_P) intel
-	@: > intel/$(am__dirstamp)
-intel/$(DEPDIR)/$(am__dirstamp):
-	@$(MKDIR_P) intel/$(DEPDIR)
-	@: > intel/$(DEPDIR)/$(am__dirstamp)
-intel/libHPCbanal_la-IntelCFGFactory.lo: intel/$(am__dirstamp) \
-	intel/$(DEPDIR)/$(am__dirstamp)
-intel/libHPCbanal_la-IntelFunction.lo: intel/$(am__dirstamp) \
-	intel/$(DEPDIR)/$(am__dirstamp)
-intel/libHPCbanal_la-IntelBlock.lo: intel/$(am__dirstamp) \
-	intel/$(DEPDIR)/$(am__dirstamp)
-intel/libHPCbanal_la-IntelCodeSource.lo: intel/$(am__dirstamp) \
-	intel/$(DEPDIR)/$(am__dirstamp)
-intel/libHPCbanal_la-ReadIntelCFG.lo: intel/$(am__dirstamp) \
-	intel/$(DEPDIR)/$(am__dirstamp)
+gpu/$(am__dirstamp):
+	@$(MKDIR_P) gpu
+	@: > gpu/$(am__dirstamp)
+gpu/$(DEPDIR)/$(am__dirstamp):
+	@$(MKDIR_P) gpu/$(DEPDIR)
+	@: > gpu/$(DEPDIR)/$(am__dirstamp)
+gpu/libHPCbanal_la-GPUCFGFactory.lo: gpu/$(am__dirstamp) \
+	gpu/$(DEPDIR)/$(am__dirstamp)
+gpu/libHPCbanal_la-GPUFunction.lo: gpu/$(am__dirstamp) \
+	gpu/$(DEPDIR)/$(am__dirstamp)
+gpu/libHPCbanal_la-GPUBlock.lo: gpu/$(am__dirstamp) \
+	gpu/$(DEPDIR)/$(am__dirstamp)
+gpu/libHPCbanal_la-GPUCodeSource.lo: gpu/$(am__dirstamp) \
+	gpu/$(DEPDIR)/$(am__dirstamp)
+gpu/libHPCbanal_la-GraphReader.lo: gpu/$(am__dirstamp) \
+	gpu/$(DEPDIR)/$(am__dirstamp)
+gpu/libHPCbanal_la-CudaCFGParser.lo: gpu/$(am__dirstamp) \
+	gpu/$(DEPDIR)/$(am__dirstamp)
+gpu/libHPCbanal_la-ReadCudaCFG.lo: gpu/$(am__dirstamp) \
+	gpu/$(DEPDIR)/$(am__dirstamp)
+gpu/libHPCbanal_la-ReadIntelCFG.lo: gpu/$(am__dirstamp) \
+	gpu/$(DEPDIR)/$(am__dirstamp)
 
 libHPCbanal.la: $(libHPCbanal_la_OBJECTS) $(libHPCbanal_la_DEPENDENCIES) $(EXTRA_libHPCbanal_la_DEPENDENCIES) 
 	$(AM_V_CXXLD)$(libHPCbanal_la_LINK)  $(libHPCbanal_la_OBJECTS) $(libHPCbanal_la_LIBADD) $(LIBS)
@@ -671,10 +649,8 @@ libHPCbanal_simple.la: $(libHPCbanal_simple_la_OBJECTS) $(libHPCbanal_simple_la_
 
 mostlyclean-compile:
 	-rm -f *.$(OBJEXT)
-	-rm -f cuda/*.$(OBJEXT)
-	-rm -f cuda/*.lo
-	-rm -f intel/*.$(OBJEXT)
-	-rm -f intel/*.lo
+	-rm -f gpu/*.$(OBJEXT)
+	-rm -f gpu/*.lo
 
 distclean-compile:
 	-rm -f *.tab.c
@@ -683,18 +659,14 @@ distclean-compile:
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libHPCbanal_la-Struct-Output.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libHPCbanal_la-Struct.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libHPCbanal_simple_la-StructSimple.Plo@am__quote@
-@AMDEP_TRUE@@am__include@ @am__quote@cuda/$(DEPDIR)/libHPCbanal_la-CFGParser.Plo@am__quote@
-@AMDEP_TRUE@@am__include@ @am__quote@cuda/$(DEPDIR)/libHPCbanal_la-CudaBlock.Plo@am__quote@
-@AMDEP_TRUE@@am__include@ @am__quote@cuda/$(DEPDIR)/libHPCbanal_la-CudaCFGFactory.Plo@am__quote@
-@AMDEP_TRUE@@am__include@ @am__quote@cuda/$(DEPDIR)/libHPCbanal_la-CudaCodeSource.Plo@am__quote@
-@AMDEP_TRUE@@am__include@ @am__quote@cuda/$(DEPDIR)/libHPCbanal_la-CudaFunction.Plo@am__quote@
-@AMDEP_TRUE@@am__include@ @am__quote@cuda/$(DEPDIR)/libHPCbanal_la-GraphReader.Plo@am__quote@
-@AMDEP_TRUE@@am__include@ @am__quote@cuda/$(DEPDIR)/libHPCbanal_la-ReadCubinCFG.Plo@am__quote@
-@AMDEP_TRUE@@am__include@ @am__quote@intel/$(DEPDIR)/libHPCbanal_la-IntelBlock.Plo@am__quote@
-@AMDEP_TRUE@@am__include@ @am__quote@intel/$(DEPDIR)/libHPCbanal_la-IntelCFGFactory.Plo@am__quote@
-@AMDEP_TRUE@@am__include@ @am__quote@intel/$(DEPDIR)/libHPCbanal_la-IntelCodeSource.Plo@am__quote@
-@AMDEP_TRUE@@am__include@ @am__quote@intel/$(DEPDIR)/libHPCbanal_la-IntelFunction.Plo@am__quote@
-@AMDEP_TRUE@@am__include@ @am__quote@intel/$(DEPDIR)/libHPCbanal_la-ReadIntelCFG.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@gpu/$(DEPDIR)/libHPCbanal_la-CudaCFGParser.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@gpu/$(DEPDIR)/libHPCbanal_la-GPUBlock.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@gpu/$(DEPDIR)/libHPCbanal_la-GPUCFGFactory.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@gpu/$(DEPDIR)/libHPCbanal_la-GPUCodeSource.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@gpu/$(DEPDIR)/libHPCbanal_la-GPUFunction.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@gpu/$(DEPDIR)/libHPCbanal_la-GraphReader.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@gpu/$(DEPDIR)/libHPCbanal_la-ReadCudaCFG.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@gpu/$(DEPDIR)/libHPCbanal_la-ReadIntelCFG.Plo@am__quote@
 
 .cpp.o:
 @am__fastdepCXX_TRUE@	$(AM_V_CXX)depbase=`echo $@ | sed 's|[^/]*$$|$(DEPDIR)/&|;s|\.o$$||'`;\
@@ -720,89 +692,61 @@ distclean-compile:
 @AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
 @am__fastdepCXX_FALSE@	$(AM_V_CXX@am__nodep@)$(LTCXXCOMPILE) -c -o $@ $<
 
-cuda/libHPCbanal_la-CFGParser.lo: cuda/CFGParser.cpp
-@am__fastdepCXX_TRUE@	$(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libHPCbanal_la_CXXFLAGS) $(CXXFLAGS) -MT cuda/libHPCbanal_la-CFGParser.lo -MD -MP -MF cuda/$(DEPDIR)/libHPCbanal_la-CFGParser.Tpo -c -o cuda/libHPCbanal_la-CFGParser.lo `test -f 'cuda/CFGParser.cpp' || echo '$(srcdir)/'`cuda/CFGParser.cpp
-@am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) cuda/$(DEPDIR)/libHPCbanal_la-CFGParser.Tpo cuda/$(DEPDIR)/libHPCbanal_la-CFGParser.Plo
-@AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='cuda/CFGParser.cpp' object='cuda/libHPCbanal_la-CFGParser.lo' libtool=yes @AMDEPBACKSLASH@
+gpu/libHPCbanal_la-GPUCFGFactory.lo: gpu/GPUCFGFactory.cpp
+@am__fastdepCXX_TRUE@	$(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libHPCbanal_la_CXXFLAGS) $(CXXFLAGS) -MT gpu/libHPCbanal_la-GPUCFGFactory.lo -MD -MP -MF gpu/$(DEPDIR)/libHPCbanal_la-GPUCFGFactory.Tpo -c -o gpu/libHPCbanal_la-GPUCFGFactory.lo `test -f 'gpu/GPUCFGFactory.cpp' || echo '$(srcdir)/'`gpu/GPUCFGFactory.cpp
+@am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) gpu/$(DEPDIR)/libHPCbanal_la-GPUCFGFactory.Tpo gpu/$(DEPDIR)/libHPCbanal_la-GPUCFGFactory.Plo
+@AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='gpu/GPUCFGFactory.cpp' object='gpu/libHPCbanal_la-GPUCFGFactory.lo' libtool=yes @AMDEPBACKSLASH@
 @AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
-@am__fastdepCXX_FALSE@	$(AM_V_CXX@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libHPCbanal_la_CXXFLAGS) $(CXXFLAGS) -c -o cuda/libHPCbanal_la-CFGParser.lo `test -f 'cuda/CFGParser.cpp' || echo '$(srcdir)/'`cuda/CFGParser.cpp
+@am__fastdepCXX_FALSE@	$(AM_V_CXX@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libHPCbanal_la_CXXFLAGS) $(CXXFLAGS) -c -o gpu/libHPCbanal_la-GPUCFGFactory.lo `test -f 'gpu/GPUCFGFactory.cpp' || echo '$(srcdir)/'`gpu/GPUCFGFactory.cpp
 
-cuda/libHPCbanal_la-CudaCFGFactory.lo: cuda/CudaCFGFactory.cpp
-@am__fastdepCXX_TRUE@	$(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libHPCbanal_la_CXXFLAGS) $(CXXFLAGS) -MT cuda/libHPCbanal_la-CudaCFGFactory.lo -MD -MP -MF cuda/$(DEPDIR)/libHPCbanal_la-CudaCFGFactory.Tpo -c -o cuda/libHPCbanal_la-CudaCFGFactory.lo `test -f 'cuda/CudaCFGFactory.cpp' || echo '$(srcdir)/'`cuda/CudaCFGFactory.cpp
-@am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) cuda/$(DEPDIR)/libHPCbanal_la-CudaCFGFactory.Tpo cuda/$(DEPDIR)/libHPCbanal_la-CudaCFGFactory.Plo
-@AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='cuda/CudaCFGFactory.cpp' object='cuda/libHPCbanal_la-CudaCFGFactory.lo' libtool=yes @AMDEPBACKSLASH@
+gpu/libHPCbanal_la-GPUFunction.lo: gpu/GPUFunction.cpp
+@am__fastdepCXX_TRUE@	$(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libHPCbanal_la_CXXFLAGS) $(CXXFLAGS) -MT gpu/libHPCbanal_la-GPUFunction.lo -MD -MP -MF gpu/$(DEPDIR)/libHPCbanal_la-GPUFunction.Tpo -c -o gpu/libHPCbanal_la-GPUFunction.lo `test -f 'gpu/GPUFunction.cpp' || echo '$(srcdir)/'`gpu/GPUFunction.cpp
+@am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) gpu/$(DEPDIR)/libHPCbanal_la-GPUFunction.Tpo gpu/$(DEPDIR)/libHPCbanal_la-GPUFunction.Plo
+@AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='gpu/GPUFunction.cpp' object='gpu/libHPCbanal_la-GPUFunction.lo' libtool=yes @AMDEPBACKSLASH@
 @AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
-@am__fastdepCXX_FALSE@	$(AM_V_CXX@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libHPCbanal_la_CXXFLAGS) $(CXXFLAGS) -c -o cuda/libHPCbanal_la-CudaCFGFactory.lo `test -f 'cuda/CudaCFGFactory.cpp' || echo '$(srcdir)/'`cuda/CudaCFGFactory.cpp
+@am__fastdepCXX_FALSE@	$(AM_V_CXX@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libHPCbanal_la_CXXFLAGS) $(CXXFLAGS) -c -o gpu/libHPCbanal_la-GPUFunction.lo `test -f 'gpu/GPUFunction.cpp' || echo '$(srcdir)/'`gpu/GPUFunction.cpp
 
-cuda/libHPCbanal_la-CudaFunction.lo: cuda/CudaFunction.cpp
-@am__fastdepCXX_TRUE@	$(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libHPCbanal_la_CXXFLAGS) $(CXXFLAGS) -MT cuda/libHPCbanal_la-CudaFunction.lo -MD -MP -MF cuda/$(DEPDIR)/libHPCbanal_la-CudaFunction.Tpo -c -o cuda/libHPCbanal_la-CudaFunction.lo `test -f 'cuda/CudaFunction.cpp' || echo '$(srcdir)/'`cuda/CudaFunction.cpp
-@am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) cuda/$(DEPDIR)/libHPCbanal_la-CudaFunction.Tpo cuda/$(DEPDIR)/libHPCbanal_la-CudaFunction.Plo
-@AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='cuda/CudaFunction.cpp' object='cuda/libHPCbanal_la-CudaFunction.lo' libtool=yes @AMDEPBACKSLASH@
+gpu/libHPCbanal_la-GPUBlock.lo: gpu/GPUBlock.cpp
+@am__fastdepCXX_TRUE@	$(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libHPCbanal_la_CXXFLAGS) $(CXXFLAGS) -MT gpu/libHPCbanal_la-GPUBlock.lo -MD -MP -MF gpu/$(DEPDIR)/libHPCbanal_la-GPUBlock.Tpo -c -o gpu/libHPCbanal_la-GPUBlock.lo `test -f 'gpu/GPUBlock.cpp' || echo '$(srcdir)/'`gpu/GPUBlock.cpp
+@am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) gpu/$(DEPDIR)/libHPCbanal_la-GPUBlock.Tpo gpu/$(DEPDIR)/libHPCbanal_la-GPUBlock.Plo
+@AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='gpu/GPUBlock.cpp' object='gpu/libHPCbanal_la-GPUBlock.lo' libtool=yes @AMDEPBACKSLASH@
 @AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
-@am__fastdepCXX_FALSE@	$(AM_V_CXX@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libHPCbanal_la_CXXFLAGS) $(CXXFLAGS) -c -o cuda/libHPCbanal_la-CudaFunction.lo `test -f 'cuda/CudaFunction.cpp' || echo '$(srcdir)/'`cuda/CudaFunction.cpp
+@am__fastdepCXX_FALSE@	$(AM_V_CXX@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libHPCbanal_la_CXXFLAGS) $(CXXFLAGS) -c -o gpu/libHPCbanal_la-GPUBlock.lo `test -f 'gpu/GPUBlock.cpp' || echo '$(srcdir)/'`gpu/GPUBlock.cpp
 
-cuda/libHPCbanal_la-GraphReader.lo: cuda/GraphReader.cpp
-@am__fastdepCXX_TRUE@	$(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libHPCbanal_la_CXXFLAGS) $(CXXFLAGS) -MT cuda/libHPCbanal_la-GraphReader.lo -MD -MP -MF cuda/$(DEPDIR)/libHPCbanal_la-GraphReader.Tpo -c -o cuda/libHPCbanal_la-GraphReader.lo `test -f 'cuda/GraphReader.cpp' || echo '$(srcdir)/'`cuda/GraphReader.cpp
-@am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) cuda/$(DEPDIR)/libHPCbanal_la-GraphReader.Tpo cuda/$(DEPDIR)/libHPCbanal_la-GraphReader.Plo
-@AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='cuda/GraphReader.cpp' object='cuda/libHPCbanal_la-GraphReader.lo' libtool=yes @AMDEPBACKSLASH@
+gpu/libHPCbanal_la-GPUCodeSource.lo: gpu/GPUCodeSource.cpp
+@am__fastdepCXX_TRUE@	$(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libHPCbanal_la_CXXFLAGS) $(CXXFLAGS) -MT gpu/libHPCbanal_la-GPUCodeSource.lo -MD -MP -MF gpu/$(DEPDIR)/libHPCbanal_la-GPUCodeSource.Tpo -c -o gpu/libHPCbanal_la-GPUCodeSource.lo `test -f 'gpu/GPUCodeSource.cpp' || echo '$(srcdir)/'`gpu/GPUCodeSource.cpp
+@am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) gpu/$(DEPDIR)/libHPCbanal_la-GPUCodeSource.Tpo gpu/$(DEPDIR)/libHPCbanal_la-GPUCodeSource.Plo
+@AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='gpu/GPUCodeSource.cpp' object='gpu/libHPCbanal_la-GPUCodeSource.lo' libtool=yes @AMDEPBACKSLASH@
 @AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
-@am__fastdepCXX_FALSE@	$(AM_V_CXX@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libHPCbanal_la_CXXFLAGS) $(CXXFLAGS) -c -o cuda/libHPCbanal_la-GraphReader.lo `test -f 'cuda/GraphReader.cpp' || echo '$(srcdir)/'`cuda/GraphReader.cpp
+@am__fastdepCXX_FALSE@	$(AM_V_CXX@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libHPCbanal_la_CXXFLAGS) $(CXXFLAGS) -c -o gpu/libHPCbanal_la-GPUCodeSource.lo `test -f 'gpu/GPUCodeSource.cpp' || echo '$(srcdir)/'`gpu/GPUCodeSource.cpp
 
-cuda/libHPCbanal_la-CudaBlock.lo: cuda/CudaBlock.cpp
-@am__fastdepCXX_TRUE@	$(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libHPCbanal_la_CXXFLAGS) $(CXXFLAGS) -MT cuda/libHPCbanal_la-CudaBlock.lo -MD -MP -MF cuda/$(DEPDIR)/libHPCbanal_la-CudaBlock.Tpo -c -o cuda/libHPCbanal_la-CudaBlock.lo `test -f 'cuda/CudaBlock.cpp' || echo '$(srcdir)/'`cuda/CudaBlock.cpp
-@am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) cuda/$(DEPDIR)/libHPCbanal_la-CudaBlock.Tpo cuda/$(DEPDIR)/libHPCbanal_la-CudaBlock.Plo
-@AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='cuda/CudaBlock.cpp' object='cuda/libHPCbanal_la-CudaBlock.lo' libtool=yes @AMDEPBACKSLASH@
+gpu/libHPCbanal_la-GraphReader.lo: gpu/GraphReader.cpp
+@am__fastdepCXX_TRUE@	$(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libHPCbanal_la_CXXFLAGS) $(CXXFLAGS) -MT gpu/libHPCbanal_la-GraphReader.lo -MD -MP -MF gpu/$(DEPDIR)/libHPCbanal_la-GraphReader.Tpo -c -o gpu/libHPCbanal_la-GraphReader.lo `test -f 'gpu/GraphReader.cpp' || echo '$(srcdir)/'`gpu/GraphReader.cpp
+@am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) gpu/$(DEPDIR)/libHPCbanal_la-GraphReader.Tpo gpu/$(DEPDIR)/libHPCbanal_la-GraphReader.Plo
+@AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='gpu/GraphReader.cpp' object='gpu/libHPCbanal_la-GraphReader.lo' libtool=yes @AMDEPBACKSLASH@
 @AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
-@am__fastdepCXX_FALSE@	$(AM_V_CXX@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libHPCbanal_la_CXXFLAGS) $(CXXFLAGS) -c -o cuda/libHPCbanal_la-CudaBlock.lo `test -f 'cuda/CudaBlock.cpp' || echo '$(srcdir)/'`cuda/CudaBlock.cpp
+@am__fastdepCXX_FALSE@	$(AM_V_CXX@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libHPCbanal_la_CXXFLAGS) $(CXXFLAGS) -c -o gpu/libHPCbanal_la-GraphReader.lo `test -f 'gpu/GraphReader.cpp' || echo '$(srcdir)/'`gpu/GraphReader.cpp
 
-cuda/libHPCbanal_la-CudaCodeSource.lo: cuda/CudaCodeSource.cpp
-@am__fastdepCXX_TRUE@	$(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libHPCbanal_la_CXXFLAGS) $(CXXFLAGS) -MT cuda/libHPCbanal_la-CudaCodeSource.lo -MD -MP -MF cuda/$(DEPDIR)/libHPCbanal_la-CudaCodeSource.Tpo -c -o cuda/libHPCbanal_la-CudaCodeSource.lo `test -f 'cuda/CudaCodeSource.cpp' || echo '$(srcdir)/'`cuda/CudaCodeSource.cpp
-@am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) cuda/$(DEPDIR)/libHPCbanal_la-CudaCodeSource.Tpo cuda/$(DEPDIR)/libHPCbanal_la-CudaCodeSource.Plo
-@AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='cuda/CudaCodeSource.cpp' object='cuda/libHPCbanal_la-CudaCodeSource.lo' libtool=yes @AMDEPBACKSLASH@
+gpu/libHPCbanal_la-CudaCFGParser.lo: gpu/CudaCFGParser.cpp
+@am__fastdepCXX_TRUE@	$(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libHPCbanal_la_CXXFLAGS) $(CXXFLAGS) -MT gpu/libHPCbanal_la-CudaCFGParser.lo -MD -MP -MF gpu/$(DEPDIR)/libHPCbanal_la-CudaCFGParser.Tpo -c -o gpu/libHPCbanal_la-CudaCFGParser.lo `test -f 'gpu/CudaCFGParser.cpp' || echo '$(srcdir)/'`gpu/CudaCFGParser.cpp
+@am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) gpu/$(DEPDIR)/libHPCbanal_la-CudaCFGParser.Tpo gpu/$(DEPDIR)/libHPCbanal_la-CudaCFGParser.Plo
+@AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='gpu/CudaCFGParser.cpp' object='gpu/libHPCbanal_la-CudaCFGParser.lo' libtool=yes @AMDEPBACKSLASH@
 @AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
-@am__fastdepCXX_FALSE@	$(AM_V_CXX@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libHPCbanal_la_CXXFLAGS) $(CXXFLAGS) -c -o cuda/libHPCbanal_la-CudaCodeSource.lo `test -f 'cuda/CudaCodeSource.cpp' || echo '$(srcdir)/'`cuda/CudaCodeSource.cpp
+@am__fastdepCXX_FALSE@	$(AM_V_CXX@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libHPCbanal_la_CXXFLAGS) $(CXXFLAGS) -c -o gpu/libHPCbanal_la-CudaCFGParser.lo `test -f 'gpu/CudaCFGParser.cpp' || echo '$(srcdir)/'`gpu/CudaCFGParser.cpp
 
-cuda/libHPCbanal_la-ReadCubinCFG.lo: cuda/ReadCubinCFG.cpp
-@am__fastdepCXX_TRUE@	$(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libHPCbanal_la_CXXFLAGS) $(CXXFLAGS) -MT cuda/libHPCbanal_la-ReadCubinCFG.lo -MD -MP -MF cuda/$(DEPDIR)/libHPCbanal_la-ReadCubinCFG.Tpo -c -o cuda/libHPCbanal_la-ReadCubinCFG.lo `test -f 'cuda/ReadCubinCFG.cpp' || echo '$(srcdir)/'`cuda/ReadCubinCFG.cpp
-@am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) cuda/$(DEPDIR)/libHPCbanal_la-ReadCubinCFG.Tpo cuda/$(DEPDIR)/libHPCbanal_la-ReadCubinCFG.Plo
-@AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='cuda/ReadCubinCFG.cpp' object='cuda/libHPCbanal_la-ReadCubinCFG.lo' libtool=yes @AMDEPBACKSLASH@
+gpu/libHPCbanal_la-ReadCudaCFG.lo: gpu/ReadCudaCFG.cpp
+@am__fastdepCXX_TRUE@	$(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libHPCbanal_la_CXXFLAGS) $(CXXFLAGS) -MT gpu/libHPCbanal_la-ReadCudaCFG.lo -MD -MP -MF gpu/$(DEPDIR)/libHPCbanal_la-ReadCudaCFG.Tpo -c -o gpu/libHPCbanal_la-ReadCudaCFG.lo `test -f 'gpu/ReadCudaCFG.cpp' || echo '$(srcdir)/'`gpu/ReadCudaCFG.cpp
+@am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) gpu/$(DEPDIR)/libHPCbanal_la-ReadCudaCFG.Tpo gpu/$(DEPDIR)/libHPCbanal_la-ReadCudaCFG.Plo
+@AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='gpu/ReadCudaCFG.cpp' object='gpu/libHPCbanal_la-ReadCudaCFG.lo' libtool=yes @AMDEPBACKSLASH@
 @AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
-@am__fastdepCXX_FALSE@	$(AM_V_CXX@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libHPCbanal_la_CXXFLAGS) $(CXXFLAGS) -c -o cuda/libHPCbanal_la-ReadCubinCFG.lo `test -f 'cuda/ReadCubinCFG.cpp' || echo '$(srcdir)/'`cuda/ReadCubinCFG.cpp
+@am__fastdepCXX_FALSE@	$(AM_V_CXX@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libHPCbanal_la_CXXFLAGS) $(CXXFLAGS) -c -o gpu/libHPCbanal_la-ReadCudaCFG.lo `test -f 'gpu/ReadCudaCFG.cpp' || echo '$(srcdir)/'`gpu/ReadCudaCFG.cpp
 
-intel/libHPCbanal_la-IntelCFGFactory.lo: intel/IntelCFGFactory.cpp
-@am__fastdepCXX_TRUE@	$(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libHPCbanal_la_CXXFLAGS) $(CXXFLAGS) -MT intel/libHPCbanal_la-IntelCFGFactory.lo -MD -MP -MF intel/$(DEPDIR)/libHPCbanal_la-IntelCFGFactory.Tpo -c -o intel/libHPCbanal_la-IntelCFGFactory.lo `test -f 'intel/IntelCFGFactory.cpp' || echo '$(srcdir)/'`intel/IntelCFGFactory.cpp
-@am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) intel/$(DEPDIR)/libHPCbanal_la-IntelCFGFactory.Tpo intel/$(DEPDIR)/libHPCbanal_la-IntelCFGFactory.Plo
-@AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='intel/IntelCFGFactory.cpp' object='intel/libHPCbanal_la-IntelCFGFactory.lo' libtool=yes @AMDEPBACKSLASH@
+gpu/libHPCbanal_la-ReadIntelCFG.lo: gpu/ReadIntelCFG.cpp
+@am__fastdepCXX_TRUE@	$(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libHPCbanal_la_CXXFLAGS) $(CXXFLAGS) -MT gpu/libHPCbanal_la-ReadIntelCFG.lo -MD -MP -MF gpu/$(DEPDIR)/libHPCbanal_la-ReadIntelCFG.Tpo -c -o gpu/libHPCbanal_la-ReadIntelCFG.lo `test -f 'gpu/ReadIntelCFG.cpp' || echo '$(srcdir)/'`gpu/ReadIntelCFG.cpp
+@am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) gpu/$(DEPDIR)/libHPCbanal_la-ReadIntelCFG.Tpo gpu/$(DEPDIR)/libHPCbanal_la-ReadIntelCFG.Plo
+@AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='gpu/ReadIntelCFG.cpp' object='gpu/libHPCbanal_la-ReadIntelCFG.lo' libtool=yes @AMDEPBACKSLASH@
 @AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
-@am__fastdepCXX_FALSE@	$(AM_V_CXX@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libHPCbanal_la_CXXFLAGS) $(CXXFLAGS) -c -o intel/libHPCbanal_la-IntelCFGFactory.lo `test -f 'intel/IntelCFGFactory.cpp' || echo '$(srcdir)/'`intel/IntelCFGFactory.cpp
-
-intel/libHPCbanal_la-IntelFunction.lo: intel/IntelFunction.cpp
-@am__fastdepCXX_TRUE@	$(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libHPCbanal_la_CXXFLAGS) $(CXXFLAGS) -MT intel/libHPCbanal_la-IntelFunction.lo -MD -MP -MF intel/$(DEPDIR)/libHPCbanal_la-IntelFunction.Tpo -c -o intel/libHPCbanal_la-IntelFunction.lo `test -f 'intel/IntelFunction.cpp' || echo '$(srcdir)/'`intel/IntelFunction.cpp
-@am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) intel/$(DEPDIR)/libHPCbanal_la-IntelFunction.Tpo intel/$(DEPDIR)/libHPCbanal_la-IntelFunction.Plo
-@AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='intel/IntelFunction.cpp' object='intel/libHPCbanal_la-IntelFunction.lo' libtool=yes @AMDEPBACKSLASH@
-@AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
-@am__fastdepCXX_FALSE@	$(AM_V_CXX@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libHPCbanal_la_CXXFLAGS) $(CXXFLAGS) -c -o intel/libHPCbanal_la-IntelFunction.lo `test -f 'intel/IntelFunction.cpp' || echo '$(srcdir)/'`intel/IntelFunction.cpp
-
-intel/libHPCbanal_la-IntelBlock.lo: intel/IntelBlock.cpp
-@am__fastdepCXX_TRUE@	$(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libHPCbanal_la_CXXFLAGS) $(CXXFLAGS) -MT intel/libHPCbanal_la-IntelBlock.lo -MD -MP -MF intel/$(DEPDIR)/libHPCbanal_la-IntelBlock.Tpo -c -o intel/libHPCbanal_la-IntelBlock.lo `test -f 'intel/IntelBlock.cpp' || echo '$(srcdir)/'`intel/IntelBlock.cpp
-@am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) intel/$(DEPDIR)/libHPCbanal_la-IntelBlock.Tpo intel/$(DEPDIR)/libHPCbanal_la-IntelBlock.Plo
-@AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='intel/IntelBlock.cpp' object='intel/libHPCbanal_la-IntelBlock.lo' libtool=yes @AMDEPBACKSLASH@
-@AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
-@am__fastdepCXX_FALSE@	$(AM_V_CXX@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libHPCbanal_la_CXXFLAGS) $(CXXFLAGS) -c -o intel/libHPCbanal_la-IntelBlock.lo `test -f 'intel/IntelBlock.cpp' || echo '$(srcdir)/'`intel/IntelBlock.cpp
-
-intel/libHPCbanal_la-IntelCodeSource.lo: intel/IntelCodeSource.cpp
-@am__fastdepCXX_TRUE@	$(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libHPCbanal_la_CXXFLAGS) $(CXXFLAGS) -MT intel/libHPCbanal_la-IntelCodeSource.lo -MD -MP -MF intel/$(DEPDIR)/libHPCbanal_la-IntelCodeSource.Tpo -c -o intel/libHPCbanal_la-IntelCodeSource.lo `test -f 'intel/IntelCodeSource.cpp' || echo '$(srcdir)/'`intel/IntelCodeSource.cpp
-@am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) intel/$(DEPDIR)/libHPCbanal_la-IntelCodeSource.Tpo intel/$(DEPDIR)/libHPCbanal_la-IntelCodeSource.Plo
-@AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='intel/IntelCodeSource.cpp' object='intel/libHPCbanal_la-IntelCodeSource.lo' libtool=yes @AMDEPBACKSLASH@
-@AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
-@am__fastdepCXX_FALSE@	$(AM_V_CXX@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libHPCbanal_la_CXXFLAGS) $(CXXFLAGS) -c -o intel/libHPCbanal_la-IntelCodeSource.lo `test -f 'intel/IntelCodeSource.cpp' || echo '$(srcdir)/'`intel/IntelCodeSource.cpp
-
-intel/libHPCbanal_la-ReadIntelCFG.lo: intel/ReadIntelCFG.cpp
-@am__fastdepCXX_TRUE@	$(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libHPCbanal_la_CXXFLAGS) $(CXXFLAGS) -MT intel/libHPCbanal_la-ReadIntelCFG.lo -MD -MP -MF intel/$(DEPDIR)/libHPCbanal_la-ReadIntelCFG.Tpo -c -o intel/libHPCbanal_la-ReadIntelCFG.lo `test -f 'intel/ReadIntelCFG.cpp' || echo '$(srcdir)/'`intel/ReadIntelCFG.cpp
-@am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) intel/$(DEPDIR)/libHPCbanal_la-ReadIntelCFG.Tpo intel/$(DEPDIR)/libHPCbanal_la-ReadIntelCFG.Plo
-@AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='intel/ReadIntelCFG.cpp' object='intel/libHPCbanal_la-ReadIntelCFG.lo' libtool=yes @AMDEPBACKSLASH@
-@AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
-@am__fastdepCXX_FALSE@	$(AM_V_CXX@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libHPCbanal_la_CXXFLAGS) $(CXXFLAGS) -c -o intel/libHPCbanal_la-ReadIntelCFG.lo `test -f 'intel/ReadIntelCFG.cpp' || echo '$(srcdir)/'`intel/ReadIntelCFG.cpp
+@am__fastdepCXX_FALSE@	$(AM_V_CXX@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libHPCbanal_la_CXXFLAGS) $(CXXFLAGS) -c -o gpu/libHPCbanal_la-ReadIntelCFG.lo `test -f 'gpu/ReadIntelCFG.cpp' || echo '$(srcdir)/'`gpu/ReadIntelCFG.cpp
 
 libHPCbanal_la-Struct.lo: Struct.cpp
 @am__fastdepCXX_TRUE@	$(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libHPCbanal_la_CXXFLAGS) $(CXXFLAGS) -MT libHPCbanal_la-Struct.lo -MD -MP -MF $(DEPDIR)/libHPCbanal_la-Struct.Tpo -c -o libHPCbanal_la-Struct.lo `test -f 'Struct.cpp' || echo '$(srcdir)/'`Struct.cpp
@@ -837,8 +781,7 @@ mostlyclean-libtool:
 
 clean-libtool:
 	-rm -rf .libs _libs
-	-rm -rf cuda/.libs cuda/_libs
-	-rm -rf intel/.libs intel/_libs
+	-rm -rf gpu/.libs gpu/_libs
 
 ID: $(am__tagged_files)
 	$(am__define_uniq_tagged_files); mkid -fID $$unique
@@ -953,10 +896,8 @@ clean-generic:
 distclean-generic:
 	-test -z "$(CONFIG_CLEAN_FILES)" || rm -f $(CONFIG_CLEAN_FILES)
 	-test . = "$(srcdir)" || test -z "$(CONFIG_CLEAN_VPATH_FILES)" || rm -f $(CONFIG_CLEAN_VPATH_FILES)
-	-rm -f cuda/$(DEPDIR)/$(am__dirstamp)
-	-rm -f cuda/$(am__dirstamp)
-	-rm -f intel/$(DEPDIR)/$(am__dirstamp)
-	-rm -f intel/$(am__dirstamp)
+	-rm -f gpu/$(DEPDIR)/$(am__dirstamp)
+	-rm -f gpu/$(am__dirstamp)
 
 maintainer-clean-generic:
 	@echo "This command is intended for maintainers to use"
@@ -967,7 +908,7 @@ clean-am: clean-generic clean-libtool clean-noinstLTLIBRARIES \
 	mostlyclean-am
 
 distclean: distclean-am
-	-rm -rf ./$(DEPDIR) cuda/$(DEPDIR) intel/$(DEPDIR)
+	-rm -rf ./$(DEPDIR) gpu/$(DEPDIR)
 	-rm -f Makefile
 distclean-am: clean-am distclean-compile distclean-generic \
 	distclean-tags
@@ -1013,7 +954,7 @@ install-ps-am:
 installcheck-am:
 
 maintainer-clean: maintainer-clean-am
-	-rm -rf ./$(DEPDIR) cuda/$(DEPDIR) intel/$(DEPDIR)
+	-rm -rf ./$(DEPDIR) gpu/$(DEPDIR)
 	-rm -f Makefile
 maintainer-clean-am: distclean-am maintainer-clean-generic
 
diff --git a/src/lib/banal/Struct.cpp b/src/lib/banal/Struct.cpp
index bd773125a4..ec44f93423 100644
--- a/src/lib/banal/Struct.cpp
+++ b/src/lib/banal/Struct.cpp
@@ -115,8 +115,8 @@
 #include "Struct-Output.hpp"
 #include "Struct-Skel.hpp"
 
-#include "cuda/ReadCubinCFG.hpp"
-#include "intel/ReadIntelCFG.hpp"
+#include "gpu/ReadCudaCFG.hpp"
+#include "gpu/ReadIntelCFG.hpp"
 
 #ifdef ENABLE_OPENMP
 #include <omp.h>
diff --git a/src/lib/banal/cuda/CudaBlock.hpp b/src/lib/banal/cuda/CudaBlock.hpp
deleted file mode 100644
index 0f27628a6e..0000000000
--- a/src/lib/banal/cuda/CudaBlock.hpp
+++ /dev/null
@@ -1,26 +0,0 @@
-#ifndef _CUDA_BLOCK_H_
-#define _CUDA_BLOCK_H_
-
-#include <CFG.h>
-
-namespace Dyninst {
-namespace ParseAPI {
-
-class PARSER_EXPORT CudaBlock : public Block {
- public:
-  CudaBlock(CodeObject * o, CodeRegion * r, Address start, std::vector<Offset> &offsets);
-
-  virtual ~CudaBlock() {}
-
-  virtual void getInsns(Insns &insns) const;
-
-  virtual Address last() const;
-
- private:
-  std::vector<Offset> _inst_offsets;
-};
-
-}
-}
-
-#endif
diff --git a/src/lib/banal/cuda/CudaCFGFactory.hpp b/src/lib/banal/cuda/CudaCFGFactory.hpp
deleted file mode 100644
index be6586f28c..0000000000
--- a/src/lib/banal/cuda/CudaCFGFactory.hpp
+++ /dev/null
@@ -1,32 +0,0 @@
-#ifndef _CUDA_CFG_FACTORY_H_
-#define _CUDA_CFG_FACTORY_H_
-
-#include <CFGFactory.h>
-#include <unordered_map>
-
-#include "CudaBlock.hpp"
-#include "DotCFG.hpp"
-
-namespace Dyninst {
-namespace ParseAPI {
-
-class PARSER_EXPORT CudaCFGFactory : public CFGFactory {   
- public:
-  CudaCFGFactory(std::vector<CudaParse::Function *> &functions) :
-    _functions(functions) {}
-  virtual ~CudaCFGFactory() {}
-
- protected:
-  virtual Function * mkfunc(Address addr, FuncSource src, 
-    std::string name, CodeObject * obj, CodeRegion * region,
-    Dyninst::InstructionSource * isrc);
-
- private:
-  std::vector<CudaParse::Function *> &_functions;
-  std::unordered_map<size_t, CudaBlock *> _block_filter; 
-};
-
-}
-}
-
-#endif
diff --git a/src/lib/banal/cuda/CFGParser.cpp b/src/lib/banal/gpu/CudaCFGParser.cpp
similarity index 95%
rename from src/lib/banal/cuda/CFGParser.cpp
rename to src/lib/banal/gpu/CudaCFGParser.cpp
index 0b2ec4853b..f20c9d6378 100644
--- a/src/lib/banal/cuda/CFGParser.cpp
+++ b/src/lib/banal/gpu/CudaCFGParser.cpp
@@ -1,11 +1,11 @@
-#include "CFGParser.hpp"
+#include "CudaCFGParser.hpp"
 #include <cctype>
 #include <iostream>
 
 #define DEBUG_CUDA_CFGPARSER 0
 
 
-namespace CudaParse {
+namespace GPUParse {
 
 static void debug_blocks(const std::vector<Block *> &blocks) {
   for (auto *block : blocks) {
@@ -22,7 +22,7 @@ static void debug_blocks(const std::vector<Block *> &blocks) {
 }
 
 
-TargetType CFGParser::get_target_type(const Inst *inst) {
+TargetType CudaCFGParser::get_target_type(const Inst *inst) {
   TargetType type;
   if (inst->predicate.find("@") != std::string::npos) {
     type = TargetType::COND_TAKEN;
@@ -35,7 +35,7 @@ TargetType CFGParser::get_target_type(const Inst *inst) {
 }
 
 
-TargetType CFGParser::get_fallthrough_type(const Inst *inst) {
+TargetType CudaCFGParser::get_fallthrough_type(const Inst *inst) {
   TargetType type;
   if (inst->is_call) {
     type = TargetType::CALL_FT;
@@ -46,7 +46,7 @@ TargetType CFGParser::get_fallthrough_type(const Inst *inst) {
 }
 
 
-void CFGParser::parse_inst_strings(
+void CudaCFGParser::parse_inst_strings(
   const std::string &label,
   std::deque<std::string> &inst_strings) {
   std::regex e("\\\\l([|]*)");
@@ -91,7 +91,7 @@ void CFGParser::parse_inst_strings(
 }
 
 
-void CFGParser::link_dangling_blocks(
+void CudaCFGParser::link_dangling_blocks(
   std::set<Block *> &dangling_blocks,
   std::vector<Function *> &functions) {
   for (auto *function : functions) {
@@ -142,7 +142,7 @@ void CFGParser::link_dangling_blocks(
 }
 
 
-void CFGParser::parse_calls(std::vector<Function *> &functions) {
+void CudaCFGParser::parse_calls(std::vector<Function *> &functions) {
   for (auto *function : functions) {
     for (auto *block : function->blocks) {
       for (auto *inst : block->insts) {
@@ -188,7 +188,7 @@ void CFGParser::parse_calls(std::vector<Function *> &functions) {
 }
 
 
-void CFGParser::find_block_parent(const std::vector<Block *> &blocks) {
+void CudaCFGParser::find_block_parent(const std::vector<Block *> &blocks) {
   bool incoming_nodes[blocks.size()];
   std::fill(incoming_nodes, incoming_nodes + blocks.size(), false);
   for (auto *block : blocks) {
@@ -210,7 +210,7 @@ void CFGParser::find_block_parent(const std::vector<Block *> &blocks) {
 }
 
 
-void CFGParser::unite_blocks(const Block *block, bool *visited, size_t parent) {
+void CudaCFGParser::unite_blocks(const Block *block, bool *visited, size_t parent) {
   for (auto *target : block->targets) {
     if (visited[target->block->id] == false) {
       visited[target->block->id] = true;
@@ -231,7 +231,7 @@ static bool compare_target_ptr(Target *l, Target *r) {
 }
 
 
-void CFGParser::parse(const Graph &graph, std::vector<Function *> &functions) {
+void CudaCFGParser::parse(const Graph &graph, std::vector<Function *> &functions) {
   std::unordered_map<size_t, Block *> block_id_map;
   std::unordered_map<std::string, Block *> block_name_map;
   std::vector<Block *> blocks;
@@ -243,7 +243,7 @@ void CFGParser::parse(const Graph &graph, std::vector<Function *> &functions) {
     std::deque<std::string> inst_strings;
     parse_inst_strings(vertex->label, inst_strings);
     for (auto &inst_string : inst_strings) {
-      block->insts.push_back(new Inst(inst_string));
+      block->insts.push_back(new CudaInst(inst_string));
     }
 
     blocks.push_back(block);
@@ -349,7 +349,7 @@ void CFGParser::parse(const Graph &graph, std::vector<Function *> &functions) {
 }
 
 
-void CFGParser::link_fallthrough_edges(
+void CudaCFGParser::link_fallthrough_edges(
   const Graph &graph,
   const std::vector<Block *> &blocks,
   std::unordered_map<size_t, Block *> &block_id_map) {
@@ -381,7 +381,7 @@ void CFGParser::link_fallthrough_edges(
 }
 
 
-void CFGParser::split_blocks(
+void CudaCFGParser::split_blocks(
   std::vector<Block *> &blocks,
   std::unordered_map<size_t, Block *> &block_id_map) {
   size_t extra_block_id = blocks.size();
diff --git a/src/lib/banal/cuda/CFGParser.hpp b/src/lib/banal/gpu/CudaCFGParser.hpp
similarity index 91%
rename from src/lib/banal/cuda/CFGParser.hpp
rename to src/lib/banal/gpu/CudaCFGParser.hpp
index ca2f0a3901..ade22c0c42 100644
--- a/src/lib/banal/cuda/CFGParser.hpp
+++ b/src/lib/banal/gpu/CudaCFGParser.hpp
@@ -8,17 +8,17 @@
 #include "DotCFG.hpp"
 #include "Graph.hpp"
 
-namespace CudaParse {
+namespace GPUParse {
 
-class CFGParser {
+class CudaCFGParser {
  public:
-  CFGParser() : _block_parent(0) {}
+  CudaCFGParser() : _block_parent(0) {}
 
   void parse(const Graph &graph, std::vector<Function *> &functions);
 
   void parse_calls(std::vector<Function *> &functions);
 
-  ~CFGParser() {}
+  ~CudaCFGParser() {}
 
  private:
   void parse_inst_strings(const std::string &label, std::deque<std::string> &inst_strings);
diff --git a/src/lib/banal/cuda/DotCFG.hpp b/src/lib/banal/gpu/DotCFG.hpp
similarity index 93%
rename from src/lib/banal/cuda/DotCFG.hpp
rename to src/lib/banal/gpu/DotCFG.hpp
index 288229b7f7..3fac50f5a9 100644
--- a/src/lib/banal/cuda/DotCFG.hpp
+++ b/src/lib/banal/gpu/DotCFG.hpp
@@ -1,5 +1,5 @@
-#ifndef _DOT_CFG_H_
-#define _DOT_CFG_H_
+#ifndef BANAL_GPU_DOT_CFG_H
+#define BANAL_GPU_DOT_CFG_H
 
 #include <algorithm>
 #include <iostream>
@@ -11,7 +11,7 @@
 // dyninst
 #include <CFG.h>
 
-namespace CudaParse {
+namespace GPUParse {
 
 struct Inst {
   int offset;
@@ -32,10 +32,17 @@ struct Inst {
     is_call(false), is_jump(false), is_sync(false) {}
 
   explicit Inst(int offset) : Inst(offset, 0) {}
+};
+
+
+struct CudaInst : public Inst {
+  // Constructor for dummy inst
+  CudaInst(int offset, int size) : Inst(offset, size) {}
+
+  explicit CudaInst(int offset) : Inst(offset) {}
 
   // Cuda instruction constructor
-  Inst(std::string &inst_str) : offset(0), dual_first(false), dual_second(false),
-    is_call(false), is_jump(false), is_sync(false) {
+  CudaInst(std::string &inst_str) : Inst(0, 0) {
     if (inst_str.find("{") != std::string::npos) {  // Dual first
       auto pos = inst_str.find("{");
       inst_str.replace(pos, 1, " ");
@@ -53,6 +60,7 @@ struct Inst {
     std::string s;
     if (std::getline(*iss, s, ':')) {
       if (s.find("<") != std::string::npos) {
+        // Port notation in dot graph to link basic blocks
         auto pos = s.find(">");
         this->port = s.substr(1, pos - 1);
         s = s.substr(pos + 1); 
@@ -92,12 +100,13 @@ struct Inst {
                 } else if (opcode.find("SYNC") != std::string::npos) {
                   // avoid Barrier Set Convergence Synchronization Point
                   //opcode.find("SSY") != std::string::npos ||
-                  //opcode.find("BSSY") != std::string::npos) {
+                  //opcode.find("BSSY") != std::string::npos)
                   // TODO(Keren): add more sync instructions
                   this->is_sync = true;
                 }
               }
             } else {
+              // Target
               operands.push_back(s);
               if (is_jump || is_sync) {
                 auto pos = s.find(".L_");
@@ -143,11 +152,11 @@ struct Target {
 
 
 struct Block {
-  int begin_offset;
+  size_t id;
   int address;
+  int begin_offset;
   std::vector<Inst *> insts;
   std::vector<Target *> targets;
-  size_t id;
   std::string name;
 
   Block(size_t id, int address, const std::string &name) : id(id), address(address), name(name) {}
diff --git a/src/lib/banal/cuda/CudaBlock.cpp b/src/lib/banal/gpu/GPUBlock.cpp
similarity index 76%
rename from src/lib/banal/cuda/CudaBlock.cpp
rename to src/lib/banal/gpu/GPUBlock.cpp
index 6074e06c60..3fe14be511 100644
--- a/src/lib/banal/cuda/CudaBlock.cpp
+++ b/src/lib/banal/gpu/GPUBlock.cpp
@@ -1,11 +1,11 @@
-#include "CudaBlock.hpp"
+#include "GPUBlock.hpp"
 #include <Instruction.h>
 
 
 namespace Dyninst {
 namespace ParseAPI {
 
-CudaBlock::CudaBlock(CodeObject * o, CodeRegion * r,
+GPUBlock::GPUBlock(CodeObject * o, CodeRegion * r,
   Address start, std::vector<Offset> &offsets) : Block(o, r, start) {
   for (auto offset : offsets) {
     _inst_offsets.push_back(offset);
@@ -13,12 +13,12 @@ CudaBlock::CudaBlock(CodeObject * o, CodeRegion * r,
 }
 
 
-Address CudaBlock::last() const {
+Address GPUBlock::last() const {
   return this->_inst_offsets.back();
 }
 
 
-void CudaBlock::getInsns(Insns &insns) const {
+void GPUBlock::getInsns(Insns &insns) const {
   for (auto offset : _inst_offsets) {
 #ifdef DYNINST_INSTRUCTION_PTR
     insns.insert(std::pair<long unsigned int, 
diff --git a/src/lib/banal/gpu/GPUBlock.hpp b/src/lib/banal/gpu/GPUBlock.hpp
new file mode 100644
index 0000000000..cbe60956a0
--- /dev/null
+++ b/src/lib/banal/gpu/GPUBlock.hpp
@@ -0,0 +1,26 @@
+#ifndef BANAL_GPU_GPU_BLOCK_H
+#define BANAL_GPU_GPU_BLOCK_H
+
+#include <CFG.h>
+
+namespace Dyninst {
+namespace ParseAPI {
+
+class PARSER_EXPORT GPUBlock : public Block {
+ public:
+  GPUBlock(CodeObject * o, CodeRegion * r, Address start, std::vector<Offset> &offsets);
+
+  virtual ~GPUBlock() {}
+
+  virtual void getInsns(Insns &insns) const;
+
+  virtual Address last() const;
+
+ private:
+  std::vector<Offset> _inst_offsets;
+};
+
+}
+}
+
+#endif
diff --git a/src/lib/banal/cuda/CudaCFGFactory.cpp b/src/lib/banal/gpu/GPUCFGFactory.cpp
similarity index 77%
rename from src/lib/banal/cuda/CudaCFGFactory.cpp
rename to src/lib/banal/gpu/GPUCFGFactory.cpp
index aa32bc19eb..114973fcd7 100644
--- a/src/lib/banal/cuda/CudaCFGFactory.cpp
+++ b/src/lib/banal/gpu/GPUCFGFactory.cpp
@@ -1,42 +1,42 @@
-#include "CudaCFGFactory.hpp"
-#include "CudaFunction.hpp"
+#include "GPUCFGFactory.hpp"
+#include "GPUFunction.hpp"
 #include <iostream>
 
-#define DEBUG_CUDA_CFGFACTORY 0
+#define DEBUG_GPU_CFGFACTORY 0
 
 namespace Dyninst {
 namespace ParseAPI {
 
-Function *CudaCFGFactory::mkfunc(Address addr, FuncSource src, 
+Function *GPUCFGFactory::mkfunc(Address addr, FuncSource src, 
   std::string name, CodeObject * obj, CodeRegion * region, 
   Dyninst::InstructionSource * isrc) {
   // Find function by name
   for (auto *function : _functions) {
     if (function->name == name) {
-      CudaFunction *ret_func = new CudaFunction(function->address, name, obj, region, isrc);
+      GPUFunction *ret_func = new GPUFunction(function->address, name, obj, region, isrc);
 
       bool first_entry = true;
-      if (DEBUG_CUDA_CFGFACTORY) {
+      if (DEBUG_GPU_CFGFACTORY) {
         std::cout << "Function: " << function->name << " addr: 0x" <<
           std::hex << addr << std::dec << std::endl;
       }
       for (auto *block : function->blocks) {
-        CudaBlock *ret_block = NULL;
+        GPUBlock *ret_block = NULL;
         // If a block has not been created by callers, create it
         // Otherwise get the block from _block_filter
         if (_block_filter.find(block->id) == _block_filter.end()) {
-          if (DEBUG_CUDA_CFGFACTORY) {
+          if (DEBUG_GPU_CFGFACTORY) {
             std::cout << "New block: " << block->name << " id: " << block->id << std::endl;
           }
           std::vector<Offset> inst_offsets;
           for (auto *inst : block->insts) {
             inst_offsets.push_back(inst->offset);
           }
-          ret_block = new CudaBlock(obj, region, block->address, inst_offsets);
+          ret_block = new GPUBlock(obj, region, block->address, inst_offsets);
           _block_filter[block->id] = ret_block;
           blocks_.add(ret_block);
         } else {
-          if (DEBUG_CUDA_CFGFACTORY) {
+          if (DEBUG_GPU_CFGFACTORY) {
             std::cout << "Old block: " << block->name << " id: " << block->id << std::endl;
           }
           ret_block = _block_filter[block->id];
@@ -50,20 +50,20 @@ Function *CudaCFGFactory::mkfunc(Address addr, FuncSource src,
 
         // Create edges and related blocks
         for (auto *target : block->targets) {
-          CudaBlock *ret_target_block = NULL;
+          GPUBlock *ret_target_block = NULL;
           if (_block_filter.find(target->block->id) == _block_filter.end()) {
-            if (DEBUG_CUDA_CFGFACTORY) {
+            if (DEBUG_GPU_CFGFACTORY) {
               std::cout << "New block: " << target->block->name << " id: " << target->block->id << std::endl;
             }
             std::vector<Offset> inst_offsets;
             for (auto *inst : target->block->insts) {
               inst_offsets.push_back(inst->offset);
             }
-            ret_target_block = new CudaBlock(obj, region, target->block->address, inst_offsets);
+            ret_target_block = new GPUBlock(obj, region, target->block->address, inst_offsets);
             _block_filter[target->block->id] = ret_target_block;
             blocks_.add(ret_target_block);
           } else {
-            if (DEBUG_CUDA_CFGFACTORY) {
+            if (DEBUG_GPU_CFGFACTORY) {
               std::cout << "Old block: " << target->block->name << " id: " << target->block->id << std::endl;
             }
             ret_target_block = _block_filter[target->block->id];
@@ -71,7 +71,7 @@ Function *CudaCFGFactory::mkfunc(Address addr, FuncSource src,
 
           Edge *ret_edge = new Edge(ret_block, ret_target_block, target->type);
           ret_edge->ignore_index();
-          if (DEBUG_CUDA_CFGFACTORY) {
+          if (DEBUG_GPU_CFGFACTORY) {
             std::cout << "Edge: "<< " -> " << target->block->name << std::endl;
           }
           ret_edge->install();
diff --git a/src/lib/banal/gpu/GPUCFGFactory.hpp b/src/lib/banal/gpu/GPUCFGFactory.hpp
new file mode 100644
index 0000000000..f878b71437
--- /dev/null
+++ b/src/lib/banal/gpu/GPUCFGFactory.hpp
@@ -0,0 +1,32 @@
+#ifndef BANAL_GPU_GPU_CFG_FACTORY_H
+#define BANAL_GPU_GPU_CFG_FACTORY_H
+
+#include <CFGFactory.h>
+#include <unordered_map>
+
+#include "GPUBlock.hpp"
+#include "DotCFG.hpp"
+
+namespace Dyninst {
+namespace ParseAPI {
+
+class PARSER_EXPORT GPUCFGFactory : public CFGFactory {   
+ public:
+  GPUCFGFactory(std::vector<GPUParse::Function *> &functions) :
+    _functions(functions) {}
+  virtual ~GPUCFGFactory() {}
+
+ protected:
+  virtual Function * mkfunc(Address addr, FuncSource src, 
+    std::string name, CodeObject * obj, CodeRegion * region,
+    Dyninst::InstructionSource * isrc);
+
+ private:
+  std::vector<GPUParse::Function *> &_functions;
+  std::unordered_map<size_t, GPUBlock *> _block_filter; 
+};
+
+}
+}
+
+#endif
diff --git a/src/lib/banal/cuda/CudaCodeSource.cpp b/src/lib/banal/gpu/GPUCodeSource.cpp
similarity index 57%
rename from src/lib/banal/cuda/CudaCodeSource.cpp
rename to src/lib/banal/gpu/GPUCodeSource.cpp
index c0d10c219e..9ed68f451d 100644
--- a/src/lib/banal/cuda/CudaCodeSource.cpp
+++ b/src/lib/banal/gpu/GPUCodeSource.cpp
@@ -1,10 +1,10 @@
-#include "CudaCodeSource.hpp"
+#include "GPUCodeSource.hpp"
 
 namespace Dyninst {
 namespace ParseAPI {
 
-CudaCodeSource::CudaCodeSource(
-  std::vector<CudaParse::Function *> &functions, Dyninst::SymtabAPI::Symtab *s) {
+GPUCodeSource::GPUCodeSource(
+  std::vector<GPUParse::Function *> &functions, Dyninst::SymtabAPI::Symtab *s) {
   for (auto *function : functions) {
     Address address = function->address;
     _hints.push_back(Hint(address, 0, 0, function->name));
diff --git a/src/lib/banal/cuda/CudaCodeSource.hpp b/src/lib/banal/gpu/GPUCodeSource.hpp
similarity index 89%
rename from src/lib/banal/cuda/CudaCodeSource.hpp
rename to src/lib/banal/gpu/GPUCodeSource.hpp
index 0f9c6f6ab4..1dce26878d 100644
--- a/src/lib/banal/cuda/CudaCodeSource.hpp
+++ b/src/lib/banal/gpu/GPUCodeSource.hpp
@@ -1,5 +1,5 @@
-#ifndef _CUDA_CODE_SOURCE_H_
-#define _CUDA_CODE_SOURCE_H_
+#ifndef BANAL_GPU_GPU_CODE_SOURCE_H
+#define BANAL_GPU_GPU_CODE_SOURCE_H
 
 #include <dyn_regs.h>
 #include <CodeSource.h>
@@ -9,11 +9,11 @@
 
 namespace Dyninst {
 namespace ParseAPI {
-  class PARSER_EXPORT CudaCodeSource : public /*Symtab */ CodeSource {
+  class PARSER_EXPORT GPUCodeSource : public /*Symtab */ CodeSource {
  public:
-  CudaCodeSource(std::vector<CudaParse::Function *> &functions, 
+  GPUCodeSource(std::vector<GPUParse::Function *> &functions, 
 		Dyninst::SymtabAPI::Symtab *s);
-  ~CudaCodeSource() {}
+  ~GPUCodeSource() {}
 
  public:
   /** InstructionSource implementation **/
diff --git a/src/lib/banal/cuda/CudaFunction.cpp b/src/lib/banal/gpu/GPUFunction.cpp
similarity index 57%
rename from src/lib/banal/cuda/CudaFunction.cpp
rename to src/lib/banal/gpu/GPUFunction.cpp
index 4470fdcfe5..f7d248d225 100644
--- a/src/lib/banal/cuda/CudaFunction.cpp
+++ b/src/lib/banal/gpu/GPUFunction.cpp
@@ -1,9 +1,9 @@
-#include "CudaFunction.hpp"
+#include "GPUFunction.hpp"
 
 namespace Dyninst {
 namespace ParseAPI {
 
-void CudaFunction::setEntry(Block *entry) {
+void GPUFunction::setEntry(Block *entry) {
   _region = entry->region();
   _entry = entry;
 }
diff --git a/src/lib/banal/cuda/CudaFunction.hpp b/src/lib/banal/gpu/GPUFunction.hpp
similarity index 53%
rename from src/lib/banal/cuda/CudaFunction.hpp
rename to src/lib/banal/gpu/GPUFunction.hpp
index 0c9af313bc..267d434e63 100644
--- a/src/lib/banal/cuda/CudaFunction.hpp
+++ b/src/lib/banal/gpu/GPUFunction.hpp
@@ -1,20 +1,20 @@
-#ifndef _CUDA_FUNCTION_H_
-#define _CUDA_FUNCTION_H_
+#ifndef BANAL_GPU_GPU_FUNCTION_H
+#define BANAL_GPU_GPU_FUNCTION_H
 
 #include <CFG.h>
 
 namespace Dyninst {
 namespace ParseAPI {
 
-class PARSER_EXPORT CudaFunction : public ParseAPI::Function {
+class PARSER_EXPORT GPUFunction : public ParseAPI::Function {
  public:
-  CudaFunction(Address addr, std::string name, CodeObject * obj, 
+  GPUFunction(Address addr, std::string name, CodeObject * obj, 
     CodeRegion * region, InstructionSource * isource) :
     Function(addr, name, obj, region, isource) {
     _cache_valid = true;
   }
 
-  virtual ~CudaFunction() {}
+  virtual ~GPUFunction() {}
 
   void setEntry(Block *entry);
 };
diff --git a/src/lib/banal/cuda/Graph.hpp b/src/lib/banal/gpu/Graph.hpp
similarity index 92%
rename from src/lib/banal/cuda/Graph.hpp
rename to src/lib/banal/gpu/Graph.hpp
index 5300e3610c..2439f0e6d9 100644
--- a/src/lib/banal/cuda/Graph.hpp
+++ b/src/lib/banal/gpu/Graph.hpp
@@ -1,11 +1,11 @@
-#ifndef _GRAPH_H_
-#define _GRAPH_H_
+#ifndef BANAL_GPU_GRAPH_H
+#define BANAL_GPU_GRAPH_H
 
 #include <string>
 #include <unordered_map>
 #include <vector>
 
-namespace CudaParse {
+namespace GPUParse {
 
 struct Vertex {
   size_t id;
diff --git a/src/lib/banal/cuda/GraphReader.cpp b/src/lib/banal/gpu/GraphReader.cpp
similarity index 99%
rename from src/lib/banal/cuda/GraphReader.cpp
rename to src/lib/banal/gpu/GraphReader.cpp
index 522b0e1bff..e3b29151c4 100644
--- a/src/lib/banal/cuda/GraphReader.cpp
+++ b/src/lib/banal/gpu/GraphReader.cpp
@@ -5,7 +5,7 @@
 #define FUNC_LABEL "@function"
 #define TYPE_LABEL ".type"
 
-namespace CudaParse {
+namespace GPUParse {
 
 void GraphReader::read(Graph &graph) {
   // Read dot graph
diff --git a/src/lib/banal/cuda/GraphReader.hpp b/src/lib/banal/gpu/GraphReader.hpp
similarity index 88%
rename from src/lib/banal/cuda/GraphReader.hpp
rename to src/lib/banal/gpu/GraphReader.hpp
index d6e286e306..21b783fddf 100644
--- a/src/lib/banal/cuda/GraphReader.hpp
+++ b/src/lib/banal/gpu/GraphReader.hpp
@@ -1,5 +1,5 @@
-#ifndef _GRAPH_READER_H_
-#define _GRAPH_READER_H_
+#ifndef BANAL_GPU_GRAPH_READER_H
+#define BANAL_GPU_GRAPH_READER_H
 
 #include <unordered_map>
 #include <string>
@@ -7,7 +7,7 @@
 #include <boost/graph/detail/read_graphviz_new.hpp>
 #include "Graph.hpp"
 
-namespace CudaParse {
+namespace GPUParse {
 
 class GraphReader {
  public:
diff --git a/src/lib/banal/cuda/ReadCubinCFG.cpp b/src/lib/banal/gpu/ReadCudaCFG.cpp
similarity index 88%
rename from src/lib/banal/cuda/ReadCubinCFG.cpp
rename to src/lib/banal/gpu/ReadCudaCFG.cpp
index 07aa41ba33..7added0b9b 100644
--- a/src/lib/banal/cuda/ReadCubinCFG.cpp
+++ b/src/lib/banal/gpu/ReadCudaCFG.cpp
@@ -22,13 +22,13 @@
 #include <lib/support/FileUtil.hpp>
 
 
-#include "CudaCFGFactory.hpp"
-#include "CudaFunction.hpp"
-#include "CudaBlock.hpp"
-#include "CudaCodeSource.hpp"
-#include "CFGParser.hpp"
+#include "GPUCFGFactory.hpp"
+#include "GPUFunction.hpp"
+#include "GPUBlock.hpp"
+#include "GPUCodeSource.hpp"
+#include "CudaCFGParser.hpp"
 #include "GraphReader.hpp"
-#include "ReadCubinCFG.hpp"
+#include "ReadCudaCFG.hpp"
 
 using namespace Dyninst;
 using namespace ParseAPI;
@@ -80,10 +80,10 @@ parseDotCFG
  const std::string &cubin,
  int cuda_arch,
  Dyninst::SymtabAPI::Symtab *the_symtab,
- std::vector<CudaParse::Function *> &functions
+ std::vector<GPUParse::Function *> &functions
 ) 
 {
-  CudaParse::CFGParser cfg_parser;
+  GPUParse::CudaCFGParser cfg_parser;
   // Step 1: parse all function symbols
   std::vector<Symbol *> symbols;
   the_symtab->getAllSymbols(symbols);
@@ -103,7 +103,7 @@ parseDotCFG
   // Store functions that are parsed by nvdisasm
   std::vector<Symbol *> parsed_function_symbols;
   // Remove functions that share the same names
-  std::map<std::string, CudaParse::Function *> function_map;
+  std::map<std::string, GPUParse::Function *> function_map;
   // Test valid symbols
   for (auto *symbol : symbols) {
     if (symbol->getType() == Dyninst::SymtabAPI::Symbol::ST_FUNCTION) {
@@ -113,9 +113,9 @@ parseDotCFG
       if (system(cmd.c_str()) == 0) {
         parsed_function_symbols.push_back(symbol);
         // Only parse valid symbols
-        CudaParse::GraphReader graph_reader(dot_filename);
-        CudaParse::Graph graph;
-        std::vector<CudaParse::Function *> funcs;
+        GPUParse::GraphReader graph_reader(dot_filename);
+        GPUParse::Graph graph;
+        std::vector<GPUParse::Function *> funcs;
         graph_reader.read(graph);
         cfg_parser.parse(graph, funcs);
         // Local functions inside a global function cannot be independently parsed
@@ -179,16 +179,16 @@ parseDotCFG
   // For functions that cannot be parsed
   for (auto *symbol : unparsable_function_symbols) {
     auto function_name = symbol->getMangledName();
-    auto *function = new CudaParse::Function(max_function_id++, std::move(function_name));
+    auto *function = new GPUParse::Function(max_function_id++, std::move(function_name));
     function->address = symbol->getOffset();
     auto block_name = symbol->getMangledName() + "_0";
-    auto *block = new CudaParse::Block(max_block_id++, std::move(block_name));
+    auto *block = new GPUParse::Block(max_block_id++, std::move(block_name));
     block->begin_offset = cuda_arch >= 70 ? 0 : 8;
     block->address = symbol->getOffset() + block->begin_offset;
     int len = cuda_arch >= 70 ? 16 : 8;
     // Add dummy insts
     for (size_t i = block->address; i < block->address + symbol->getSize(); i += len) {
-      block->insts.push_back(new CudaParse::Inst(i));
+      block->insts.push_back(new GPUParse::CudaInst(i));
     }
     function->blocks.push_back(block);
     functions.push_back(function);
@@ -207,18 +207,18 @@ parseDotCFG
             std::cout << function->name << " append nop instructions" << std::endl;
             std::cout << "function_size: " << function_size << " < " << "symbol_size: " << symbol_size << std::endl;
           }
-          auto *block = new CudaParse::Block(max_block_id, ".L_" + std::to_string(max_block_id));
+          auto *block = new GPUParse::Block(max_block_id, ".L_" + std::to_string(max_block_id));
           block->address = function_size + function->address;
           block->begin_offset = cuda_arch >= 70 ? 16 : 8;
           max_block_id++;
           while (function_size < symbol_size) {
-            block->insts.push_back(new CudaParse::Inst(function_size + function->address));
+            block->insts.push_back(new GPUParse::Inst(function_size + function->address));
             function_size += len;
           } 
           if (function->blocks.size() > 0) {
             auto *last_block = function->blocks.back();
             last_block->targets.push_back(
-              new CudaParse::Target(last_block->insts.back(), block, CudaParse::TargetType::DIRECT));
+              new GPUParse::Target(last_block->insts.back(), block, GPUParse::TargetType::DIRECT));
           }
           function->blocks.push_back(block);
         }
@@ -301,10 +301,10 @@ readCubinCFG
     if (!dump_cubin_success) {
       std::cout << "WARNING: unable to write a cubin to the file system to analyze its CFG" << std::endl; 
     } else {
-      std::vector<CudaParse::Function *> functions;
+      std::vector<GPUParse::Function *> functions;
       parseDotCFG(search_path, elfFile->getFileName(), dot, cubin, elfFile->getArch(), the_symtab, functions);
-      CFGFactory *cfg_fact = new CudaCFGFactory(functions);
-      *code_src = new CudaCodeSource(functions, the_symtab); 
+      CFGFactory *cfg_fact = new GPUCFGFactory(functions);
+      *code_src = new GPUCodeSource(functions, the_symtab); 
       *code_obj = new CodeObject(*code_src, cfg_fact);
       (*code_obj)->parse();
       unlink(dot.c_str());
diff --git a/src/lib/banal/cuda/ReadCubinCFG.hpp b/src/lib/banal/gpu/ReadCudaCFG.hpp
similarity index 77%
rename from src/lib/banal/cuda/ReadCubinCFG.hpp
rename to src/lib/banal/gpu/ReadCudaCFG.hpp
index 40514923c8..e259e01dc5 100644
--- a/src/lib/banal/cuda/ReadCubinCFG.hpp
+++ b/src/lib/banal/gpu/ReadCudaCFG.hpp
@@ -1,4 +1,7 @@
-#include <set>
+#ifndef BANAL_GPU_READ_CUDA_CFG_HPP
+#define BANAL_GPU_READ_CUDA_CFG_HPP
+
+#include <string>
 
 #include <lib/binutils/VMAInterval.hpp>
 #include <lib/binutils/ElfHelper.hpp>
@@ -16,3 +19,5 @@ readCubinCFG
  Dyninst::ParseAPI::CodeSource **code_src, 
  Dyninst::ParseAPI::CodeObject **code_obj
 );
+
+#endif
diff --git a/src/lib/banal/intel/ReadIntelCFG.cpp b/src/lib/banal/gpu/ReadIntelCFG.cpp
similarity index 90%
rename from src/lib/banal/intel/ReadIntelCFG.cpp
rename to src/lib/banal/gpu/ReadIntelCFG.cpp
index c817751485..1400871ae8 100644
--- a/src/lib/banal/intel/ReadIntelCFG.cpp
+++ b/src/lib/banal/gpu/ReadIntelCFG.cpp
@@ -70,11 +70,11 @@
 #include <lib/binutils/ElfHelper.hpp>
 #include <lib/support/diagnostics.h>
 
-#include "../cuda/DotCFG.hpp"
-#include "IntelCFGFactory.hpp"
-#include "IntelFunction.hpp"
-#include "IntelBlock.hpp"
-#include "IntelCodeSource.hpp"
+#include "DotCFG.hpp"
+#include "GPUCFGFactory.hpp"
+#include "GPUFunction.hpp"
+#include "GPUBlock.hpp"
+#include "GPUCodeSource.hpp"
 #include "ReadIntelCFG.hpp"
 
 //******************************************************************************
@@ -145,11 +145,11 @@ parseIntelCFG
 (
  char *text_section,
  int text_section_size,
- CudaParse::Function &function
+ GPUParse::Function &function
 )
 {
   KernelView kv(IGA_GEN9, text_section, text_section_size, iga::SWSB_ENCODE_MODE::SingleDistPipe);
-  std::map<int, CudaParse::Block *> block_offset_map;
+  std::map<int, GPUParse::Block *> block_offset_map;
 
   int offset = 0;
   int size = 0;
@@ -157,14 +157,14 @@ parseIntelCFG
 
   // Construct basic blocks
   while (offset < text_section_size) {
-    auto *block = new CudaParse::Block(block_id, offset, function.name + "_" + std::to_string(block_id)); 
+    auto *block = new GPUParse::Block(block_id, offset, function.name + "_" + std::to_string(block_id)); 
     block_id++;
 
     function.blocks.push_back(block);
     block_offset_map[offset] = block;
 
     size = kv.getInstSize(offset);
-    auto *inst = new CudaParse::Inst(offset, size);
+    auto *inst = new GPUParse::Inst(offset, size);
     block->insts.push_back(inst);
 
     while (!kv.isInstTarget(offset + size) && (offset + size < text_section_size)) {
@@ -175,7 +175,7 @@ parseIntelCFG
         break;
       }
 
-      inst = new CudaParse::Inst(offset, size);
+      inst = new GPUParse::Inst(offset, size);
       block->insts.push_back(inst);
     }
 
@@ -204,7 +204,7 @@ parseIntelCFG
     for (size_t j = 0; j < jump_targets_count + 1; j++) {
       auto *target_block = block_offset_map.at(jump_targets[j]);
       // TODO(Aaron): call edge
-      auto type = CudaParse::TargetType::DIRECT;
+      auto type = GPUParse::TargetType::DIRECT;
       // Jump
       bool added = false;
       for (auto *target : block->targets) {
@@ -213,7 +213,7 @@ parseIntelCFG
         }
       }
       if (!added) {
-        block->targets.push_back(new CudaParse::Target(inst, target_block, type));
+        block->targets.push_back(new GPUParse::Target(inst, target_block, type));
       }
     }
   }
@@ -252,12 +252,12 @@ readIntelCFG
     return false;
   }
 
-  CudaParse::Function function(0, function_name);
+  GPUParse::Function function(0, function_name);
   parseIntelCFG(text_section, text_section_size, function);
-  std::vector<CudaParse::Function *> functions = {&function};
+  std::vector<GPUParse::Function *> functions = {&function};
 
-  CFGFactory *cfg_fact = new IntelCFGFactory(functions);
-  *code_src = new IntelCodeSource(functions, the_symtab); 
+  CFGFactory *cfg_fact = new GPUCFGFactory(functions);
+  *code_src = new GPUCodeSource(functions, the_symtab); 
   *code_obj = new CodeObject(*code_src, cfg_fact);
   (*code_obj)->parse();
 
diff --git a/src/lib/banal/intel/ReadIntelCFG.hpp b/src/lib/banal/gpu/ReadIntelCFG.hpp
similarity index 97%
rename from src/lib/banal/intel/ReadIntelCFG.hpp
rename to src/lib/banal/gpu/ReadIntelCFG.hpp
index 3730ec98a2..0f2c78e28c 100644
--- a/src/lib/banal/intel/ReadIntelCFG.hpp
+++ b/src/lib/banal/gpu/ReadIntelCFG.hpp
@@ -46,10 +46,9 @@
 // system includes
 //******************************************************************************
 
-#ifndef BANAL_INTEL_READ_INTEL_CFG_HPP
-#define BANAL_INTEL_READ_INTEL_CFG_HPP
+#ifndef BANAL_GPU_READ_INTEL_CFG_HPP
+#define BANAL_GPU_READ_INTEL_CFG_HPP
 
-#include <iostream>
 #include <string>
 #include <Symtab.h>
 #include <CodeSource.h>
diff --git a/src/lib/banal/intel/IntelBlock.cpp b/src/lib/banal/intel/IntelBlock.cpp
deleted file mode 100644
index 9786e6e6e9..0000000000
--- a/src/lib/banal/intel/IntelBlock.cpp
+++ /dev/null
@@ -1,34 +0,0 @@
-#include "IntelBlock.hpp"
-#include <Instruction.h>
-
-
-namespace Dyninst {
-namespace ParseAPI {
-
-IntelBlock::IntelBlock(CodeObject * o, CodeRegion * r,
-  Address start, std::vector<Offset> &offsets) : Block(o, r, start) {
-  for (auto offset : offsets) {
-    _inst_offsets.push_back(offset);
-  }
-}
-
-
-Address IntelBlock::last() const {
-  return this->_inst_offsets.back();
-}
-
-
-void IntelBlock::getInsns(Insns &insns) const {
-  for (auto offset : _inst_offsets) {
-#ifdef DYNINST_INSTRUCTION_PTR
-    insns.insert(std::pair<long unsigned int, 
-      InstructionAPI::InstructionPtr>(offset, NULL));
-#else
-    InstructionAPI::Instruction inst;    
-    insns[offset] = inst;
-#endif
-  }
-}
-
-}
-}
diff --git a/src/lib/banal/intel/IntelBlock.hpp b/src/lib/banal/intel/IntelBlock.hpp
deleted file mode 100644
index fbdaa732c7..0000000000
--- a/src/lib/banal/intel/IntelBlock.hpp
+++ /dev/null
@@ -1,26 +0,0 @@
-#ifndef BANAL_INTEL_INTEL_BLOCK_HPP
-#define BANAL_INTEL_INTEL_BLOCK_HPP
-
-#include <CFG.h>
-
-namespace Dyninst {
-namespace ParseAPI {
-
-class PARSER_EXPORT IntelBlock : public Block {
- public:
-  IntelBlock(CodeObject * o, CodeRegion * r, Address start, std::vector<Offset> &offsets);
-
-  virtual ~IntelBlock() {}
-
-  virtual void getInsns(Insns &insns) const;
-
-  virtual Address last() const;
-
- private:
-  std::vector<Offset> _inst_offsets;
-};
-
-}
-}
-
-#endif
diff --git a/src/lib/banal/intel/IntelCFGFactory.cpp b/src/lib/banal/intel/IntelCFGFactory.cpp
deleted file mode 100644
index 092ddc99d1..0000000000
--- a/src/lib/banal/intel/IntelCFGFactory.cpp
+++ /dev/null
@@ -1,92 +0,0 @@
-#include "IntelCFGFactory.hpp"
-#include "IntelFunction.hpp"
-#include <iostream>
-
-#define DEBUG_INTEL_CFGFACTORY 0
-
-namespace Dyninst {
-namespace ParseAPI {
-
-Function *IntelCFGFactory::mkfunc(Address addr, FuncSource src, 
-  std::string name, CodeObject * obj, CodeRegion * region, 
-  Dyninst::InstructionSource * isrc) {
-  // Find function by name
-  for (auto *function : _functions) {
-    if (function->name == name) {
-      IntelFunction *ret_func = new IntelFunction(function->address, name, obj, region, isrc);
-
-      bool first_entry = true;
-      if (DEBUG_INTEL_CFGFACTORY) {
-        std::cout << "Function: " << function->name << " addr: 0x" <<
-          std::hex << addr << std::dec << std::endl;
-      }
-      for (auto *block : function->blocks) {
-        IntelBlock *ret_block = NULL;
-        // If a block has not been created by callers, create it
-        // Otherwise get the block from _block_filter
-        if (_block_filter.find(block->id) == _block_filter.end()) {
-          if (DEBUG_INTEL_CFGFACTORY) {
-            std::cout << "New block: " << block->name << " id: " << block->id << std::endl;
-          }
-          std::vector<Offset> inst_offsets;
-          for (auto *inst : block->insts) {
-            inst_offsets.push_back(inst->offset);
-          }
-          ret_block = new IntelBlock(obj, region, block->address, inst_offsets);
-          _block_filter[block->id] = ret_block;
-          blocks_.add(ret_block);
-        } else {
-          if (DEBUG_INTEL_CFGFACTORY) {
-            std::cout << "Old block: " << block->name << " id: " << block->id << std::endl;
-          }
-          ret_block = _block_filter[block->id];
-        }
-        ret_func->add_block(ret_block);
-
-        if (first_entry) {
-          ret_func->setEntry(ret_block);
-          first_entry = false;
-        }
-
-        // Create edges and related blocks
-        for (auto *target : block->targets) {
-          IntelBlock *ret_target_block = NULL;
-          if (_block_filter.find(target->block->id) == _block_filter.end()) {
-            if (DEBUG_INTEL_CFGFACTORY) {
-              std::cout << "New block: " << target->block->name << " id: " << target->block->id << std::endl;
-            }
-            std::vector<Offset> inst_offsets;
-            for (auto *inst : target->block->insts) {
-              inst_offsets.push_back(inst->offset);
-            }
-            ret_target_block = new IntelBlock(obj, region, target->block->address, inst_offsets);
-            _block_filter[target->block->id] = ret_target_block;
-            blocks_.add(ret_target_block);
-          } else {
-            if (DEBUG_INTEL_CFGFACTORY) {
-              std::cout << "Old block: " << target->block->name << " id: " << target->block->id << std::endl;
-            }
-            ret_target_block = _block_filter[target->block->id];
-          }
-
-          Edge *ret_edge = new Edge(ret_block, ret_target_block, target->type);
-          ret_edge->ignore_index();
-          if (DEBUG_INTEL_CFGFACTORY) {
-            std::cout << "Edge: "<< " -> " << target->block->name << std::endl;
-          }
-          ret_edge->install();
-          edges_.add(ret_edge);
-        }
-      }
-      return ret_func;
-    }
-  }
-  return NULL;
-  // iterate blocks
-  // add blocks
-  // iterate targets
-  // add edges
-}
-
-}
-}
diff --git a/src/lib/banal/intel/IntelCFGFactory.hpp b/src/lib/banal/intel/IntelCFGFactory.hpp
deleted file mode 100644
index 939154b16c..0000000000
--- a/src/lib/banal/intel/IntelCFGFactory.hpp
+++ /dev/null
@@ -1,32 +0,0 @@
-#ifndef BANAL_INTEL_INTEL_CFG_FACTORY_HPP
-#define BANAL_INTEL_INTEL_CFG_FACTORY_HPP
-
-#include <CFGFactory.h>
-#include <unordered_map>
-
-#include "IntelBlock.hpp"
-#include "../cuda/DotCFG.hpp"
-
-namespace Dyninst {
-namespace ParseAPI {
-
-class PARSER_EXPORT IntelCFGFactory : public CFGFactory {   
- public:
-  IntelCFGFactory(std::vector<CudaParse::Function *> &functions) :
-    _functions(functions) {}
-  virtual ~IntelCFGFactory() {}
-
- protected:
-  virtual Function * mkfunc(Address addr, FuncSource src, 
-    std::string name, CodeObject * obj, CodeRegion * region,
-    Dyninst::InstructionSource * isrc);
-
- private:
-  std::vector<CudaParse::Function *> &_functions;
-  std::unordered_map<size_t, IntelBlock *> _block_filter; 
-};
-
-}
-}
-
-#endif
diff --git a/src/lib/banal/intel/IntelCodeSource.cpp b/src/lib/banal/intel/IntelCodeSource.cpp
deleted file mode 100644
index eaf2346c8b..0000000000
--- a/src/lib/banal/intel/IntelCodeSource.cpp
+++ /dev/null
@@ -1,15 +0,0 @@
-#include "IntelCodeSource.hpp"
-
-namespace Dyninst {
-namespace ParseAPI {
-
-IntelCodeSource::IntelCodeSource(
-  std::vector<CudaParse::Function *> &functions, Dyninst::SymtabAPI::Symtab *s) {
-  for (auto *function : functions) {
-    Address address = function->address;
-    _hints.push_back(Hint(address, 0, 0, function->name));
-  }
-}
-
-}
-}
diff --git a/src/lib/banal/intel/IntelCodeSource.hpp b/src/lib/banal/intel/IntelCodeSource.hpp
deleted file mode 100644
index ed97cbdfd2..0000000000
--- a/src/lib/banal/intel/IntelCodeSource.hpp
+++ /dev/null
@@ -1,55 +0,0 @@
-#ifndef BANAL_INTEL_INTEL_CODE_SOURCE_HPP
-#define BANAL_INTEL_INTEL_CODE_SOURCE_HPP
-
-#include <dyn_regs.h>
-#include <CodeSource.h>
-#include <Symtab.h>
-
-#include "../cuda/DotCFG.hpp"
-
-namespace Dyninst {
-namespace ParseAPI {
-  class PARSER_EXPORT IntelCodeSource : public /*Symtab */ CodeSource {
- public:
-  IntelCodeSource(std::vector<CudaParse::Function *> &functions, 
-		Dyninst::SymtabAPI::Symtab *s);
-  ~IntelCodeSource() {}
-
- public:
-  /** InstructionSource implementation **/
-  virtual bool isValidAddress(const Address) const { return false; }
-  virtual void* getPtrToInstruction(const Address) const { return NULL; }
-  virtual void* getPtrToData(const Address) const { return NULL; }
-  virtual unsigned int getAddressWidth() const { return 0; }
-  virtual bool isCode(const Address) const { return false; }
-  virtual bool isData(const Address) const { return false; }
-  virtual bool isReadOnly(const Address) const { return false; }
-  virtual Address offset() const { return 0; }
-  virtual Address length() const { return 0; }
-  virtual Architecture getArch() const { return Arch_cuda; }
-
-  virtual bool nonReturning(Address /*func_entry*/) { return false; }                                                                                  
-	virtual bool nonReturningSyscall(int /*number*/) { return false; }
-
-	/* If the binary file type supplies per-function
-	 * TOC's (e.g. ppc64 Linux), override.
-	 */
-  virtual Address getTOC(Address) const { return _table_of_contents; }
-
-  // statistics accessor
-  virtual void print_stats() const { return; }                                                                                                         
-  virtual bool have_stats() const { return false; }
-
-  // manage statistics
-  virtual void incrementCounter(const std::string& /*name*/) const { return; } 
-  virtual void addCounter(const std::string& /*name*/, int /*num*/) const { return; }
-  virtual void decrementCounter(const std::string& /*name*/) const { return; }
-  virtual void startTimer(const std::string& /*name*/) const { return; } 
-  virtual void stopTimer(const std::string& /*name*/) const { return; }
-  virtual bool findCatchBlockByTryRange(Address /*given try address*/, std::set<Address> & /* catch start */)  const { return false; }
-};
-
-}
-}
-
-#endif
diff --git a/src/lib/banal/intel/IntelFunction.cpp b/src/lib/banal/intel/IntelFunction.cpp
deleted file mode 100644
index 8184f5f9ae..0000000000
--- a/src/lib/banal/intel/IntelFunction.cpp
+++ /dev/null
@@ -1,12 +0,0 @@
-#include "IntelFunction.hpp"
-
-namespace Dyninst {
-namespace ParseAPI {
-
-void IntelFunction::setEntry(Block *entry) {
-  _region = entry->region();
-  _entry = entry;
-}
-
-}
-}
diff --git a/src/lib/banal/intel/IntelFunction.hpp b/src/lib/banal/intel/IntelFunction.hpp
deleted file mode 100644
index 72305a3af0..0000000000
--- a/src/lib/banal/intel/IntelFunction.hpp
+++ /dev/null
@@ -1,25 +0,0 @@
-#ifndef BANAL_INTEL_INTEL_FUNCTION_HPP
-#define BANAL_INTEL_INTEL_FUNCTION_HPP
-
-#include <CFG.h>
-
-namespace Dyninst {
-namespace ParseAPI {
-
-class PARSER_EXPORT IntelFunction : public ParseAPI::Function {
- public:
-  IntelFunction(Address addr, std::string name, CodeObject * obj, 
-    CodeRegion * region, InstructionSource * isource) :
-    Function(addr, name, obj, region, isource) {
-    _cache_valid = true;
-  }
-
-  virtual ~IntelFunction() {}
-
-  void setEntry(Block *entry);
-};
-
-}
-}
-
-#endif
diff --git a/src/lib/binutils/InputFile.cpp b/src/lib/binutils/InputFile.cpp
index ac03b799c8..81a833e59d 100644
--- a/src/lib/binutils/InputFile.cpp
+++ b/src/lib/binutils/InputFile.cpp
@@ -75,7 +75,7 @@
 
 #include "ElfHelper.hpp"
 #include "Fatbin.hpp"
-#include "intel/IntelGPUbinutils.hpp"
+#include "intel/IntelGPUBinutils.hpp"
 #include "InputFile.hpp"
 
 
@@ -206,9 +206,8 @@ InputFile::openFile
   if (result) {
     filevector = new ElfFileVector;
     if (isIntelGPUFile(elfFile)) {
-      findIntelGPUbins(elfFile, filevector);
-    }
-    else {
+      findIntelGPUBins(elfFile, filevector);
+    } else {
       filevector->push_back(elfFile);
     }
     //findCubins(elfFile, filevector);
diff --git a/src/lib/binutils/Makefile.am b/src/lib/binutils/Makefile.am
index 1223a53e40..d82f300ac3 100644
--- a/src/lib/binutils/Makefile.am
+++ b/src/lib/binutils/Makefile.am
@@ -106,7 +106,7 @@ MYSOURCES = \
 	ElfHelper.cpp \
 	InputFile.cpp \
 	RelocateCubin.cpp \
-	intel/IntelGPUbinutils.cpp
+	intel/IntelGPUBinutils.cpp
 
 
 #############################################################################
diff --git a/src/lib/binutils/Makefile.in b/src/lib/binutils/Makefile.in
index ff79979420..56bed3dbcf 100644
--- a/src/lib/binutils/Makefile.in
+++ b/src/lib/binutils/Makefile.in
@@ -149,7 +149,7 @@ am__objects_1 = libHPCbinutils_la-LM.lo libHPCbinutils_la-Seg.lo \
 	libHPCbinutils_la-Fatbin.lo libHPCbinutils_la-ElfHelper.lo \
 	libHPCbinutils_la-InputFile.lo \
 	libHPCbinutils_la-RelocateCubin.lo \
-	intel/libHPCbinutils_la-IntelGPUbinutils.lo
+	intel/libHPCbinutils_la-IntelGPUBinutils.lo
 am_libHPCbinutils_la_OBJECTS = $(am__objects_1)
 libHPCbinutils_la_OBJECTS = $(am_libHPCbinutils_la_OBJECTS)
 AM_V_lt = $(am__v_lt_@AM_V@)
@@ -565,7 +565,7 @@ MYSOURCES = \
 	ElfHelper.cpp \
 	InputFile.cpp \
 	RelocateCubin.cpp \
-	intel/IntelGPUbinutils.cpp
+	intel/IntelGPUBinutils.cpp
 
 
 #############################################################################
@@ -652,7 +652,7 @@ intel/$(am__dirstamp):
 intel/$(DEPDIR)/$(am__dirstamp):
 	@$(MKDIR_P) intel/$(DEPDIR)
 	@: > intel/$(DEPDIR)/$(am__dirstamp)
-intel/libHPCbinutils_la-IntelGPUbinutils.lo: intel/$(am__dirstamp) \
+intel/libHPCbinutils_la-IntelGPUBinutils.lo: intel/$(am__dirstamp) \
 	intel/$(DEPDIR)/$(am__dirstamp)
 
 libHPCbinutils.la: $(libHPCbinutils_la_OBJECTS) $(libHPCbinutils_la_DEPENDENCIES) $(EXTRA_libHPCbinutils_la_DEPENDENCIES) 
@@ -681,7 +681,7 @@ distclean-compile:
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libHPCbinutils_la-SimpleSymbols.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libHPCbinutils_la-SimpleSymbolsFactories.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libHPCbinutils_la-VMAInterval.Plo@am__quote@
-@AMDEP_TRUE@@am__include@ @am__quote@intel/$(DEPDIR)/libHPCbinutils_la-IntelGPUbinutils.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@intel/$(DEPDIR)/libHPCbinutils_la-IntelGPUBinutils.Plo@am__quote@
 
 .cpp.o:
 @am__fastdepCXX_TRUE@	$(AM_V_CXX)depbase=`echo $@ | sed 's|[^/]*$$|$(DEPDIR)/&|;s|\.o$$||'`;\
@@ -812,12 +812,12 @@ libHPCbinutils_la-RelocateCubin.lo: RelocateCubin.cpp
 @AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
 @am__fastdepCXX_FALSE@	$(AM_V_CXX@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libHPCbinutils_la_CXXFLAGS) $(CXXFLAGS) -c -o libHPCbinutils_la-RelocateCubin.lo `test -f 'RelocateCubin.cpp' || echo '$(srcdir)/'`RelocateCubin.cpp
 
-intel/libHPCbinutils_la-IntelGPUbinutils.lo: intel/IntelGPUbinutils.cpp
-@am__fastdepCXX_TRUE@	$(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libHPCbinutils_la_CXXFLAGS) $(CXXFLAGS) -MT intel/libHPCbinutils_la-IntelGPUbinutils.lo -MD -MP -MF intel/$(DEPDIR)/libHPCbinutils_la-IntelGPUbinutils.Tpo -c -o intel/libHPCbinutils_la-IntelGPUbinutils.lo `test -f 'intel/IntelGPUbinutils.cpp' || echo '$(srcdir)/'`intel/IntelGPUbinutils.cpp
-@am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) intel/$(DEPDIR)/libHPCbinutils_la-IntelGPUbinutils.Tpo intel/$(DEPDIR)/libHPCbinutils_la-IntelGPUbinutils.Plo
-@AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='intel/IntelGPUbinutils.cpp' object='intel/libHPCbinutils_la-IntelGPUbinutils.lo' libtool=yes @AMDEPBACKSLASH@
+intel/libHPCbinutils_la-IntelGPUBinutils.lo: intel/IntelGPUBinutils.cpp
+@am__fastdepCXX_TRUE@	$(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libHPCbinutils_la_CXXFLAGS) $(CXXFLAGS) -MT intel/libHPCbinutils_la-IntelGPUBinutils.lo -MD -MP -MF intel/$(DEPDIR)/libHPCbinutils_la-IntelGPUBinutils.Tpo -c -o intel/libHPCbinutils_la-IntelGPUBinutils.lo `test -f 'intel/IntelGPUBinutils.cpp' || echo '$(srcdir)/'`intel/IntelGPUBinutils.cpp
+@am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) intel/$(DEPDIR)/libHPCbinutils_la-IntelGPUBinutils.Tpo intel/$(DEPDIR)/libHPCbinutils_la-IntelGPUBinutils.Plo
+@AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='intel/IntelGPUBinutils.cpp' object='intel/libHPCbinutils_la-IntelGPUBinutils.lo' libtool=yes @AMDEPBACKSLASH@
 @AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
-@am__fastdepCXX_FALSE@	$(AM_V_CXX@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libHPCbinutils_la_CXXFLAGS) $(CXXFLAGS) -c -o intel/libHPCbinutils_la-IntelGPUbinutils.lo `test -f 'intel/IntelGPUbinutils.cpp' || echo '$(srcdir)/'`intel/IntelGPUbinutils.cpp
+@am__fastdepCXX_FALSE@	$(AM_V_CXX@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libHPCbinutils_la_CXXFLAGS) $(CXXFLAGS) -c -o intel/libHPCbinutils_la-IntelGPUBinutils.lo `test -f 'intel/IntelGPUBinutils.cpp' || echo '$(srcdir)/'`intel/IntelGPUBinutils.cpp
 
 mostlyclean-libtool:
 	-rm -f *.lo
diff --git a/src/lib/binutils/intel/IntelGPUbinutils.cpp b/src/lib/binutils/intel/IntelGPUBinutils.cpp
similarity index 64%
rename from src/lib/binutils/intel/IntelGPUbinutils.cpp
rename to src/lib/binutils/intel/IntelGPUBinutils.cpp
index 0a32fae64b..c1ac80c8a5 100644
--- a/src/lib/binutils/intel/IntelGPUbinutils.cpp
+++ b/src/lib/binutils/intel/IntelGPUBinutils.cpp
@@ -69,7 +69,7 @@
 #include <lib/binutils/ElfHelper.hpp>
 #include <lib/support/diagnostics.h>
 #include <lib/support/RealPathMgr.cpp>
-#include "IntelGPUbinutils.hpp"
+#include "IntelGPUBinutils.hpp"
 
 
 //******************************************************************************
@@ -123,7 +123,7 @@ read_all(int fd, void *buf, size_t count)
 
 
 static const char*
-openclElfSectionType
+opencl_elf_section_type
 (
   Elf64_Word sh_type
 )
@@ -164,97 +164,63 @@ openclElfSectionType
 static bool
 extract_kernelelfs
 (
-  std::vector<uint8_t> &symbols,
-  ElfFileVector *filevector
+ char *section_data,
+ size_t section_size,
+ ElfFileVector *filevector
 )
 {
-  bool extractSuccess = true;
-  const uint8_t* ptr = symbols.data();
+  const char *ptr = section_data;
   const SProgramDebugDataHeaderIGC* header =
     reinterpret_cast<const SProgramDebugDataHeaderIGC*>(ptr);
   ptr += sizeof(SProgramDebugDataHeaderIGC);
 
   if (header->NumberOfKernels == 0) {
-    extractSuccess = false;
-    return extractSuccess;
+    return false;
   }
   
   for (uint32_t i = 0; i < header->NumberOfKernels; ++i) {
-    const SKernelDebugDataHeaderIGC* kernel_header =
+    const SKernelDebugDataHeaderIGC *kernel_header =
       reinterpret_cast<const SKernelDebugDataHeaderIGC*>(ptr);
     ptr += sizeof(SKernelDebugDataHeaderIGC);
 
-    const char* kernel_name = reinterpret_cast<const char*>(ptr);
+    const char *kernel_name = reinterpret_cast<const char*>(ptr);
     std::string file_name = std::string(kernel_name) + ".gpubin";
+    std::cout << "intel " << file_name << std::endl;
 
     unsigned kernel_name_size_aligned = sizeof(uint32_t) *
       (1 + (kernel_header->KernelNameSize - 1) / sizeof(uint32_t));
     ptr += kernel_name_size_aligned;
 
     if (kernel_header->SizeVisaDbgInBytes > 0) {
-      /*FILE *f_ptr = fopen(kernel_name, "wb");
-      fwrite(ptr, kernel_header->SizeVisaDbgInBytes, 1, f_ptr);
-      fclose(f_ptr);*/
-      std::ifstream in(kernel_name);
-      std::string file_contents((std::istreambuf_iterator<char>(in)), 
-          std::istreambuf_iterator<char>());
-
-      ElfFile *elfFile = new ElfFile;
+      ElfFile *elf_file = new ElfFile;
       int file_fd = open(file_name.c_str(), O_RDONLY);
       size_t f_size = file_size(file_fd);
       char *file_buffer = (char *)malloc(f_size);
       size_t bytes = read_all(file_fd, file_buffer, f_size);
 
-      if (elfFile->open(file_buffer, f_size, file_name)) {
-        extractSuccess = true;
-        filevector->push_back(elfFile);
+      if (elf_file->open(file_buffer, f_size, file_name)) {
+        filevector->push_back(elf_file);
       } else {
-        extractSuccess = false;
-        break;
-      }
-      
-      /*
-      // start cfg generation
-      Elf *elf = elfFile->getElf();
-      file_buffer = elfFile->getMemory();
-      ElfSectionVector *sections = elfGetSectionVector(elf);
-      GElf_Ehdr ehdr_v;
-      GElf_Ehdr *ehdr = gelf_getehdr(elf, &ehdr_v);
-
-      if (ehdr) {
-        for (auto si = sections->begin(); si != sections->end(); si++) {
-          Elf_Scn *scn = *si;
-          GElf_Shdr shdr_v;
-          GElf_Shdr *shdr = gelf_getshdr(scn, &shdr_v);
-          if (!shdr) continue;
-          char *sectionData = elfSectionGetData(file_buffer, shdr);
-          const char *section_name = elf_strptr(elf, ehdr->e_shstrndx, shdr->sh_name);
-          if (strcmp(section_name, ".text") == 0) {
-            std::vector<uint8_t> intelRawGenBinary(reinterpret_cast<uint8_t*>(sectionData), 
-                reinterpret_cast<uint8_t*>(sectionData) + kernel_header->SizeVisaDbgInBytes);
-            printCFGInDotGraph(kernel_name, intelRawGenBinary);
-          }
-        }
+        // Cannot handle a kernel
+        return false;
       }
-      */
-      //end cfg generation
     } else {
-      extractSuccess = false;
-      break;
+      // Kernel does not have debug info
+      return false;
     }
   }
 
-  return extractSuccess;
+  return true;
 }
 
 
 static bool
-isCustomOpenCLBinary
+is_custom_opencl_binary
 (
-  const char *section_name
+ const std::string &section_name
 )
 {
-  return (strcmp(section_name, ".SHT_OPENCL_DEV_DEBUG") == 0);
+  return section_name == ".SHT_OPENCL_DEV_DEBUG";
 }
 
 //******************************************************************************
@@ -262,14 +228,14 @@ isCustomOpenCLBinary
 //******************************************************************************
 
 bool
-findIntelGPUbins
+findIntelGPUBins
 (
-  ElfFile *elfFile,
-  ElfFileVector *filevector
+ ElfFile *elfFile,
+ ElfFileVector *filevector
 )
 {
-  bool fileHasDebugSection = false;
-  bool extractSuccess = false;
+  bool has_debug_section = false;
+  bool extract_file = false;
 
   Elf *elf = elfFile->getElf();
   char *file_buffer = elfFile->getMemory();
@@ -283,37 +249,31 @@ findIntelGPUbins
       GElf_Shdr shdr_v;
       GElf_Shdr *shdr = gelf_getshdr(scn, &shdr_v);
       if (!shdr) continue;
-      char *sectionData = elfSectionGetData(file_buffer, shdr);
-      const char *section_name = elf_strptr(elf, ehdr->e_shstrndx, shdr->sh_name);
-      //std::cerr << "section name: " << section_name << ". section type: " << openclElfSectionType(shdr->sh_type) << std::endl;
+      char *section_data = elfSectionGetData(file_buffer, shdr);
+      std::string section_name = std::string(elf_strptr(elf, ehdr->e_shstrndx, shdr->sh_name));
+      std::cout << "section name: " << section_name << ". section type: " << opencl_elf_section_type(shdr->sh_type) << std::endl;
 
       // extract debug section
-      if ((shdr->sh_type == SHT_OPENCL_DEV_DEBUG && strcmp(section_name, INTEL_GPU_DEBUG_SECTION_NAME) == 0)
-          || isCustomOpenCLBinary(section_name)) {
-        fileHasDebugSection = true;
-        std::vector<uint8_t> debug_info(reinterpret_cast<uint8_t*>(sectionData), reinterpret_cast<uint8_t*>(sectionData) + shdr->sh_size);
-        extractSuccess = extract_kernelelfs(debug_info, filevector);
+      if ((shdr->sh_type == SHT_OPENCL_DEV_DEBUG && section_name == INTEL_GPU_DEBUG_SECTION_NAME)
+        || is_custom_opencl_binary(section_name)) {
+        has_debug_section = true;
+        extract_file = extract_kernelelfs(section_data, shdr->sh_size, filevector);
         break;
-      } /*else if (strcmp(section_name, ".text") == 0) {
-        FILE *bin_ptr;
-        bin_ptr = fopen("switch.text", "wb");
-        fwrite(sectionData, shdr->sh_size, 1, bin_ptr);
-        fclose(bin_ptr);
-      }*/
+      }
     }
   }
-  // TODO(Aaron): why put this section here?
-  FILE *fptr;
-  if (!fileHasDebugSection && (fptr = fopen("opencl_main.debuginfo", "rb"))) {
-    fileHasDebugSection = true;
-    fseek(fptr, 0L, SEEK_END);
-    size_t debug_info_size = ftell(fptr);
-    printf("debug_info_size: %zu\n", debug_info_size);
-    rewind(fptr);
-    std::vector<uint8_t> debug_info(debug_info_size);
-    fread(debug_info.data(), debug_info_size, 1, fptr);
-    extractSuccess = extract_kernelelfs(debug_info, filevector);
-  }
-  bool success = fileHasDebugSection && extractSuccess;
-  return success; 
+  //// TODO(Aaron): why put this section here?
+  //FILE *fptr;
+  //if (!fileHasDebugSection && (fptr = fopen("opencl_main.debuginfo", "rb"))) {
+  //  fileHasDebugSection = true;
+  //  fseek(fptr, 0L, SEEK_END);
+  //  size_t debug_info_size = ftell(fptr);
+  //  printf("debug_info_size: %zu\n", debug_info_size);
+  //  rewind(fptr);
+  //  std::vector<uint8_t> debug_info(debug_info_size);
+  //  fread(debug_info.data(), debug_info_size, 1, fptr);
+  //  extractSuccess = extract_kernelelfs(debug_info, filevector);
+  //}
+  //bool success = fileHasDebugSection && extractSuccess;
+  return extract_file && has_debug_section; 
 }
diff --git a/src/lib/binutils/intel/IntelGPUbinutils.hpp b/src/lib/binutils/intel/IntelGPUBinutils.hpp
similarity index 99%
rename from src/lib/binutils/intel/IntelGPUbinutils.hpp
rename to src/lib/binutils/intel/IntelGPUBinutils.hpp
index 4f64553502..56564f934a 100644
--- a/src/lib/binutils/intel/IntelGPUbinutils.hpp
+++ b/src/lib/binutils/intel/IntelGPUBinutils.hpp
@@ -73,7 +73,7 @@ enum SHT_OPENCL : uint32_t {
 //******************************************************************************
 
 bool
-findIntelGPUbins
+findIntelGPUBins
 (
 	ElfFile *elfFile,
 	ElfFileVector *filevector

From aeacef0eb1d56f4c7e947142bbf733e2ad8a7c59 Mon Sep 17 00:00:00 2001
From: Aaron Cherian <aarontcopal2@iris19.ftm.alcf.anl.gov>
Date: Tue, 22 Sep 2020 15:24:56 +0000
Subject: [PATCH 034/177] code for adding block and instruction offsets in
 struct files for intel GPU

---
 src/lib/banal/Struct-Output.cpp             | 41 +++++++++++
 src/lib/banal/Struct-Output.hpp             | 37 ++++++++++
 src/lib/banal/Struct.cpp                    | 39 +++++++---
 src/lib/binutils/InputFile.cpp              |  8 ++-
 src/lib/binutils/InputFile.hpp              |  2 +-
 src/lib/binutils/LM.cpp                     |  2 +-
 src/lib/binutils/intel/CreateCFG.cpp        | 79 +++++++++++++++++++--
 src/lib/binutils/intel/CreateCFG.hpp        | 13 +++-
 src/lib/binutils/intel/IntelGPUbinutils.cpp |  3 +-
 9 files changed, 202 insertions(+), 22 deletions(-)

diff --git a/src/lib/banal/Struct-Output.cpp b/src/lib/banal/Struct-Output.cpp
index c4e5ddb851..092c424801 100644
--- a/src/lib/banal/Struct-Output.cpp
+++ b/src/lib/banal/Struct-Output.cpp
@@ -72,8 +72,11 @@
 #include <map>
 #include <ostream>
 #include <string>
+#include <fcntl.h>
 
 #include <lib/binutils/VMAInterval.hpp>
+#include <lib/binutils/ElfHelper.hpp>
+#include <lib/binutils/intel/CreateCFG.hpp>
 #include <lib/support/FileUtil.hpp>
 #include <lib/support/StringTable.hpp>
 #include <lib/support/dictionary.h>
@@ -179,6 +182,7 @@ StmtLessThan(StmtInfo * s1, StmtInfo * s2)
   return s1->vma < s2->vma;
 }
 
+
 //----------------------------------------------------------------------
 
 // DOCTYPE header and <HPCToolkitStructure> tag.
@@ -249,6 +253,43 @@ printLoadModuleEnd(ostream * os)
   *os << "</LM>\n";
 }
 
+//----------------------------------------------------------------------
+void
+printBlockAndInstructionOffset(ostream * os, string file_name)
+{
+	ElfFile *elfFile = new ElfFile;
+	int file_fd = open(file_name.c_str(), O_RDONLY);
+	size_t f_size = file_size(file_fd);
+	char  *file_buffer = (char *) malloc(f_size);
+	size_t bytes = read_all(file_fd, file_buffer, f_size);
+	bool result = elfFile->open(file_buffer, f_size, file_name);
+
+	Elf *elf = elfFile->getElf();
+	file_buffer = elfFile->getMemory();
+	ElfSectionVector *sections = elfGetSectionVector(elf);
+	GElf_Ehdr ehdr_v;
+	GElf_Ehdr *ehdr = gelf_getehdr(elf, &ehdr_v);
+
+	if (ehdr) {
+		for (auto si = sections->begin(); si != sections->end(); si++) {
+			Elf_Scn *scn = *si;
+			GElf_Shdr shdr_v;
+			GElf_Shdr *shdr = gelf_getshdr(scn, &shdr_v);
+			if (!shdr) continue;
+			char *sectionData = elfSectionGetData(file_buffer, shdr);
+			const char *section_name = elf_strptr(elf, ehdr->e_shstrndx, shdr->sh_name);
+			if (strcmp(section_name, ".text") == 0) {
+				std::vector<uint8_t> intelRawGenBinary(reinterpret_cast<uint8_t*>(sectionData), 
+						reinterpret_cast<uint8_t*>(sectionData) + shdr->sh_size);
+				std::string blockAndInstOffsets = getBlockAndInstructionOffsets(intelRawGenBinary);
+				*os << blockAndInstOffsets;
+			}
+		}
+	}
+
+}
+
+
 //----------------------------------------------------------------------
 
 // Begin <F> file tag.
diff --git a/src/lib/banal/Struct-Output.hpp b/src/lib/banal/Struct-Output.hpp
index 2f354d7628..c6602e66cc 100644
--- a/src/lib/banal/Struct-Output.hpp
+++ b/src/lib/banal/Struct-Output.hpp
@@ -78,6 +78,43 @@ void printFileEnd(ostream *, FileInfo *);
 void printProc(ostream *, ostream *, string, FileInfo *, GroupInfo *,
 	       ProcInfo *, HPC::StringTable & strTab);
 
+void printBlockAndInstructionOffset(ostream * os, string file_name);
+
+static size_t
+file_size(int fd)
+{
+  struct stat sb;
+  int retval = fstat(fd, &sb);
+  if (retval == 0 && S_ISREG(sb.st_mode)) {
+    return sb.st_size;
+  }
+  return 0;
+}
+
+
+// Automatically restart short reads.
+// This protects against EINTR.
+//
+static size_t
+read_all(int fd, void *buf, size_t count)
+{
+  ssize_t ret;
+  size_t len;
+
+  len = 0;
+  while (len < count) {
+    ret = read(fd, ((char *) buf) + len, count - len);
+    if (ret == 0 || (ret < 0 && errno != EINTR)) {
+      break;
+    }
+    if (ret > 0) {
+      len += ret;
+    }
+  }
+
+  return len;
+}
+
 }  // namespace Output
 }  // namespace BAnal
 
diff --git a/src/lib/banal/Struct.cpp b/src/lib/banal/Struct.cpp
index bfce07ecd4..3b8691ab01 100644
--- a/src/lib/banal/Struct.cpp
+++ b/src/lib/banal/Struct.cpp
@@ -567,6 +567,20 @@ getFileNameFromAbsolutePath(string str)
 }
 
 
+static bool
+isIntelGPUFile
+(
+	std::string inputFileType
+)
+{
+	const std::string intelGPUType = "IntelGPU";
+	if (inputFileType.compare(intelGPUType) == 0) {
+		return true;	
+	}
+	return false;
+}
+
+
 //
 // makeStructure -- the main entry point for hpcstruct realmain().
 //
@@ -595,10 +609,10 @@ makeStructure(string filename,
 #endif
 
   InputFile inputFile;
+	std::string inputFileType;
 
   // failure throws an error up the call chain
-  inputFile.openFile(filename, InputFileError_Error);
-	
+  inputFile.openFile(filename, InputFileError_Error, &inputFileType);
   ElfFileVector * elfFileVector = inputFile.fileVector();
   string & sfilename = inputFile.fileName();
   const char * cfilename = inputFile.CfileName();
@@ -661,10 +675,9 @@ makeStructure(string filename,
     omp_set_num_threads(opts.jobs_parse);
 #endif
 
-		bool isIntelArch = true;
+		bool isIntelArch = isIntelGPUFile(inputFileType);
 		bool cfgNotPresent = true;
 		if (isIntelArch && cfgNotPresent) {
-			//std::cerr << "executing intel-gen9 specific code." << std::endl;
 			add_custom_function_object(symtab, getFileNameFromAbsolutePath(elfFile->getFileName())); //adds a dummy function object
 			code_src = new SymtabCodeSource(symtab);
 		  code_obj = new CodeObject(code_src, NULL, NULL, false, true); //last param is bool ignoreParse
@@ -709,11 +722,14 @@ makeStructure(string filename,
     mutex output_mtx;
 
     makeWorkList(fileMap, wlPrint, wlLaunch);
-		
-		std::cerr << elfFile->getArch() << std::endl;
-		char *elfFileRealPath = realpath(elfFile->getFileName().c_str(), NULL);
-		std::cerr << elfFileRealPath << std::endl;
-    Output::printLoadModuleBegin(outFile, elfFileRealPath);
+	
+		char *elfFileRealPath;
+		if (isIntelArch) {
+			elfFileRealPath = realpath(elfFile->getFileName().c_str(), NULL);
+			Output::printLoadModuleBegin(outFile, elfFileRealPath);
+		} else {
+			Output::printLoadModuleBegin(outFile, elfFile->getFileName());
+		}
 
 #pragma omp parallel  default(none)				\
     shared(wlPrint, wlLaunch, num_done, output_mtx)		\
@@ -734,6 +750,11 @@ makeStructure(string filename,
     // with try_lock(), there are interleavings where not all items
     // have been printed.
     printWorkList(wlPrint, num_done, outFile, gapsFile, gaps_filenm);
+	
+		// custom code for intel GPU elfs
+		if (isIntelArch) {
+    	Output::printBlockAndInstructionOffset(outFile, elfFileRealPath);
+		}
 
     Output::printLoadModuleEnd(outFile);
 
diff --git a/src/lib/binutils/InputFile.cpp b/src/lib/binutils/InputFile.cpp
index 9a25e84662..311ccd716d 100644
--- a/src/lib/binutils/InputFile.cpp
+++ b/src/lib/binutils/InputFile.cpp
@@ -130,8 +130,6 @@ isIntelGPUFile
   GElf_Ehdr *ehdr = gelf_getehdr(elf, &ehdr_v);
 
 	int intelGPUType = 0xff04;
-	std::cerr << "ehdr->e_type: " << ehdr->e_type << std::endl;
-	std::cerr << "ehdr->e_type == intelGPUType: " << (ehdr->e_type == intelGPUType) << std::endl;
   if (ehdr && ehdr->e_type == intelGPUType) {
 		return true;
 	}
@@ -148,7 +146,8 @@ bool
 InputFile::openFile
 (
  std::string &filename,
- InputFileErrorType_t errType
+ InputFileErrorType_t errType,
+ std::string *fileType
 )
 {
   const char *tag = 
@@ -206,6 +205,9 @@ InputFile::openFile
   if (result) {
     filevector = new ElfFileVector;
 		if (isIntelGPUFile(elfFile)) {
+			if (fileType != NULL) {
+				*fileType = "IntelGPU";
+			}
 			findIntelGPUbins(elfFile, filevector);
 		}
 		else {
diff --git a/src/lib/binutils/InputFile.hpp b/src/lib/binutils/InputFile.hpp
index cc3cc9d8f9..4814553820 100644
--- a/src/lib/binutils/InputFile.hpp
+++ b/src/lib/binutils/InputFile.hpp
@@ -89,7 +89,7 @@ class InputFile {
 public:
   InputFile() { filevector = 0; }
   ~InputFile();
-  bool openFile(std::string &filename, InputFileErrorType_t errType);
+  bool openFile(std::string &filename, InputFileErrorType_t errType, std::string *fileType);
 
   std::string &fileName() { return filename; }
   const char *CfileName() { return filename.c_str(); }
diff --git a/src/lib/binutils/LM.cpp b/src/lib/binutils/LM.cpp
index bb200771d9..402851f1b2 100644
--- a/src/lib/binutils/LM.cpp
+++ b/src/lib/binutils/LM.cpp
@@ -415,7 +415,7 @@ BinUtil::LM::open(const char* filenm)
 
   std::string file_name = std::string(filenm);
 
-  if (input_file.openFile(file_name, InputFileError_WarningNothrow)) {
+  if (input_file.openFile(file_name, InputFileError_WarningNothrow, NULL)) {
     // We only relocate individual cubins, with filevector size 1
     ElfFile *elf_file = (*input_file.fileVector())[0];
     if (isCubin(elf_file->getElf())) {
diff --git a/src/lib/binutils/intel/CreateCFG.cpp b/src/lib/binutils/intel/CreateCFG.cpp
index c4f6ffb97d..810408c119 100644
--- a/src/lib/binutils/intel/CreateCFG.cpp
+++ b/src/lib/binutils/intel/CreateCFG.cpp
@@ -71,8 +71,8 @@
 //******************************************************************************
 
 #define MAX_STR_SIZE 1024
+#define INDENT  "  "
 
-std::vector<int32_t> block_offsets;
 std::map<int32_t, bool> visitedBlockOffsets;
 
 
@@ -84,14 +84,15 @@ std::map<int32_t, bool> visitedBlockOffsets;
 static std::set<Edge>
 get_cfg_edges
 (
-	std::vector<uint8_t> binary,
-	size_t binary_size
+	std::vector<uint8_t> binary
 )
 {
 	KernelView kv(IGA_GEN9, binary.data(), binary.size(),
 			iga::SWSB_ENCODE_MODE::SingleDistPipe);
+	size_t binary_size = binary.size();
 	std::set<Edge> cfg_edges;
 
+	std::vector<int32_t> block_offsets;
 	int32_t offset = 0;
 	int32_t size;
 	while (offset < binary_size) {
@@ -198,6 +199,39 @@ printBasicBlocks
 }
 
 
+static std::vector<int32_t>
+getBlockOffsets
+(
+	std::vector<uint8_t> binary
+)
+{
+	std::vector<int32_t> block_offsets;
+	int32_t offset = 0;
+	int32_t size = 0;
+	KernelView kv(IGA_GEN9, binary.data(), binary.size(),
+			iga::SWSB_ENCODE_MODE::SingleDistPipe);
+
+	while (offset < binary.size()) {
+		bool isStartOfBasicBlock = kv.isInstTarget(offset);
+		if (isStartOfBasicBlock) {
+			block_offsets.push_back(offset);
+		}
+		size = kv.getInstSize(offset);
+		offset += size;	
+	}
+	return block_offsets;
+}
+
+
+static void
+doIndent(std::stringstream *ss, int depth)
+{
+  for (int n = 1; n <= depth; n++) {
+    *ss << INDENT;
+  }
+}
+
+
 
 //******************************************************************************
 // interface operations
@@ -206,10 +240,45 @@ printBasicBlocks
 // pass Intel kernel's raw gen binary
 // kernel's text region is a raw gen binary
 // you  can find kernel nested in [debug section of GPU binary/separate debug section dump]
-void printCFGInDotGraph(std::vector<uint8_t> intelRawGenBinary) {
+void
+printCFGInDotGraph
+(
+	std::vector<uint8_t> intelRawGenBinary
+)
+{
 	std::cout << "digraph GEMM_iga {" << std::endl;
-	std::set<Edge> edges = get_cfg_edges(intelRawGenBinary, intelRawGenBinary.size());
+	std::set<Edge> edges = get_cfg_edges(intelRawGenBinary);
 	printBasicBlocks(intelRawGenBinary, edges);
 	printCFGEdges(edges);
 	std::cout << "}" << std::endl;
 }
+
+
+std::string
+getBlockAndInstructionOffsets
+(
+ std::vector<uint8_t> intelRawGenBinary
+)
+{
+	std::stringstream ss;
+	std::vector<int32_t> block_offsets = getBlockOffsets(intelRawGenBinary);
+	KernelView kv(IGA_GEN9, intelRawGenBinary.data(), intelRawGenBinary.size(),
+			iga::SWSB_ENCODE_MODE::SingleDistPipe);
+	int32_t offset, size;
+
+	for(auto i = 0; i < block_offsets.size()-1; i++) {
+		offset = block_offsets[i];
+		doIndent(&ss, 1);
+		ss << "<B o=\"0x" << std::hex << offset << "\">\n"; 
+		doIndent(&ss, 2);
+		while (offset != block_offsets[i+1]) {
+			ss << "<I o=\"0x" << std::hex << offset << "\"/>";
+			size = kv.getInstSize(offset);
+			offset += size;
+		}
+		ss << "\n"; 
+		doIndent(&ss, 1);
+		ss << "</B>\n"; 
+	}
+	return ss.str();
+}
diff --git a/src/lib/binutils/intel/CreateCFG.hpp b/src/lib/binutils/intel/CreateCFG.hpp
index 4c478a672b..90820920d0 100644
--- a/src/lib/binutils/intel/CreateCFG.hpp
+++ b/src/lib/binutils/intel/CreateCFG.hpp
@@ -89,6 +89,17 @@ class Edge {
 // interface operations
 //******************************************************************************
 
-void printCFGInDotGraph(std::vector<uint8_t> intelRawGenBinary);
+void
+printCFGInDotGraph
+(
+	std::vector<uint8_t> intelRawGenBinary
+);
+
+
+std::string
+getBlockAndInstructionOffsets
+(
+ std::vector<uint8_t> intelRawGenBinary
+);
 
 #endif
diff --git a/src/lib/binutils/intel/IntelGPUbinutils.cpp b/src/lib/binutils/intel/IntelGPUbinutils.cpp
index d947475661..229f3c2a9e 100644
--- a/src/lib/binutils/intel/IntelGPUbinutils.cpp
+++ b/src/lib/binutils/intel/IntelGPUbinutils.cpp
@@ -230,7 +230,7 @@ extract_kernelelfs
 					if (strcmp(section_name, ".text") == 0) {
 						std::vector<uint8_t> intelRawGenBinary(reinterpret_cast<uint8_t*>(sectionData), 
 								reinterpret_cast<uint8_t*>(sectionData) + kernel_header->SizeVisaDbgInBytes);
-						printCFGInDotGraph(intelRawGenBinary);
+						//printCFGInDotGraph(intelRawGenBinary);
 					}
 				}
 			}
@@ -304,7 +304,6 @@ findIntelGPUbins
 		fileHasDebugSection = true;
 		fseek(fptr, 0L, SEEK_END);
 		size_t debug_info_size = ftell(fptr);
-		printf("debug_info_size: %zu\n", debug_info_size);
 		rewind(fptr);
 		std::vector<uint8_t> debug_info(debug_info_size);
 		fread(debug_info.data(), debug_info_size, 1, fptr);

From 74b4b838a04fd352e5fabeddd5d7a025d82acb12 Mon Sep 17 00:00:00 2001
From: Aaron Cherian <aarontcopal2@iris08.ftm.alcf.anl.gov>
Date: Wed, 23 Sep 2020 05:03:14 +0000
Subject: [PATCH 035/177] replacing gotcha intercepts with function overrides
 for opencl functions

---
 .../instrumentation/opencl-instrumentation.c  |   2 +-
 src/tool/hpcrun/gpu/opencl/opencl-api.c       | 466 +++++++++++++++++-
 src/tool/hpcrun/gpu/opencl/opencl-api.h       |   7 +
 src/tool/hpcrun/gpu/opencl/opencl-intercept.c | 444 +----------------
 src/tool/hpcrun/gpu/opencl/opencl-intercept.h |   7 -
 5 files changed, 468 insertions(+), 458 deletions(-)

diff --git a/src/tool/hpcrun/gpu/instrumentation/opencl-instrumentation.c b/src/tool/hpcrun/gpu/instrumentation/opencl-instrumentation.c
index c4c32a309d..0b136c56f2 100644
--- a/src/tool/hpcrun/gpu/instrumentation/opencl-instrumentation.c
+++ b/src/tool/hpcrun/gpu/instrumentation/opencl-instrumentation.c
@@ -68,7 +68,7 @@
 #include <hpcrun/gpu/gpu-monitoring-thread-api.h>
 #include <hpcrun/memory/hpcrun-malloc.h>
 #include <hpcrun/utilities/hpcrun-nanotime.h>
-#include <hpcrun/gpu/opencl/opencl-intercept.h>
+#include <hpcrun/gpu/opencl/opencl-api.h>
 #include "opencl-instrumentation.h"
 
 
diff --git a/src/tool/hpcrun/gpu/opencl/opencl-api.c b/src/tool/hpcrun/gpu/opencl/opencl-api.c
index 68980eac33..f6a243c2d2 100644
--- a/src/tool/hpcrun/gpu/opencl/opencl-api.c
+++ b/src/tool/hpcrun/gpu/opencl/opencl-api.c
@@ -153,6 +153,12 @@
 #define opencl_path() "libOpenCL.so"
 
 #define FORALL_OPENCL_ROUTINES(macro)					\
+  macro(clBuildProgram)					\
+  macro(clCreateProgramWithSource)					\
+  macro(clCreateCommandQueue)					\
+  macro(clEnqueueNDRangeKernel)					\
+  macro(clEnqueueReadBuffer)					\
+  macro(clEnqueueWriteBuffer)					\
   macro(clGetEventProfilingInfo)					\
   macro(clReleaseEvent)							\
   macro(clSetEventCallback)
@@ -162,15 +168,18 @@
 #define OPENCL_FN(fn, args)			\
   static cl_int (*OPENCL_FN_NAME(fn)) args
 
-#define HPCRUN_OPENCL_CALL(fn, args)					\
-  {									\
+#define HPCRUN_OPENCL_CALL(fn, args) (OPENCL_FN_NAME(fn) args)
+
+/*
+#define HPCRUN_OPENCL_CALL(fn, args)								\
+  {																									\
     cl_int status = OPENCL_FN_NAME(fn) args;				\
-    if (status != CL_SUCCESS) {						\
+    if (status != CL_SUCCESS) {											\
       ETMSG(OPENCL, "opencl call failed: %s",				\
-	    opencl_error_report(status));				\
-    }									\
+	    opencl_error_report(status));									\
+    }																								\
   }
-
+*/
 
 
 //******************************************************************************
@@ -181,6 +190,96 @@
 // opencl function pointers for late binding
 //----------------------------------------------------------
 
+OPENCL_FN
+(
+  clBuildProgram, 
+  (
+	 cl_program program,
+	 cl_uint num_devices,
+	 const cl_device_id* device_list,
+	 const char* options,
+	 void (CL_CALLBACK* pfn_notify)(cl_program program, void* user_data),
+	 void* user_data
+  )
+);
+
+
+OPENCL_FN
+(
+  clCreateProgramWithSource, 
+  (
+	 cl_context context,
+	 cl_uint count,
+	 const char** strings,
+	 const size_t* lengths,
+	 cl_int* errcode_ret
+  )
+);
+
+
+OPENCL_FN
+(
+  clCreateCommandQueue, 
+	(
+	 cl_context,
+	 cl_device_id,
+	 cl_command_queue_properties,
+	 cl_int*
+	)
+);
+
+
+OPENCL_FN
+(
+  clEnqueueNDRangeKernel, 
+  (
+	 cl_command_queue,
+	 cl_kernel,
+	 cl_uint,
+	 const size_t *, 
+	 const size_t *,
+	 const size_t *,
+	 cl_uint,
+	 const cl_event *,
+	 cl_event *
+  )
+);
+
+
+OPENCL_FN
+(
+  clEnqueueReadBuffer, 
+  (
+	 cl_command_queue,
+	 cl_mem,
+	 cl_bool,
+	 size_t,
+	 size_t,
+	 void *,
+	 cl_uint,
+	 const cl_event *,
+	 cl_event *
+  )
+);
+
+
+OPENCL_FN
+(
+  clEnqueueWriteBuffer, 
+  (
+	 cl_command_queue,
+	 cl_mem,
+	 cl_bool,
+	 size_t,
+	 size_t,
+	 const void *,
+	 cl_uint,
+	 const cl_event *,
+	 cl_event *
+  )
+);
+
+
 OPENCL_FN
 (
   clGetEventProfilingInfo,
@@ -216,8 +315,13 @@ OPENCL_FN
 );
 
 
-
 static atomic_ullong opencl_pending_operations;
+static char *debugInfoFullFileName;
+static atomic_long correlation_id;
+
+
+#define CL_PROGRAM_DEBUG_INFO_SIZES_INTEL 0x4101
+#define CL_PROGRAM_DEBUG_INFO_INTEL       0x4100
 
 
 
@@ -225,6 +329,133 @@ static atomic_ullong opencl_pending_operations;
 // private operations
 //******************************************************************************
 
+static uint64_t
+getCorrelationId
+(
+  void
+)
+{
+  return atomic_fetch_add(&correlation_id, 1);
+}
+
+
+static void
+setDebugInfoFullFileName
+(
+	char *fileName
+)
+{
+	if (debugInfoFullFileName == NULL) {
+		debugInfoFullFileName = fileName;	
+	}
+}
+
+
+static void
+initializeKernelCallBackInfo
+(
+  cl_kernel_callback_t *kernel_cb,
+  uint64_t correlation_id
+)
+{
+  kernel_cb->correlation_id = correlation_id;
+  kernel_cb->type = kernel; 
+}
+
+
+static void
+initializeMemoryCallBackInfo
+(
+  cl_memory_callback_t *mem_transfer_cb,
+  uint64_t correlation_id,
+  size_t size,
+  bool fromHostToDevice
+)
+{
+  mem_transfer_cb->correlation_id = correlation_id;
+  mem_transfer_cb->type = (fromHostToDevice) ? memcpy_H2D: memcpy_D2H; 
+  mem_transfer_cb->size = size;
+  mem_transfer_cb->fromHostToDevice = fromHostToDevice;
+  mem_transfer_cb->fromDeviceToHost = !fromHostToDevice;
+}
+
+static char*
+getKernelNameFromSourceCode
+(
+	const char *kernelSourceCode
+)
+{
+	char *kernelCode_copy = (char*)hpcrun_malloc(sizeof(kernelSourceCode));
+	strcpy(kernelCode_copy, kernelSourceCode);
+	char *token = strtok(kernelCode_copy, " ");
+	while (token != NULL) {
+		if (strcmp(token, "void") == 0) { // not searching for kernel because "supported\n#endif\nkernel"
+			token = strtok(NULL, " ");
+			printf("kernel name: %s", token);
+			return token;
+		}
+		token = strtok(NULL, " ");
+	}
+	return NULL;
+}
+
+
+// we are dumping the debuginfo temporarily since the binary does not have debugsection
+// poorly written code: FIXME
+static char*
+dumpIntelGPUBinary(cl_program program) {
+	int device_count = 1;
+	cl_int status = CL_SUCCESS;
+	size_t *binary_size = (size_t*)hpcrun_malloc(sizeof(size_t) * device_count);
+
+	status = clGetProgramInfo(program, CL_PROGRAM_BINARY_SIZES,	sizeof(size_t), binary_size, NULL);
+	assert(status == CL_SUCCESS);
+	uint8_t **binary = (uint8_t**)hpcrun_malloc(device_count * sizeof(uint8_t*));
+	for (size_t i = 0; i < device_count; ++i) {
+		binary[i] = (uint8_t*)hpcrun_malloc(binary_size[i] * sizeof(uint8_t));
+	}
+
+	status = clGetProgramInfo(program, CL_PROGRAM_BINARIES, device_count * sizeof(uint8_t*), binary, NULL);
+	assert(status == CL_SUCCESS);
+
+	FILE *bin_ptr;
+	bin_ptr = fopen("opencl_main.gpubin", "wb");
+	fwrite(binary[0], binary_size[0], 1, bin_ptr);
+
+  // SECOND
+	size_t *debug_info_size = (size_t*)hpcrun_malloc(sizeof(size_t) * device_count);
+
+	status = clGetProgramInfo(program, CL_PROGRAM_DEBUG_INFO_SIZES_INTEL,	sizeof(size_t), debug_info_size, NULL);
+	assert(status == CL_SUCCESS);
+	uint8_t **debug_info = (uint8_t**)hpcrun_malloc(device_count * sizeof(uint8_t*));
+	for (size_t i = 0; i < device_count; ++i) {
+		debug_info[i] = (uint8_t*)hpcrun_malloc(debug_info_size[i] * sizeof(uint8_t));
+	}
+
+	status = clGetProgramInfo(program, CL_PROGRAM_DEBUG_INFO_INTEL, device_count * sizeof(uint8_t*), debug_info, NULL);
+	assert(status == CL_SUCCESS);
+
+	char *debuginfoFileName = "opencl_main.debuginfo";
+	bin_ptr = fopen(debuginfoFileName, "wb");
+	fwrite(debug_info[0], debug_info_size[0], 1, bin_ptr);
+	fclose(bin_ptr);
+  ETMSG(OPENCL, "Intel GPU files dumped successfully");
+	return realpath(debuginfoFileName, NULL);
+}
+
+
+static void
+clBuildProgramCallback
+(
+	cl_program program,
+	void* user_data
+)
+{
+	char* debugInfoFullFileName = dumpIntelGPUBinary(program);
+	setDebugInfoFullFileName(debugInfoFullFileName);
+}
+
+
 static void
 opencl_pending_operations_adjust
 (
@@ -302,6 +533,16 @@ opencl_error_report
 // interface operations
 //******************************************************************************
 
+char*
+getDebugInfoFullFileName
+(
+	void
+)
+{
+	return debugInfoFullFileName;
+}
+
+
 void
 opencl_subscriber_callback
 (
@@ -431,6 +672,7 @@ opencl_api_initialize
 )
 {
   opencl_intercept_setup();
+  atomic_store(&correlation_id, 0);
   atomic_store(&opencl_pending_operations, 0);
 }
 
@@ -461,6 +703,216 @@ opencl_bind
 }
 
 
+cl_program
+clCreateProgramWithSource
+(
+ cl_context context,
+ cl_uint count,
+ const char** strings,
+ const size_t* lengths,
+ cl_int* errcode_ret
+)
+{
+	ETMSG(OPENCL, "inside clCreateProgramWithSource_wrapper");
+
+	FILE *f_ptr;
+	for (int i = 0; i < (int)count; i++) {
+		// what if a single file has multiple kernels?
+		// we need to add logic to get filenames by reading the strings contents
+		char fileno = '0' + (i + 1); // right now we are naming the files as index numbers
+		// using malloc instead of hpcrun_malloc gives extra garbage characters in file name
+		char *filename = (char*)hpcrun_malloc(sizeof(fileno) + 1);
+		*filename = fileno + '\0';
+		f_ptr = fopen(filename, "w");
+		fwrite(strings[i], lengths[i], 1, f_ptr);
+	}
+	fclose(f_ptr);
+	
+	return HPCRUN_OPENCL_CALL(clCreateProgramWithSource, (context, count, strings, lengths, errcode_ret));
+}
+
+
+// one downside of this appproach is that we may override the callback provided by user
+cl_int
+clBuildProgram
+(
+ cl_program program,
+ cl_uint num_devices,
+ const cl_device_id* device_list,
+ const char* options,
+ void (CL_CALLBACK* pfn_notify)(cl_program program, void* user_data),
+ void* user_data
+)
+{
+  ETMSG(OPENCL, "inside clBuildProgram_wrapper");
+
+	char optionsWithDebugFlag[] = " -gline-tables-only ";
+	if (options != NULL) {
+		strcat(optionsWithDebugFlag, options);
+	}
+  return HPCRUN_OPENCL_CALL(clBuildProgram, (program, num_devices, device_list, (const char*)optionsWithDebugFlag, clBuildProgramCallback, user_data));
+}
+
+
+cl_command_queue
+clCreateCommandQueue
+(
+  cl_context context,
+  cl_device_id device,
+  cl_command_queue_properties properties,
+  cl_int *errcode_ret
+)
+{
+  // enabling profiling
+  properties |= (cl_command_queue_properties)CL_QUEUE_PROFILING_ENABLE; 
+
+	return HPCRUN_OPENCL_CALL(clCreateCommandQueue, (context, device,
+				properties,errcode_ret));	
+}
+
+
+cl_int
+clEnqueueNDRangeKernel
+(
+  cl_command_queue command_queue,
+  cl_kernel ocl_kernel,
+  cl_uint work_dim,
+  const size_t *global_work_offset, 
+  const size_t *global_work_size,
+  const size_t *local_work_size,
+  cl_uint num_events_in_wait_list,
+  const cl_event *event_wait_list,
+  cl_event *event
+)
+{
+  uint64_t correlation_id = getCorrelationId();
+  opencl_object_t *kernel_info = opencl_malloc();
+  kernel_info->kind = OPENCL_KERNEL_CALLBACK;
+  cl_kernel_callback_t *kernel_cb = &(kernel_info->details.ker_cb);
+  initializeKernelCallBackInfo(kernel_cb, correlation_id);
+  cl_event my_event;
+  cl_event *eventp;
+  if (!event) {
+    kernel_info->isInternalClEvent = true;
+    eventp = &my_event;
+  } else {
+    eventp = event;
+    kernel_info->isInternalClEvent = false;
+  }
+  cl_int return_status = 
+    HPCRUN_OPENCL_CALL(clEnqueueNDRangeKernel, (command_queue, ocl_kernel, work_dim, 
+				   global_work_offset, global_work_size, 
+				   local_work_size, num_events_in_wait_list, 
+				   event_wait_list, eventp));
+
+  ETMSG(OPENCL, "registering callback for type: kernel. " 
+	"Correlation id: %"PRIu64 "", correlation_id);
+
+  opencl_subscriber_callback(kernel_cb->type, kernel_cb->correlation_id);
+  clSetEventCallback_wrapper(*eventp, CL_COMPLETE, 
+			     &opencl_activity_completion_callback, kernel_info);
+  return return_status;
+}
+
+
+cl_int
+clEnqueueReadBuffer
+(
+  cl_command_queue command_queue,
+  cl_mem buffer,
+  cl_bool blocking_read,
+  size_t offset,
+  size_t cb,
+  void *ptr,
+  cl_uint num_events_in_wait_list,
+  const cl_event *event_wait_list,
+  cl_event *event
+)
+{
+  uint64_t correlation_id = getCorrelationId();
+  opencl_object_t *mem_info = opencl_malloc();
+  mem_info->kind = OPENCL_MEMORY_CALLBACK;
+  cl_memory_callback_t *mem_transfer_cb = &(mem_info->details.mem_cb);
+  initializeMemoryCallBackInfo(mem_transfer_cb, correlation_id, cb, false);
+  cl_event my_event;
+  cl_event *eventp;
+  if (!event) {
+    mem_info->isInternalClEvent = true;
+    eventp = &my_event;
+  } else {
+    eventp = event;
+    mem_info->isInternalClEvent = false;
+  }
+  cl_int return_status = 
+    HPCRUN_OPENCL_CALL(clEnqueueReadBuffer, (command_queue, buffer, blocking_read, offset, 
+				cb, ptr, num_events_in_wait_list, 
+				event_wait_list, eventp));
+
+  ETMSG(OPENCL, "registering callback for type: D2H. " 
+	"Correlation id: %"PRIu64 "", correlation_id);
+  ETMSG(OPENCL, "%d(bytes) of data being transferred from device to host", 
+	(long)cb);
+
+  opencl_subscriber_callback(mem_transfer_cb->type, 
+			     mem_transfer_cb->correlation_id);
+
+  clSetEventCallback_wrapper(*eventp, CL_COMPLETE, 
+			     &opencl_activity_completion_callback, mem_info);
+
+  return return_status;
+}
+
+
+cl_int
+clEnqueueWriteBuffer
+(
+  cl_command_queue command_queue,
+  cl_mem buffer,
+  cl_bool blocking_write,
+  size_t offset,
+  size_t cb,
+  const void *ptr,
+  cl_uint num_events_in_wait_list,
+  const cl_event *event_wait_list,
+  cl_event *event
+)
+{
+  uint64_t correlation_id = getCorrelationId();
+  opencl_object_t *mem_info = opencl_malloc();
+  mem_info->kind = OPENCL_MEMORY_CALLBACK;
+  cl_memory_callback_t *mem_transfer_cb = &(mem_info->details.mem_cb);
+  initializeMemoryCallBackInfo(mem_transfer_cb, correlation_id, cb, true);
+  cl_event my_event;
+  cl_event *eventp;
+  if (!event) {
+    mem_info->isInternalClEvent = true;
+    eventp = &my_event;
+  } else {
+    eventp = event;
+    mem_info->isInternalClEvent = false;
+  }
+  cl_int return_status = 
+    HPCRUN_OPENCL_CALL(clEnqueueWriteBuffer, (command_queue, buffer, blocking_write, offset,
+				 cb, ptr, num_events_in_wait_list, 
+				 event_wait_list, eventp));
+
+  ETMSG(OPENCL, "registering callback for type: H2D. " 
+	"Correlation id: %"PRIu64 "", correlation_id);
+
+  ETMSG(OPENCL, "%d(bytes) of data being transferred from host to device", 
+	(long)cb);
+
+  opencl_subscriber_callback(mem_transfer_cb->type, 
+			     mem_transfer_cb->correlation_id);
+
+  clSetEventCallback_wrapper(*eventp, CL_COMPLETE, 
+			     &opencl_activity_completion_callback, 
+			     (void*) mem_info);
+
+  return return_status;
+}
+
+
 void
 opencl_api_finalize
 (
diff --git a/src/tool/hpcrun/gpu/opencl/opencl-api.h b/src/tool/hpcrun/gpu/opencl/opencl-api.h
index d4ef7b8127..fa2d373f12 100644
--- a/src/tool/hpcrun/gpu/opencl/opencl-api.h
+++ b/src/tool/hpcrun/gpu/opencl/opencl-api.h
@@ -61,6 +61,13 @@
 // interface operations
 //******************************************************************************
 
+char*
+getDebugInfoFullFileName
+(
+	void
+);
+
+
 void
 opencl_subscriber_callback
 (
diff --git a/src/tool/hpcrun/gpu/opencl/opencl-intercept.c b/src/tool/hpcrun/gpu/opencl/opencl-intercept.c
index 44901c4302..4fcb7a1888 100644
--- a/src/tool/hpcrun/gpu/opencl/opencl-intercept.c
+++ b/src/tool/hpcrun/gpu/opencl/opencl-intercept.c
@@ -57,6 +57,7 @@
 #include <hpcrun/gpu/gpu-metrics.h>
 #include <hpcrun/memory/hpcrun-malloc.h>
 #include <hpcrun/messages/messages.h>
+#include <hpcrun/sample-sources/libdl.h>
 #include <lib/prof-lean/hpcrun-gotcha.h>
 #include <lib/prof-lean/hpcrun-opencl.h>
 #include <lib/prof-lean/stdatomic.h>
@@ -67,447 +68,10 @@
 
 
 
-//******************************************************************************
-// local data
-//******************************************************************************
-
-#ifndef HPCRUN_STATIC_LINK
-static gotcha_wrappee_handle_t clBuildProgram_handle;
-static gotcha_wrappee_handle_t clCreateProgramWithSource_handle;
-static gotcha_wrappee_handle_t clCreateCommandQueue_handle;
-static gotcha_wrappee_handle_t clEnqueueNDRangeKernel_handle;
-static gotcha_wrappee_handle_t clEnqueueReadBuffer_handle;
-static gotcha_wrappee_handle_t clEnqueueWriteBuffer_handle;
-static atomic_long correlation_id;
-static char *debugInfoFullFileName;
-
-
-#define CL_PROGRAM_DEBUG_INFO_SIZES_INTEL 0x4101
-#define CL_PROGRAM_DEBUG_INFO_INTEL       0x4100
-
-
-
-//******************************************************************************
-// private operations
-//******************************************************************************
-
-static void
-opencl_intercept_initialize
-(
-  void
-)
-{
-  atomic_store(&correlation_id, 0);
-}
-
-
-static uint64_t
-getCorrelationId
-(
-  void
-)
-{
-  return atomic_fetch_add(&correlation_id, 1);
-}
-
-
-static void
-setDebugInfoFullFileName
-(
-	char *fileName
-)
-{
-	if (debugInfoFullFileName == NULL) {
-		debugInfoFullFileName = fileName;	
-	}
-}
-
-
-static void
-initializeKernelCallBackInfo
-(
-  cl_kernel_callback_t *kernel_cb,
-  uint64_t correlation_id
-)
-{
-  kernel_cb->correlation_id = correlation_id;
-  kernel_cb->type = kernel; 
-}
-
-
-static void
-initializeMemoryCallBackInfo
-(
-  cl_memory_callback_t *mem_transfer_cb,
-  uint64_t correlation_id,
-  size_t size,
-  bool fromHostToDevice
-)
-{
-  mem_transfer_cb->correlation_id = correlation_id;
-  mem_transfer_cb->type = (fromHostToDevice) ? memcpy_H2D: memcpy_D2H; 
-  mem_transfer_cb->size = size;
-  mem_transfer_cb->fromHostToDevice = fromHostToDevice;
-  mem_transfer_cb->fromDeviceToHost = !fromHostToDevice;
-}
-
-static char*
-getKernelNameFromSourceCode
-(
-	const char *kernelSourceCode
-)
-{
-	char *kernelCode_copy = (char*)hpcrun_malloc(sizeof(kernelSourceCode));
-	strcpy(kernelCode_copy, kernelSourceCode);
-	char *token = strtok(kernelCode_copy, " ");
-	while (token != NULL) {
-		if (strcmp(token, "void") == 0) { // not searching for kernel because "supported\n#endif\nkernel"
-			token = strtok(NULL, " ");
-			printf("kernel name: %s", token);
-			return token;
-		}
-		token = strtok(NULL, " ");
-	}
-	return NULL;
-}
-
-
-static cl_program
-clCreateProgramWithSource_wrapper
-(
- cl_context context,
- cl_uint count,
- const char** strings,
- const size_t* lengths,
- cl_int* errcode_ret
-)
-{
-	ETMSG(OPENCL, "inside clCreateProgramWithSource_wrapper");
-
-	FILE *f_ptr;
-	for (int i = 0; i < (int)count; i++) {
-		// what if a single file has multiple kernels?
-		// we need to add logic to get filenames by reading the strings contents
-		char fileno = '0' + (i + 1); // right now we are naming the files as index numbers
-		// using malloc instead of hpcrun_malloc gives extra garbage characters in file name
-		char *filename = (char*)hpcrun_malloc(sizeof(fileno) + 1);
-		*filename = fileno + '\0';
-		f_ptr = fopen(filename, "w");
-		fwrite(strings[i], lengths[i], 1, f_ptr);
-	}
-	fclose(f_ptr);
-	
-	clcreateprogramwithsource_t clCreateProgramWithSource_wrappee =
-		GOTCHA_GET_TYPED_WRAPPEE(clCreateProgramWithSource_handle, clcreateprogramwithsource_t);
-	return clCreateProgramWithSource_wrappee(context, count, strings, lengths, errcode_ret);
-}
-
-
-// we are dumping the debuginfo temporarily since the binary does not have debugsection
-// poorly written code: FIXME
-static char*
-dumpIntelGPUBinary(cl_program program) {
-	int device_count = 1;
-	cl_int status = CL_SUCCESS;
-	size_t *binary_size = (size_t*)hpcrun_malloc(sizeof(size_t) * device_count);
-
-	status = clGetProgramInfo(program, CL_PROGRAM_BINARY_SIZES,	sizeof(size_t), binary_size, NULL);
-	assert(status == CL_SUCCESS);
-	uint8_t **binary = (uint8_t**)hpcrun_malloc(device_count * sizeof(uint8_t*));
-	for (size_t i = 0; i < device_count; ++i) {
-		binary[i] = (uint8_t*)hpcrun_malloc(binary_size[i] * sizeof(uint8_t));
-	}
-
-	status = clGetProgramInfo(program, CL_PROGRAM_BINARIES, device_count * sizeof(uint8_t*), binary, NULL);
-	assert(status == CL_SUCCESS);
-
-	FILE *bin_ptr;
-	bin_ptr = fopen("opencl_main.gpubin", "wb");
-	fwrite(binary[0], binary_size[0], 1, bin_ptr);
-
-  // SECOND
-	size_t *debug_info_size = (size_t*)hpcrun_malloc(sizeof(size_t) * device_count);
-
-	status = clGetProgramInfo(program, CL_PROGRAM_DEBUG_INFO_SIZES_INTEL,	sizeof(size_t), debug_info_size, NULL);
-	assert(status == CL_SUCCESS);
-	uint8_t **debug_info = (uint8_t**)hpcrun_malloc(device_count * sizeof(uint8_t*));
-	for (size_t i = 0; i < device_count; ++i) {
-		debug_info[i] = (uint8_t*)hpcrun_malloc(debug_info_size[i] * sizeof(uint8_t));
-	}
-
-	status = clGetProgramInfo(program, CL_PROGRAM_DEBUG_INFO_INTEL, device_count * sizeof(uint8_t*), debug_info, NULL);
-	assert(status == CL_SUCCESS);
-
-	char *debuginfoFileName = "opencl_main.debuginfo";
-	bin_ptr = fopen(debuginfoFileName, "wb");
-	fwrite(debug_info[0], debug_info_size[0], 1, bin_ptr);
-	fclose(bin_ptr);
-  ETMSG(OPENCL, "Intel GPU files dumped successfully");
-	return realpath(debuginfoFileName, NULL);
-}
-
-
-static void
-clBuildProgramCallback
-(
-	cl_program program,
-	void* user_data
-)
-{
-	char* debugInfoFullFileName = dumpIntelGPUBinary(program);
-	setDebugInfoFullFileName(debugInfoFullFileName);
-}
-
-
-// one downside of this appproach is that we may override the callback provided by user
-static cl_int
-clBuildProgram_wrapper
-(
- cl_program program,
- cl_uint num_devices,
- const cl_device_id* device_list,
- const char* options,
- void (CL_CALLBACK* pfn_notify)(cl_program program, void* user_data),
- void* user_data
-)
-{
-  ETMSG(OPENCL, "inside clBuildProgram_wrapper");
-  clbuildprogram_t clBuildProgram_wrappee = 
-    GOTCHA_GET_TYPED_WRAPPEE(clBuildProgram_handle, clbuildprogram_t);
-
-	char optionsWithDebugFlag[] = " -gline-tables-only ";
-	if (options != NULL) {
-		strcat(optionsWithDebugFlag, options);
-	}
-  return clBuildProgram_wrappee(program, num_devices, device_list, (const char*)optionsWithDebugFlag, clBuildProgramCallback, user_data);
-}
-
-
-static cl_command_queue
-clCreateCommandQueue_wrapper
-(
-  cl_context context,
-  cl_device_id device,
-  cl_command_queue_properties properties,
-  cl_int *errcode_ret
-)
-{
-  // enabling profiling
-  properties |= (cl_command_queue_properties)CL_QUEUE_PROFILING_ENABLE; 
-
-  clqueue_t clCreateCommandQueue_wrappee = 
-    GOTCHA_GET_TYPED_WRAPPEE(clCreateCommandQueue_handle, clqueue_t);
-
-  return clCreateCommandQueue_wrappee(context, device, properties, errcode_ret);
-}
-
-
-static cl_int
-clEnqueueNDRangeKernel_wrapper
-(
-  cl_command_queue command_queue,
-  cl_kernel ocl_kernel,
-  cl_uint work_dim,
-  const size_t *global_work_offset, 
-  const size_t *global_work_size,
-  const size_t *local_work_size,
-  cl_uint num_events_in_wait_list,
-  const cl_event *event_wait_list,
-  cl_event *event
-)
-{
-  uint64_t correlation_id = getCorrelationId();
-  opencl_object_t *kernel_info = opencl_malloc();
-  kernel_info->kind = OPENCL_KERNEL_CALLBACK;
-  cl_kernel_callback_t *kernel_cb = &(kernel_info->details.ker_cb);
-  initializeKernelCallBackInfo(kernel_cb, correlation_id);
-  cl_event my_event;
-  cl_event *eventp;
-  if (!event) {
-    kernel_info->isInternalClEvent = true;
-    eventp = &my_event;
-  } else {
-    eventp = event;
-    kernel_info->isInternalClEvent = false;
-  }
-  clkernel_t clEnqueueNDRangeKernel_wrappee = 
-    GOTCHA_GET_TYPED_WRAPPEE(clEnqueueNDRangeKernel_handle, clkernel_t);
-  cl_int return_status = 
-    clEnqueueNDRangeKernel_wrappee(command_queue, ocl_kernel, work_dim, 
-				   global_work_offset, global_work_size, 
-				   local_work_size, num_events_in_wait_list, 
-				   event_wait_list, eventp);
-
-  ETMSG(OPENCL, "registering callback for type: kernel. " 
-	"Correlation id: %"PRIu64 "", correlation_id);
-
-  opencl_subscriber_callback(kernel_cb->type, kernel_cb->correlation_id);
-  clSetEventCallback_wrapper(*eventp, CL_COMPLETE, 
-			     &opencl_activity_completion_callback, kernel_info);
-  return return_status;
-}
-
-
-static cl_int
-clEnqueueReadBuffer_wrapper
-(
-  cl_command_queue command_queue,
-  cl_mem buffer,
-  cl_bool blocking_read,
-  size_t offset,
-  size_t cb,
-  void *ptr,
-  cl_uint num_events_in_wait_list,
-  const cl_event *event_wait_list,
-  cl_event *event
-)
-{
-  uint64_t correlation_id = getCorrelationId();
-  opencl_object_t *mem_info = opencl_malloc();
-  mem_info->kind = OPENCL_MEMORY_CALLBACK;
-  cl_memory_callback_t *mem_transfer_cb = &(mem_info->details.mem_cb);
-  initializeMemoryCallBackInfo(mem_transfer_cb, correlation_id, cb, false);
-  cl_event my_event;
-  cl_event *eventp;
-  if (!event) {
-    mem_info->isInternalClEvent = true;
-    eventp = &my_event;
-  } else {
-    eventp = event;
-    mem_info->isInternalClEvent = false;
-  }
-  clreadbuffer_t clEnqueueReadBuffer_wrappee = 
-    GOTCHA_GET_TYPED_WRAPPEE(clEnqueueReadBuffer_handle, clreadbuffer_t);
-  cl_int return_status = 
-    clEnqueueReadBuffer_wrappee(command_queue, buffer, blocking_read, offset, 
-				cb, ptr, num_events_in_wait_list, 
-				event_wait_list, eventp);
-
-  ETMSG(OPENCL, "registering callback for type: D2H. " 
-	"Correlation id: %"PRIu64 "", correlation_id);
-  ETMSG(OPENCL, "%d(bytes) of data being transferred from device to host", 
-	(long)cb);
-
-  opencl_subscriber_callback(mem_transfer_cb->type, 
-			     mem_transfer_cb->correlation_id);
-
-  clSetEventCallback_wrapper(*eventp, CL_COMPLETE, 
-			     &opencl_activity_completion_callback, mem_info);
-
-  return return_status;
-}
-
-
-static cl_int
-clEnqueueWriteBuffer_wrapper
-(
-  cl_command_queue command_queue,
-  cl_mem buffer,
-  cl_bool blocking_write,
-  size_t offset,
-  size_t cb,
-  const void *ptr,
-  cl_uint num_events_in_wait_list,
-  const cl_event *event_wait_list,
-  cl_event *event
-)
-{
-  uint64_t correlation_id = getCorrelationId();
-  opencl_object_t *mem_info = opencl_malloc();
-  mem_info->kind = OPENCL_MEMORY_CALLBACK;
-  cl_memory_callback_t *mem_transfer_cb = &(mem_info->details.mem_cb);
-  initializeMemoryCallBackInfo(mem_transfer_cb, correlation_id, cb, true);
-  cl_event my_event;
-  cl_event *eventp;
-  if (!event) {
-    mem_info->isInternalClEvent = true;
-    eventp = &my_event;
-  } else {
-    eventp = event;
-    mem_info->isInternalClEvent = false;
-  }
-  clwritebuffer_t clEnqueueWriteBuffer_wrappee = 
-    GOTCHA_GET_TYPED_WRAPPEE(clEnqueueWriteBuffer_handle, clwritebuffer_t);
-  cl_int return_status = 
-    clEnqueueWriteBuffer_wrappee(command_queue, buffer, blocking_write, offset,
-				 cb, ptr, num_events_in_wait_list, 
-				 event_wait_list, eventp);
-
-  ETMSG(OPENCL, "registering callback for type: H2D. " 
-	"Correlation id: %"PRIu64 "", correlation_id);
-
-  ETMSG(OPENCL, "%d(bytes) of data being transferred from host to device", 
-	(long)cb);
-
-  opencl_subscriber_callback(mem_transfer_cb->type, 
-			     mem_transfer_cb->correlation_id);
-
-  clSetEventCallback_wrapper(*eventp, CL_COMPLETE, 
-			     &opencl_activity_completion_callback, 
-			     (void*) mem_info);
-
-  return return_status;
-}
-
-#endif
-
-
-
-//******************************************************************************
-// gotcha variables
-//******************************************************************************
-
-#ifndef HPCRUN_STATIC_LINK
-static gotcha_binding_t opencl_bindings[] = {
-  {
-    "clBuildProgram",
-    (void*) clBuildProgram_wrapper,
-    &clBuildProgram_handle
-  },
-  {
-    "clCreateProgramWithSource",
-    (void*) clCreateProgramWithSource_wrapper,
-    &clCreateProgramWithSource_handle
-  },
-  {
-    "clCreateCommandQueue",
-    (void*) clCreateCommandQueue_wrapper,
-    &clCreateCommandQueue_handle
-  },
-  {
-    "clEnqueueNDRangeKernel",
-    (void*)clEnqueueNDRangeKernel_wrapper,
-    &clEnqueueNDRangeKernel_handle
-  },
-  {
-    "clEnqueueReadBuffer",
-    (void*) clEnqueueReadBuffer_wrapper,
-    &clEnqueueReadBuffer_handle
-  },
-  {
-    "clEnqueueWriteBuffer",
-    (void*) clEnqueueWriteBuffer_wrapper,
-    &clEnqueueWriteBuffer_handle
-  }
-};
-#endif
-
-
-
 //******************************************************************************
 // interface operations
 //******************************************************************************
 
-char*
-getDebugInfoFullFileName
-(
-	void
-)
-{
-	return debugInfoFullFileName;
-}
-
-
 void
 opencl_intercept_setup
 (
@@ -518,8 +82,6 @@ opencl_intercept_setup
   ETMSG(OPENCL, "setting up opencl intercepts");
 	gpu_metrics_KER_BLKINFO_enable();
   enableProfiling();
-  gotcha_wrap(opencl_bindings, 4, "opencl_bindings");
-  opencl_intercept_initialize();
 #endif
 }
 
@@ -530,8 +92,4 @@ opencl_intercept_teardown
   void
 )
 {
-#ifndef HPCRUN_STATIC_LINK
-  // not sure if this works
-  gotcha_set_priority("opencl_bindings", -1);
-#endif
 }
diff --git a/src/tool/hpcrun/gpu/opencl/opencl-intercept.h b/src/tool/hpcrun/gpu/opencl/opencl-intercept.h
index 30c6ec6d19..5bb8be1106 100644
--- a/src/tool/hpcrun/gpu/opencl/opencl-intercept.h
+++ b/src/tool/hpcrun/gpu/opencl/opencl-intercept.h
@@ -167,13 +167,6 @@ typedef struct cl_memory_callback_t {
 // interface operations
 //******************************************************************************
 
-char*
-getDebugInfoFullFileName
-(
-	void
-);
-
-
 void
 opencl_intercept_setup
 (

From f069a23258ce9464bf09721b55c8b4659ad03f29 Mon Sep 17 00:00:00 2001
From: Jokeren <robinho364@gmail.com>
Date: Wed, 23 Sep 2020 17:10:48 +0000
Subject: [PATCH 036/177] Use another dyninst instruction interface to
 construct gpu instructions which has instruction size information

---
 src/lib/banal/Struct.cpp            | 55 +++++++++++++++--------------
 src/lib/banal/gpu/DotCFG.hpp        | 13 ++++---
 src/lib/banal/gpu/GPUBlock.cpp      | 26 ++++++++------
 src/lib/banal/gpu/GPUBlock.hpp      |  5 +--
 src/lib/banal/gpu/GPUCFGFactory.cpp |  8 ++---
 src/lib/banal/gpu/ReadCudaCFG.cpp   |  7 ++--
 src/lib/banal/gpu/ReadCudaCFG.hpp   |  2 +-
 src/lib/banal/gpu/ReadIntelCFG.cpp  |  7 ----
 src/lib/banal/gpu/ReadIntelCFG.hpp  |  1 -
 9 files changed, 65 insertions(+), 59 deletions(-)

diff --git a/src/lib/banal/Struct.cpp b/src/lib/banal/Struct.cpp
index ec44f93423..4686d4c3c8 100644
--- a/src/lib/banal/Struct.cpp
+++ b/src/lib/banal/Struct.cpp
@@ -170,11 +170,11 @@ static const string & unknown_link = UNKNOWN_LINK;
 // FIXME: temporary until the line map problems are resolved
 static Symtab * the_symtab = NULL;
 static int cuda_arch = 0;
+static int intel_gpu_arch = 0;
+// We relocate the symbols and line maps of cubins to 'original_offset+cubin_size'
+// to handle the cases in which relocated offsets conflicts with original information
 static size_t cubin_size = 0;
 
-// FIXME: temporary until instruction size problem is fixed
-static int intel_arch = 0;
-static std::map<int, int> inst_size;
 
 static BAnal::Struct::Options opts;
 
@@ -650,27 +650,26 @@ makeStructure(string filename,
     omp_set_num_threads(opts.jobs_parse);
 #endif
 
-    // TODO(Aaron): determine these variables
-		bool isIntelArch = true;
-		bool cfgNotPresent = true;
-		if (isIntelArch && cfgNotPresent) {
-			//std::cerr << "executing intel-gen9 specific code." << std::endl;
-      // TODO(Aaron): does instruction size change with different generations?
-      inst_size.clear();
-      intel_arch = 1;
-      parsable = readIntelCFG(search_path, elfFile, the_symtab, inst_size,
+    // TODO(Aaron): determine the variables
+		bool intel_file = true;
+
+    if (cuda_file) { // don't run parseapi on cuda binary
+      cuda_arch = elfFile->getArch();
+      cubin_size = elfFile->getLength();
+      parsable = readCudaCFG(search_path, elfFile, the_symtab, 
+			      structOpts.compute_gpu_cfg, &code_src, &code_obj);
+    } else if (intel_file) { // don't run parseapi on intel binary
+      // TODO(Aaron): determine which generation of intel gpu it is
+      intel_gpu_arch = 1;
+      parsable = readIntelCFG(search_path, elfFile, the_symtab,
         structOpts.compute_gpu_cfg, &code_src, &code_obj);
-		} else if (! cuda_file) { // don't run parseapi on cuda binary
+    } else {
       code_src = new SymtabCodeSource(symtab);
       code_obj = new CodeObject(code_src);
       code_obj->parse();
+      intel_gpu_arch = 0;
       cuda_arch = 0;
       cubin_size = 0;
-    } else {
-      cuda_arch = elfFile->getArch();
-      cubin_size = elfFile->getLength();
-      parsable = readCubinCFG(search_path, elfFile, the_symtab, 
-			      structOpts.compute_gpu_cfg, &code_src, &code_obj);
     }
 
     if (opts.show_time) {
@@ -997,6 +996,8 @@ getProcLineMap(StatementVector & svec, Offset vma, Offset end,
   svec.clear();
 
   if (cuda_arch > 0) {
+    // TODO(Keren): Use the same method below and remove magic numbers for instruction length
+    // mod->getSourceLines(svec, next + cubin_size);
     int len = (cuda_arch >= 70) ? 16 : 8;
 
     StatementVector tmp;
@@ -1008,7 +1009,7 @@ getProcLineMap(StatementVector & svec, Offset vma, Offset end,
         if (svec.empty()) {
           svec.push_back(tmp[0]);
         } else if (tmp[0]->getFile() == svec[0]->getFile() &&
-		   tmp[0]->getLine() < svec[0]->getLine()) {
+          tmp[0]->getLine() < svec[0]->getLine()) {
           svec[0] = tmp[0];
         }
       }
@@ -1803,11 +1804,14 @@ doBlock(WorkEnv & env, GroupInfo * ginfo, ParseAPI::Function * func,
   LineMapCache lmcache (ginfo->sym_func, env.realPath);
 
   // iterate through the instructions in this block
+#if 0
+// no longer support this path
 #ifdef DYNINST_INSTRUCTION_PTR
   map <Offset, Instruction::Ptr> imap;
 #else
-  map <Offset, Instruction> imap;
 #endif
+#endif
+  map <Offset, Instruction> imap;
   block->getInsns(imap);
 
   int len = 0;
@@ -1815,8 +1819,7 @@ doBlock(WorkEnv & env, GroupInfo * ginfo, ParseAPI::Function * func,
 
   if (cuda_arch > 0) {
     device = "NVIDIA sm_" + std::to_string(cuda_arch);
-    len = (cuda_arch >= 70) ? 16 : 8;
-  } else if (intel_arch > 0) {
+  } else if (intel_gpu_arch > 0) {
     device = "INTEL GPU";
   }
   
@@ -1826,15 +1829,13 @@ doBlock(WorkEnv & env, GroupInfo * ginfo, ParseAPI::Function * func,
     string filenm = "";
     uint line = 0;
 
-    if (intel_arch > 0) {
-      len = inst_size.at(vma);
-    } else if (cuda_arch == 0) {
+#if 0
 #ifdef DYNINST_INSTRUCTION_PTR
       len = iit->second->size();
 #else
-      len = iit->second.size();
 #endif
-    } 
+#endif
+    len = iit->second.size();
 
     lmcache.getLineInfo(vma, filenm, line);
 
diff --git a/src/lib/banal/gpu/DotCFG.hpp b/src/lib/banal/gpu/DotCFG.hpp
index 3fac50f5a9..5b323da1b4 100644
--- a/src/lib/banal/gpu/DotCFG.hpp
+++ b/src/lib/banal/gpu/DotCFG.hpp
@@ -13,6 +13,8 @@
 
 namespace GPUParse {
 
+typedef Dyninst::Architecture Arch;
+
 struct Inst {
   int offset;
   int size;
@@ -26,10 +28,13 @@ struct Inst {
   std::string port;
   std::string target;
   std::vector<std::string> operands;
+  Arch arch;
 
   // Constructor for dummy inst
-  Inst(int offset, int size) : offset(offset), size(size), dual_first(false), dual_second(false),
-    is_call(false), is_jump(false), is_sync(false) {}
+  Inst(int offset, int size, Arch arch) : offset(offset), size(size), dual_first(false), dual_second(false),
+    is_call(false), is_jump(false), is_sync(false), arch(arch) {}
+
+  Inst(int offset, int size) : Inst(offset, size, Dyninst::Arch_none) {}
 
   explicit Inst(int offset) : Inst(offset, 0) {}
 };
@@ -37,9 +42,9 @@ struct Inst {
 
 struct CudaInst : public Inst {
   // Constructor for dummy inst
-  CudaInst(int offset, int size) : Inst(offset, size) {}
+  CudaInst(int offset, int size) : Inst(offset, size, Dyninst::Arch_cuda) {}
 
-  explicit CudaInst(int offset) : Inst(offset) {}
+  explicit CudaInst(int offset) : Inst(offset, 0, Dyninst::Arch_cuda) {}
 
   // Cuda instruction constructor
   CudaInst(std::string &inst_str) : Inst(0, 0) {
diff --git a/src/lib/banal/gpu/GPUBlock.cpp b/src/lib/banal/gpu/GPUBlock.cpp
index 3fe14be511..95f0ad7514 100644
--- a/src/lib/banal/gpu/GPUBlock.cpp
+++ b/src/lib/banal/gpu/GPUBlock.cpp
@@ -6,27 +6,33 @@ namespace Dyninst {
 namespace ParseAPI {
 
 GPUBlock::GPUBlock(CodeObject * o, CodeRegion * r,
-  Address start, std::vector<Offset> &offsets) : Block(o, r, start) {
-  for (auto offset : offsets) {
-    _inst_offsets.push_back(offset);
-  }
-}
+  Address start, std::vector<std::pair<Offset, size_t>> &offsets) :
+  Block(o, r, start), _inst_offsets(offsets) {}
 
 
 Address GPUBlock::last() const {
-  return this->_inst_offsets.back();
+  return this->_inst_offsets.back().first;
 }
 
 
 void GPUBlock::getInsns(Insns &insns) const {
-  for (auto offset : _inst_offsets) {
+  for (auto &inst_offset : _inst_offsets) {
+    entryID entry_id = intel_gpu_op_general;
+    InstructionAPI::Operation op(entry_id, "", Arch_intelGen9);
+
+    auto offset = inst_offset.first;
+    auto size = inst_offset.second;
+
+#if 0 
+// No longer support this path
 #ifdef DYNINST_INSTRUCTION_PTR
     insns.insert(std::pair<long unsigned int, 
       InstructionAPI::InstructionPtr>(offset, NULL));
-#else
-    InstructionAPI::Instruction inst;    
-    insns[offset] = inst;
 #endif
+#endif
+
+    InstructionAPI::Instruction inst(op, size, NULL, Arch_intelGen9);
+    insns.emplace(offset, std::move(inst));
   }
 }
 
diff --git a/src/lib/banal/gpu/GPUBlock.hpp b/src/lib/banal/gpu/GPUBlock.hpp
index cbe60956a0..658f89f0fa 100644
--- a/src/lib/banal/gpu/GPUBlock.hpp
+++ b/src/lib/banal/gpu/GPUBlock.hpp
@@ -8,7 +8,7 @@ namespace ParseAPI {
 
 class PARSER_EXPORT GPUBlock : public Block {
  public:
-  GPUBlock(CodeObject * o, CodeRegion * r, Address start, std::vector<Offset> &offsets);
+  GPUBlock(CodeObject * o, CodeRegion * r, Address start, std::vector<std::pair<Offset, size_t>> &offsets);
 
   virtual ~GPUBlock() {}
 
@@ -17,7 +17,8 @@ class PARSER_EXPORT GPUBlock : public Block {
   virtual Address last() const;
 
  private:
-  std::vector<Offset> _inst_offsets;
+  // <offset, size> pair
+  std::vector<std::pair<Offset, size_t>> _inst_offsets;
 };
 
 }
diff --git a/src/lib/banal/gpu/GPUCFGFactory.cpp b/src/lib/banal/gpu/GPUCFGFactory.cpp
index 114973fcd7..070f2e056f 100644
--- a/src/lib/banal/gpu/GPUCFGFactory.cpp
+++ b/src/lib/banal/gpu/GPUCFGFactory.cpp
@@ -28,9 +28,9 @@ Function *GPUCFGFactory::mkfunc(Address addr, FuncSource src,
           if (DEBUG_GPU_CFGFACTORY) {
             std::cout << "New block: " << block->name << " id: " << block->id << std::endl;
           }
-          std::vector<Offset> inst_offsets;
+          std::vector<std::pair<Offset, size_t>> inst_offsets;
           for (auto *inst : block->insts) {
-            inst_offsets.push_back(inst->offset);
+            inst_offsets.emplace_back(std::make_pair(inst->offset, inst->size));
           }
           ret_block = new GPUBlock(obj, region, block->address, inst_offsets);
           _block_filter[block->id] = ret_block;
@@ -55,9 +55,9 @@ Function *GPUCFGFactory::mkfunc(Address addr, FuncSource src,
             if (DEBUG_GPU_CFGFACTORY) {
               std::cout << "New block: " << target->block->name << " id: " << target->block->id << std::endl;
             }
-            std::vector<Offset> inst_offsets;
+            std::vector<std::pair<Offset, size_t>> inst_offsets;
             for (auto *inst : target->block->insts) {
-              inst_offsets.push_back(inst->offset);
+              inst_offsets.push_back(std::make_pair(inst->offset, inst->size));
             }
             ret_target_block = new GPUBlock(obj, region, target->block->address, inst_offsets);
             _block_filter[target->block->id] = ret_target_block;
diff --git a/src/lib/banal/gpu/ReadCudaCFG.cpp b/src/lib/banal/gpu/ReadCudaCFG.cpp
index 7added0b9b..647d8e8b03 100644
--- a/src/lib/banal/gpu/ReadCudaCFG.cpp
+++ b/src/lib/banal/gpu/ReadCudaCFG.cpp
@@ -144,6 +144,7 @@ parseDotCFG
         for (auto *block : function->blocks) {
           for (auto *inst : block->insts) {
             inst->offset = (inst->offset - begin_offset) + symbol->getOffset();
+            inst->size = cuda_arch >= 70 ? 16 : 8;
           }
           block->address = block->insts[0]->offset;
         }
@@ -188,7 +189,7 @@ parseDotCFG
     int len = cuda_arch >= 70 ? 16 : 8;
     // Add dummy insts
     for (size_t i = block->address; i < block->address + symbol->getSize(); i += len) {
-      block->insts.push_back(new GPUParse::CudaInst(i));
+      block->insts.push_back(new GPUParse::CudaInst(i, len));
     }
     function->blocks.push_back(block);
     functions.push_back(function);
@@ -212,7 +213,7 @@ parseDotCFG
           block->begin_offset = cuda_arch >= 70 ? 16 : 8;
           max_block_id++;
           while (function_size < symbol_size) {
-            block->insts.push_back(new GPUParse::Inst(function_size + function->address));
+            block->insts.push_back(new GPUParse::Inst(function_size + function->address, len));
             function_size += len;
           } 
           if (function->blocks.size() > 0) {
@@ -279,7 +280,7 @@ getFilename
 
 
 bool
-readCubinCFG
+readCudaCFG
 (
  const std::string &search_path,
  ElfFile *elfFile,
diff --git a/src/lib/banal/gpu/ReadCudaCFG.hpp b/src/lib/banal/gpu/ReadCudaCFG.hpp
index e259e01dc5..fcf1562235 100644
--- a/src/lib/banal/gpu/ReadCudaCFG.hpp
+++ b/src/lib/banal/gpu/ReadCudaCFG.hpp
@@ -10,7 +10,7 @@
 #include <CodeObject.h>
 
 bool
-readCubinCFG
+readCudaCFG
 (
  const std::string &search_path,
  ElfFile *elfFile,
diff --git a/src/lib/banal/gpu/ReadIntelCFG.cpp b/src/lib/banal/gpu/ReadIntelCFG.cpp
index 1400871ae8..59ff9e21dd 100644
--- a/src/lib/banal/gpu/ReadIntelCFG.cpp
+++ b/src/lib/banal/gpu/ReadIntelCFG.cpp
@@ -237,7 +237,6 @@ readIntelCFG
  const std::string &search_path,
  ElfFile *elfFile,
  Dyninst::SymtabAPI::Symtab *the_symtab, 
- std::map<int, int> &inst_size,
  bool cfg_wanted,
  Dyninst::ParseAPI::CodeSource **code_src, 
  Dyninst::ParseAPI::CodeObject **code_obj
@@ -261,11 +260,5 @@ readIntelCFG
   *code_obj = new CodeObject(*code_src, cfg_fact);
   (*code_obj)->parse();
 
-  for (auto *block : function.blocks) {
-    for (auto *inst : block->insts) {
-      inst_size[inst->offset] = inst->size;
-    }
-  }
-
   return true;
 }
diff --git a/src/lib/banal/gpu/ReadIntelCFG.hpp b/src/lib/banal/gpu/ReadIntelCFG.hpp
index 0f2c78e28c..6c80c01cb7 100644
--- a/src/lib/banal/gpu/ReadIntelCFG.hpp
+++ b/src/lib/banal/gpu/ReadIntelCFG.hpp
@@ -86,7 +86,6 @@ readIntelCFG
  const std::string &search_path,
  ElfFile *elfFile,
  Dyninst::SymtabAPI::Symtab *the_symtab, 
- std::map<int, int> &inst_size,
  bool cfg_wanted,
  Dyninst::ParseAPI::CodeSource **code_src, 
  Dyninst::ParseAPI::CodeObject **code_obj

From 7e6fcca39c8a5e8c4f7b6bc1ba5aa92a4ec0779f Mon Sep 17 00:00:00 2001
From: dejangrubisic <grubisic.dejan@yahoo.com>
Date: Wed, 23 Sep 2020 16:32:30 -0500
Subject: [PATCH 037/177] opencl_multiplexer: compilable version

---
 src/tool/hpcrun/Makefile.am                   |   5 +
 src/tool/hpcrun/Makefile.in                   | 163 +++++++++++++-
 .../hpcrun/gpu/gpu-activity-multiplexer.c     | 207 ++++++++++++++++++
 .../hpcrun/gpu/gpu-activity-multiplexer.h     | 105 +++++++++
 src/tool/hpcrun/gpu/gpu-activity.h            |   2 +
 .../hpcrun/gpu/gpu-operation-channel-set.c    | 199 +++++++++++++++++
 .../hpcrun/gpu/gpu-operation-channel-set.h    |  99 +++++++++
 src/tool/hpcrun/gpu/gpu-operation-channel.c   | 182 +++++++++++++++
 src/tool/hpcrun/gpu/gpu-operation-channel.h   |  92 ++++++++
 .../hpcrun/gpu/gpu-operation-item-process.c   | 204 +++++++++++++++++
 .../hpcrun/gpu/gpu-operation-item-process.h   |  69 ++++++
 src/tool/hpcrun/gpu/gpu-operation-item.c      | 120 ++++++++++
 src/tool/hpcrun/gpu/gpu-operation-item.h      | 131 +++++++++++
 src/tool/hpcrun/gpu/gpu-operation.c           | 120 ++++++++++
 src/tool/hpcrun/gpu/gpu-operation.h           | 131 +++++++++++
 .../gpu/opencl/opencl-activity-translate.c    |  12 +-
 src/tool/hpcrun/gpu/opencl/opencl-api.c       |  46 ++--
 src/tool/hpcrun/gpu/opencl/opencl-intercept.c |  17 +-
 src/tool/hpcrun/gpu/opencl/opencl-intercept.h |   3 +
 .../hpcrun/gpu/opencl/opencl-memory-manager.h |   2 +
 20 files changed, 1872 insertions(+), 37 deletions(-)
 create mode 100644 src/tool/hpcrun/gpu/gpu-activity-multiplexer.c
 create mode 100644 src/tool/hpcrun/gpu/gpu-activity-multiplexer.h
 create mode 100644 src/tool/hpcrun/gpu/gpu-operation-channel-set.c
 create mode 100644 src/tool/hpcrun/gpu/gpu-operation-channel-set.h
 create mode 100644 src/tool/hpcrun/gpu/gpu-operation-channel.c
 create mode 100644 src/tool/hpcrun/gpu/gpu-operation-channel.h
 create mode 100644 src/tool/hpcrun/gpu/gpu-operation-item-process.c
 create mode 100644 src/tool/hpcrun/gpu/gpu-operation-item-process.h
 create mode 100644 src/tool/hpcrun/gpu/gpu-operation-item.c
 create mode 100644 src/tool/hpcrun/gpu/gpu-operation-item.h
 create mode 100644 src/tool/hpcrun/gpu/gpu-operation.c
 create mode 100644 src/tool/hpcrun/gpu/gpu-operation.h

diff --git a/src/tool/hpcrun/Makefile.am b/src/tool/hpcrun/Makefile.am
index 4f5925293d..4aba76313d 100644
--- a/src/tool/hpcrun/Makefile.am
+++ b/src/tool/hpcrun/Makefile.am
@@ -357,6 +357,7 @@ MY_BASE_FILES =				\
 	hpcrun-placeholders.c 		\
 	gpu/gpu-activity.c 		\
 	gpu/gpu-activity-channel.c 	\
+	gpu/gpu-activity-multiplexer.c 	\
 	gpu/gpu-activity-process.c 	\
 	gpu/gpu-application-thread-api.c \
 	gpu/gpu-channel-item-allocator.c \
@@ -373,6 +374,10 @@ MY_BASE_FILES =				\
 	gpu/gpu-monitoring.c 		\
 	gpu/gpu-monitoring-thread-api.c \
 	gpu/gpu-op-placeholders.c 	\
+	gpu/gpu-operation-item.c			\
+	gpu/gpu-operation-item-process.c   \
+	gpu/gpu-operation-channel.c \
+	gpu/gpu-operation-channel-set.c \
 	gpu/gpu-splay-allocator.c	\
 	gpu/gpu-stream-id-map.c		\
 	gpu/gpu-trace.c			\
diff --git a/src/tool/hpcrun/Makefile.in b/src/tool/hpcrun/Makefile.in
index 942ff12e92..fb18716199 100644
--- a/src/tool/hpcrun/Makefile.in
+++ b/src/tool/hpcrun/Makefile.in
@@ -481,8 +481,8 @@ am__libhpcrun_la_SOURCES_DIST = utilities/first_func.c main.h main.c \
 	memory/mem.c memory/mmap.c messages/debug-flag.c \
 	messages/messages-sync.c messages/messages-async.c \
 	messages/fmt.c hpcrun-placeholders.c gpu/gpu-activity.c \
-	gpu/gpu-activity-channel.c gpu/gpu-activity-process.c \
-	gpu/gpu-application-thread-api.c \
+	gpu/gpu-activity-channel.c gpu/gpu-activity-multiplexer.c \
+	gpu/gpu-activity-process.c gpu/gpu-application-thread-api.c \
 	gpu/gpu-channel-item-allocator.c gpu/gpu-context-id-map.c \
 	gpu/gpu-correlation.c gpu/gpu-correlation-channel.c \
 	gpu/gpu-correlation-channel-set.c gpu/gpu-correlation-id.c \
@@ -490,6 +490,8 @@ am__libhpcrun_la_SOURCES_DIST = utilities/first_func.c main.h main.c \
 	gpu/gpu-function-id-map.c gpu/gpu-host-correlation-map.c \
 	gpu/gpu-metrics.c gpu/gpu-monitoring.c \
 	gpu/gpu-monitoring-thread-api.c gpu/gpu-op-placeholders.c \
+	gpu/gpu-operation-item.c gpu/gpu-operation-item-process.c \
+	gpu/gpu-operation-channel.c gpu/gpu-operation-channel-set.c \
 	gpu/gpu-splay-allocator.c gpu/gpu-stream-id-map.c \
 	gpu/gpu-trace.c gpu/gpu-trace-channel.c gpu/gpu-trace-item.c \
 	gpu/gpu-trace-channel-set.c ompt/ompt-callstack.c \
@@ -644,6 +646,7 @@ am__objects_15 = utilities/libhpcrun_la-first_func.lo \
 	libhpcrun_la-hpcrun-placeholders.lo \
 	gpu/libhpcrun_la-gpu-activity.lo \
 	gpu/libhpcrun_la-gpu-activity-channel.lo \
+	gpu/libhpcrun_la-gpu-activity-multiplexer.lo \
 	gpu/libhpcrun_la-gpu-activity-process.lo \
 	gpu/libhpcrun_la-gpu-application-thread-api.lo \
 	gpu/libhpcrun_la-gpu-channel-item-allocator.lo \
@@ -660,6 +663,10 @@ am__objects_15 = utilities/libhpcrun_la-first_func.lo \
 	gpu/libhpcrun_la-gpu-monitoring.lo \
 	gpu/libhpcrun_la-gpu-monitoring-thread-api.lo \
 	gpu/libhpcrun_la-gpu-op-placeholders.lo \
+	gpu/libhpcrun_la-gpu-operation-item.lo \
+	gpu/libhpcrun_la-gpu-operation-item-process.lo \
+	gpu/libhpcrun_la-gpu-operation-channel.lo \
+	gpu/libhpcrun_la-gpu-operation-channel-set.lo \
 	gpu/libhpcrun_la-gpu-splay-allocator.lo \
 	gpu/libhpcrun_la-gpu-stream-id-map.lo \
 	gpu/libhpcrun_la-gpu-trace.lo \
@@ -894,8 +901,8 @@ am__libhpcrun_o_SOURCES_DIST = utilities/first_func.c main.h main.c \
 	memory/mem.c memory/mmap.c messages/debug-flag.c \
 	messages/messages-sync.c messages/messages-async.c \
 	messages/fmt.c hpcrun-placeholders.c gpu/gpu-activity.c \
-	gpu/gpu-activity-channel.c gpu/gpu-activity-process.c \
-	gpu/gpu-application-thread-api.c \
+	gpu/gpu-activity-channel.c gpu/gpu-activity-multiplexer.c \
+	gpu/gpu-activity-process.c gpu/gpu-application-thread-api.c \
 	gpu/gpu-channel-item-allocator.c gpu/gpu-context-id-map.c \
 	gpu/gpu-correlation.c gpu/gpu-correlation-channel.c \
 	gpu/gpu-correlation-channel-set.c gpu/gpu-correlation-id.c \
@@ -903,6 +910,8 @@ am__libhpcrun_o_SOURCES_DIST = utilities/first_func.c main.h main.c \
 	gpu/gpu-function-id-map.c gpu/gpu-host-correlation-map.c \
 	gpu/gpu-metrics.c gpu/gpu-monitoring.c \
 	gpu/gpu-monitoring-thread-api.c gpu/gpu-op-placeholders.c \
+	gpu/gpu-operation-item.c gpu/gpu-operation-item-process.c \
+	gpu/gpu-operation-channel.c gpu/gpu-operation-channel-set.c \
 	gpu/gpu-splay-allocator.c gpu/gpu-stream-id-map.c \
 	gpu/gpu-trace.c gpu/gpu-trace-channel.c gpu/gpu-trace-item.c \
 	gpu/gpu-trace-channel-set.c ompt/ompt-callstack.c \
@@ -1059,6 +1068,7 @@ am__objects_55 = utilities/libhpcrun_o-first_func.$(OBJEXT) \
 	libhpcrun_o-hpcrun-placeholders.$(OBJEXT) \
 	gpu/libhpcrun_o-gpu-activity.$(OBJEXT) \
 	gpu/libhpcrun_o-gpu-activity-channel.$(OBJEXT) \
+	gpu/libhpcrun_o-gpu-activity-multiplexer.$(OBJEXT) \
 	gpu/libhpcrun_o-gpu-activity-process.$(OBJEXT) \
 	gpu/libhpcrun_o-gpu-application-thread-api.$(OBJEXT) \
 	gpu/libhpcrun_o-gpu-channel-item-allocator.$(OBJEXT) \
@@ -1075,6 +1085,10 @@ am__objects_55 = utilities/libhpcrun_o-first_func.$(OBJEXT) \
 	gpu/libhpcrun_o-gpu-monitoring.$(OBJEXT) \
 	gpu/libhpcrun_o-gpu-monitoring-thread-api.$(OBJEXT) \
 	gpu/libhpcrun_o-gpu-op-placeholders.$(OBJEXT) \
+	gpu/libhpcrun_o-gpu-operation-item.$(OBJEXT) \
+	gpu/libhpcrun_o-gpu-operation-item-process.$(OBJEXT) \
+	gpu/libhpcrun_o-gpu-operation-channel.$(OBJEXT) \
+	gpu/libhpcrun_o-gpu-operation-channel-set.$(OBJEXT) \
 	gpu/libhpcrun_o-gpu-splay-allocator.$(OBJEXT) \
 	gpu/libhpcrun_o-gpu-stream-id-map.$(OBJEXT) \
 	gpu/libhpcrun_o-gpu-trace.$(OBJEXT) \
@@ -1794,8 +1808,8 @@ MY_BASE_FILES = utilities/first_func.c main.h main.c disabled.c \
 	memory/mem.c memory/mmap.c messages/debug-flag.c \
 	messages/messages-sync.c messages/messages-async.c \
 	messages/fmt.c hpcrun-placeholders.c gpu/gpu-activity.c \
-	gpu/gpu-activity-channel.c gpu/gpu-activity-process.c \
-	gpu/gpu-application-thread-api.c \
+	gpu/gpu-activity-channel.c gpu/gpu-activity-multiplexer.c \
+	gpu/gpu-activity-process.c gpu/gpu-application-thread-api.c \
 	gpu/gpu-channel-item-allocator.c gpu/gpu-context-id-map.c \
 	gpu/gpu-correlation.c gpu/gpu-correlation-channel.c \
 	gpu/gpu-correlation-channel-set.c gpu/gpu-correlation-id.c \
@@ -1803,6 +1817,8 @@ MY_BASE_FILES = utilities/first_func.c main.h main.c disabled.c \
 	gpu/gpu-function-id-map.c gpu/gpu-host-correlation-map.c \
 	gpu/gpu-metrics.c gpu/gpu-monitoring.c \
 	gpu/gpu-monitoring-thread-api.c gpu/gpu-op-placeholders.c \
+	gpu/gpu-operation-item.c gpu/gpu-operation-item-process.c \
+	gpu/gpu-operation-channel.c gpu/gpu-operation-channel-set.c \
 	gpu/gpu-splay-allocator.c gpu/gpu-stream-id-map.c \
 	gpu/gpu-trace.c gpu/gpu-trace-channel.c gpu/gpu-trace-item.c \
 	gpu/gpu-trace-channel-set.c ompt/ompt-callstack.c \
@@ -2472,6 +2488,8 @@ gpu/libhpcrun_la-gpu-activity.lo: gpu/$(am__dirstamp) \
 	gpu/$(DEPDIR)/$(am__dirstamp)
 gpu/libhpcrun_la-gpu-activity-channel.lo: gpu/$(am__dirstamp) \
 	gpu/$(DEPDIR)/$(am__dirstamp)
+gpu/libhpcrun_la-gpu-activity-multiplexer.lo: gpu/$(am__dirstamp) \
+	gpu/$(DEPDIR)/$(am__dirstamp)
 gpu/libhpcrun_la-gpu-activity-process.lo: gpu/$(am__dirstamp) \
 	gpu/$(DEPDIR)/$(am__dirstamp)
 gpu/libhpcrun_la-gpu-application-thread-api.lo: gpu/$(am__dirstamp) \
@@ -2504,6 +2522,14 @@ gpu/libhpcrun_la-gpu-monitoring-thread-api.lo: gpu/$(am__dirstamp) \
 	gpu/$(DEPDIR)/$(am__dirstamp)
 gpu/libhpcrun_la-gpu-op-placeholders.lo: gpu/$(am__dirstamp) \
 	gpu/$(DEPDIR)/$(am__dirstamp)
+gpu/libhpcrun_la-gpu-operation-item.lo: gpu/$(am__dirstamp) \
+	gpu/$(DEPDIR)/$(am__dirstamp)
+gpu/libhpcrun_la-gpu-operation-item-process.lo: gpu/$(am__dirstamp) \
+	gpu/$(DEPDIR)/$(am__dirstamp)
+gpu/libhpcrun_la-gpu-operation-channel.lo: gpu/$(am__dirstamp) \
+	gpu/$(DEPDIR)/$(am__dirstamp)
+gpu/libhpcrun_la-gpu-operation-channel-set.lo: gpu/$(am__dirstamp) \
+	gpu/$(DEPDIR)/$(am__dirstamp)
 gpu/libhpcrun_la-gpu-splay-allocator.lo: gpu/$(am__dirstamp) \
 	gpu/$(DEPDIR)/$(am__dirstamp)
 gpu/libhpcrun_la-gpu-stream-id-map.lo: gpu/$(am__dirstamp) \
@@ -3124,6 +3150,8 @@ gpu/libhpcrun_o-gpu-activity.$(OBJEXT): gpu/$(am__dirstamp) \
 	gpu/$(DEPDIR)/$(am__dirstamp)
 gpu/libhpcrun_o-gpu-activity-channel.$(OBJEXT): gpu/$(am__dirstamp) \
 	gpu/$(DEPDIR)/$(am__dirstamp)
+gpu/libhpcrun_o-gpu-activity-multiplexer.$(OBJEXT):  \
+	gpu/$(am__dirstamp) gpu/$(DEPDIR)/$(am__dirstamp)
 gpu/libhpcrun_o-gpu-activity-process.$(OBJEXT): gpu/$(am__dirstamp) \
 	gpu/$(DEPDIR)/$(am__dirstamp)
 gpu/libhpcrun_o-gpu-application-thread-api.$(OBJEXT):  \
@@ -3156,6 +3184,14 @@ gpu/libhpcrun_o-gpu-monitoring-thread-api.$(OBJEXT):  \
 	gpu/$(am__dirstamp) gpu/$(DEPDIR)/$(am__dirstamp)
 gpu/libhpcrun_o-gpu-op-placeholders.$(OBJEXT): gpu/$(am__dirstamp) \
 	gpu/$(DEPDIR)/$(am__dirstamp)
+gpu/libhpcrun_o-gpu-operation-item.$(OBJEXT): gpu/$(am__dirstamp) \
+	gpu/$(DEPDIR)/$(am__dirstamp)
+gpu/libhpcrun_o-gpu-operation-item-process.$(OBJEXT):  \
+	gpu/$(am__dirstamp) gpu/$(DEPDIR)/$(am__dirstamp)
+gpu/libhpcrun_o-gpu-operation-channel.$(OBJEXT): gpu/$(am__dirstamp) \
+	gpu/$(DEPDIR)/$(am__dirstamp)
+gpu/libhpcrun_o-gpu-operation-channel-set.$(OBJEXT):  \
+	gpu/$(am__dirstamp) gpu/$(DEPDIR)/$(am__dirstamp)
 gpu/libhpcrun_o-gpu-splay-allocator.$(OBJEXT): gpu/$(am__dirstamp) \
 	gpu/$(DEPDIR)/$(am__dirstamp)
 gpu/libhpcrun_o-gpu-stream-id-map.$(OBJEXT): gpu/$(am__dirstamp) \
@@ -3671,6 +3707,7 @@ distclean-compile:
 @AMDEP_TRUE@@am__include@ @am__quote@fnbounds/$(DEPDIR)/libhpcrun_o-fnbounds_common.Po@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@fnbounds/$(DEPDIR)/libhpcrun_o-fnbounds_static.Po@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@gpu/$(DEPDIR)/libhpcrun_la-gpu-activity-channel.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@gpu/$(DEPDIR)/libhpcrun_la-gpu-activity-multiplexer.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@gpu/$(DEPDIR)/libhpcrun_la-gpu-activity-process.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@gpu/$(DEPDIR)/libhpcrun_la-gpu-activity.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@gpu/$(DEPDIR)/libhpcrun_la-gpu-application-thread-api.Plo@am__quote@
@@ -3688,6 +3725,10 @@ distclean-compile:
 @AMDEP_TRUE@@am__include@ @am__quote@gpu/$(DEPDIR)/libhpcrun_la-gpu-monitoring-thread-api.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@gpu/$(DEPDIR)/libhpcrun_la-gpu-monitoring.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@gpu/$(DEPDIR)/libhpcrun_la-gpu-op-placeholders.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@gpu/$(DEPDIR)/libhpcrun_la-gpu-operation-channel-set.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@gpu/$(DEPDIR)/libhpcrun_la-gpu-operation-channel.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@gpu/$(DEPDIR)/libhpcrun_la-gpu-operation-item-process.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@gpu/$(DEPDIR)/libhpcrun_la-gpu-operation-item.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@gpu/$(DEPDIR)/libhpcrun_la-gpu-splay-allocator.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@gpu/$(DEPDIR)/libhpcrun_la-gpu-stream-id-map.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@gpu/$(DEPDIR)/libhpcrun_la-gpu-trace-channel-set.Plo@am__quote@
@@ -3695,6 +3736,7 @@ distclean-compile:
 @AMDEP_TRUE@@am__include@ @am__quote@gpu/$(DEPDIR)/libhpcrun_la-gpu-trace-item.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@gpu/$(DEPDIR)/libhpcrun_la-gpu-trace.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@gpu/$(DEPDIR)/libhpcrun_o-gpu-activity-channel.Po@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@gpu/$(DEPDIR)/libhpcrun_o-gpu-activity-multiplexer.Po@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@gpu/$(DEPDIR)/libhpcrun_o-gpu-activity-process.Po@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@gpu/$(DEPDIR)/libhpcrun_o-gpu-activity.Po@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@gpu/$(DEPDIR)/libhpcrun_o-gpu-application-thread-api.Po@am__quote@
@@ -3712,6 +3754,10 @@ distclean-compile:
 @AMDEP_TRUE@@am__include@ @am__quote@gpu/$(DEPDIR)/libhpcrun_o-gpu-monitoring-thread-api.Po@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@gpu/$(DEPDIR)/libhpcrun_o-gpu-monitoring.Po@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@gpu/$(DEPDIR)/libhpcrun_o-gpu-op-placeholders.Po@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@gpu/$(DEPDIR)/libhpcrun_o-gpu-operation-channel-set.Po@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@gpu/$(DEPDIR)/libhpcrun_o-gpu-operation-channel.Po@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@gpu/$(DEPDIR)/libhpcrun_o-gpu-operation-item-process.Po@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@gpu/$(DEPDIR)/libhpcrun_o-gpu-operation-item.Po@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@gpu/$(DEPDIR)/libhpcrun_o-gpu-splay-allocator.Po@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@gpu/$(DEPDIR)/libhpcrun_o-gpu-stream-id-map.Po@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@gpu/$(DEPDIR)/libhpcrun_o-gpu-trace-channel-set.Po@am__quote@
@@ -4712,6 +4758,13 @@ gpu/libhpcrun_la-gpu-activity-channel.lo: gpu/gpu-activity-channel.c
 @AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
 @am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -c -o gpu/libhpcrun_la-gpu-activity-channel.lo `test -f 'gpu/gpu-activity-channel.c' || echo '$(srcdir)/'`gpu/gpu-activity-channel.c
 
+gpu/libhpcrun_la-gpu-activity-multiplexer.lo: gpu/gpu-activity-multiplexer.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -MT gpu/libhpcrun_la-gpu-activity-multiplexer.lo -MD -MP -MF gpu/$(DEPDIR)/libhpcrun_la-gpu-activity-multiplexer.Tpo -c -o gpu/libhpcrun_la-gpu-activity-multiplexer.lo `test -f 'gpu/gpu-activity-multiplexer.c' || echo '$(srcdir)/'`gpu/gpu-activity-multiplexer.c
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) gpu/$(DEPDIR)/libhpcrun_la-gpu-activity-multiplexer.Tpo gpu/$(DEPDIR)/libhpcrun_la-gpu-activity-multiplexer.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='gpu/gpu-activity-multiplexer.c' object='gpu/libhpcrun_la-gpu-activity-multiplexer.lo' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -c -o gpu/libhpcrun_la-gpu-activity-multiplexer.lo `test -f 'gpu/gpu-activity-multiplexer.c' || echo '$(srcdir)/'`gpu/gpu-activity-multiplexer.c
+
 gpu/libhpcrun_la-gpu-activity-process.lo: gpu/gpu-activity-process.c
 @am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -MT gpu/libhpcrun_la-gpu-activity-process.lo -MD -MP -MF gpu/$(DEPDIR)/libhpcrun_la-gpu-activity-process.Tpo -c -o gpu/libhpcrun_la-gpu-activity-process.lo `test -f 'gpu/gpu-activity-process.c' || echo '$(srcdir)/'`gpu/gpu-activity-process.c
 @am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) gpu/$(DEPDIR)/libhpcrun_la-gpu-activity-process.Tpo gpu/$(DEPDIR)/libhpcrun_la-gpu-activity-process.Plo
@@ -4824,6 +4877,34 @@ gpu/libhpcrun_la-gpu-op-placeholders.lo: gpu/gpu-op-placeholders.c
 @AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
 @am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -c -o gpu/libhpcrun_la-gpu-op-placeholders.lo `test -f 'gpu/gpu-op-placeholders.c' || echo '$(srcdir)/'`gpu/gpu-op-placeholders.c
 
+gpu/libhpcrun_la-gpu-operation-item.lo: gpu/gpu-operation-item.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -MT gpu/libhpcrun_la-gpu-operation-item.lo -MD -MP -MF gpu/$(DEPDIR)/libhpcrun_la-gpu-operation-item.Tpo -c -o gpu/libhpcrun_la-gpu-operation-item.lo `test -f 'gpu/gpu-operation-item.c' || echo '$(srcdir)/'`gpu/gpu-operation-item.c
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) gpu/$(DEPDIR)/libhpcrun_la-gpu-operation-item.Tpo gpu/$(DEPDIR)/libhpcrun_la-gpu-operation-item.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='gpu/gpu-operation-item.c' object='gpu/libhpcrun_la-gpu-operation-item.lo' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -c -o gpu/libhpcrun_la-gpu-operation-item.lo `test -f 'gpu/gpu-operation-item.c' || echo '$(srcdir)/'`gpu/gpu-operation-item.c
+
+gpu/libhpcrun_la-gpu-operation-item-process.lo: gpu/gpu-operation-item-process.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -MT gpu/libhpcrun_la-gpu-operation-item-process.lo -MD -MP -MF gpu/$(DEPDIR)/libhpcrun_la-gpu-operation-item-process.Tpo -c -o gpu/libhpcrun_la-gpu-operation-item-process.lo `test -f 'gpu/gpu-operation-item-process.c' || echo '$(srcdir)/'`gpu/gpu-operation-item-process.c
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) gpu/$(DEPDIR)/libhpcrun_la-gpu-operation-item-process.Tpo gpu/$(DEPDIR)/libhpcrun_la-gpu-operation-item-process.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='gpu/gpu-operation-item-process.c' object='gpu/libhpcrun_la-gpu-operation-item-process.lo' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -c -o gpu/libhpcrun_la-gpu-operation-item-process.lo `test -f 'gpu/gpu-operation-item-process.c' || echo '$(srcdir)/'`gpu/gpu-operation-item-process.c
+
+gpu/libhpcrun_la-gpu-operation-channel.lo: gpu/gpu-operation-channel.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -MT gpu/libhpcrun_la-gpu-operation-channel.lo -MD -MP -MF gpu/$(DEPDIR)/libhpcrun_la-gpu-operation-channel.Tpo -c -o gpu/libhpcrun_la-gpu-operation-channel.lo `test -f 'gpu/gpu-operation-channel.c' || echo '$(srcdir)/'`gpu/gpu-operation-channel.c
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) gpu/$(DEPDIR)/libhpcrun_la-gpu-operation-channel.Tpo gpu/$(DEPDIR)/libhpcrun_la-gpu-operation-channel.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='gpu/gpu-operation-channel.c' object='gpu/libhpcrun_la-gpu-operation-channel.lo' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -c -o gpu/libhpcrun_la-gpu-operation-channel.lo `test -f 'gpu/gpu-operation-channel.c' || echo '$(srcdir)/'`gpu/gpu-operation-channel.c
+
+gpu/libhpcrun_la-gpu-operation-channel-set.lo: gpu/gpu-operation-channel-set.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -MT gpu/libhpcrun_la-gpu-operation-channel-set.lo -MD -MP -MF gpu/$(DEPDIR)/libhpcrun_la-gpu-operation-channel-set.Tpo -c -o gpu/libhpcrun_la-gpu-operation-channel-set.lo `test -f 'gpu/gpu-operation-channel-set.c' || echo '$(srcdir)/'`gpu/gpu-operation-channel-set.c
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) gpu/$(DEPDIR)/libhpcrun_la-gpu-operation-channel-set.Tpo gpu/$(DEPDIR)/libhpcrun_la-gpu-operation-channel-set.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='gpu/gpu-operation-channel-set.c' object='gpu/libhpcrun_la-gpu-operation-channel-set.lo' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -c -o gpu/libhpcrun_la-gpu-operation-channel-set.lo `test -f 'gpu/gpu-operation-channel-set.c' || echo '$(srcdir)/'`gpu/gpu-operation-channel-set.c
+
 gpu/libhpcrun_la-gpu-splay-allocator.lo: gpu/gpu-splay-allocator.c
 @am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -MT gpu/libhpcrun_la-gpu-splay-allocator.lo -MD -MP -MF gpu/$(DEPDIR)/libhpcrun_la-gpu-splay-allocator.Tpo -c -o gpu/libhpcrun_la-gpu-splay-allocator.lo `test -f 'gpu/gpu-splay-allocator.c' || echo '$(srcdir)/'`gpu/gpu-splay-allocator.c
 @am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) gpu/$(DEPDIR)/libhpcrun_la-gpu-splay-allocator.Tpo gpu/$(DEPDIR)/libhpcrun_la-gpu-splay-allocator.Plo
@@ -6700,6 +6781,20 @@ gpu/libhpcrun_o-gpu-activity-channel.obj: gpu/gpu-activity-channel.c
 @AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
 @am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_o_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_o_CFLAGS) $(CFLAGS) -c -o gpu/libhpcrun_o-gpu-activity-channel.obj `if test -f 'gpu/gpu-activity-channel.c'; then $(CYGPATH_W) 'gpu/gpu-activity-channel.c'; else $(CYGPATH_W) '$(srcdir)/gpu/gpu-activity-channel.c'; fi`
 
+gpu/libhpcrun_o-gpu-activity-multiplexer.o: gpu/gpu-activity-multiplexer.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_o_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_o_CFLAGS) $(CFLAGS) -MT gpu/libhpcrun_o-gpu-activity-multiplexer.o -MD -MP -MF gpu/$(DEPDIR)/libhpcrun_o-gpu-activity-multiplexer.Tpo -c -o gpu/libhpcrun_o-gpu-activity-multiplexer.o `test -f 'gpu/gpu-activity-multiplexer.c' || echo '$(srcdir)/'`gpu/gpu-activity-multiplexer.c
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) gpu/$(DEPDIR)/libhpcrun_o-gpu-activity-multiplexer.Tpo gpu/$(DEPDIR)/libhpcrun_o-gpu-activity-multiplexer.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='gpu/gpu-activity-multiplexer.c' object='gpu/libhpcrun_o-gpu-activity-multiplexer.o' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_o_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_o_CFLAGS) $(CFLAGS) -c -o gpu/libhpcrun_o-gpu-activity-multiplexer.o `test -f 'gpu/gpu-activity-multiplexer.c' || echo '$(srcdir)/'`gpu/gpu-activity-multiplexer.c
+
+gpu/libhpcrun_o-gpu-activity-multiplexer.obj: gpu/gpu-activity-multiplexer.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_o_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_o_CFLAGS) $(CFLAGS) -MT gpu/libhpcrun_o-gpu-activity-multiplexer.obj -MD -MP -MF gpu/$(DEPDIR)/libhpcrun_o-gpu-activity-multiplexer.Tpo -c -o gpu/libhpcrun_o-gpu-activity-multiplexer.obj `if test -f 'gpu/gpu-activity-multiplexer.c'; then $(CYGPATH_W) 'gpu/gpu-activity-multiplexer.c'; else $(CYGPATH_W) '$(srcdir)/gpu/gpu-activity-multiplexer.c'; fi`
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) gpu/$(DEPDIR)/libhpcrun_o-gpu-activity-multiplexer.Tpo gpu/$(DEPDIR)/libhpcrun_o-gpu-activity-multiplexer.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='gpu/gpu-activity-multiplexer.c' object='gpu/libhpcrun_o-gpu-activity-multiplexer.obj' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_o_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_o_CFLAGS) $(CFLAGS) -c -o gpu/libhpcrun_o-gpu-activity-multiplexer.obj `if test -f 'gpu/gpu-activity-multiplexer.c'; then $(CYGPATH_W) 'gpu/gpu-activity-multiplexer.c'; else $(CYGPATH_W) '$(srcdir)/gpu/gpu-activity-multiplexer.c'; fi`
+
 gpu/libhpcrun_o-gpu-activity-process.o: gpu/gpu-activity-process.c
 @am__fastdepCC_TRUE@	$(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_o_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_o_CFLAGS) $(CFLAGS) -MT gpu/libhpcrun_o-gpu-activity-process.o -MD -MP -MF gpu/$(DEPDIR)/libhpcrun_o-gpu-activity-process.Tpo -c -o gpu/libhpcrun_o-gpu-activity-process.o `test -f 'gpu/gpu-activity-process.c' || echo '$(srcdir)/'`gpu/gpu-activity-process.c
 @am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) gpu/$(DEPDIR)/libhpcrun_o-gpu-activity-process.Tpo gpu/$(DEPDIR)/libhpcrun_o-gpu-activity-process.Po
@@ -6924,6 +7019,62 @@ gpu/libhpcrun_o-gpu-op-placeholders.obj: gpu/gpu-op-placeholders.c
 @AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
 @am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_o_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_o_CFLAGS) $(CFLAGS) -c -o gpu/libhpcrun_o-gpu-op-placeholders.obj `if test -f 'gpu/gpu-op-placeholders.c'; then $(CYGPATH_W) 'gpu/gpu-op-placeholders.c'; else $(CYGPATH_W) '$(srcdir)/gpu/gpu-op-placeholders.c'; fi`
 
+gpu/libhpcrun_o-gpu-operation-item.o: gpu/gpu-operation-item.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_o_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_o_CFLAGS) $(CFLAGS) -MT gpu/libhpcrun_o-gpu-operation-item.o -MD -MP -MF gpu/$(DEPDIR)/libhpcrun_o-gpu-operation-item.Tpo -c -o gpu/libhpcrun_o-gpu-operation-item.o `test -f 'gpu/gpu-operation-item.c' || echo '$(srcdir)/'`gpu/gpu-operation-item.c
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) gpu/$(DEPDIR)/libhpcrun_o-gpu-operation-item.Tpo gpu/$(DEPDIR)/libhpcrun_o-gpu-operation-item.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='gpu/gpu-operation-item.c' object='gpu/libhpcrun_o-gpu-operation-item.o' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_o_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_o_CFLAGS) $(CFLAGS) -c -o gpu/libhpcrun_o-gpu-operation-item.o `test -f 'gpu/gpu-operation-item.c' || echo '$(srcdir)/'`gpu/gpu-operation-item.c
+
+gpu/libhpcrun_o-gpu-operation-item.obj: gpu/gpu-operation-item.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_o_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_o_CFLAGS) $(CFLAGS) -MT gpu/libhpcrun_o-gpu-operation-item.obj -MD -MP -MF gpu/$(DEPDIR)/libhpcrun_o-gpu-operation-item.Tpo -c -o gpu/libhpcrun_o-gpu-operation-item.obj `if test -f 'gpu/gpu-operation-item.c'; then $(CYGPATH_W) 'gpu/gpu-operation-item.c'; else $(CYGPATH_W) '$(srcdir)/gpu/gpu-operation-item.c'; fi`
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) gpu/$(DEPDIR)/libhpcrun_o-gpu-operation-item.Tpo gpu/$(DEPDIR)/libhpcrun_o-gpu-operation-item.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='gpu/gpu-operation-item.c' object='gpu/libhpcrun_o-gpu-operation-item.obj' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_o_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_o_CFLAGS) $(CFLAGS) -c -o gpu/libhpcrun_o-gpu-operation-item.obj `if test -f 'gpu/gpu-operation-item.c'; then $(CYGPATH_W) 'gpu/gpu-operation-item.c'; else $(CYGPATH_W) '$(srcdir)/gpu/gpu-operation-item.c'; fi`
+
+gpu/libhpcrun_o-gpu-operation-item-process.o: gpu/gpu-operation-item-process.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_o_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_o_CFLAGS) $(CFLAGS) -MT gpu/libhpcrun_o-gpu-operation-item-process.o -MD -MP -MF gpu/$(DEPDIR)/libhpcrun_o-gpu-operation-item-process.Tpo -c -o gpu/libhpcrun_o-gpu-operation-item-process.o `test -f 'gpu/gpu-operation-item-process.c' || echo '$(srcdir)/'`gpu/gpu-operation-item-process.c
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) gpu/$(DEPDIR)/libhpcrun_o-gpu-operation-item-process.Tpo gpu/$(DEPDIR)/libhpcrun_o-gpu-operation-item-process.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='gpu/gpu-operation-item-process.c' object='gpu/libhpcrun_o-gpu-operation-item-process.o' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_o_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_o_CFLAGS) $(CFLAGS) -c -o gpu/libhpcrun_o-gpu-operation-item-process.o `test -f 'gpu/gpu-operation-item-process.c' || echo '$(srcdir)/'`gpu/gpu-operation-item-process.c
+
+gpu/libhpcrun_o-gpu-operation-item-process.obj: gpu/gpu-operation-item-process.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_o_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_o_CFLAGS) $(CFLAGS) -MT gpu/libhpcrun_o-gpu-operation-item-process.obj -MD -MP -MF gpu/$(DEPDIR)/libhpcrun_o-gpu-operation-item-process.Tpo -c -o gpu/libhpcrun_o-gpu-operation-item-process.obj `if test -f 'gpu/gpu-operation-item-process.c'; then $(CYGPATH_W) 'gpu/gpu-operation-item-process.c'; else $(CYGPATH_W) '$(srcdir)/gpu/gpu-operation-item-process.c'; fi`
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) gpu/$(DEPDIR)/libhpcrun_o-gpu-operation-item-process.Tpo gpu/$(DEPDIR)/libhpcrun_o-gpu-operation-item-process.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='gpu/gpu-operation-item-process.c' object='gpu/libhpcrun_o-gpu-operation-item-process.obj' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_o_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_o_CFLAGS) $(CFLAGS) -c -o gpu/libhpcrun_o-gpu-operation-item-process.obj `if test -f 'gpu/gpu-operation-item-process.c'; then $(CYGPATH_W) 'gpu/gpu-operation-item-process.c'; else $(CYGPATH_W) '$(srcdir)/gpu/gpu-operation-item-process.c'; fi`
+
+gpu/libhpcrun_o-gpu-operation-channel.o: gpu/gpu-operation-channel.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_o_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_o_CFLAGS) $(CFLAGS) -MT gpu/libhpcrun_o-gpu-operation-channel.o -MD -MP -MF gpu/$(DEPDIR)/libhpcrun_o-gpu-operation-channel.Tpo -c -o gpu/libhpcrun_o-gpu-operation-channel.o `test -f 'gpu/gpu-operation-channel.c' || echo '$(srcdir)/'`gpu/gpu-operation-channel.c
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) gpu/$(DEPDIR)/libhpcrun_o-gpu-operation-channel.Tpo gpu/$(DEPDIR)/libhpcrun_o-gpu-operation-channel.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='gpu/gpu-operation-channel.c' object='gpu/libhpcrun_o-gpu-operation-channel.o' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_o_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_o_CFLAGS) $(CFLAGS) -c -o gpu/libhpcrun_o-gpu-operation-channel.o `test -f 'gpu/gpu-operation-channel.c' || echo '$(srcdir)/'`gpu/gpu-operation-channel.c
+
+gpu/libhpcrun_o-gpu-operation-channel.obj: gpu/gpu-operation-channel.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_o_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_o_CFLAGS) $(CFLAGS) -MT gpu/libhpcrun_o-gpu-operation-channel.obj -MD -MP -MF gpu/$(DEPDIR)/libhpcrun_o-gpu-operation-channel.Tpo -c -o gpu/libhpcrun_o-gpu-operation-channel.obj `if test -f 'gpu/gpu-operation-channel.c'; then $(CYGPATH_W) 'gpu/gpu-operation-channel.c'; else $(CYGPATH_W) '$(srcdir)/gpu/gpu-operation-channel.c'; fi`
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) gpu/$(DEPDIR)/libhpcrun_o-gpu-operation-channel.Tpo gpu/$(DEPDIR)/libhpcrun_o-gpu-operation-channel.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='gpu/gpu-operation-channel.c' object='gpu/libhpcrun_o-gpu-operation-channel.obj' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_o_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_o_CFLAGS) $(CFLAGS) -c -o gpu/libhpcrun_o-gpu-operation-channel.obj `if test -f 'gpu/gpu-operation-channel.c'; then $(CYGPATH_W) 'gpu/gpu-operation-channel.c'; else $(CYGPATH_W) '$(srcdir)/gpu/gpu-operation-channel.c'; fi`
+
+gpu/libhpcrun_o-gpu-operation-channel-set.o: gpu/gpu-operation-channel-set.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_o_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_o_CFLAGS) $(CFLAGS) -MT gpu/libhpcrun_o-gpu-operation-channel-set.o -MD -MP -MF gpu/$(DEPDIR)/libhpcrun_o-gpu-operation-channel-set.Tpo -c -o gpu/libhpcrun_o-gpu-operation-channel-set.o `test -f 'gpu/gpu-operation-channel-set.c' || echo '$(srcdir)/'`gpu/gpu-operation-channel-set.c
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) gpu/$(DEPDIR)/libhpcrun_o-gpu-operation-channel-set.Tpo gpu/$(DEPDIR)/libhpcrun_o-gpu-operation-channel-set.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='gpu/gpu-operation-channel-set.c' object='gpu/libhpcrun_o-gpu-operation-channel-set.o' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_o_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_o_CFLAGS) $(CFLAGS) -c -o gpu/libhpcrun_o-gpu-operation-channel-set.o `test -f 'gpu/gpu-operation-channel-set.c' || echo '$(srcdir)/'`gpu/gpu-operation-channel-set.c
+
+gpu/libhpcrun_o-gpu-operation-channel-set.obj: gpu/gpu-operation-channel-set.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_o_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_o_CFLAGS) $(CFLAGS) -MT gpu/libhpcrun_o-gpu-operation-channel-set.obj -MD -MP -MF gpu/$(DEPDIR)/libhpcrun_o-gpu-operation-channel-set.Tpo -c -o gpu/libhpcrun_o-gpu-operation-channel-set.obj `if test -f 'gpu/gpu-operation-channel-set.c'; then $(CYGPATH_W) 'gpu/gpu-operation-channel-set.c'; else $(CYGPATH_W) '$(srcdir)/gpu/gpu-operation-channel-set.c'; fi`
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) gpu/$(DEPDIR)/libhpcrun_o-gpu-operation-channel-set.Tpo gpu/$(DEPDIR)/libhpcrun_o-gpu-operation-channel-set.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='gpu/gpu-operation-channel-set.c' object='gpu/libhpcrun_o-gpu-operation-channel-set.obj' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_o_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_o_CFLAGS) $(CFLAGS) -c -o gpu/libhpcrun_o-gpu-operation-channel-set.obj `if test -f 'gpu/gpu-operation-channel-set.c'; then $(CYGPATH_W) 'gpu/gpu-operation-channel-set.c'; else $(CYGPATH_W) '$(srcdir)/gpu/gpu-operation-channel-set.c'; fi`
+
 gpu/libhpcrun_o-gpu-splay-allocator.o: gpu/gpu-splay-allocator.c
 @am__fastdepCC_TRUE@	$(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_o_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_o_CFLAGS) $(CFLAGS) -MT gpu/libhpcrun_o-gpu-splay-allocator.o -MD -MP -MF gpu/$(DEPDIR)/libhpcrun_o-gpu-splay-allocator.Tpo -c -o gpu/libhpcrun_o-gpu-splay-allocator.o `test -f 'gpu/gpu-splay-allocator.c' || echo '$(srcdir)/'`gpu/gpu-splay-allocator.c
 @am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) gpu/$(DEPDIR)/libhpcrun_o-gpu-splay-allocator.Tpo gpu/$(DEPDIR)/libhpcrun_o-gpu-splay-allocator.Po
diff --git a/src/tool/hpcrun/gpu/gpu-activity-multiplexer.c b/src/tool/hpcrun/gpu/gpu-activity-multiplexer.c
new file mode 100644
index 0000000000..8d28a965ca
--- /dev/null
+++ b/src/tool/hpcrun/gpu/gpu-activity-multiplexer.c
@@ -0,0 +1,207 @@
+
+// * BeginRiceCopyright *****************************************************
+// -*-Mode: C++;-*- // technically C99
+//
+// --------------------------------------------------------------------------
+// Part of HPCToolkit (hpctoolkit.org)
+//
+// Information about sources of support for research and development of
+// HPCToolkit is at 'hpctoolkit.org' and in 'README.Acknowledgments'.
+// --------------------------------------------------------------------------
+//
+// Copyright ((c)) 2002-2020, Rice University
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+//   notice, this list of conditions and the following disclaimer.
+//
+// * Redistributions in binary form must reproduce the above copyright
+//   notice, this list of conditions and the following disclaimer in the
+//   documentation and/or other materials provided with the distribution.
+//
+// * Neither the name of Rice University (RICE) nor the names of its
+//   contributors may be used to endorse or promote products derived from
+//   this software without specific prior written permission.
+//
+// This software is provided by RICE and contributors "as is" and any
+// express or implied warranties, including, but not limited to, the
+// implied warranties of merchantability and fitness for a particular
+// purpose are disclaimed. In no event shall RICE or contributors be
+// liable for any direct, indirect, incidental, special, exemplary, or
+// consequential damages (including, but not limited to, procurement of
+// substitute goods or services; loss of use, data, or profits; or
+// business interruption) however caused and on any theory of liability,
+// whether in contract, strict liability, or tort (including negligence
+// or otherwise) arising in any way out of the use of this software, even
+// if advised of the possibility of such damage.
+//
+// ******************************************************* EndRiceCopyright *
+
+#include <pthread.h>
+
+#include <lib/prof-lean/stdatomic.h>
+
+
+#include "gpu-activity.h"
+#include "gpu-activity-channel.h"
+#include "gpu-operation-channel-set.h"
+#include "gpu-activity-multiplexer.h"
+#include "gpu-monitoring-thread-api.h"
+#include "gpu-activity-process.h"
+#include "gpu-print.h"
+
+//TODO: Figure out how to get max number of application threads
+#define max_threads_consumers 1000
+
+//******************************************************************************
+// type declarations
+//******************************************************************************
+
+typedef void *(*pthread_start_routine_t)(void *);
+
+//******************************************************************************
+// local variables
+//******************************************************************************
+
+static _Atomic(bool) stop_activity_flag;
+
+static atomic_uint operation_stream_counter;
+static atomic_uint operation_set_id;
+static __thread uint32_t my_operation_set_id = -1;
+static __thread gpu_operation_channel_t *gpu_operation_channel = NULL;
+static pthread_once_t is_initialized = PTHREAD_ONCE_INIT;
+
+//******************************************************************************
+// private operations
+//******************************************************************************
+
+
+static void *
+gpu_activity_record
+(
+void
+)
+{
+
+
+  while (!atomic_load(&stop_activity_flag)){
+
+    for (int set_index = 0; set_index < atomic_load(&operation_set_id) ; ++set_index) {
+      gpu_operation_channel_set_consume(set_index);
+    }
+//    pthread_cond_timedwait
+  }
+
+
+//  for (int set_index = 0; set_index < atomic_load(&operation_set_id) ; ++set_index) {
+//    gpu_trace_operation_set_release(set_index);
+//  }
+
+
+  return NULL;
+}
+
+
+void
+gpu_operation_release
+(
+gpu_operation_channel_t *channel
+)
+{
+  atomic_fetch_add(&operation_stream_counter, -1);
+}
+
+//******************************************************************************
+// interface operations
+//******************************************************************************
+
+static void
+gpu_activity_multiplexer_init
+(
+void
+)
+{
+  pthread_t thread;
+  atomic_store(&stop_activity_flag, false);
+  atomic_store(&operation_set_id, 0);
+
+  gpu_operation_channel_stack_alloc(max_threads_consumers);
+  // You are the first to create monitor thread
+  pthread_create(&thread, NULL, (pthread_start_routine_t) gpu_activity_record,
+                 NULL);
+}
+
+
+void
+gpu_activity_multiplexer_fini
+(
+void
+)
+{
+  PRINT("gpu_activity_multiplexer_fini called\n");
+
+  atomic_store(&stop_activity_flag, true);
+
+//  gpu_context_stream_map_signal_all();
+
+//  while (atomic_load(&operation_stream_counter));
+}
+
+void
+gpu_activity_multiplexer_push
+(
+gpu_activity_channel_t *gpu_channel,
+gpu_activity_t *gpu_activity
+)
+{
+  pthread_once(&is_initialized, gpu_activity_multiplexer_init);
+
+  if (my_operation_set_id == -1){
+    // Create operation channel
+    my_operation_set_id = atomic_fetch_add(&operation_set_id, 1);
+    gpu_operation_channel = gpu_operation_channel_get();
+    gpu_operation_channel_set_insert(gpu_operation_channel, my_operation_set_id);
+  }
+
+  gpu_operation_item_t item = (gpu_operation_item_t){.channel=gpu_channel, .activity=gpu_activity};
+  gpu_operation_channel_produce(gpu_operation_channel, &item);
+
+//  atomic_fetch_add(&operation_stream_counter, +1);
+
+}
+
+void
+gpu_activity_multiplexer_release
+(
+ void
+)
+{
+  atomic_fetch_add(&operation_stream_counter, -1);
+}
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/src/tool/hpcrun/gpu/gpu-activity-multiplexer.h b/src/tool/hpcrun/gpu/gpu-activity-multiplexer.h
new file mode 100644
index 0000000000..97c1a0ccd4
--- /dev/null
+++ b/src/tool/hpcrun/gpu/gpu-activity-multiplexer.h
@@ -0,0 +1,105 @@
+
+// * BeginRiceCopyright *****************************************************
+// -*-Mode: C++;-*- // technically C99
+//
+// --------------------------------------------------------------------------
+// Part of HPCToolkit (hpctoolkit.org)
+//
+// Information about sources of support for research and development of
+// HPCToolkit is at 'hpctoolkit.org' and in 'README.Acknowledgments'.
+// --------------------------------------------------------------------------
+//
+// Copyright ((c)) 2002-2020, Rice University
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+//   notice, this list of conditions and the following disclaimer.
+//
+// * Redistributions in binary form must reproduce the above copyright
+//   notice, this list of conditions and the following disclaimer in the
+//   documentation and/or other materials provided with the distribution.
+//
+// * Neither the name of Rice University (RICE) nor the names of its
+//   contributors may be used to endorse or promote products derived from
+//   this software without specific prior written permission.
+//
+// This software is provided by RICE and contributors "as is" and any
+// express or implied warranties, including, but not limited to, the
+// implied warranties of merchantability and fitness for a particular
+// purpose are disclaimed. In no event shall RICE or contributors be
+// liable for any direct, indirect, incidental, special, exemplary, or
+// consequential damages (including, but not limited to, procurement of
+// substitute goods or services; loss of use, data, or profits; or
+// business interruption) however caused and on any theory of liability,
+// whether in contract, strict liability, or tort (including negligence
+// or otherwise) arising in any way out of the use of this software, even
+// if advised of the possibility of such damage.
+//
+// ******************************************************* EndRiceCopyright *
+
+
+#ifndef gpu_channel_multiplexer_h
+#define gpu_channel_multiplexer_h
+
+#include <hpcrun/thread_data.h>
+#include "gpu-operation-channel.h"
+
+//******************************************************************************
+// type declarations
+//******************************************************************************
+typedef struct gpu_activity_channel_t gpu_activity_channel_t;
+
+//******************************************************************************
+// local variables
+//******************************************************************************
+
+
+//******************************************************************************
+// private operations
+//******************************************************************************
+
+void
+gpu_activity_multiplexer_fini(void);
+
+
+//******************************************************************************
+// interface operations
+//******************************************************************************
+
+void
+gpu_activity_multiplexer_push
+(
+gpu_activity_channel_t *gpu_channel,
+gpu_activity_t *gpu_activity
+);
+
+
+void
+gpu_activity_multiplexer_release(void);
+
+
+void
+gpu_operation_release(gpu_operation_channel_t *channel);
+
+
+
+
+#endif
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/src/tool/hpcrun/gpu/gpu-activity.h b/src/tool/hpcrun/gpu/gpu-activity.h
index 0b672c5ae8..c1a1d005fb 100644
--- a/src/tool/hpcrun/gpu/gpu-activity.h
+++ b/src/tool/hpcrun/gpu/gpu-activity.h
@@ -207,6 +207,7 @@ typedef struct gpu_mem_t {
 
 // gpu_mem_t is a prefix 
 typedef struct gpu_memcpy_t {
+  uint64_t submit_time;
   uint64_t start;
   uint64_t end;
   uint64_t bytes;
@@ -241,6 +242,7 @@ typedef struct gpu_memset_t {
 
 // gpu kernel execution
 typedef struct gpu_kernel_t {
+  uint64_t submit_time;
   uint64_t start;
   uint64_t end;
   uint32_t correlation_id;
diff --git a/src/tool/hpcrun/gpu/gpu-operation-channel-set.c b/src/tool/hpcrun/gpu/gpu-operation-channel-set.c
new file mode 100644
index 0000000000..82fbbe21ed
--- /dev/null
+++ b/src/tool/hpcrun/gpu/gpu-operation-channel-set.c
@@ -0,0 +1,199 @@
+
+// * BeginRiceCopyright *****************************************************
+// -*-Mode: C++;-*- // technically C99
+//
+// --------------------------------------------------------------------------
+// Part of HPCToolkit (hpctoolkit.org)
+//
+// Information about sources of support for research and development of
+// HPCToolkit is at 'hpctoolkit.org' and in 'README.Acknowledgments'.
+// --------------------------------------------------------------------------
+//
+// Copyright ((c)) 2002-2020, Rice University
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+//   notice, this list of conditions and the following disclaimer.
+//
+// * Redistributions in binary form must reproduce the above copyright
+//   notice, this list of conditions and the following disclaimer in the
+//   documentation and/or other materials provided with the distribution.
+//
+// * Neither the name of Rice University (RICE) nor the names of its
+//   contributors may be used to endorse or promote products derived from
+//   this software without specific prior written permission.
+//
+// This software is provided by RICE and contributors "as is" and any
+// express or implied warranties, including, but not limited to, the
+// implied warranties of merchantability and fitness for a particular
+// purpose are disclaimed. In no event shall RICE or contributors be
+// liable for any direct, indirect, incidental, special, exemplary, or
+// consequential damages (including, but not limited to, procurement of
+// substitute goods or services; loss of use, data, or profits; or
+// business interruption) however caused and on any theory of liability,
+// whether in contract, strict liability, or tort (including negligence
+// or otherwise) arising in any way out of the use of this software, even
+// if advised of the possibility of such damage.
+//
+// ******************************************************* EndRiceCopyright *
+
+
+//******************************************************************************
+// local includes
+//******************************************************************************
+
+#include <lib/prof-lean/stacks.h>
+
+#include <hpcrun/memory/hpcrun-malloc.h>
+#include <hpcrun/thread_data.h>
+
+
+#include "gpu-activity-process.h"
+#include "gpu-activity-multiplexer.h"
+#include "gpu-operation-item.h"
+#include "gpu-operation-channel.h"
+#include "gpu-operation-channel-set.h"
+#include "gpu-print.h"
+
+
+
+
+//******************************************************************************
+// macros
+//******************************************************************************
+
+#define channel_stack_push  \
+  typed_stack_push(gpu_operation_channel_ptr_t, cstack)
+
+#define channel_stack_forall \
+  typed_stack_forall(gpu_operation_channel_ptr_t, cstack)
+
+#define channel_stack_elem_t \
+  typed_stack_elem(gpu_operation_channel_ptr_t)
+
+#define channel_stack_elem_ptr_set \
+  typed_stack_elem_ptr_set(gpu_operation_channel_ptr_t, cstack)
+
+
+
+//******************************************************************************
+// type declarations
+//******************************************************************************
+
+//----------------------------------------------------------
+// support for a stack of operation channels
+//----------------------------------------------------------
+
+typedef gpu_operation_channel_t* gpu_operation_channel_ptr_t;
+
+
+typedef struct {
+  s_element_ptr_t next;
+  gpu_operation_channel_ptr_t channel;
+} typed_stack_elem(gpu_operation_channel_ptr_t);
+
+
+typed_stack_declare_type(gpu_operation_channel_ptr_t);
+
+
+
+//******************************************************************************
+// local data
+//******************************************************************************
+
+static
+typed_stack_elem_ptr(gpu_operation_channel_ptr_t) *gpu_operation_channel_stack;
+
+
+
+//******************************************************************************
+// private operations
+//******************************************************************************
+
+// implement stack of operation channels
+typed_stack_impl(gpu_operation_channel_ptr_t, cstack);
+
+
+static void
+channel_forone
+(
+ channel_stack_elem_t *se,
+ void *arg
+)
+{
+  gpu_operation_channel_t *channel = se->channel;
+
+  gpu_operation_fn_t channel_fn = (gpu_operation_fn_t) arg;
+
+  channel_fn(channel);
+}
+
+
+static void
+gpu_operation_channel_set_forall
+(
+ gpu_operation_fn_t channel_fn,
+ int set_index
+)
+{
+  channel_stack_forall(&gpu_operation_channel_stack[set_index], channel_forone,
+                       channel_fn);
+}
+
+
+
+//******************************************************************************
+// interface operations
+//******************************************************************************
+
+void gpu_operation_channel_stack_alloc(int size){
+  gpu_operation_channel_stack = hpcrun_malloc_safe( size * sizeof(typed_stack_elem_ptr(gpu_operation_channel_ptr_t)));
+}
+
+void
+gpu_operation_channel_set_insert
+(
+ gpu_operation_channel_t *channel,
+ int set_index
+)
+{
+  // allocate and initialize new entry for channel stack
+  channel_stack_elem_t *e =
+                       (channel_stack_elem_t *) hpcrun_malloc_safe(sizeof(channel_stack_elem_t));
+
+  // initialize the new entry
+  e->channel = channel;
+
+  // clear the entry's next ptr
+  channel_stack_elem_ptr_set(e, 0);
+
+  // add the entry to the channel stack
+  channel_stack_push(&gpu_operation_channel_stack[set_index], e);
+}
+
+
+void
+gpu_operation_channel_set_consume
+(
+ int set_index
+)
+{
+  gpu_operation_channel_set_forall(gpu_operation_channel_consume, set_index);
+}
+
+
+
+void
+gpu_operation_stream_release
+(
+ int set_index
+)
+{
+  gpu_operation_channel_set_forall(gpu_operation_release, set_index);
+}
+
+
diff --git a/src/tool/hpcrun/gpu/gpu-operation-channel-set.h b/src/tool/hpcrun/gpu/gpu-operation-channel-set.h
new file mode 100644
index 0000000000..054718c299
--- /dev/null
+++ b/src/tool/hpcrun/gpu/gpu-operation-channel-set.h
@@ -0,0 +1,99 @@
+// -*-Mode: C++;-*- // technically C99
+
+// * BeginRiceCopyright *****************************************************
+//
+// --------------------------------------------------------------------------
+// Part of HPCToolkit (hpctoolkit.org)
+//
+// Information about sources of support for research and development of
+// HPCToolkit is at 'hpctoolkit.org' and in 'README.Acknowledgments'.
+// --------------------------------------------------------------------------
+//
+// Copyright ((c)) 2002-2020, Rice University
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+//   notice, this list of conditions and the following disclaimer.
+//
+// * Redistributions in binary form must reproduce the above copyright
+//   notice, this list of conditions and the following disclaimer in the
+//   documentation and/or other materials provided with the distribution.
+//
+// * Neither the name of Rice University (RICE) nor the names of its
+//   contributors may be used to endorse or promote products derived from
+//   this software without specific prior written permission.
+//
+// This software is provided by RICE and contributors "as is" and any
+// express or implied warranties, including, but not limited to, the
+// implied warranties of merchantability and fitness for a particular
+// purpose are disclaimed. In no event shall RICE or contributors be
+// liable for any direct, indirect, incidental, special, exemplary, or
+// consequential damages (including, but not limited to, procurement of
+// substitute goods or services; loss of use, data, or profits; or
+// business interruption) however caused and on any theory of liability,
+// whether in contract, strict liability, or tort (including negligence
+// or otherwise) arising in any way out of the use of this software, even
+// if advised of the possibility of such damage.
+//
+// ******************************************************* EndRiceCopyright *
+
+
+#ifndef gpu_operation_channel_set_h
+#define gpu_operation_channel_set_h
+
+#include "gpu-operation-item.h"
+
+//******************************************************************************
+// forward type declarations
+//******************************************************************************
+
+typedef struct gpu_operation_channel_t gpu_operation_channel_t;
+
+//typedef struct gpu_operation_fn_t gpu_operation_fn_t;
+
+//******************************************************************************
+// type declarations
+//******************************************************************************
+
+typedef void (*gpu_operation_channel_fn_t)
+(
+ gpu_operation_channel_t *channel
+);
+
+//typedef void (*gpu_operation_fn_t)
+//(
+// gpu_operation_t *a
+//);
+
+
+//******************************************************************************
+// interface operations
+//******************************************************************************
+
+void
+gpu_operation_channel_stack_alloc(int size);
+
+
+void
+gpu_operation_channel_set_insert
+(
+ gpu_operation_channel_t *channel,
+ int set_index
+);
+
+
+void
+gpu_operation_channel_set_consume
+(
+ int set_index
+);
+
+void gpu_operation_channel_set_release(int set_index);
+
+
+
+#endif
diff --git a/src/tool/hpcrun/gpu/gpu-operation-channel.c b/src/tool/hpcrun/gpu/gpu-operation-channel.c
new file mode 100644
index 0000000000..3513428424
--- /dev/null
+++ b/src/tool/hpcrun/gpu/gpu-operation-channel.c
@@ -0,0 +1,182 @@
+// -*-Mode: C++;-*- // technically C99
+
+// * BeginRiceCopyright *****************************************************
+//
+// --------------------------------------------------------------------------
+// Part of HPCToolkit (hpctoolkit.org)
+//
+// Information about sources of support for research and development of
+// HPCToolkit is at 'hpctoolkit.org' and in 'README.Acknowledgments'.
+// --------------------------------------------------------------------------
+//
+// Copyright ((c)) 2002-2020, Rice University
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+//   notice, this list of conditions and the following disclaimer.
+//
+// * Redistributions in binary form must reproduce the above copyright
+//   notice, this list of conditions and the following disclaimer in the
+//   documentation and/or other materials provided with the distribution.
+//
+// * Neither the name of Rice University (RICE) nor the names of its
+//   contributors may be used to endorse or promote products derived from
+//   this software without specific prior written permission.
+//
+// This software is provided by RICE and contributors "as is" and any
+// express or implied warranties, including, but not limited to, the
+// implied warranties of merchantability and fitness for a particular
+// purpose are disclaimed. In no event shall RICE or contributors be
+// liable for any direct, indirect, incidental, special, exemplary, or
+// consequential damages (including, but not limited to, procurement of
+// substitute goods or services; loss of use, data, or profits; or
+// business interruption) however caused and on any theory of liability,
+// whether in contract, strict liability, or tort (including negligence
+// or otherwise) arising in any way out of the use of this software, even
+// if advised of the possibility of such damage.
+//
+// ******************************************************* EndRiceCopyright *
+
+//******************************************************************************
+// local includes
+//******************************************************************************
+
+#include <hpcrun/memory/hpcrun-malloc.h>
+
+#include "gpu-channel-item-allocator.h"
+#include "gpu-operation-channel.h"
+#include "gpu-operation-item.h"
+#include "gpu-operation-item-process.h"
+
+
+//******************************************************************************
+// macros
+//******************************************************************************
+
+#undef typed_bichannel
+#undef typed_stack_elem
+
+#define typed_bichannel(x) gpu_operation_channel_t
+#define typed_stack_elem(x) gpu_operation_item_t
+
+// define macros that simplify use of operation channel API
+#define channel_init  \
+  typed_bichannel_init(gpu_operation_item_t)
+
+#define channel_pop   \
+  typed_bichannel_pop(gpu_operation_item_t)
+
+#define channel_push  \
+  typed_bichannel_push(gpu_operation_item_t)
+
+#define channel_reverse \
+  typed_bichannel_reverse(gpu_operation_item_t)
+
+#define channel_steal \
+  typed_bichannel_steal(gpu_operation_item_t)
+
+#define gpu_operation_item_alloc(channel)		\
+  channel_item_alloc(channel, gpu_operation_item_t)
+
+#define gpu_operation_item_free(channel, item)	\
+  channel_item_free(channel, item)
+
+
+//******************************************************************************
+// type declarations
+//******************************************************************************
+
+typedef struct gpu_operation_channel_t {
+  bistack_t bistacks[2];
+} gpu_operation_channel_t;
+
+
+
+//******************************************************************************
+// local data
+//******************************************************************************
+
+static __thread gpu_operation_channel_t *gpu_operation_channel = NULL;
+
+
+//******************************************************************************
+// private functions
+//******************************************************************************
+
+// implement bidirectional channels for activities
+typed_bichannel_impl(gpu_operation_item_t)
+
+
+static gpu_operation_channel_t *
+gpu_operation_channel_alloc
+(
+void
+)
+{
+  gpu_operation_channel_t *c = hpcrun_malloc_safe(sizeof(gpu_operation_channel_t));
+
+  channel_init(c);
+
+  return c;
+}
+
+
+
+//******************************************************************************
+// interface operations 
+//******************************************************************************
+
+
+gpu_operation_channel_t *
+gpu_operation_channel_get
+(
+ void
+)
+{
+  if (gpu_operation_channel == NULL) {
+    gpu_operation_channel = gpu_operation_channel_alloc();
+  }
+
+  return gpu_operation_channel;
+}
+
+
+void
+gpu_operation_channel_produce
+(
+ gpu_operation_channel_t *channel,
+ gpu_operation_item_t *it
+)
+{
+  gpu_operation_item_t *channel_op = gpu_operation_item_alloc(channel);
+  *channel_op = *it;
+
+  channel_push(channel, bichannel_direction_forward, channel_op);
+}
+
+
+void
+gpu_operation_channel_consume
+(
+ gpu_operation_channel_t *channel
+)
+{
+
+  // steal elements previously pushed by the producer
+  channel_steal(channel, bichannel_direction_forward);
+
+  // reverse them so that they are in FIFO order
+  channel_reverse(channel, bichannel_direction_forward);
+
+  // consume all elements enqueued before this function was called
+  for (;;) {
+    gpu_operation_item_t *it = channel_pop(channel, bichannel_direction_forward);
+    if (!it) break;
+    gpu_operation_item_consume(gpu_operation_item_process, it);
+    gpu_operation_item_free(channel, it);
+  }
+}
diff --git a/src/tool/hpcrun/gpu/gpu-operation-channel.h b/src/tool/hpcrun/gpu/gpu-operation-channel.h
new file mode 100644
index 0000000000..f31ed020f8
--- /dev/null
+++ b/src/tool/hpcrun/gpu/gpu-operation-channel.h
@@ -0,0 +1,92 @@
+// -*-Mode: C++;-*- // technically C99
+
+// * BeginRiceCopyright *****************************************************
+//
+// --------------------------------------------------------------------------
+// Part of HPCToolkit (hpctoolkit.org)
+//
+// Information about sources of support for research and development of
+// HPCToolkit is at 'hpctoolkit.org' and in 'README.Acknowledgments'.
+// --------------------------------------------------------------------------
+//
+// Copyright ((c)) 2002-2020, Rice University
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+//   notice, this list of conditions and the following disclaimer.
+//
+// * Redistributions in binary form must reproduce the above copyright
+//   notice, this list of conditions and the following disclaimer in the
+//   documentation and/or other materials provided with the distribution.
+//
+// * Neither the name of Rice University (RICE) nor the names of its
+//   contributors may be used to endorse or promote products derived from
+//   this software without specific prior written permission.
+//
+// This software is provided by RICE and contributors "as is" and any
+// express or implied warranties, including, but not limited to, the
+// implied warranties of merchantability and fitness for a particular
+// purpose are disclaimed. In no event shall RICE or contributors be
+// liable for any direct, indirect, incidental, special, exemplary, or
+// consequential damages (including, but not limited to, procurement of
+// substitute goods or services; loss of use, data, or profits; or
+// business interruption) however caused and on any theory of liability,
+// whether in contract, strict liability, or tort (including negligence
+// or otherwise) arising in any way out of the use of this software, even
+// if advised of the possibility of such damage.
+//
+// ******************************************************* EndRiceCopyright *
+
+#ifndef gpu_operation_channel_h
+#define gpu_operation_channel_h
+
+//******************************************************************************
+// local includes
+//******************************************************************************
+
+#include <lib/prof-lean/bichannel.h>
+
+//#include "gpu-operation-item.h"
+
+
+//******************************************************************************
+// forward type declarations
+//******************************************************************************
+
+typedef struct gpu_operation_item_t gpu_operation_item_t;
+
+typedef struct gpu_operation_channel_t gpu_operation_channel_t;
+
+
+//******************************************************************************
+// interface operations
+//******************************************************************************
+
+gpu_operation_channel_t *
+gpu_operation_channel_get
+(
+ void
+);
+
+
+void
+gpu_operation_channel_produce
+(
+ gpu_operation_channel_t *channel,
+ gpu_operation_item_t *it
+);
+
+
+void
+gpu_operation_channel_consume
+(
+ gpu_operation_channel_t *channel
+);
+
+
+
+#endif
diff --git a/src/tool/hpcrun/gpu/gpu-operation-item-process.c b/src/tool/hpcrun/gpu/gpu-operation-item-process.c
new file mode 100644
index 0000000000..3290eba6fc
--- /dev/null
+++ b/src/tool/hpcrun/gpu/gpu-operation-item-process.c
@@ -0,0 +1,204 @@
+// -*-Mode: C++;-*- // technically C99
+
+// * BeginRiceCopyright *****************************************************
+//
+// --------------------------------------------------------------------------
+// Part of HPCToolkit (hpctoolkit.org)
+//
+// Information about sources of support for research and development of
+// HPCToolkit is at 'hpctoolkit.org' and in 'README.Acknowledgments'.
+// --------------------------------------------------------------------------
+//
+// Copyright ((c)) 2002-2020, Rice University
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+//   notice, this list of conditions and the following disclaimer.
+//
+// * Redistributions in binary form must reproduce the above copyright
+//   notice, this list of conditions and the following disclaimer in the
+//   documentation and/or other materials provided with the distribution.
+//
+// * Neither the name of Rice University (RICE) nor the names of its
+//   contributors may be used to endorse or promote products derived from
+//   this software without specific prior written permission.
+//
+// This software is provided by RICE and contributors "as is" and any
+// express or implied warranties, including, but not limited to, the
+// implied warranties of merchantability and fitness for a particular
+// purpose are disclaimed. In no event shall RICE or contributors be
+// liable for any direct, indirect, incidental, special, exemplary, or
+// consequential damages (including, but not limited to, procurement of
+// substitute goods or services; loss of use, data, or profits; or
+// business interruption) however caused and on any theory of liability,
+// whether in contract, strict liability, or tort (including negligence
+// or otherwise) arising in any way out of the use of this software, even
+// if advised of the possibility of such damage.
+//
+// ******************************************************* EndRiceCopyright *
+
+//******************************************************************************
+// system includes
+//******************************************************************************
+
+#include <assert.h>
+
+
+
+//******************************************************************************
+// local includes
+//******************************************************************************
+
+#include <hpcrun/cct/cct.h>
+#include <hpcrun/gpu/gpu-activity.h>
+#include <hpcrun/gpu/gpu-activity-channel.h>
+#include <hpcrun/gpu/gpu-trace-item.h>
+#include <hpcrun/gpu/gpu-context-id-map.h>
+
+#include "gpu-operation-item-process.h"
+#include "gpu-operation-item.h"
+
+
+
+//******************************************************************************
+// macros
+//******************************************************************************
+
+#define UNIT_TEST 0
+
+#define DEBUG 0
+
+#include "gpu-print.h"
+
+
+
+//******************************************************************************
+// private operations
+//******************************************************************************
+
+
+
+static void
+gpu_context_stream_trace
+(
+ uint32_t context_id,
+ uint32_t stream_id,
+ gpu_trace_item_t *ti
+)
+{
+  gpu_context_id_map_stream_process(context_id, stream_id, gpu_trace_produce, ti);
+}
+
+
+static void
+trace_item_set
+(
+ gpu_trace_item_t *ti,
+ uint64_t submit_time,
+ uint64_t start_time,
+ uint64_t end_time,
+ gpu_activity_t *ga
+)
+{
+  gpu_trace_item_produce(ti, submit_time, start_time, end_time, ga->cct_node);
+}
+
+
+
+//******************************************************************************
+// gpu operations process
+//******************************************************************************
+
+static void
+gpu_memcpy_process
+(
+gpu_operation_item_t *it
+)
+{
+  gpu_activity_t *activity = it->activity;
+  gpu_activity_channel_t *channel = it->channel;
+
+  assert(activity->cct_node != NULL);
+
+  gpu_trace_item_t entry_trace;
+  trace_item_set(&entry_trace, activity->details.memcpy.submit_time,
+                 activity->details.memcpy.start, activity->details.memcpy.end, activity);
+
+  gpu_context_stream_trace
+    (activity->details.memcpy.context_id, activity->details.memcpy.stream_id,
+     &entry_trace);
+
+  gpu_activity_channel_produce(channel, activity);
+
+  PRINT("Memcpy copy cct_node %p\n", activity->cct_node);
+  PRINT("Memcpy copy kind %u\n", activity->details.memcpy.copyKind);
+  PRINT("Memcpy copy bytes %lu\n", activity->details.memcpy.bytes);
+}
+
+
+static void
+gpu_kernel_process
+(
+gpu_operation_item_t *it
+)
+{
+  gpu_activity_t *activity = it->activity;
+  gpu_activity_channel_t *channel = it->channel;
+
+  gpu_trace_item_t entry_trace;
+  trace_item_set(&entry_trace, activity->details.kernel.submit_time,
+                 activity->details.kernel.start, activity->details.kernel.end, activity);
+
+  gpu_context_stream_trace
+    (activity->details.kernel.context_id, activity->details.kernel.stream_id,
+     &entry_trace);
+
+  gpu_activity_channel_produce(channel, activity);
+
+  PRINT("Kernel execution cct_node %p\n", activity->cct_node);
+  PRINT("Kernel execution deviceId %u\n", activity->details.kernel.device_id);
+}
+
+
+static void
+gpu_unknown_process
+(
+gpu_operation_item_t *it
+)
+{
+  PRINT("Unknown activity kind %d\n", it->activity->kind);
+}
+
+
+
+//******************************************************************************
+// interface operations
+//******************************************************************************
+
+void
+gpu_operation_item_process
+(
+gpu_operation_item_t *it
+)
+{
+
+  switch (it->activity->kind) {
+
+  case GPU_ACTIVITY_MEMCPY:
+    gpu_memcpy_process(it);
+    break;
+
+  case GPU_ACTIVITY_KERNEL:
+    gpu_kernel_process(it);
+    break;
+
+  default:
+    gpu_unknown_process(it);
+    break;
+  }
+}
+
diff --git a/src/tool/hpcrun/gpu/gpu-operation-item-process.h b/src/tool/hpcrun/gpu/gpu-operation-item-process.h
new file mode 100644
index 0000000000..cd66efe3d2
--- /dev/null
+++ b/src/tool/hpcrun/gpu/gpu-operation-item-process.h
@@ -0,0 +1,69 @@
+// -*-Mode: C++;-*- // technically C99
+
+// * BeginRiceCopyright *****************************************************
+//
+// --------------------------------------------------------------------------
+// Part of HPCToolkit (hpctoolkit.org)
+//
+// Information about sources of support for research and development of
+// HPCToolkit is at 'hpctoolkit.org' and in 'README.Acknowledgments'.
+// --------------------------------------------------------------------------
+//
+// Copyright ((c)) 2002-2020, Rice University
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+//   notice, this list of conditions and the following disclaimer.
+//
+// * Redistributions in binary form must reproduce the above copyright
+//   notice, this list of conditions and the following disclaimer in the
+//   documentation and/or other materials provided with the distribution.
+//
+// * Neither the name of Rice University (RICE) nor the names of its
+//   contributors may be used to endorse or promote products derived from
+//   this software without specific prior written permission.
+//
+// This software is provided by RICE and contributors "as is" and any
+// express or implied warranties, including, but not limited to, the
+// implied warranties of merchantability and fitness for a particular
+// purpose are disclaimed. In no event shall RICE or contributors be
+// liable for any direct, indirect, incidental, special, exemplary, or
+// consequential damages (including, but not limited to, procurement of
+// substitute goods or services; loss of use, data, or profits; or
+// business interruption) however caused and on any theory of liability,
+// whether in contract, strict liability, or tort (including negligence
+// or otherwise) arising in any way out of the use of this software, even
+// if advised of the possibility of such damage.
+//
+// ******************************************************* EndRiceCopyright *
+
+#ifndef gpu_operation_item_process_h
+#define gpu_operation_item_process_h
+
+
+
+//******************************************************************************
+// forward type declarations
+//******************************************************************************
+
+typedef struct gpu_operation_item_t gpu_operation_item_t;
+
+
+
+//******************************************************************************
+// interface operations
+//******************************************************************************
+
+void
+gpu_operation_item_process
+(
+ gpu_operation_item_t *it
+);
+
+
+
+#endif
diff --git a/src/tool/hpcrun/gpu/gpu-operation-item.c b/src/tool/hpcrun/gpu/gpu-operation-item.c
new file mode 100644
index 0000000000..c26bc879e9
--- /dev/null
+++ b/src/tool/hpcrun/gpu/gpu-operation-item.c
@@ -0,0 +1,120 @@
+// -*-Mode: C++;-*- // technically C99
+
+// * BeginRiceCopyright *****************************************************
+//
+// --------------------------------------------------------------------------
+// Part of HPCToolkit (hpctoolkit.org)
+//
+// Information about sources of support for research and development of
+// HPCToolkit is at 'hpctoolkit.org' and in 'README.Acknowledgments'.
+// --------------------------------------------------------------------------
+//
+// Copyright ((c)) 2002-2020, Rice University
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+//   notice, this list of conditions and the following disclaimer.
+//
+// * Redistributions in binary form must reproduce the above copyright
+//   notice, this list of conditions and the following disclaimer in the
+//   documentation and/or other materials provided with the distribution.
+//
+// * Neither the name of Rice University (RICE) nor the names of its
+//   contributors may be used to endorse or promote products derived from
+//   this software without specific prior written permission.
+//
+// This software is provided by RICE and contributors "as is" and any
+// express or implied warranties, including, but not limited to, the
+// implied warranties of merchantability and fitness for a particular
+// purpose are disclaimed. In no event shall RICE or contributors be
+// liable for any direct, indirect, incidental, special, exemplary, or
+// consequential damages (including, but not limited to, procurement of
+// substitute goods or services; loss of use, data, or profits; or
+// business interruption) however caused and on any theory of liability,
+// whether in contract, strict liability, or tort (including negligence
+// or otherwise) arising in any way out of the use of this software, even
+// if advised of the possibility of such damage.
+//
+// ******************************************************* EndRiceCopyright *
+
+
+//******************************************************************************
+// system includes
+//******************************************************************************
+
+#include <assert.h>
+
+
+
+//******************************************************************************
+// macros
+//******************************************************************************
+
+#define UNIT_TEST 0
+
+#define DEBUG 0
+
+#include "gpu-print.h"
+
+
+
+//******************************************************************************
+// local includes
+//******************************************************************************
+
+#include "gpu-activity.h"
+#include "gpu-channel-item-allocator.h"
+#include "gpu-operation-item.h"
+#include "gpu-operation-channel.h"
+
+
+
+//******************************************************************************
+// interface functions
+//******************************************************************************
+
+void
+gpu_operation_item_dump
+(
+ gpu_operation_item_t *it
+)
+{
+  PRINT("gpu operation item: activity_channel %p, activity = %p\n", it->channel, it->activity);
+}
+
+
+void
+gpu_operation_item_consume
+(
+ gpu_operation_item_fn_t ap_fn,
+ gpu_operation_item_t *it
+)
+{
+  gpu_context_activity_dump(it->activity, "CONSUME");
+  ap_fn(it);
+}
+
+
+gpu_operation_item_t *
+gpu_operation_item_alloc
+(
+ gpu_operation_channel_t *channel
+)
+{
+  return channel_item_alloc(channel, gpu_operation_item_t);
+}
+
+
+void
+gpu_operation_item_free
+(
+ gpu_operation_channel_t *channel,
+ gpu_operation_item_t *it
+)
+{
+  channel_item_free(channel, it);
+}
diff --git a/src/tool/hpcrun/gpu/gpu-operation-item.h b/src/tool/hpcrun/gpu/gpu-operation-item.h
new file mode 100644
index 0000000000..284d437067
--- /dev/null
+++ b/src/tool/hpcrun/gpu/gpu-operation-item.h
@@ -0,0 +1,131 @@
+// -*-Mode: C++;-*- // technically C99
+
+// * BeginRiceCopyright *****************************************************
+//
+// --------------------------------------------------------------------------
+// Part of HPCToolkit (hpctoolkit.org)
+//
+// Information about sources of support for research and development of
+// HPCToolkit is at 'hpctoolkit.org' and in 'README.Acknowledgments'.
+// --------------------------------------------------------------------------
+//
+// Copyright ((c)) 2002-2020, Rice University
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+//   notice, this list of conditions and the following disclaimer.
+//
+// * Redistributions in binary form must reproduce the above copyright
+//   notice, this list of conditions and the following disclaimer in the
+//   documentation and/or other materials provided with the distribution.
+//
+// * Neither the name of Rice University (RICE) nor the names of its
+//   contributors may be used to endorse or promote products derived from
+//   this software without specific prior written permission.
+//
+// This software is provided by RICE and contributors "as is" and any
+// express or implied warranties, including, but not limited to, the
+// implied warranties of merchantability and fitness for a particular
+// purpose are disclaimed. In no event shall RICE or contributors be
+// liable for any direct, indirect, incidental, special, exemplary, or
+// consequential damages (including, but not limited to, procurement of
+// substitute goods or services; loss of use, data, or profits; or
+// business interruption) however caused and on any theory of liability,
+// whether in contract, strict liability, or tort (including negligence
+// or otherwise) arising in any way out of the use of this software, even
+// if advised of the possibility of such damage.
+//
+// ******************************************************* EndRiceCopyright *
+
+#ifndef gpu_operation_item_h
+#define gpu_operation_item_h
+
+
+
+//******************************************************************************
+// system includes
+//******************************************************************************
+
+#include <stdint.h>
+
+
+
+//******************************************************************************
+// local includes
+//******************************************************************************
+
+#include <lib/prof-lean/stacks.h>
+#include <hpcrun/utilities/ip-normalized.h>
+
+#include "gpu-activity.h"
+
+
+
+//******************************************************************************
+// forward declarations
+//******************************************************************************
+
+typedef struct gpu_operation_channel_t gpu_operation_channel_t;
+
+
+
+//******************************************************************************
+// type declarations
+//******************************************************************************
+
+typedef struct gpu_operation_item_t{
+  gpu_activity_channel_t *channel;
+  gpu_activity_t *activity;
+}gpu_operation_item_t;
+
+
+typedef void (*gpu_operation_fn_t)
+(
+ gpu_operation_channel_t *channel
+);
+
+typedef void (*gpu_operation_item_fn_t)
+(
+ gpu_operation_item_t *it
+);
+
+//******************************************************************************
+// interface functions
+//******************************************************************************
+
+void
+gpu_operation_item_consume
+(
+ gpu_operation_item_fn_t ap_fn,
+ gpu_operation_item_t *it
+);
+
+
+gpu_operation_item_t *
+gpu_operation_item_alloc
+(
+ gpu_operation_channel_t *channel
+);
+
+
+void
+gpu_operation_item_free
+(
+ gpu_operation_channel_t *channel,
+ gpu_operation_item_t *a
+);
+
+
+void
+gpu_operation_item_dump
+(
+ gpu_operation_item_t *it
+);
+
+
+
+#endif
diff --git a/src/tool/hpcrun/gpu/gpu-operation.c b/src/tool/hpcrun/gpu/gpu-operation.c
new file mode 100644
index 0000000000..cb5ba633b8
--- /dev/null
+++ b/src/tool/hpcrun/gpu/gpu-operation.c
@@ -0,0 +1,120 @@
+// -*-Mode: C++;-*- // technically C99
+
+// * BeginRiceCopyright *****************************************************
+//
+// --------------------------------------------------------------------------
+// Part of HPCToolkit (hpctoolkit.org)
+//
+// Information about sources of support for research and development of
+// HPCToolkit is at 'hpctoolkit.org' and in 'README.Acknowledgments'.
+// --------------------------------------------------------------------------
+//
+// Copyright ((c)) 2002-2020, Rice University
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+//   notice, this list of conditions and the following disclaimer.
+//
+// * Redistributions in binary form must reproduce the above copyright
+//   notice, this list of conditions and the following disclaimer in the
+//   documentation and/or other materials provided with the distribution.
+//
+// * Neither the name of Rice University (RICE) nor the names of its
+//   contributors may be used to endorse or promote products derived from
+//   this software without specific prior written permission.
+//
+// This software is provided by RICE and contributors "as is" and any
+// express or implied warranties, including, but not limited to, the
+// implied warranties of merchantability and fitness for a particular
+// purpose are disclaimed. In no event shall RICE or contributors be
+// liable for any direct, indirect, incidental, special, exemplary, or
+// consequential damages (including, but not limited to, procurement of
+// substitute goods or services; loss of use, data, or profits; or
+// business interruption) however caused and on any theory of liability,
+// whether in contract, strict liability, or tort (including negligence
+// or otherwise) arising in any way out of the use of this software, even
+// if advised of the possibility of such damage.
+//
+// ******************************************************* EndRiceCopyright *
+
+
+//******************************************************************************
+// system includes
+//******************************************************************************
+
+#include <assert.h>
+
+
+
+//******************************************************************************
+// macros
+//******************************************************************************
+
+#define UNIT_TEST 0
+
+#define DEBUG 0
+
+#include "gpu-print.h"
+
+
+
+//******************************************************************************
+// local includes
+//******************************************************************************
+
+#include "gpu-activity.h"
+#include "gpu-channel-item-allocator.h"
+#include "gpu-operation.h"
+#include "gpu-operation-channel.h"
+
+
+
+//******************************************************************************
+// interface functions
+//******************************************************************************
+
+void
+gpu_operation_dump
+(
+ gpu_operation_item_t *it
+)
+{
+  PRINT("gpu operation item: activity_channel %p, activity = %p\n", it->channel, it->activity);
+}
+
+
+void
+gpu_operation_item_consume
+(
+ gpu_operation_item_fn_t ap_fn,
+ gpu_operation_item_t *it,
+)
+{
+  gpu_context_activity_dump(&it->activity, "CONSUME");
+  ap_fn(it);
+}
+
+
+gpu_operation_item_t *
+gpu_operation_alloc
+(
+ gpu_operation_channel_t *channel
+)
+{
+  return channel_item_alloc(channel, gpu_operation_item_t);
+}
+
+
+void
+gpu_operation_free
+(
+ gpu_operation_channel_t *channel,
+ gpu_operation_item_t *it
+)
+{
+  channel_item_free(channel, it);
+}
diff --git a/src/tool/hpcrun/gpu/gpu-operation.h b/src/tool/hpcrun/gpu/gpu-operation.h
new file mode 100644
index 0000000000..4a3b7e8fe1
--- /dev/null
+++ b/src/tool/hpcrun/gpu/gpu-operation.h
@@ -0,0 +1,131 @@
+// -*-Mode: C++;-*- // technically C99
+
+// * BeginRiceCopyright *****************************************************
+//
+// --------------------------------------------------------------------------
+// Part of HPCToolkit (hpctoolkit.org)
+//
+// Information about sources of support for research and development of
+// HPCToolkit is at 'hpctoolkit.org' and in 'README.Acknowledgments'.
+// --------------------------------------------------------------------------
+//
+// Copyright ((c)) 2002-2020, Rice University
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+//   notice, this list of conditions and the following disclaimer.
+//
+// * Redistributions in binary form must reproduce the above copyright
+//   notice, this list of conditions and the following disclaimer in the
+//   documentation and/or other materials provided with the distribution.
+//
+// * Neither the name of Rice University (RICE) nor the names of its
+//   contributors may be used to endorse or promote products derived from
+//   this software without specific prior written permission.
+//
+// This software is provided by RICE and contributors "as is" and any
+// express or implied warranties, including, but not limited to, the
+// implied warranties of merchantability and fitness for a particular
+// purpose are disclaimed. In no event shall RICE or contributors be
+// liable for any direct, indirect, incidental, special, exemplary, or
+// consequential damages (including, but not limited to, procurement of
+// substitute goods or services; loss of use, data, or profits; or
+// business interruption) however caused and on any theory of liability,
+// whether in contract, strict liability, or tort (including negligence
+// or otherwise) arising in any way out of the use of this software, even
+// if advised of the possibility of such damage.
+//
+// ******************************************************* EndRiceCopyright *
+
+#ifndef gpu_operation_h
+#define gpu_operation_h
+
+
+
+//******************************************************************************
+// system includes
+//******************************************************************************
+
+#include <stdint.h>
+
+
+
+//******************************************************************************
+// local includes
+//******************************************************************************
+
+#include <lib/prof-lean/stacks.h>
+#include <hpcrun/utilities/ip-normalized.h>
+
+#include "gpu-activity.h"
+
+
+
+//******************************************************************************
+// forward declarations
+//******************************************************************************
+
+typedef struct gpu_operation_channel_t gpu_operation_channel_t;
+
+
+
+//******************************************************************************
+// type declarations
+//******************************************************************************
+
+typedef struct gpu_operation_item_t{
+  gpu_activity_channel_t *channel;
+  gpu_activity_t activity;
+}gpu_operation_item_t;
+
+
+typedef void (*gpu_operation_fn_t)
+(
+ gpu_operation_channel_t *channel
+);
+
+typedef void (*gpu_operation_item_fn_t)
+(
+ gpu_operation_item_t *it
+);
+
+//******************************************************************************
+// interface functions
+//******************************************************************************
+
+void
+gpu_operation_item_consume
+(
+ gpu_operation_item_fn_t ap_fn,
+ gpu_operation_item_t *it
+);
+
+
+gpu_operation_item_t *
+gpu_operation_alloc
+(
+ gpu_operation_channel_t *channel
+);
+
+
+void
+gpu_operation_free
+(
+ gpu_operation_channel_t *channel,
+ gpu_operation_item_t *a
+);
+
+
+void
+gpu_operation_dump
+(
+ gpu_operation_item_t *it
+);
+
+
+
+#endif
diff --git a/src/tool/hpcrun/gpu/opencl/opencl-activity-translate.c b/src/tool/hpcrun/gpu/opencl/opencl-activity-translate.c
index d6cc192fba..72d58dd4af 100644
--- a/src/tool/hpcrun/gpu/opencl/opencl-activity-translate.c
+++ b/src/tool/hpcrun/gpu/opencl/opencl-activity-translate.c
@@ -74,6 +74,7 @@ getMemoryProfileInfo
 )
 {
   memcpy->correlation_id = cb_data->correlation_id;
+  memcpy->submit_time = cb_data->submit_time;
   memcpy->bytes = cb_data->size;
   memcpy->copyKind = (gpu_memcpy_type_t) 
     (cb_data->fromHostToDevice)? GPU_MEMCPY_H2D: 
@@ -94,6 +95,7 @@ convert_kernel_launch
   getTimingInfoFromClEvent(&ga->details.interval, event);
   ga->kind = GPU_ACTIVITY_KERNEL;
   ga->details.kernel.correlation_id = kernel_cb_data->correlation_id;
+  ga->details.kernel.submit_time = kernel_cb_data->submit_time;
 }
 
 
@@ -124,20 +126,20 @@ opencl_activity_translate
 (
   gpu_activity_t *ga,
   cl_event event,
-  void *user_data
+  void *act_data
 )
 {
-  cl_generic_callback_t *cb_data = (cl_generic_callback_t*)user_data;
+  cl_generic_callback_t *cb_data = (cl_generic_callback_t*)act_data;
   opencl_call_t type = cb_data->type;
   switch (type) {
     case kernel:
-      convert_kernel_launch(ga, user_data, event);
+      convert_kernel_launch(ga, act_data, event);
       break;
     case memcpy_H2D:
-      convert_memcpy(ga, user_data, event, GPU_MEMCPY_H2D);
+      convert_memcpy(ga, act_data, event, GPU_MEMCPY_H2D);
       break;
     case memcpy_D2H:
-      convert_memcpy(ga, user_data, event, GPU_MEMCPY_D2H);
+      convert_memcpy(ga, act_data, event, GPU_MEMCPY_D2H);
       break;
     default:
       assert(0);
diff --git a/src/tool/hpcrun/gpu/opencl/opencl-api.c b/src/tool/hpcrun/gpu/opencl/opencl-api.c
index 68980eac33..e54ee6010c 100644
--- a/src/tool/hpcrun/gpu/opencl/opencl-api.c
+++ b/src/tool/hpcrun/gpu/opencl/opencl-api.c
@@ -58,6 +58,7 @@
 #include <hpcrun/gpu/gpu-activity.h>
 #include <hpcrun/gpu/gpu-activity-channel.h>
 #include <hpcrun/gpu/gpu-activity-process.h>
+#include <hpcrun/gpu/gpu-activity-multiplexer.h>
 #include <hpcrun/gpu/gpu-correlation-channel.h>
 #include <hpcrun/gpu/gpu-correlation-id-map.h>
 #include <hpcrun/gpu/gpu-application-thread-api.h>
@@ -235,26 +236,19 @@ opencl_pending_operations_adjust
 }
 
 
-static void
-opencl_activity_completion_notify
-(
-  void
-)
-{
-  gpu_monitoring_thread_activities_ready();
-}
-
-
 static void
 opencl_activity_process
 (
   cl_event event,
-  void *user_data
+  cl_generic_callback_t *act_data,
+  gpu_activity_channel_t *initiator_channel
 )
 {
   gpu_activity_t gpu_activity;
-  opencl_activity_translate(&gpu_activity, event, user_data);
-  gpu_activity_process(&gpu_activity);
+  opencl_activity_translate(&gpu_activity, event, act_data);
+
+  gpu_activity_multiplexer_push(initiator_channel, &gpu_activity);
+//  gpu_activity_process(&gpu_activity);
 }
 
 
@@ -341,9 +335,10 @@ opencl_subscriber_callback
   hpcrun_safe_exit();
 
   gpu_activity_channel_consume(gpu_metrics_attribute);	
-  uint64_t cpu_submit_time = CPU_NANOTIME();
-  gpu_correlation_channel_produce(correlation_id, &gpu_op_ccts, 
-				  cpu_submit_time);
+//  uint64_t cpu_submit_time = CPU_NANOTIME();
+//
+//  gpu_correlation_channel_produce(correlation_id, &gpu_op_ccts,
+//				  cpu_submit_time);
 }
 
 
@@ -356,12 +351,13 @@ opencl_activity_completion_callback
 )
 {
   cl_int complete_flag = CL_COMPLETE;
-  opencl_object_t *o = (opencl_object_t*)user_data;
+  opencl_object_t *cl_obj = (opencl_object_t*)user_data;
   cl_generic_callback_t *act_data;
-  if (o->kind == OPENCL_KERNEL_CALLBACK) {
-    act_data = (cl_generic_callback_t*) &(o->details.ker_cb);
-  } else if (o->kind == OPENCL_MEMORY_CALLBACK) {
-    act_data = (cl_generic_callback_t*) &(o->details.mem_cb);
+
+  if (cl_obj->kind == OPENCL_KERNEL_CALLBACK) {
+    act_data = (cl_generic_callback_t*) &(cl_obj->details.ker_cb);
+  } else if (cl_obj->kind == OPENCL_MEMORY_CALLBACK) {
+    act_data = (cl_generic_callback_t*) &(cl_obj->details.mem_cb);
   }
   uint64_t correlation_id = act_data->correlation_id;
   opencl_call_t type = act_data->type;
@@ -376,13 +372,13 @@ opencl_activity_completion_callback
     }
     ETMSG(OPENCL, "completion type: %s, Correlation id: %"PRIu64 "", 
 	  opencl_call_to_string(type), correlation_id);
-    opencl_activity_completion_notify();
-    opencl_activity_process(event, act_data);
+
+    opencl_activity_process(event, act_data, cl_obj->details.initiator_channel);
   }
-  if (o->isInternalClEvent) {
+  if (cl_obj->isInternalClEvent) {
     HPCRUN_OPENCL_CALL(clReleaseEvent, (event));
   }
-  opencl_free(o);
+  opencl_free(cl_obj);
   opencl_pending_operations_adjust(-1);
 }
 
diff --git a/src/tool/hpcrun/gpu/opencl/opencl-intercept.c b/src/tool/hpcrun/gpu/opencl/opencl-intercept.c
index ed8e1f3f20..3fa1598d2d 100644
--- a/src/tool/hpcrun/gpu/opencl/opencl-intercept.c
+++ b/src/tool/hpcrun/gpu/opencl/opencl-intercept.c
@@ -54,10 +54,12 @@
 //******************************************************************************
 
 #include <hpcrun/gpu/gpu-metrics.h>
+#include <hpcrun/gpu/gpu-activity-channel.h>
 #include <hpcrun/messages/messages.h>
 #include <lib/prof-lean/hpcrun-gotcha.h>
 #include <lib/prof-lean/hpcrun-opencl.h>
 #include <lib/prof-lean/stdatomic.h>
+#include <lib/prof-lean/usec_time.h>
 
 #include "opencl-api.h"
 #include "opencl-intercept.h"
@@ -69,6 +71,8 @@
 // local data
 //******************************************************************************
 
+#define CPU_NANOTIME() (usec_time() * 1000)
+
 #ifndef HPCRUN_STATIC_LINK
 static gotcha_wrappee_handle_t clCreateCommandQueue_handle;
 static gotcha_wrappee_handle_t clEnqueueNDRangeKernel_handle;
@@ -110,7 +114,8 @@ initializeKernelCallBackInfo
 )
 {
   kernel_cb->correlation_id = correlation_id;
-  kernel_cb->type = kernel; 
+  kernel_cb->type = kernel;
+  kernel_cb->submit_time = CPU_NANOTIME();
 }
 
 
@@ -128,6 +133,7 @@ initializeMemoryCallBackInfo
   mem_transfer_cb->size = size;
   mem_transfer_cb->fromHostToDevice = fromHostToDevice;
   mem_transfer_cb->fromDeviceToHost = !fromHostToDevice;
+  mem_transfer_cb->submit_time = CPU_NANOTIME();
 }
 
 
@@ -167,6 +173,8 @@ clEnqueueNDRangeKernel_wrapper
   uint64_t correlation_id = getCorrelationId();
   opencl_object_t *kernel_info = opencl_malloc();
   kernel_info->kind = OPENCL_KERNEL_CALLBACK;
+  kernel_info->details.initiator_channel = gpu_activity_channel_get();
+
   cl_kernel_callback_t *kernel_cb = &(kernel_info->details.ker_cb);
   initializeKernelCallBackInfo(kernel_cb, correlation_id);
   cl_event my_event;
@@ -213,6 +221,8 @@ clEnqueueReadBuffer_wrapper
   uint64_t correlation_id = getCorrelationId();
   opencl_object_t *mem_info = opencl_malloc();
   mem_info->kind = OPENCL_MEMORY_CALLBACK;
+  mem_info->details.initiator_channel = gpu_activity_channel_get();
+
   cl_memory_callback_t *mem_transfer_cb = &(mem_info->details.mem_cb);
   initializeMemoryCallBackInfo(mem_transfer_cb, correlation_id, cb, false);
   cl_event my_event;
@@ -224,6 +234,7 @@ clEnqueueReadBuffer_wrapper
     eventp = event;
     mem_info->isInternalClEvent = false;
   }
+
   clreadbuffer_t clEnqueueReadBuffer_wrappee = 
     GOTCHA_GET_TYPED_WRAPPEE(clEnqueueReadBuffer_handle, clreadbuffer_t);
   cl_int return_status = 
@@ -263,6 +274,8 @@ clEnqueueWriteBuffer_wrapper
   uint64_t correlation_id = getCorrelationId();
   opencl_object_t *mem_info = opencl_malloc();
   mem_info->kind = OPENCL_MEMORY_CALLBACK;
+  mem_info->details.initiator_channel = gpu_activity_channel_get();
+
   cl_memory_callback_t *mem_transfer_cb = &(mem_info->details.mem_cb);
   initializeMemoryCallBackInfo(mem_transfer_cb, correlation_id, cb, true);
   cl_event my_event;
@@ -274,6 +287,8 @@ clEnqueueWriteBuffer_wrapper
     eventp = event;
     mem_info->isInternalClEvent = false;
   }
+
+
   clwritebuffer_t clEnqueueWriteBuffer_wrappee = 
     GOTCHA_GET_TYPED_WRAPPEE(clEnqueueWriteBuffer_handle, clwritebuffer_t);
   cl_int return_status = 
diff --git a/src/tool/hpcrun/gpu/opencl/opencl-intercept.h b/src/tool/hpcrun/gpu/opencl/opencl-intercept.h
index 89e4cd3584..5b9389843e 100644
--- a/src/tool/hpcrun/gpu/opencl/opencl-intercept.h
+++ b/src/tool/hpcrun/gpu/opencl/opencl-intercept.h
@@ -123,12 +123,14 @@ typedef enum {
 typedef struct cl_generic_callback_t {
   uint64_t correlation_id;
   opencl_call_t type;
+  uint64_t submit_time;
 } cl_generic_callback_t;
 
 
 typedef struct cl_kernel_callback_t {
   uint64_t correlation_id;
   opencl_call_t type;
+  uint64_t submit_time;
 } cl_kernel_callback_t;
 
 
@@ -138,6 +140,7 @@ typedef struct cl_memory_callback_t {
   bool fromHostToDevice;
   bool fromDeviceToHost;
   size_t size;
+  uint64_t submit_time;
 } cl_memory_callback_t;
 
 
diff --git a/src/tool/hpcrun/gpu/opencl/opencl-memory-manager.h b/src/tool/hpcrun/gpu/opencl/opencl-memory-manager.h
index 33b994af78..d9608b97b8 100644
--- a/src/tool/hpcrun/gpu/opencl/opencl-memory-manager.h
+++ b/src/tool/hpcrun/gpu/opencl/opencl-memory-manager.h
@@ -68,12 +68,14 @@ typedef enum {
 
 typedef struct opencl_object_channel_t opencl_object_channel_t;
 
+typedef struct gpu_activity_channel_t gpu_activity_channel_t;
 
 typedef struct opencl_object_details_t {
   union {
     cl_kernel_callback_t ker_cb;
     cl_memory_callback_t mem_cb;
   };
+  gpu_activity_channel_t *initiator_channel;
 } opencl_object_details_t;
 
 

From 715c4dd8cca1a0dd7578dcbdeba988ffe61f895f Mon Sep 17 00:00:00 2001
From: dejangrubisic <grubisic.dejan@yahoo.com>
Date: Wed, 23 Sep 2020 18:56:29 -0500
Subject: [PATCH 038/177] refactoring opencl-api interface

---
 .../hpcrun/gpu/gpu-activity-multiplexer.c     |  4 +-
 .../hpcrun/gpu/gpu-activity-multiplexer.h     |  2 +-
 .../hpcrun/gpu/gpu-operation-channel-set.c    |  4 +-
 .../hpcrun/gpu/gpu-operation-channel-set.h    |  6 +-
 src/tool/hpcrun/gpu/gpu-operation-channel.c   |  5 -
 src/tool/hpcrun/gpu/gpu-operation-channel.h   |  2 +-
 .../hpcrun/gpu/gpu-operation-item-process.c   |  5 +-
 src/tool/hpcrun/gpu/gpu-operation-item.h      |  2 +-
 src/tool/hpcrun/gpu/opencl/opencl-api.c       | 61 +++++++++----
 src/tool/hpcrun/gpu/opencl/opencl-api.h       | 15 ++-
 src/tool/hpcrun/gpu/opencl/opencl-intercept.c | 91 ++++++-------------
 .../hpcrun/gpu/opencl/opencl-memory-manager.c |  2 +-
 .../hpcrun/gpu/opencl/opencl-memory-manager.h | 10 ++
 13 files changed, 107 insertions(+), 102 deletions(-)

diff --git a/src/tool/hpcrun/gpu/gpu-activity-multiplexer.c b/src/tool/hpcrun/gpu/gpu-activity-multiplexer.c
index 8d28a965ca..3d8cf747c2 100644
--- a/src/tool/hpcrun/gpu/gpu-activity-multiplexer.c
+++ b/src/tool/hpcrun/gpu/gpu-activity-multiplexer.c
@@ -154,7 +154,7 @@ void
 void
 gpu_activity_multiplexer_push
 (
-gpu_activity_channel_t *gpu_channel,
+gpu_activity_channel_t *initiator_channel,
 gpu_activity_t *gpu_activity
 )
 {
@@ -167,7 +167,7 @@ gpu_activity_t *gpu_activity
     gpu_operation_channel_set_insert(gpu_operation_channel, my_operation_set_id);
   }
 
-  gpu_operation_item_t item = (gpu_operation_item_t){.channel=gpu_channel, .activity=gpu_activity};
+  gpu_operation_item_t item = (gpu_operation_item_t){.channel=initiator_channel, .activity=gpu_activity};
   gpu_operation_channel_produce(gpu_operation_channel, &item);
 
 //  atomic_fetch_add(&operation_stream_counter, +1);
diff --git a/src/tool/hpcrun/gpu/gpu-activity-multiplexer.h b/src/tool/hpcrun/gpu/gpu-activity-multiplexer.h
index 97c1a0ccd4..1498a96a3d 100644
--- a/src/tool/hpcrun/gpu/gpu-activity-multiplexer.h
+++ b/src/tool/hpcrun/gpu/gpu-activity-multiplexer.h
@@ -73,7 +73,7 @@ gpu_activity_multiplexer_fini(void);
 void
 gpu_activity_multiplexer_push
 (
-gpu_activity_channel_t *gpu_channel,
+gpu_activity_channel_t *initiator_channel,
 gpu_activity_t *gpu_activity
 );
 
diff --git a/src/tool/hpcrun/gpu/gpu-operation-channel-set.c b/src/tool/hpcrun/gpu/gpu-operation-channel-set.c
index 82fbbe21ed..4697eb0d0a 100644
--- a/src/tool/hpcrun/gpu/gpu-operation-channel-set.c
+++ b/src/tool/hpcrun/gpu/gpu-operation-channel-set.c
@@ -127,7 +127,7 @@ channel_forone
 {
   gpu_operation_channel_t *channel = se->channel;
 
-  gpu_operation_fn_t channel_fn = (gpu_operation_fn_t) arg;
+  gpu_operation_channel_fn_t channel_fn = (gpu_operation_channel_fn_t) arg;
 
   channel_fn(channel);
 }
@@ -136,7 +136,7 @@ channel_forone
 static void
 gpu_operation_channel_set_forall
 (
- gpu_operation_fn_t channel_fn,
+ gpu_operation_channel_fn_t channel_fn,
  int set_index
 )
 {
diff --git a/src/tool/hpcrun/gpu/gpu-operation-channel-set.h b/src/tool/hpcrun/gpu/gpu-operation-channel-set.h
index 054718c299..8917496573 100644
--- a/src/tool/hpcrun/gpu/gpu-operation-channel-set.h
+++ b/src/tool/hpcrun/gpu/gpu-operation-channel-set.h
@@ -53,7 +53,7 @@
 
 typedef struct gpu_operation_channel_t gpu_operation_channel_t;
 
-//typedef struct gpu_operation_fn_t gpu_operation_fn_t;
+//typedef struct gpu_operation_channel_fn_t gpu_operation_channel_fn_t;
 
 //******************************************************************************
 // type declarations
@@ -64,10 +64,6 @@ typedef void (*gpu_operation_channel_fn_t)
  gpu_operation_channel_t *channel
 );
 
-//typedef void (*gpu_operation_fn_t)
-//(
-// gpu_operation_t *a
-//);
 
 
 //******************************************************************************
diff --git a/src/tool/hpcrun/gpu/gpu-operation-channel.c b/src/tool/hpcrun/gpu/gpu-operation-channel.c
index 3513428424..6a2a95d645 100644
--- a/src/tool/hpcrun/gpu/gpu-operation-channel.c
+++ b/src/tool/hpcrun/gpu/gpu-operation-channel.c
@@ -79,11 +79,6 @@
 #define channel_steal \
   typed_bichannel_steal(gpu_operation_item_t)
 
-#define gpu_operation_item_alloc(channel)		\
-  channel_item_alloc(channel, gpu_operation_item_t)
-
-#define gpu_operation_item_free(channel, item)	\
-  channel_item_free(channel, item)
 
 
 //******************************************************************************
diff --git a/src/tool/hpcrun/gpu/gpu-operation-channel.h b/src/tool/hpcrun/gpu/gpu-operation-channel.h
index f31ed020f8..d197f49dca 100644
--- a/src/tool/hpcrun/gpu/gpu-operation-channel.h
+++ b/src/tool/hpcrun/gpu/gpu-operation-channel.h
@@ -50,7 +50,6 @@
 
 #include <lib/prof-lean/bichannel.h>
 
-//#include "gpu-operation-item.h"
 
 
 //******************************************************************************
@@ -62,6 +61,7 @@ typedef struct gpu_operation_item_t gpu_operation_item_t;
 typedef struct gpu_operation_channel_t gpu_operation_channel_t;
 
 
+
 //******************************************************************************
 // interface operations
 //******************************************************************************
diff --git a/src/tool/hpcrun/gpu/gpu-operation-item-process.c b/src/tool/hpcrun/gpu/gpu-operation-item-process.c
index 3290eba6fc..4f0a6f1e54 100644
--- a/src/tool/hpcrun/gpu/gpu-operation-item-process.c
+++ b/src/tool/hpcrun/gpu/gpu-operation-item-process.c
@@ -59,9 +59,8 @@
 #include <hpcrun/gpu/gpu-trace-item.h>
 #include <hpcrun/gpu/gpu-context-id-map.h>
 
-#include "gpu-operation-item-process.h"
 #include "gpu-operation-item.h"
-
+#include "gpu-operation-item-process.h"
 
 
 //******************************************************************************
@@ -80,8 +79,6 @@
 // private operations
 //******************************************************************************
 
-
-
 static void
 gpu_context_stream_trace
 (
diff --git a/src/tool/hpcrun/gpu/gpu-operation-item.h b/src/tool/hpcrun/gpu/gpu-operation-item.h
index 284d437067..291688dd79 100644
--- a/src/tool/hpcrun/gpu/gpu-operation-item.h
+++ b/src/tool/hpcrun/gpu/gpu-operation-item.h
@@ -83,7 +83,7 @@ typedef struct gpu_operation_item_t{
 }gpu_operation_item_t;
 
 
-typedef void (*gpu_operation_fn_t)
+typedef void (*gpu_operation_channel_fn_t)
 (
  gpu_operation_channel_t *channel
 );
diff --git a/src/tool/hpcrun/gpu/opencl/opencl-api.c b/src/tool/hpcrun/gpu/opencl/opencl-api.c
index e54ee6010c..4807a43e3a 100644
--- a/src/tool/hpcrun/gpu/opencl/opencl-api.c
+++ b/src/tool/hpcrun/gpu/opencl/opencl-api.c
@@ -178,6 +178,8 @@
 // local data
 //******************************************************************************
 
+static atomic_long correlation_id_counter;
+
 //----------------------------------------------------------
 // opencl function pointers for late binding
 //----------------------------------------------------------
@@ -226,6 +228,16 @@ static atomic_ullong opencl_pending_operations;
 // private operations
 //******************************************************************************
 
+static uint64_t
+getCorrelationId
+(
+void
+)
+{
+  return atomic_fetch_add(&correlation_id_counter, 1);
+}
+
+
 static void
 opencl_pending_operations_adjust
 (
@@ -296,13 +308,24 @@ opencl_error_report
 // interface operations
 //******************************************************************************
 
+void
+opencl_initialize_correlation_id
+(
+ void
+)
+{
+  atomic_store(&correlation_id_counter, 0);
+}
+
 void
 opencl_subscriber_callback
 (
-  opencl_call_t type,
-  uint64_t correlation_id
+  opencl_object_t *cb_info
 )
 {
+
+  uint64_t correlation_id = getCorrelationId();
+
   opencl_pending_operations_adjust(1);
   gpu_op_placeholder_flags_t gpu_op_placeholder_flags = 0;
   gpu_op_ccts_t gpu_op_ccts;
@@ -310,16 +333,21 @@ opencl_subscriber_callback
   cct_node_t *api_node = 
     gpu_application_thread_correlation_callback(correlation_id);
 
-  switch (type) {
-    case memcpy_H2D:
-      gpu_op_placeholder_flags_set(&gpu_op_placeholder_flags, 
-				   gpu_placeholder_type_copyin);
-      break;
-    case memcpy_D2H:
-      gpu_op_placeholder_flags_set(&gpu_op_placeholder_flags, 
-				   gpu_placeholder_type_copyout);
+
+  switch (cb_info->kind) {
+
+    case OPENCL_MEMORY_CALLBACK:
+      if (cb_info->details.mem_cb.type == memcpy_H2D){
+        gpu_op_placeholder_flags_set(&gpu_op_placeholder_flags,
+                                       gpu_placeholder_type_copyin);
+
+      }else if (cb_info->details.mem_cb.type == memcpy_D2H){
+        gpu_op_placeholder_flags_set(&gpu_op_placeholder_flags,
+                                       gpu_placeholder_type_copyout);
+      }
       break;
-    case kernel:
+
+    case OPENCL_KERNEL_CALLBACK:
       gpu_op_placeholder_flags_set(&gpu_op_placeholder_flags, 
 				   gpu_placeholder_type_kernel);
 
@@ -334,11 +362,12 @@ opencl_subscriber_callback
   gpu_op_ccts_insert(api_node, &gpu_op_ccts, gpu_op_placeholder_flags);
   hpcrun_safe_exit();
 
-  gpu_activity_channel_consume(gpu_metrics_attribute);	
-//  uint64_t cpu_submit_time = CPU_NANOTIME();
-//
-//  gpu_correlation_channel_produce(correlation_id, &gpu_op_ccts,
-//				  cpu_submit_time);
+  gpu_activity_channel_consume(gpu_metrics_attribute);
+
+  cb_info->details.cct_node = api_node;
+  cb_info->details.initiator_channel = gpu_activity_channel_get();
+  cb_info->details.submit_time = CPU_NANOTIME();
+
 }
 
 
diff --git a/src/tool/hpcrun/gpu/opencl/opencl-api.h b/src/tool/hpcrun/gpu/opencl/opencl-api.h
index d4ef7b8127..352ba3dda6 100644
--- a/src/tool/hpcrun/gpu/opencl/opencl-api.h
+++ b/src/tool/hpcrun/gpu/opencl/opencl-api.h
@@ -57,15 +57,26 @@
 
 
 
+//************************ Forward Declarations ******************************
+
+typedef struct opencl_object_t opencl_object_t;
+
+
+
 //******************************************************************************
 // interface operations
 //******************************************************************************
 
+void
+opencl_initialize_correlation_id
+(
+  void
+);
+
 void
 opencl_subscriber_callback
 (
-  opencl_call_t,
-  uint64_t
+  opencl_object_t *cb_info
 );
 
 
diff --git a/src/tool/hpcrun/gpu/opencl/opencl-intercept.c b/src/tool/hpcrun/gpu/opencl/opencl-intercept.c
index 3fa1598d2d..34b2c7abc9 100644
--- a/src/tool/hpcrun/gpu/opencl/opencl-intercept.c
+++ b/src/tool/hpcrun/gpu/opencl/opencl-intercept.c
@@ -71,14 +71,13 @@
 // local data
 //******************************************************************************
 
-#define CPU_NANOTIME() (usec_time() * 1000)
+
 
 #ifndef HPCRUN_STATIC_LINK
 static gotcha_wrappee_handle_t clCreateCommandQueue_handle;
 static gotcha_wrappee_handle_t clEnqueueNDRangeKernel_handle;
 static gotcha_wrappee_handle_t clEnqueueReadBuffer_handle;
 static gotcha_wrappee_handle_t clEnqueueWriteBuffer_handle;
-static atomic_long correlation_id;
 
 
 
@@ -86,54 +85,32 @@ static atomic_long correlation_id;
 // private operations
 //******************************************************************************
 
-static void
-opencl_intercept_initialize
-(
-  void
-)
-{
-  atomic_store(&correlation_id, 0);
-}
-
-
-static uint64_t
-getCorrelationId
-(
-  void
-)
-{
-  return atomic_fetch_add(&correlation_id, 1);
-}
-
-
 static void
 initializeKernelCallBackInfo
 (
-  cl_kernel_callback_t *kernel_cb,
-  uint64_t correlation_id
+ opencl_object_t *kernel_info
 )
 {
-  kernel_cb->correlation_id = correlation_id;
-  kernel_cb->type = kernel;
-  kernel_cb->submit_time = CPU_NANOTIME();
+  kernel_info->kind = OPENCL_KERNEL_CALLBACK;
+  kernel_info->details.ker_cb.type = kernel;
+
 }
 
 
 static void
 initializeMemoryCallBackInfo
 (
-  cl_memory_callback_t *mem_transfer_cb,
-  uint64_t correlation_id,
-  size_t size,
-  bool fromHostToDevice
+  opencl_object_t *mem_info,
+  opencl_call_t type,
+  size_t size
 )
 {
-  mem_transfer_cb->correlation_id = correlation_id;
-  mem_transfer_cb->type = (fromHostToDevice) ? memcpy_H2D: memcpy_D2H; 
-  mem_transfer_cb->size = size;
-  mem_transfer_cb->fromHostToDevice = fromHostToDevice;
-  mem_transfer_cb->fromDeviceToHost = !fromHostToDevice;
-  mem_transfer_cb->submit_time = CPU_NANOTIME();
+  mem_info->kind = OPENCL_MEMORY_CALLBACK;
+  mem_info->details.mem_cb.type = type;
+  mem_info->details.mem_cb.fromHostToDevice = (type == memcpy_H2D);
+  mem_info->details.mem_cb.fromDeviceToHost = (type == memcpy_D2H);
+  mem_info->details.mem_cb.size = size;
+
 }
 
 
@@ -170,13 +147,11 @@ clEnqueueNDRangeKernel_wrapper
   cl_event *event
 )
 {
-  uint64_t correlation_id = getCorrelationId();
   opencl_object_t *kernel_info = opencl_malloc();
-  kernel_info->kind = OPENCL_KERNEL_CALLBACK;
-  kernel_info->details.initiator_channel = gpu_activity_channel_get();
+  initializeKernelCallBackInfo(kernel_info);
+
+  opencl_subscriber_callback(kernel_info);
 
-  cl_kernel_callback_t *kernel_cb = &(kernel_info->details.ker_cb);
-  initializeKernelCallBackInfo(kernel_cb, correlation_id);
   cl_event my_event;
   cl_event *eventp;
   if (!event) {
@@ -195,10 +170,9 @@ clEnqueueNDRangeKernel_wrapper
 				   event_wait_list, eventp);
 
   ETMSG(OPENCL, "registering callback for type: kernel. " 
-	"Correlation id: %"PRIu64 "", correlation_id);
+	"Correlation id: %"PRIu64 "", kernel_info->details.ker_cb.correlation_id);
 
-  opencl_subscriber_callback(kernel_cb->type, kernel_cb->correlation_id);
-  clSetEventCallback_wrapper(*eventp, CL_COMPLETE, 
+  clSetEventCallback_wrapper(*eventp, CL_COMPLETE,
 			     &opencl_activity_completion_callback, kernel_info);
   return return_status;
 }
@@ -218,13 +192,11 @@ clEnqueueReadBuffer_wrapper
   cl_event *event
 )
 {
-  uint64_t correlation_id = getCorrelationId();
   opencl_object_t *mem_info = opencl_malloc();
-  mem_info->kind = OPENCL_MEMORY_CALLBACK;
-  mem_info->details.initiator_channel = gpu_activity_channel_get();
+  initializeMemoryCallBackInfo(mem_info, memcpy_D2H, cb);
+
+  opencl_subscriber_callback(mem_info);
 
-  cl_memory_callback_t *mem_transfer_cb = &(mem_info->details.mem_cb);
-  initializeMemoryCallBackInfo(mem_transfer_cb, correlation_id, cb, false);
   cl_event my_event;
   cl_event *eventp;
   if (!event) {
@@ -243,12 +215,10 @@ clEnqueueReadBuffer_wrapper
 				event_wait_list, eventp);
 
   ETMSG(OPENCL, "registering callback for type: D2H. " 
-	"Correlation id: %"PRIu64 "", correlation_id);
+	"Correlation id: %"PRIu64 "", mem_info->details.mem_cb.correlation_id);
   ETMSG(OPENCL, "%d(bytes) of data being transferred from device to host", 
 	(long)cb);
 
-  opencl_subscriber_callback(mem_transfer_cb->type, 
-			     mem_transfer_cb->correlation_id);
 
   clSetEventCallback_wrapper(*eventp, CL_COMPLETE, 
 			     &opencl_activity_completion_callback, mem_info);
@@ -271,13 +241,12 @@ clEnqueueWriteBuffer_wrapper
   cl_event *event
 )
 {
-  uint64_t correlation_id = getCorrelationId();
+
   opencl_object_t *mem_info = opencl_malloc();
-  mem_info->kind = OPENCL_MEMORY_CALLBACK;
-  mem_info->details.initiator_channel = gpu_activity_channel_get();
+  initializeMemoryCallBackInfo(mem_info, memcpy_H2D, cb);
+
+  opencl_subscriber_callback(mem_info);
 
-  cl_memory_callback_t *mem_transfer_cb = &(mem_info->details.mem_cb);
-  initializeMemoryCallBackInfo(mem_transfer_cb, correlation_id, cb, true);
   cl_event my_event;
   cl_event *eventp;
   if (!event) {
@@ -297,13 +266,11 @@ clEnqueueWriteBuffer_wrapper
 				 event_wait_list, eventp);
 
   ETMSG(OPENCL, "registering callback for type: H2D. " 
-	"Correlation id: %"PRIu64 "", correlation_id);
+	"Correlation id: %"PRIu64 "", mem_info->details.mem_cb.correlation_id);
 
   ETMSG(OPENCL, "%d(bytes) of data being transferred from host to device", 
 	(long)cb);
 
-  opencl_subscriber_callback(mem_transfer_cb->type, 
-			     mem_transfer_cb->correlation_id);
 
   clSetEventCallback_wrapper(*eventp, CL_COMPLETE, 
 			     &opencl_activity_completion_callback, 
@@ -360,7 +327,7 @@ opencl_intercept_setup
 #ifndef HPCRUN_STATIC_LINK
   ETMSG(OPENCL, "setting up opencl intercepts");
   gotcha_wrap(opencl_bindings, 4, "opencl_bindings");
-  opencl_intercept_initialize();
+  opencl_initialize_correlation_id();
 #endif
 }
 
diff --git a/src/tool/hpcrun/gpu/opencl/opencl-memory-manager.c b/src/tool/hpcrun/gpu/opencl/opencl-memory-manager.c
index d7895cfd4f..9ee0b3edb1 100644
--- a/src/tool/hpcrun/gpu/opencl/opencl-memory-manager.c
+++ b/src/tool/hpcrun/gpu/opencl/opencl-memory-manager.c
@@ -130,6 +130,6 @@ opencl_free
 )
 {
   memset(o, 0, sizeof(opencl_object_t));
-  opencl_object_channel_t *c = &(o->channel);
+  opencl_object_channel_t *c = o->channel;
   channel_item_free(c, o);
 }
diff --git a/src/tool/hpcrun/gpu/opencl/opencl-memory-manager.h b/src/tool/hpcrun/gpu/opencl/opencl-memory-manager.h
index d9608b97b8..4f6a11d141 100644
--- a/src/tool/hpcrun/gpu/opencl/opencl-memory-manager.h
+++ b/src/tool/hpcrun/gpu/opencl/opencl-memory-manager.h
@@ -56,6 +56,14 @@
 
 
 
+
+//************************ Forward Declarations ******************************
+
+typedef struct cct_node_t cct_node_t;
+
+
+
+
 //******************************************************************************
 // type declarations
 //******************************************************************************
@@ -76,6 +84,8 @@ typedef struct opencl_object_details_t {
     cl_memory_callback_t mem_cb;
   };
   gpu_activity_channel_t *initiator_channel;
+  cct_node_t *cct_node;
+  uint64_t submit_time;
 } opencl_object_details_t;
 
 

From 184e54bb7dc4bde6e61ac834d5fdac22c07f931f Mon Sep 17 00:00:00 2001
From: dejangrubisic <grubisic.dejan@yahoo.com>
Date: Wed, 23 Sep 2020 20:22:11 -0500
Subject: [PATCH 039/177] refactoring opencl-translate

---
 .../gpu/opencl/opencl-activity-translate.c    | 63 ++++++++-----------
 .../gpu/opencl/opencl-activity-translate.h    | 17 ++---
 src/tool/hpcrun/gpu/opencl/opencl-api.c       | 23 ++++---
 src/tool/hpcrun/gpu/opencl/opencl-api.h       |  4 +-
 src/tool/hpcrun/gpu/opencl/opencl-intercept.h |  4 +-
 5 files changed, 49 insertions(+), 62 deletions(-)

diff --git a/src/tool/hpcrun/gpu/opencl/opencl-activity-translate.c b/src/tool/hpcrun/gpu/opencl/opencl-activity-translate.c
index 72d58dd4af..9869d183f4 100644
--- a/src/tool/hpcrun/gpu/opencl/opencl-activity-translate.c
+++ b/src/tool/hpcrun/gpu/opencl/opencl-activity-translate.c
@@ -66,36 +66,22 @@
 // private operations
 //******************************************************************************
 
-static void
-getMemoryProfileInfo
-(
-  gpu_memcpy_t *memcpy,
-  cl_memory_callback_t *cb_data
-)
-{
-  memcpy->correlation_id = cb_data->correlation_id;
-  memcpy->submit_time = cb_data->submit_time;
-  memcpy->bytes = cb_data->size;
-  memcpy->copyKind = (gpu_memcpy_type_t) 
-    (cb_data->fromHostToDevice)? GPU_MEMCPY_H2D: 
-    (cb_data->fromDeviceToHost? GPU_MEMCPY_D2H:	GPU_MEMCPY_UNK);
-}
-
-
 static void
 convert_kernel_launch
 (
   gpu_activity_t *ga,
-  void *user_data,
+  opencl_object_t *cb_data,
   cl_event event
 )
 {
-  cl_kernel_callback_t *kernel_cb_data = (cl_kernel_callback_t*)user_data;
   memset(&ga->details.kernel, 0, sizeof(gpu_kernel_t));
   getTimingInfoFromClEvent(&ga->details.interval, event);
-  ga->kind = GPU_ACTIVITY_KERNEL;
-  ga->details.kernel.correlation_id = kernel_cb_data->correlation_id;
-  ga->details.kernel.submit_time = kernel_cb_data->submit_time;
+
+  ga->kind     = cb_data->kind;
+  ga->cct_node = cb_data->details.cct_node;
+
+  ga->details.kernel.correlation_id = cb_data->details.ker_cb.correlation_id;
+  ga->details.kernel.submit_time    = cb_data->details.submit_time;
 }
 
 
@@ -103,16 +89,20 @@ static void
 convert_memcpy
 (
   gpu_activity_t *ga,
-  void *user_data,
-  cl_event event,
-  gpu_memcpy_type_t kind
+  opencl_object_t *cb_data,
+  cl_event event
 )
 {
-  cl_memory_callback_t *memory_cb_data = (cl_memory_callback_t*)user_data;
   memset(&ga->details.memcpy, 0, sizeof(gpu_memcpy_t));
   getTimingInfoFromClEvent(&ga->details.interval, event);
-  getMemoryProfileInfo(&ga->details.memcpy, memory_cb_data);
-  ga->kind = GPU_ACTIVITY_MEMCPY;
+
+  ga->kind     = cb_data->kind;
+  ga->cct_node = cb_data->details.cct_node;
+
+  ga->details.memcpy.correlation_id  = cb_data->details.mem_cb.correlation_id;
+  ga->details.memcpy.submit_time     = cb_data->details.submit_time;
+  ga->details.memcpy.bytes           = cb_data->details.mem_cb.size;
+  ga->details.memcpy.copyKind        = cb_data->kind;
 }
 
 
@@ -126,21 +116,18 @@ opencl_activity_translate
 (
   gpu_activity_t *ga,
   cl_event event,
-  void *act_data
+  opencl_object_t *cb_data
 )
 {
-  cl_generic_callback_t *cb_data = (cl_generic_callback_t*)act_data;
-  opencl_call_t type = cb_data->type;
-  switch (type) {
-    case kernel:
-      convert_kernel_launch(ga, act_data, event);
-      break;
-    case memcpy_H2D:
-      convert_memcpy(ga, act_data, event, GPU_MEMCPY_H2D);
+  switch (cb_data->kind) {
+    case OPENCL_MEMORY_CALLBACK:
+      convert_memcpy(ga, cb_data, event);
       break;
-    case memcpy_D2H:
-      convert_memcpy(ga, act_data, event, GPU_MEMCPY_D2H);
+
+    case OPENCL_KERNEL_CALLBACK:
+      convert_kernel_launch(ga, cb_data, event);
       break;
+
     default:
       assert(0);
   }
diff --git a/src/tool/hpcrun/gpu/opencl/opencl-activity-translate.h b/src/tool/hpcrun/gpu/opencl/opencl-activity-translate.h
index 6c0f6f257d..824b2920f9 100644
--- a/src/tool/hpcrun/gpu/opencl/opencl-activity-translate.h
+++ b/src/tool/hpcrun/gpu/opencl/opencl-activity-translate.h
@@ -41,8 +41,8 @@
 //
 // ******************************************************* EndRiceCopyright *
 
-#ifndef _OPENCL_ACTIVITY_TRANSLATE_H_
-#define _OPENCL_ACTIVITY_TRANSLATE_H_
+#ifndef opencl_activity_translate_h
+#define opencl_activity_translate_h
 
 
 
@@ -55,6 +55,10 @@
 
 
 
+//*************************** Forward Declarations **************************
+
+typedef struct opencl_object_t opencl_object_t;
+
 //******************************************************************************
 // interface operations
 //******************************************************************************
@@ -62,11 +66,10 @@
 void
 opencl_activity_translate
 (
-  gpu_activity_t *,
-  cl_event,
-  void *
+  gpu_activity_t *ga,
+  cl_event event,
+  opencl_object_t *cb_data
 );
 
 
-
-#endif  //_OPENCL_ACTIVITY_TRANSLATE_H_
+#endif
diff --git a/src/tool/hpcrun/gpu/opencl/opencl-api.c b/src/tool/hpcrun/gpu/opencl/opencl-api.c
index 4807a43e3a..892c5b0737 100644
--- a/src/tool/hpcrun/gpu/opencl/opencl-api.c
+++ b/src/tool/hpcrun/gpu/opencl/opencl-api.c
@@ -252,14 +252,13 @@ static void
 opencl_activity_process
 (
   cl_event event,
-  cl_generic_callback_t *act_data,
-  gpu_activity_channel_t *initiator_channel
+  opencl_object_t *cb_data
 )
 {
   gpu_activity_t gpu_activity;
-  opencl_activity_translate(&gpu_activity, event, act_data);
+  opencl_activity_translate(&gpu_activity, event, cb_data);
 
-  gpu_activity_multiplexer_push(initiator_channel, &gpu_activity);
+  gpu_activity_multiplexer_push(cb_data->details.initiator_channel, &gpu_activity);
 //  gpu_activity_process(&gpu_activity);
 }
 
@@ -380,13 +379,13 @@ opencl_activity_completion_callback
 )
 {
   cl_int complete_flag = CL_COMPLETE;
-  opencl_object_t *cl_obj = (opencl_object_t*)user_data;
+  opencl_object_t *cb_data = (opencl_object_t*)user_data;
   cl_generic_callback_t *act_data;
 
-  if (cl_obj->kind == OPENCL_KERNEL_CALLBACK) {
-    act_data = (cl_generic_callback_t*) &(cl_obj->details.ker_cb);
-  } else if (cl_obj->kind == OPENCL_MEMORY_CALLBACK) {
-    act_data = (cl_generic_callback_t*) &(cl_obj->details.mem_cb);
+  if (cb_data->kind == OPENCL_KERNEL_CALLBACK) {
+    act_data = (cl_generic_callback_t*) &(cb_data->details.ker_cb);
+  } else if (cb_data->kind == OPENCL_MEMORY_CALLBACK) {
+    act_data = (cl_generic_callback_t*) &(cb_data->details.mem_cb);
   }
   uint64_t correlation_id = act_data->correlation_id;
   opencl_call_t type = act_data->type;
@@ -402,12 +401,12 @@ opencl_activity_completion_callback
     ETMSG(OPENCL, "completion type: %s, Correlation id: %"PRIu64 "", 
 	  opencl_call_to_string(type), correlation_id);
 
-    opencl_activity_process(event, act_data, cl_obj->details.initiator_channel);
+    opencl_activity_process(event, cb_data);
   }
-  if (cl_obj->isInternalClEvent) {
+  if (cb_data->isInternalClEvent) {
     HPCRUN_OPENCL_CALL(clReleaseEvent, (event));
   }
-  opencl_free(cl_obj);
+  opencl_free(cb_data);
   opencl_pending_operations_adjust(-1);
 }
 
diff --git a/src/tool/hpcrun/gpu/opencl/opencl-api.h b/src/tool/hpcrun/gpu/opencl/opencl-api.h
index 352ba3dda6..4e874485db 100644
--- a/src/tool/hpcrun/gpu/opencl/opencl-api.h
+++ b/src/tool/hpcrun/gpu/opencl/opencl-api.h
@@ -54,12 +54,12 @@
 #include <lib/prof-lean/hpcrun-opencl.h>
 
 #include "opencl-intercept.h"
-
+#include "opencl-memory-manager.h"
 
 
 //************************ Forward Declarations ******************************
 
-typedef struct opencl_object_t opencl_object_t;
+//typedef struct opencl_object_t opencl_object_t;
 
 
 
diff --git a/src/tool/hpcrun/gpu/opencl/opencl-intercept.h b/src/tool/hpcrun/gpu/opencl/opencl-intercept.h
index 5b9389843e..74c1e30024 100644
--- a/src/tool/hpcrun/gpu/opencl/opencl-intercept.h
+++ b/src/tool/hpcrun/gpu/opencl/opencl-intercept.h
@@ -123,24 +123,22 @@ typedef enum {
 typedef struct cl_generic_callback_t {
   uint64_t correlation_id;
   opencl_call_t type;
-  uint64_t submit_time;
 } cl_generic_callback_t;
 
 
 typedef struct cl_kernel_callback_t {
   uint64_t correlation_id;
   opencl_call_t type;
-  uint64_t submit_time;
 } cl_kernel_callback_t;
 
 
 typedef struct cl_memory_callback_t {
   uint64_t correlation_id;
   opencl_call_t type;
+  uint64_t submit_time;
   bool fromHostToDevice;
   bool fromDeviceToHost;
   size_t size;
-  uint64_t submit_time;
 } cl_memory_callback_t;
 
 

From 251ebc29c4afe9efd3524e77d42d211221c5a6e7 Mon Sep 17 00:00:00 2001
From: dejangrubisic <grubisic.dejan@yahoo.com>
Date: Thu, 24 Sep 2020 00:14:32 -0500
Subject: [PATCH 040/177] using the same enum for gpu_activity and
 opencl_activity

---
 .../hpcrun/gpu/gpu-activity-multiplexer.c     |  9 ++---
 src/tool/hpcrun/gpu/gpu-activity.h            |  4 +--
 .../hpcrun/gpu/gpu-operation-channel-set.c    |  1 +
 src/tool/hpcrun/gpu/gpu-operation-channel.c   | 34 ++++++++++++++++++-
 src/tool/hpcrun/gpu/gpu-operation-channel.h   |  5 +++
 .../gpu/opencl/opencl-activity-translate.c    |  5 +--
 src/tool/hpcrun/gpu/opencl/opencl-api.c       | 28 +++++++++------
 src/tool/hpcrun/gpu/opencl/opencl-intercept.c | 16 ++++-----
 src/tool/hpcrun/gpu/opencl/opencl-intercept.h | 14 ++------
 .../hpcrun/gpu/opencl/opencl-memory-manager.c |  2 +-
 .../hpcrun/gpu/opencl/opencl-memory-manager.h | 12 ++++---
 11 files changed, 82 insertions(+), 48 deletions(-)

diff --git a/src/tool/hpcrun/gpu/gpu-activity-multiplexer.c b/src/tool/hpcrun/gpu/gpu-activity-multiplexer.c
index 3d8cf747c2..3aaaf36e82 100644
--- a/src/tool/hpcrun/gpu/gpu-activity-multiplexer.c
+++ b/src/tool/hpcrun/gpu/gpu-activity-multiplexer.c
@@ -93,14 +93,11 @@ void
     for (int set_index = 0; set_index < atomic_load(&operation_set_id) ; ++set_index) {
       gpu_operation_channel_set_consume(set_index);
     }
-//    pthread_cond_timedwait
   }
 
-
-//  for (int set_index = 0; set_index < atomic_load(&operation_set_id) ; ++set_index) {
-//    gpu_trace_operation_set_release(set_index);
-//  }
-
+  for (int set_index = 0; set_index < atomic_load(&operation_set_id) ; ++set_index) {
+    gpu_operation_channel_set_consume(set_index);
+  }
 
   return NULL;
 }
diff --git a/src/tool/hpcrun/gpu/gpu-activity.h b/src/tool/hpcrun/gpu/gpu-activity.h
index c1a1d005fb..8e08df20d5 100644
--- a/src/tool/hpcrun/gpu/gpu-activity.h
+++ b/src/tool/hpcrun/gpu/gpu-activity.h
@@ -207,9 +207,9 @@ typedef struct gpu_mem_t {
 
 // gpu_mem_t is a prefix 
 typedef struct gpu_memcpy_t {
-  uint64_t submit_time;
   uint64_t start;
   uint64_t end;
+  uint64_t submit_time;
   uint64_t bytes;
   uint32_t correlation_id;
   uint32_t context_id;
@@ -242,9 +242,9 @@ typedef struct gpu_memset_t {
 
 // gpu kernel execution
 typedef struct gpu_kernel_t {
-  uint64_t submit_time;
   uint64_t start;
   uint64_t end;
+  uint64_t submit_time;
   uint32_t correlation_id;
   uint32_t device_id;
   uint32_t context_id;
diff --git a/src/tool/hpcrun/gpu/gpu-operation-channel-set.c b/src/tool/hpcrun/gpu/gpu-operation-channel-set.c
index 4697eb0d0a..aab832ca46 100644
--- a/src/tool/hpcrun/gpu/gpu-operation-channel-set.c
+++ b/src/tool/hpcrun/gpu/gpu-operation-channel-set.c
@@ -183,6 +183,7 @@ gpu_operation_channel_set_consume
 )
 {
   gpu_operation_channel_set_forall(gpu_operation_channel_consume, set_index);
+  gpu_operation_channel_set_forall(gpu_operation_channel_wait, set_index);
 }
 
 
diff --git a/src/tool/hpcrun/gpu/gpu-operation-channel.c b/src/tool/hpcrun/gpu/gpu-operation-channel.c
index 6a2a95d645..b65d580ca2 100644
--- a/src/tool/hpcrun/gpu/gpu-operation-channel.c
+++ b/src/tool/hpcrun/gpu/gpu-operation-channel.c
@@ -41,6 +41,12 @@
 //
 // ******************************************************* EndRiceCopyright *
 
+//******************************************************************************
+// system includes
+//******************************************************************************
+
+#include <pthread.h>
+
 //******************************************************************************
 // local includes
 //******************************************************************************
@@ -79,6 +85,8 @@
 #define channel_steal \
   typed_bichannel_steal(gpu_operation_item_t)
 
+#define SECONDS_UNTIL_WAKEUP 2
+
 
 
 //******************************************************************************
@@ -87,6 +95,8 @@
 
 typedef struct gpu_operation_channel_t {
   bistack_t bistacks[2];
+  pthread_mutex_t mutex;
+  pthread_cond_t cond;
 } gpu_operation_channel_t;
 
 
@@ -150,6 +160,8 @@ gpu_operation_channel_produce
   gpu_operation_item_t *channel_op = gpu_operation_item_alloc(channel);
   *channel_op = *it;
 
+  printf("\nPRODUCE: channel = %p || return_channel = %p -> activity = %p\n\n", channel, it->channel, it->activity);
+
   channel_push(channel, bichannel_direction_forward, channel_op);
 }
 
@@ -170,8 +182,28 @@ gpu_operation_channel_consume
   // consume all elements enqueued before this function was called
   for (;;) {
     gpu_operation_item_t *it = channel_pop(channel, bichannel_direction_forward);
-    if (!it) break;
+    printf("\n---------CONSUME: op_channel = %p || channel = %p , activity = %p\n", channel, it->channel, it->activity);
+
+    if (!it || !it->activity || !it->channel) {
+      break;
+    }
     gpu_operation_item_consume(gpu_operation_item_process, it);
     gpu_operation_item_free(channel, it);
   }
 }
+
+
+void
+gpu_operation_channel_wait
+(
+gpu_operation_channel_t *channel
+)
+{
+  struct timespec time;
+  clock_gettime(CLOCK_REALTIME, &time); // get current time
+  time.tv_sec += SECONDS_UNTIL_WAKEUP;
+
+  // wait for a signal or for a few seconds. periodically waking
+  // up avoids missing a signal.
+  pthread_cond_timedwait(&channel->cond, &channel->mutex, &time);
+}
\ No newline at end of file
diff --git a/src/tool/hpcrun/gpu/gpu-operation-channel.h b/src/tool/hpcrun/gpu/gpu-operation-channel.h
index d197f49dca..aade93fa20 100644
--- a/src/tool/hpcrun/gpu/gpu-operation-channel.h
+++ b/src/tool/hpcrun/gpu/gpu-operation-channel.h
@@ -87,6 +87,11 @@ gpu_operation_channel_consume
  gpu_operation_channel_t *channel
 );
 
+void
+gpu_operation_channel_wait
+(
+ gpu_operation_channel_t *channel
+);
 
 
 #endif
diff --git a/src/tool/hpcrun/gpu/opencl/opencl-activity-translate.c b/src/tool/hpcrun/gpu/opencl/opencl-activity-translate.c
index 9869d183f4..08a74173b8 100644
--- a/src/tool/hpcrun/gpu/opencl/opencl-activity-translate.c
+++ b/src/tool/hpcrun/gpu/opencl/opencl-activity-translate.c
@@ -82,6 +82,7 @@ convert_kernel_launch
 
   ga->details.kernel.correlation_id = cb_data->details.ker_cb.correlation_id;
   ga->details.kernel.submit_time    = cb_data->details.submit_time;
+
 }
 
 
@@ -120,11 +121,11 @@ opencl_activity_translate
 )
 {
   switch (cb_data->kind) {
-    case OPENCL_MEMORY_CALLBACK:
+    case GPU_ACTIVITY_MEMCPY:
       convert_memcpy(ga, cb_data, event);
       break;
 
-    case OPENCL_KERNEL_CALLBACK:
+    case GPU_ACTIVITY_KERNEL:
       convert_kernel_launch(ga, cb_data, event);
       break;
 
diff --git a/src/tool/hpcrun/gpu/opencl/opencl-api.c b/src/tool/hpcrun/gpu/opencl/opencl-api.c
index 892c5b0737..066c57e571 100644
--- a/src/tool/hpcrun/gpu/opencl/opencl-api.c
+++ b/src/tool/hpcrun/gpu/opencl/opencl-api.c
@@ -77,6 +77,7 @@
 
 
 
+
 //******************************************************************************
 // macros
 //******************************************************************************
@@ -145,9 +146,8 @@
   macro(CL_INVALID_DEVICE_PARTITION_COUNT)
 
 #define FORALL_OPENCL_CALLS(macro)					\
-  macro(memcpy_H2D)							\
-  macro(memcpy_D2H)							\
-  macro(kernel)
+  macro(GPU_MEMCPY_H2D)							\
+  macro(GPU_MEMCPY_D2H)
 
 #define CODE_TO_STRING(e) case e: return #e;
 
@@ -278,7 +278,7 @@ opencl_wait_for_pending_operations
 static const char*
 opencl_call_to_string
 (
-  opencl_call_t type
+  gpu_memcpy_type_t type
 )
 {
   switch (type)
@@ -335,18 +335,20 @@ opencl_subscriber_callback
 
   switch (cb_info->kind) {
 
-    case OPENCL_MEMORY_CALLBACK:
-      if (cb_info->details.mem_cb.type == memcpy_H2D){
+    case GPU_ACTIVITY_MEMCPY:
+      cb_info->details.mem_cb.correlation_id = correlation_id;
+      if (cb_info->details.mem_cb.type == GPU_MEMCPY_H2D){
         gpu_op_placeholder_flags_set(&gpu_op_placeholder_flags,
                                        gpu_placeholder_type_copyin);
 
-      }else if (cb_info->details.mem_cb.type == memcpy_D2H){
+      }else if (cb_info->details.mem_cb.type == GPU_MEMCPY_D2H){
         gpu_op_placeholder_flags_set(&gpu_op_placeholder_flags,
                                        gpu_placeholder_type_copyout);
       }
       break;
 
-    case OPENCL_KERNEL_CALLBACK:
+    case GPU_ACTIVITY_KERNEL:
+      cb_info->details.ker_cb.correlation_id = correlation_id;
       gpu_op_placeholder_flags_set(&gpu_op_placeholder_flags, 
 				   gpu_placeholder_type_kernel);
 
@@ -363,6 +365,7 @@ opencl_subscriber_callback
 
   gpu_activity_channel_consume(gpu_metrics_attribute);
 
+
   cb_info->details.cct_node = api_node;
   cb_info->details.initiator_channel = gpu_activity_channel_get();
   cb_info->details.submit_time = CPU_NANOTIME();
@@ -382,13 +385,13 @@ opencl_activity_completion_callback
   opencl_object_t *cb_data = (opencl_object_t*)user_data;
   cl_generic_callback_t *act_data;
 
-  if (cb_data->kind == OPENCL_KERNEL_CALLBACK) {
+  if (cb_data->kind == GPU_ACTIVITY_KERNEL) {
     act_data = (cl_generic_callback_t*) &(cb_data->details.ker_cb);
-  } else if (cb_data->kind == OPENCL_MEMORY_CALLBACK) {
+  } else if (cb_data->kind == GPU_ACTIVITY_MEMCPY) {
     act_data = (cl_generic_callback_t*) &(cb_data->details.mem_cb);
   }
   uint64_t correlation_id = act_data->correlation_id;
-  opencl_call_t type = act_data->type;
+  gpu_memcpy_type_t type = act_data->type;
 
   if (event_command_exec_status == complete_flag) {
     gpu_correlation_id_map_entry_t *cid_map_entry = 
@@ -492,5 +495,8 @@ opencl_api_finalize
 )
 {
   opencl_wait_for_pending_operations();
+  gpu_activity_multiplexer_fini();
+
   gpu_application_thread_process_activities();
+
 }
diff --git a/src/tool/hpcrun/gpu/opencl/opencl-intercept.c b/src/tool/hpcrun/gpu/opencl/opencl-intercept.c
index 34b2c7abc9..778b835e4b 100644
--- a/src/tool/hpcrun/gpu/opencl/opencl-intercept.c
+++ b/src/tool/hpcrun/gpu/opencl/opencl-intercept.c
@@ -91,9 +91,7 @@ initializeKernelCallBackInfo
  opencl_object_t *kernel_info
 )
 {
-  kernel_info->kind = OPENCL_KERNEL_CALLBACK;
-  kernel_info->details.ker_cb.type = kernel;
-
+  kernel_info->kind = GPU_ACTIVITY_KERNEL;
 }
 
 
@@ -101,14 +99,14 @@ static void
 initializeMemoryCallBackInfo
 (
   opencl_object_t *mem_info,
-  opencl_call_t type,
+  gpu_memcpy_type_t type,
   size_t size
 )
 {
-  mem_info->kind = OPENCL_MEMORY_CALLBACK;
+  mem_info->kind = GPU_ACTIVITY_MEMCPY;
   mem_info->details.mem_cb.type = type;
-  mem_info->details.mem_cb.fromHostToDevice = (type == memcpy_H2D);
-  mem_info->details.mem_cb.fromDeviceToHost = (type == memcpy_D2H);
+  mem_info->details.mem_cb.fromHostToDevice = (type == GPU_MEMCPY_H2D);
+  mem_info->details.mem_cb.fromDeviceToHost = (type == GPU_MEMCPY_D2H);
   mem_info->details.mem_cb.size = size;
 
 }
@@ -193,7 +191,7 @@ clEnqueueReadBuffer_wrapper
 )
 {
   opencl_object_t *mem_info = opencl_malloc();
-  initializeMemoryCallBackInfo(mem_info, memcpy_D2H, cb);
+  initializeMemoryCallBackInfo(mem_info, GPU_MEMCPY_D2H, cb);
 
   opencl_subscriber_callback(mem_info);
 
@@ -243,7 +241,7 @@ clEnqueueWriteBuffer_wrapper
 {
 
   opencl_object_t *mem_info = opencl_malloc();
-  initializeMemoryCallBackInfo(mem_info, memcpy_H2D, cb);
+  initializeMemoryCallBackInfo(mem_info, GPU_MEMCPY_H2D, cb);
 
   opencl_subscriber_callback(mem_info);
 
diff --git a/src/tool/hpcrun/gpu/opencl/opencl-intercept.h b/src/tool/hpcrun/gpu/opencl/opencl-intercept.h
index 74c1e30024..3069c44d47 100644
--- a/src/tool/hpcrun/gpu/opencl/opencl-intercept.h
+++ b/src/tool/hpcrun/gpu/opencl/opencl-intercept.h
@@ -113,29 +113,21 @@ typedef cl_int (*clwritebuffer_t)(
 );
 
 
-typedef enum {
-  memcpy_H2D                      = 0,
-  memcpy_D2H                      = 1,
-  kernel                          = 2
-} opencl_call_t;
-
-
 typedef struct cl_generic_callback_t {
   uint64_t correlation_id;
-  opencl_call_t type;
+  gpu_memcpy_type_t type;
 } cl_generic_callback_t;
 
 
 typedef struct cl_kernel_callback_t {
   uint64_t correlation_id;
-  opencl_call_t type;
+  gpu_memcpy_type_t type;
 } cl_kernel_callback_t;
 
 
 typedef struct cl_memory_callback_t {
   uint64_t correlation_id;
-  opencl_call_t type;
-  uint64_t submit_time;
+  gpu_memcpy_type_t type;
   bool fromHostToDevice;
   bool fromDeviceToHost;
   size_t size;
diff --git a/src/tool/hpcrun/gpu/opencl/opencl-memory-manager.c b/src/tool/hpcrun/gpu/opencl/opencl-memory-manager.c
index 9ee0b3edb1..d7895cfd4f 100644
--- a/src/tool/hpcrun/gpu/opencl/opencl-memory-manager.c
+++ b/src/tool/hpcrun/gpu/opencl/opencl-memory-manager.c
@@ -130,6 +130,6 @@ opencl_free
 )
 {
   memset(o, 0, sizeof(opencl_object_t));
-  opencl_object_channel_t *c = o->channel;
+  opencl_object_channel_t *c = &(o->channel);
   channel_item_free(c, o);
 }
diff --git a/src/tool/hpcrun/gpu/opencl/opencl-memory-manager.h b/src/tool/hpcrun/gpu/opencl/opencl-memory-manager.h
index 4f6a11d141..b86666fa0d 100644
--- a/src/tool/hpcrun/gpu/opencl/opencl-memory-manager.h
+++ b/src/tool/hpcrun/gpu/opencl/opencl-memory-manager.h
@@ -51,6 +51,7 @@
 //******************************************************************************
 
 #include <lib/prof-lean/bistack.h>
+#include <hpcrun/gpu/gpu-activity.h>
 
 #include "opencl-intercept.h"
 
@@ -68,10 +69,11 @@ typedef struct cct_node_t cct_node_t;
 // type declarations
 //******************************************************************************
 
-typedef enum {
-  OPENCL_KERNEL_CALLBACK                     = 0,
-  OPENCL_MEMORY_CALLBACK                     = 1
-} opencl_object_kind_t;
+//This must be the same as gpu_activity_kind_t
+//typedef enum {
+//  GPU_ACTIVITY_KERNEL                     = 1,
+//  GPU_ACTIVITY_MEMCPY                     = 2
+//} gpu_activity_kind_t;
 
 
 typedef struct opencl_object_channel_t opencl_object_channel_t;
@@ -92,7 +94,7 @@ typedef struct opencl_object_details_t {
 typedef struct opencl_object_t {
   s_element_ptr_t next;
   opencl_object_channel_t *channel;
-  opencl_object_kind_t kind;
+  gpu_activity_kind_t kind;
   bool isInternalClEvent;
   opencl_object_details_t details;
 } opencl_object_t;

From 1b9a39e0e44b7cbbcd7ac7659b67bff9cdb458b7 Mon Sep 17 00:00:00 2001
From: Jokeren <robinho364@gmail.com>
Date: Thu, 24 Sep 2020 18:55:08 +0000
Subject: [PATCH 041/177] 1. Refactor gpu binary dump and parsing process 2.
 Fix memory leak problems in ElfHelper

---
 src/lib/banal/gpu/ReadIntelCFG.cpp            |  39 +-
 src/lib/binutils/ElfHelper.cpp                |  22 +-
 src/lib/binutils/ElfHelper.hpp                |   2 +-
 src/lib/binutils/InputFile.cpp                |  17 +-
 src/lib/binutils/intel/IntelGPUBinutils.cpp   | 157 ++-----
 src/lib/binutils/intel/IntelGPUBinutils.hpp   |   8 +-
 .../instrumentation/opencl-instrumentation.c  | 214 +++++----
 .../instrumentation/opencl-instrumentation.h  |  26 +-
 src/tool/hpcrun/gpu/opencl/opencl-intercept.c | 418 ++++++++++--------
 9 files changed, 472 insertions(+), 431 deletions(-)

diff --git a/src/lib/banal/gpu/ReadIntelCFG.cpp b/src/lib/banal/gpu/ReadIntelCFG.cpp
index 59ff9e21dd..d5ea08aa1f 100644
--- a/src/lib/banal/gpu/ReadIntelCFG.cpp
+++ b/src/lib/banal/gpu/ReadIntelCFG.cpp
@@ -81,7 +81,7 @@
 // macros
 //******************************************************************************
 
-#define DEBUG 1
+#define DEBUG 0
 
 #define MAX_STR_SIZE 1024
 #define INTEL_GPU_DEBUG_SECTION_NAME "Intel(R) OpenCL Device Debug"
@@ -242,23 +242,30 @@ readIntelCFG
  Dyninst::ParseAPI::CodeObject **code_obj
 )
 {
-  auto function_name = getFileNameFromAbsolutePath(elfFile->getFileName());
-  addCustomFunctionObject(function_name, the_symtab); //adds a dummy function object
+  if (cfg_wanted) {
+    auto function_name = getFileNameFromAbsolutePath(elfFile->getFileName());
+    addCustomFunctionObject(function_name, the_symtab); //adds a dummy function object
+
+    char *text_section = NULL;
+    auto text_section_size = elfFile->getTextSection(&text_section);
+    if (text_section_size == 0) {
+      return false;
+    }
 
-  char *text_section = NULL;
-  auto text_section_size = elfFile->getTextSection(&text_section);
-  if (text_section_size == 0) {
-    return false;
-  }
+    GPUParse::Function function(0, function_name);
+    parseIntelCFG(text_section, text_section_size, function);
+    std::vector<GPUParse::Function *> functions = {&function};
 
-  GPUParse::Function function(0, function_name);
-  parseIntelCFG(text_section, text_section_size, function);
-  std::vector<GPUParse::Function *> functions = {&function};
+    CFGFactory *cfg_fact = new GPUCFGFactory(functions);
+    *code_src = new GPUCodeSource(functions, the_symtab); 
+    *code_obj = new CodeObject(*code_src, cfg_fact);
+    (*code_obj)->parse();
+
+    return true;
+  }
 
-  CFGFactory *cfg_fact = new GPUCFGFactory(functions);
-  *code_src = new GPUCodeSource(functions, the_symtab); 
-  *code_obj = new CodeObject(*code_src, cfg_fact);
-  (*code_obj)->parse();
+  *code_src = new SymtabCodeSource(the_symtab);
+  *code_obj = new CodeObject(*code_src, NULL, NULL, false, true);
 
-  return true;
+  return false;
 }
diff --git a/src/lib/binutils/ElfHelper.cpp b/src/lib/binutils/ElfHelper.cpp
index b8ca5bfbe1..c3fdc47ac4 100644
--- a/src/lib/binutils/ElfHelper.cpp
+++ b/src/lib/binutils/ElfHelper.cpp
@@ -96,7 +96,7 @@ ElfFile::open
 (
  char *_memPtr,
  size_t _memLen,
- std::string _fileName
+ const std::string &_fileName
 )
 {
   origPtr = _memPtr;
@@ -107,36 +107,44 @@ ElfFile::open
   elf_version(EV_CURRENT);
   elf = elf_memory(memPtr, memLen);
   if (elf == 0 || elf_kind(elf) != ELF_K_ELF) {
+    memPtr = 0;
     return false;
   }
+
   GElf_Ehdr ehdr_v; 
   GElf_Ehdr *ehdr = gelf_getehdr(elf, &ehdr_v);
   if (!ehdr) {
+    memPtr = 0;
     return false;
   }
-#ifdef EM_CUDA
 
+  bool result = true;
+#ifdef EM_CUDA
   if (ehdr->e_machine == EM_CUDA) {
     this->arch = ehdr->e_flags & 0xFF;
 #ifdef DYNINST_USE_CUDA
     origPtr = (char *) malloc(memLen);
     memcpy(origPtr, memPtr, memLen);
     relocateCubin(memPtr, memLen, elf);
+    // Prevent memory leak
+    free(memPtr);
 #else
-    elf_end(elf);
-    return false;
+    result = false;
 #endif
+    // If we cannot open the binary, release memPtr's memory
+    // If we opened the binary, we've copied memPtr to origPtr, also release memory
+    memPtr = 0;
   }
-
 #endif
 
-  return true;
+  return result;
 }
 
 
 ElfFile::~ElfFile() 
 {
-  if (origPtr != memPtr) free(origPtr);
+  // TODO(Keren): prevent memory leak
+  if (origPtr != memPtr && origPtr != 0) free(origPtr);
   elf_end(elf);
 }
 
diff --git a/src/lib/binutils/ElfHelper.hpp b/src/lib/binutils/ElfHelper.hpp
index 337e295d07..69ca556e0c 100644
--- a/src/lib/binutils/ElfHelper.hpp
+++ b/src/lib/binutils/ElfHelper.hpp
@@ -87,7 +87,7 @@
 class ElfFile {
 public:
   ElfFile() { origPtr = 0; memPtr = 0; elf = 0; memLen = 0; }
-  bool open(char *_memPtr, size_t _memLen, std::string _fileName);
+  bool open(char *_memPtr, size_t _memLen, const std::string &_fileName);
   ~ElfFile();
   int getArch() { return arch; }
   Elf *getElf() { return elf; }
diff --git a/src/lib/binutils/InputFile.cpp b/src/lib/binutils/InputFile.cpp
index 81a833e59d..396a49de12 100644
--- a/src/lib/binutils/InputFile.cpp
+++ b/src/lib/binutils/InputFile.cpp
@@ -200,22 +200,19 @@ InputFile::openFile
 
   close(file_fd);
 
+  filevector = new ElfFileVector;
   ElfFile *elfFile = new ElfFile;
-  bool result = elfFile->open(file_buffer, f_size, filename);
 
+  bool result = elfFile->open(file_buffer, f_size, filename);
   if (result) {
-    filevector = new ElfFileVector;
-    if (isIntelGPUFile(elfFile)) {
-      findIntelGPUBins(elfFile, filevector);
-    } else {
-      filevector->push_back(elfFile);
-    }
+    filevector->push_back(elfFile);
     //findCubins(elfFile, filevector);
-  } else {
+  } else if (!findIntelGPUBins(filename, file_buffer, f_size, filevector)) { // Check if the file is a intel debug binary
+    // Release memory
+    delete(elfFile);
     DIAG_MsgIf_GENERIC(tag, 1, "Not an ELF binary " << filename);
-
     if (errType != InputFileError_WarningNothrow) throw 1;
-
+    // Not a standard elf file
     return false;
   }
 
diff --git a/src/lib/binutils/intel/IntelGPUBinutils.cpp b/src/lib/binutils/intel/IntelGPUBinutils.cpp
index c1ac80c8a5..c09d130060 100644
--- a/src/lib/binutils/intel/IntelGPUBinutils.cpp
+++ b/src/lib/binutils/intel/IntelGPUBinutils.cpp
@@ -50,6 +50,8 @@
 //******************************************************************************
 
 #include <iostream>
+#include <sstream>
+#include <iomanip>
 #include <string>
 #include <sys/types.h>
 #include <sys/stat.h>
@@ -86,43 +88,7 @@
 // private operations
 //******************************************************************************
 
-static size_t
-file_size(int fd)
-{
-  struct stat sb;
-  int retval = fstat(fd, &sb);
-  if (retval == 0 && S_ISREG(sb.st_mode)) {
-    return sb.st_size;
-  }
-  return 0;
-}
-
-
-// Automatically restart short reads.
-// This protects against EINTR.
-//
-static size_t
-read_all(int fd, void *buf, size_t count)
-{
-  ssize_t ret;
-  size_t len;
-
-  len = 0;
-  while (len < count) {
-    ret = read(fd, ((char *) buf) + len, count - len);
-    if (ret == 0 || (ret < 0 && errno != EINTR)) {
-      break;
-    }
-    if (ret > 0) {
-      len += ret;
-    }
-  }
-
-  return len;
-}
-
-
-static const char*
+static __attribute__((unused)) const char *
 opencl_elf_section_type
 (
   Elf64_Word sh_type
@@ -161,15 +127,20 @@ opencl_elf_section_type
 }
 
 
-static bool
-extract_kernelelfs
+//******************************************************************************
+// interface operations
+//******************************************************************************
+
+bool
+findIntelGPUBins
 (
- char *section_data,
- size_t section_size,
+ const std::string &file_name,
+ const char *file_buffer,
+ size_t file_size,
  ElfFileVector *filevector
 )
 {
-  const char *ptr = section_data;
+  const char *ptr = file_buffer;
   const SProgramDebugDataHeaderIGC* header =
     reinterpret_cast<const SProgramDebugDataHeaderIGC*>(ptr);
   ptr += sizeof(SProgramDebugDataHeaderIGC);
@@ -177,103 +148,51 @@ extract_kernelelfs
   if (header->NumberOfKernels == 0) {
     return false;
   }
+
+  auto iter = file_name.rfind("/");
+  if (iter == std::string::npos) {
+    return false;
+  }
+  std::string dir_name = file_name.substr(0, iter + 1);
   
   for (uint32_t i = 0; i < header->NumberOfKernels; ++i) {
     const SKernelDebugDataHeaderIGC *kernel_header =
       reinterpret_cast<const SKernelDebugDataHeaderIGC*>(ptr);
     ptr += sizeof(SKernelDebugDataHeaderIGC);
-
-    const char *kernel_name = reinterpret_cast<const char*>(ptr);
-    std::string file_name = std::string(kernel_name) + ".gpubin";
-    std::cout << "intel " << file_name << std::endl;
+    std::string kernel_name(ptr);
 
     unsigned kernel_name_size_aligned = sizeof(uint32_t) *
       (1 + (kernel_header->KernelNameSize - 1) / sizeof(uint32_t));
     ptr += kernel_name_size_aligned;
 
     if (kernel_header->SizeVisaDbgInBytes > 0) {
-      ElfFile *elf_file = new ElfFile;
-      int file_fd = open(file_name.c_str(), O_RDONLY);
-      size_t f_size = file_size(file_fd);
-      char *file_buffer = (char *)malloc(f_size);
-      size_t bytes = read_all(file_fd, file_buffer, f_size);
+      std::stringstream ss;
+      ss << dir_name << kernel_name << ".gpubin";
+
+      size_t kernel_size = kernel_header->SizeVisaDbgInBytes;
+      char *kernel_buffer = (char *)malloc(kernel_size);
+      memcpy(kernel_buffer, ptr, kernel_size);
+
+      auto elf_file = new ElfFile;
+      if (elf_file->open(kernel_buffer, kernel_size, ss.str())) {
+        // TODO(Keren): Dump binaries or not?
+        FILE *fptr = fopen(ss.str().c_str(), "wb");
+        fwrite(kernel_buffer, sizeof(char), kernel_size, fptr);
+        fclose(fptr);
 
-      if (elf_file->open(file_buffer, f_size, file_name)) {
         filevector->push_back(elf_file);
       } else {
-        // Cannot handle a kernel
-        return false;
+        // kernel_buffer is released with elf_file
+        delete elf_file;
       }
     } else {
       // Kernel does not have debug info
       return false;
     }
+
+    ptr += kernel_header->SizeVisaDbgInBytes;
+    ptr += kernel_header->SizeGenIsaDbgInBytes;
   }
 
   return true;
 }
-
-
-static bool
-is_custom_opencl_binary
-(
- const std::string &section_name
-)
-{
-  return section_name == ".SHT_OPENCL_DEV_DEBUG";
-}
-
-//******************************************************************************
-// interface operations
-//******************************************************************************
-
-bool
-findIntelGPUBins
-(
- ElfFile *elfFile,
- ElfFileVector *filevector
-)
-{
-  bool has_debug_section = false;
-  bool extract_file = false;
-
-  Elf *elf = elfFile->getElf();
-  char *file_buffer = elfFile->getMemory();
-  ElfSectionVector *sections = elfGetSectionVector(elf);
-  GElf_Ehdr ehdr_v;
-  GElf_Ehdr *ehdr = gelf_getehdr(elf, &ehdr_v);
-
-  if (ehdr) {
-    for (auto si = sections->begin(); si != sections->end(); si++) {
-      Elf_Scn *scn = *si;
-      GElf_Shdr shdr_v;
-      GElf_Shdr *shdr = gelf_getshdr(scn, &shdr_v);
-      if (!shdr) continue;
-      char *section_data = elfSectionGetData(file_buffer, shdr);
-      std::string section_name = std::string(elf_strptr(elf, ehdr->e_shstrndx, shdr->sh_name));
-      std::cout << "section name: " << section_name << ". section type: " << opencl_elf_section_type(shdr->sh_type) << std::endl;
-
-      // extract debug section
-      if ((shdr->sh_type == SHT_OPENCL_DEV_DEBUG && section_name == INTEL_GPU_DEBUG_SECTION_NAME)
-        || is_custom_opencl_binary(section_name)) {
-        has_debug_section = true;
-        extract_file = extract_kernelelfs(section_data, shdr->sh_size, filevector);
-        break;
-      }
-    }
-  }
-  //// TODO(Aaron): why put this section here?
-  //FILE *fptr;
-  //if (!fileHasDebugSection && (fptr = fopen("opencl_main.debuginfo", "rb"))) {
-  //  fileHasDebugSection = true;
-  //  fseek(fptr, 0L, SEEK_END);
-  //  size_t debug_info_size = ftell(fptr);
-  //  printf("debug_info_size: %zu\n", debug_info_size);
-  //  rewind(fptr);
-  //  std::vector<uint8_t> debug_info(debug_info_size);
-  //  fread(debug_info.data(), debug_info_size, 1, fptr);
-  //  extractSuccess = extract_kernelelfs(debug_info, filevector);
-  //}
-  //bool success = fileHasDebugSection && extractSuccess;
-  return extract_file && has_debug_section; 
-}
diff --git a/src/lib/binutils/intel/IntelGPUBinutils.hpp b/src/lib/binutils/intel/IntelGPUBinutils.hpp
index 56564f934a..3861ffb7e1 100644
--- a/src/lib/binutils/intel/IntelGPUBinutils.hpp
+++ b/src/lib/binutils/intel/IntelGPUBinutils.hpp
@@ -66,8 +66,6 @@ enum SHT_OPENCL : uint32_t {
     SHT_OPENCL_SPIRV_SC_VALUES = 0xff00000c          // Specialization Constants values
 };
 
-
-
 //******************************************************************************
 // interface functions
 //******************************************************************************
@@ -75,8 +73,10 @@ enum SHT_OPENCL : uint32_t {
 bool
 findIntelGPUBins
 (
-	ElfFile *elfFile,
-	ElfFileVector *filevector
+ const std::string &file_name, 
+ const char *file_buffer,
+ size_t file_size,
+ ElfFileVector *filevector
 );
 
 #endif
diff --git a/src/tool/hpcrun/gpu/instrumentation/opencl-instrumentation.c b/src/tool/hpcrun/gpu/instrumentation/opencl-instrumentation.c
index c4c32a309d..1e9a323c42 100644
--- a/src/tool/hpcrun/gpu/instrumentation/opencl-instrumentation.c
+++ b/src/tool/hpcrun/gpu/instrumentation/opencl-instrumentation.c
@@ -47,6 +47,12 @@
 
 #include <assert.h>
 #include <stdlib.h>
+#include <errno.h>     // errno
+#include <fcntl.h>     // open
+#include <sys/stat.h>  // mkdir
+#include <dirent.h>
+#include <sys/types.h>
+#include <unistd.h>
 #include <gtpin.h>
 
 
@@ -69,6 +75,7 @@
 #include <hpcrun/memory/hpcrun-malloc.h>
 #include <hpcrun/utilities/hpcrun-nanotime.h>
 #include <hpcrun/gpu/opencl/opencl-intercept.h>
+#include <hpcrun/files.h>
 #include "opencl-instrumentation.h"
 
 
@@ -79,6 +86,7 @@
 
 #define MAX_STR_SIZE 1024
 
+// TODO(Aaron): Why there are so many correlation ids
 static atomic_long correlation_id;
 
 
@@ -103,6 +111,16 @@ knobAddBool
 }
 
 
+void
+initializeInstrumentation
+(
+ void
+)
+{
+  atomic_store(&correlation_id, 5000);  // to avoid conflict with opencl operation correlation ids, we start instrumentation ids with 5000 (TODO(Aaron):FIX)
+}
+
+
 static uint32_t
 getCorrelationId
 (
@@ -136,85 +154,104 @@ createKernelNode
 }
 
 
-static uint32_t
-findKernelAndInsertToLoadMap
+static int32_t
+findOrAddKernelModule
 (
- uint8_t *debuginfo,
- char *input_kernel_name
+ const char *input_kernel_name
 )
 {
-  const uint8_t* ptr = debuginfo;
-  const SProgramDebugDataHeaderIGC* header = (const SProgramDebugDataHeaderIGC*)(ptr);
-  ptr += sizeof(SProgramDebugDataHeaderIGC);
-
-  ETMSG(OPENCL, "Number of kernels: %d", header->NumberOfKernels);
-  for (uint32_t i = 0; i < header->NumberOfKernels; ++i) {
-    const SKernelDebugDataHeaderIGC* kernel_header = (const SKernelDebugDataHeaderIGC*)(ptr);
-    ptr += sizeof(SKernelDebugDataHeaderIGC);
-
-    const char* kernel_name = (const char*)(ptr);
-    char *file_name = (char*)hpcrun_malloc(sizeof(kernel_name));
-    strcpy(file_name, kernel_name);
-    strcat(file_name, ".gpubin");
-
-    unsigned kernel_name_size_aligned = sizeof(uint32_t) *
-      (1 + (kernel_header->KernelNameSize - 1) / sizeof(uint32_t));
-    ptr += kernel_name_size_aligned;
-
-    if (kernel_header->SizeVisaDbgInBytes > 0 && strcmp(kernel_name, input_kernel_name) == 0) {
-      FILE *fptr = fopen(file_name, "wb");
-      fwrite(ptr, kernel_header->SizeVisaDbgInBytes, 1, fptr);
-
-      uint32_t hpctoolkit_module_id;
-      load_module_t *module = NULL;
-      char *absoluteKernelName = realpath(file_name, NULL); 
-
-      hpcrun_loadmap_lock();
-      if ((module = hpcrun_loadmap_findByName(absoluteKernelName)) == NULL) {
-        hpctoolkit_module_id = hpcrun_loadModule_add(absoluteKernelName);
-      } else {
-        hpctoolkit_module_id = module->id;
-      }
-      hpcrun_loadmap_unlock();
+  char path_name[PATH_MAX];
+  size_t used = 0;
+  used += sprintf(&path_name[used], "%s", hpcrun_files_output_directory());
+  used += sprintf(&path_name[used], "%s", "/intel/");
+
+  DIR *FD;
+  if (NULL == (FD = opendir(path_name))) {
+    return -1;
+  }
 
-      fclose(fptr);
-      return hpctoolkit_module_id;
+  int module_id = -1;
+  struct dirent *in_file;
+  while ((in_file = readdir(FD))) {
+    if (!strstr(in_file->d_name, ".debuginfo")) {
+      continue;
     }
-    // Should be zero for newest drivers
-    assert(kernel_header->SizeGenIsaDbgInBytes == 0);
 
-    ptr += kernel_header->SizeVisaDbgInBytes;
-    ptr += kernel_header->SizeGenIsaDbgInBytes;
-  }
-  return -1;
-}
+    char buffer[PATH_MAX];
+    used = 0;
+    used = sprintf(&buffer[used], "%s", path_name);
+    used = sprintf(&buffer[used], "%s", in_file->d_name);
+
+    FILE *fptr = fopen(buffer, "rb");
+    fseek(fptr, 0L, SEEK_END);
+    size_t debug_info_size = ftell(fptr);
+    rewind(fptr);
+    char *debug_info = (char *)malloc(debug_info_size);
+    fread(debug_info, debug_info_size, 1, fptr);
+
+    const char *ptr = debug_info;
+    const SProgramDebugDataHeaderIGC *header = (const SProgramDebugDataHeaderIGC *)(ptr);
+    ptr += sizeof(SProgramDebugDataHeaderIGC);
+
+    ETMSG(OPENCL, "Number of kernels: %d", header->NumberOfKernels);
+    for (uint32_t i = 0; i < header->NumberOfKernels; ++i) {
+      const SKernelDebugDataHeaderIGC* kernel_header = (const SKernelDebugDataHeaderIGC*)(ptr);
+      ptr += sizeof(SKernelDebugDataHeaderIGC);
+
+      const char *kernel_name = (const char *)(ptr);
+      if (kernel_header->SizeVisaDbgInBytes > 0 && strcmp(kernel_name, input_kernel_name) == 0) {
+        // Create file name
+        char file_name[PATH_MAX];
+        size_t i;
+        size_t used = 0;
+        used += sprintf(&file_name[used], "%s", hpcrun_files_output_directory());
+        used += sprintf(&file_name[used], "%s", "/intel/");
+        used += sprintf(&file_name[used], "%s", kernel_name);
+        used += sprintf(&file_name[used], "%s", ".gpubin");
+
+        #if 0
+        // Write a file if does not exist
+        bool file_flag;
+        spinlock_lock(&files_lock);
+        file_flag = writeBinary(file_name, binary, binary_size);
+        spinlock_unlock(&files_lock);
+        #endif
+
+        hpcrun_loadmap_lock();
+        load_module_t *module = hpcrun_loadmap_findByName(file_name);
+        if (module == NULL) {
+          module_id = hpcrun_loadModule_add(file_name);
+        } else {
+          // Find module
+          module_id = module->id;
+        }
+        hpcrun_loadmap_unlock();
+
+        break;
+      }
 
+      // TODO(Aaron): Should be zero for newest drivers (what does it mean?)
+      assert(kernel_header->SizeGenIsaDbgInBytes == 0);
 
-static uint32_t
-add_opencl_binary_to_loadmap 
-(
- char *kernel_name
-)
-{
-  char *debuginfoFileName = getDebugInfoFullFileName();
-  ETMSG(OPENCL, "OpenCL binary name %s", debuginfoFileName);
+      ptr += kernel_header->SizeVisaDbgInBytes;
+      ptr += kernel_header->SizeGenIsaDbgInBytes;
+    }
+
+    free(debug_info);
+    fclose(fptr);
 
-  if (debuginfoFileName == NULL) {
-    ETMSG(OPENCL, "debug file not found");
-    return -1;  
+    if (module_id != -1) {
+      // Find module
+      break;
+    }
   }
-  FILE *fptr = fopen(debuginfoFileName, "rb");
-  fseek(fptr, 0L, SEEK_END);
-  size_t debug_info_size = ftell(fptr);
-  rewind(fptr);
-  uint8_t *debug_info = (uint8_t*)hpcrun_malloc(debug_info_size);
-  fread(debug_info, debug_info_size, 1, fptr);
-  return findKernelAndInsertToLoadMap(debug_info, kernel_name);
+
+  return module_id;
 }
 
 
 static void
-opencl_activity_notify
+activityNotify
 (
  void
 )
@@ -224,7 +261,7 @@ opencl_activity_notify
 
 
 static void
-opencl_kernel_block_activity_translate
+kernelBlockActivityTranslate
 (
  gpu_activity_t *ga,
  uint32_t correlation_id,
@@ -246,7 +283,7 @@ opencl_kernel_block_activity_translate
 
 
 static void
-opencl_kernel_block_activity_process
+kernelBlockActivityProcess
 (
  gpu_activity_t *ga,
  uint32_t correlation_id,
@@ -255,7 +292,7 @@ opencl_kernel_block_activity_process
  uint64_t execution_count
 )
 {
-  opencl_kernel_block_activity_translate(ga, correlation_id, loadmap_module_id, offset, execution_count);
+  kernelBlockActivityTranslate(ga, correlation_id, loadmap_module_id, offset, execution_count);
   gpu_activity_process(ga);
 }
 
@@ -295,6 +332,7 @@ onKernelBuild
     status = GTPin_OpcodeprofInstrument(head, mem);
     assert(status == GTPINTOOL_STATUS_SUCCESS);
 
+    // TODO(Aaron): when using hpcrun_malloc, find a way to recycle memory
     mem_pair_node *m = hpcrun_malloc(sizeof(mem_pair_node));
     m->offset = offset;
     m->mem = mem;
@@ -310,6 +348,7 @@ onKernelBuild
     }
   }
   if (h != NULL) {
+    // TODO(Aaron): naming insert1/insert2 is confusing
     kernel_memory_map_insert1((uint64_t)kernel, h);
   }
 
@@ -318,13 +357,19 @@ onKernelBuild
   char kernel_name[MAX_STR_SIZE];
   status = GTPin_KernelGetName(kernel, MAX_STR_SIZE, kernel_name, NULL);
   assert(status == GTPINTOOL_STATUS_SUCCESS);
-
   // 
   // m->next = NULL;
   // add these details to cct_node. If thats not needed, we can create the kernel_cct in onKernelComplete
-  data.name = kernel_name;
+  // XXX(Aaron): what is this for?
+  //data.name = kernel_name;
   data.call_count = 0;
-  data.loadmap_module_id = add_opencl_binary_to_loadmap(kernel_name);
+
+  int32_t module_id = findOrAddKernelModule(kernel_name);
+  if (module_id != -1) {
+    data.loadmap_module_id = module_id;
+  } else {
+    ETMSG(OPENCL, "onKernelComplete cannot find kernel %d\n", kernel_name);
+  }
 
   kernel_data_map_insert1((uint64_t)kernel, data);
   ETMSG(OPENCL, "onKernelBuild complete. Inserted key: %"PRIu64 "",(uint64_t)kernel);
@@ -332,11 +377,11 @@ onKernelBuild
 
 
 static void
-  onKernelRun
+onKernelRun
 (
  GTPinKernelExec kernelExec,
  void *v
- )
+)
 {
   GTPINTOOL_STATUS status = GTPINTOOL_STATUS_SUCCESS;
   GTPin_KernelProfilingActive(kernelExec, 1);
@@ -345,11 +390,11 @@ static void
 
 
 static void
-  onKernelComplete
+onKernelComplete
 (
  GTPinKernelExec kernelExec,
  void *v
- )
+)
 {
   GTPINTOOL_STATUS status = GTPINTOOL_STATUS_SUCCESS;
   GTPinKernel kernel = GTPin_KernelExec_GetKernel(kernelExec);
@@ -357,6 +402,7 @@ static void
   assert(kernel_data_map_lookup1((uint64_t)kernel) != 0);
   assert(kernel_memory_map_lookup1((uint64_t)kernel) != 0);
 
+  // TODO(Aaron): rename lookup methods, do not use magic numbers
   kernel_data_map_t *kernel_data_list = kernel_data_map_lookup1((uint64_t)kernel);
   KernelData data = kernel_data_list->data;
   kernel_memory_map_t *kernel_memory_list = kernel_memory_map_lookup1((uint64_t)kernel);
@@ -389,9 +435,9 @@ static void
     uint64_t execution_count = total; // + bm->val 
     //block_map_insert1(data.block_map_root, block->offset, execution_count);
 
-    opencl_activity_notify();  
+    activityNotify();  
     gpu_activity_t gpu_activity;
-    opencl_kernel_block_activity_process(&gpu_activity, correlation_id, data.loadmap_module_id, block->offset, execution_count);
+    kernelBlockActivityProcess(&gpu_activity, correlation_id, data.loadmap_module_id, block->offset, execution_count);
     block = block->next;
     //how to make offset the primary key within the cct and += the execution value for existing ccts?
   }
@@ -405,23 +451,15 @@ static void
 // interface operations
 //******************************************************************************
 
-void
-opencl_instrumentation_initialize
-(
- void
-)
-{
-  atomic_store(&correlation_id, 5000);  // to avoid conflict with opencl operation correlation ids, we start instrumentation ids with 5000 (TODO:FIX)
-}
 
-
-void enableProfiling
+void
+opencl_enable_profiling
 (
  void
 )
 {
   ETMSG(OPENCL, "inside enableProfiling");
-  opencl_instrumentation_initialize();
+  initializeInstrumentation();
   knobAddBool("silent_warnings", true);
 
   /*if (utils::GetEnv("PTI_GEN12") != nullptr) {
diff --git a/src/tool/hpcrun/gpu/instrumentation/opencl-instrumentation.h b/src/tool/hpcrun/gpu/instrumentation/opencl-instrumentation.h
index c15d6fde0f..53dee81047 100644
--- a/src/tool/hpcrun/gpu/instrumentation/opencl-instrumentation.h
+++ b/src/tool/hpcrun/gpu/instrumentation/opencl-instrumentation.h
@@ -54,23 +54,24 @@
 // type declarations
 //******************************************************************************
 
+// TODO(Aaron): Why it starts with a _? 
 typedef struct _SProgramDebugDataHeaderIGC
 {
-    uint32_t         Magic;
-    uint32_t         Version;
-    uint32_t         Size;
-    uint32_t         Device;
-    uint32_t         SteppingId;
-    uint32_t         GPUPointerSizeInBytes;
-    uint32_t         NumberOfKernels;
+ uint32_t Magic;
+ uint32_t Version;
+ uint32_t Size;
+ uint32_t Device;
+ uint32_t SteppingId;
+ uint32_t GPUPointerSizeInBytes;
+ uint32_t NumberOfKernels;
 } SProgramDebugDataHeaderIGC;
 
 
 typedef struct _SKernelDebugDataHeaderIGC
 {
-    uint32_t         KernelNameSize;
-    uint32_t         SizeVisaDbgInBytes;
-    uint32_t         SizeGenIsaDbgInBytes;
+ uint32_t KernelNameSize;
+ uint32_t SizeVisaDbgInBytes;
+ uint32_t SizeGenIsaDbgInBytes;
 } SKernelDebugDataHeaderIGC;
 
 
@@ -79,7 +80,8 @@ typedef struct _SKernelDebugDataHeaderIGC
 // interface operations
 //******************************************************************************
 
-void enableProfiling
+void
+opencl_enable_profiling
 (
-  void
+ void
 );
diff --git a/src/tool/hpcrun/gpu/opencl/opencl-intercept.c b/src/tool/hpcrun/gpu/opencl/opencl-intercept.c
index 44901c4302..a5685941bb 100644
--- a/src/tool/hpcrun/gpu/opencl/opencl-intercept.c
+++ b/src/tool/hpcrun/gpu/opencl/opencl-intercept.c
@@ -46,7 +46,11 @@
 //******************************************************************************
 
 #include <inttypes.h>
-
+#include <errno.h>     // errno
+#include <fcntl.h>     // open
+#include <sys/stat.h>  // mkdir
+#include <sys/types.h>
+#include <unistd.h>
 
 
 //******************************************************************************
@@ -57,9 +61,12 @@
 #include <hpcrun/gpu/gpu-metrics.h>
 #include <hpcrun/memory/hpcrun-malloc.h>
 #include <hpcrun/messages/messages.h>
+#include <hpcrun/files.h>
 #include <lib/prof-lean/hpcrun-gotcha.h>
 #include <lib/prof-lean/hpcrun-opencl.h>
 #include <lib/prof-lean/stdatomic.h>
+#include <lib/prof-lean/spinlock.h>
+#include <lib/prof-lean/crypto-hash.h>
 
 #include "opencl-api.h"
 #include "opencl-intercept.h"
@@ -71,7 +78,11 @@
 // local data
 //******************************************************************************
 
+static spinlock_t files_lock = SPINLOCK_UNLOCKED;
+
+// TODO(Aaron): this endif is so far from ifndef
 #ifndef HPCRUN_STATIC_LINK
+
 static gotcha_wrappee_handle_t clBuildProgram_handle;
 static gotcha_wrappee_handle_t clCreateProgramWithSource_handle;
 static gotcha_wrappee_handle_t clCreateCommandQueue_handle;
@@ -79,22 +90,20 @@ static gotcha_wrappee_handle_t clEnqueueNDRangeKernel_handle;
 static gotcha_wrappee_handle_t clEnqueueReadBuffer_handle;
 static gotcha_wrappee_handle_t clEnqueueWriteBuffer_handle;
 static atomic_long correlation_id;
-static char *debugInfoFullFileName;
-
 
 #define CL_PROGRAM_DEBUG_INFO_SIZES_INTEL 0x4101
 #define CL_PROGRAM_DEBUG_INFO_INTEL       0x4100
 
-
+#define LINE_TABLE_FLAG " -gline-tables-only "
 
 //******************************************************************************
 // private operations
 //******************************************************************************
 
 static void
-opencl_intercept_initialize
+initializeIntercept
 (
-  void
+ void
 )
 {
   atomic_store(&correlation_id, 0);
@@ -104,30 +113,18 @@ opencl_intercept_initialize
 static uint64_t
 getCorrelationId
 (
-  void
+ void
 )
 {
   return atomic_fetch_add(&correlation_id, 1);
 }
 
 
-static void
-setDebugInfoFullFileName
-(
-	char *fileName
-)
-{
-	if (debugInfoFullFileName == NULL) {
-		debugInfoFullFileName = fileName;	
-	}
-}
-
-
 static void
 initializeKernelCallBackInfo
 (
-  cl_kernel_callback_t *kernel_cb,
-  uint64_t correlation_id
+ cl_kernel_callback_t *kernel_cb,
+ uint64_t correlation_id
 )
 {
   kernel_cb->correlation_id = correlation_id;
@@ -138,10 +135,10 @@ initializeKernelCallBackInfo
 static void
 initializeMemoryCallBackInfo
 (
-  cl_memory_callback_t *mem_transfer_cb,
-  uint64_t correlation_id,
-  size_t size,
-  bool fromHostToDevice
+ cl_memory_callback_t *mem_transfer_cb,
+ uint64_t correlation_id,
+ size_t size,
+ bool fromHostToDevice
 )
 {
   mem_transfer_cb->correlation_id = correlation_id;
@@ -151,25 +148,96 @@ initializeMemoryCallBackInfo
   mem_transfer_cb->fromDeviceToHost = !fromHostToDevice;
 }
 
+
+static bool
+writeBinary
+(
+ const char *file_name,
+ const void *binary,
+ size_t binary_size
+)
+{
+  int fd;
+  errno = 0;
+  fd = open(file_name, O_WRONLY | O_CREAT | O_EXCL, 0644);
+  if (errno == EEXIST) {
+    close(fd);
+    return true;
+  }
+  if (fd >= 0) {
+    // Success
+    if (write(fd, binary, binary_size) != binary_size) {
+      close(fd);
+      return false;
+    } else {
+      close(fd);
+      return true;
+    }
+  } else {
+    // Failure to open is a fatal error.
+    hpcrun_abort("hpctoolkit: unable to open file: '%s'", file_name);
+    return false;
+  }
+}
+
+
+void
+writeHashBinary
+(
+ const void *binary,
+ size_t binary_size,
+ bool is_debug_info
+)
+{
+  // Compute hash for the binary
+  unsigned char hash[HASH_LENGTH];
+  crypto_hash_compute(binary, binary_size, hash, HASH_LENGTH);
+
+  // Create file name
+  char file_name[PATH_MAX];
+  size_t i;
+  size_t used = 0;
+  used += sprintf(&file_name[used], "%s", hpcrun_files_output_directory());
+  used += sprintf(&file_name[used], "%s", "/intel/");
+  mkdir(file_name, S_IRWXU | S_IRWXG | S_IROTH | S_IXOTH);
+  for (i = 0; i < HASH_LENGTH; ++i) {
+    used += sprintf(&file_name[used], "%02x", hash[i]);
+  }
+  if (is_debug_info) {
+    used += sprintf(&file_name[used], "%s", ".debuginfo");
+  } else {
+    // XXX(Aaron): we do not use this file for now
+    used += sprintf(&file_name[used], "%s", ".gpumain");
+  }
+
+  // Write a file if does not exist
+  spinlock_lock(&files_lock);
+  writeBinary(file_name, binary, binary_size);
+  spinlock_unlock(&files_lock);
+}
+
+
+#if 0
 static char*
 getKernelNameFromSourceCode
 (
-	const char *kernelSourceCode
+  const char *kernelSourceCode
 )
 {
-	char *kernelCode_copy = (char*)hpcrun_malloc(sizeof(kernelSourceCode));
-	strcpy(kernelCode_copy, kernelSourceCode);
-	char *token = strtok(kernelCode_copy, " ");
-	while (token != NULL) {
-		if (strcmp(token, "void") == 0) { // not searching for kernel because "supported\n#endif\nkernel"
-			token = strtok(NULL, " ");
-			printf("kernel name: %s", token);
-			return token;
-		}
-		token = strtok(NULL, " ");
-	}
-	return NULL;
+  char *kernelCode_copy = (char*)hpcrun_malloc(sizeof(kernelSourceCode));
+  strcpy(kernelCode_copy, kernelSourceCode);
+  char *token = strtok(kernelCode_copy, " ");
+  while (token != NULL) {
+    if (strcmp(token, "void") == 0) { // not searching for kernel because "supported\n#endif\nkernel"
+      token = strtok(NULL, " ");
+      printf("kernel name: %s", token);
+      return token;
+    }
+    token = strtok(NULL, " ");
+  }
+  return NULL;
 }
+#endif
 
 
 static cl_program
@@ -182,80 +250,84 @@ clCreateProgramWithSource_wrapper
  cl_int* errcode_ret
 )
 {
-	ETMSG(OPENCL, "inside clCreateProgramWithSource_wrapper");
-
-	FILE *f_ptr;
-	for (int i = 0; i < (int)count; i++) {
-		// what if a single file has multiple kernels?
-		// we need to add logic to get filenames by reading the strings contents
-		char fileno = '0' + (i + 1); // right now we are naming the files as index numbers
-		// using malloc instead of hpcrun_malloc gives extra garbage characters in file name
-		char *filename = (char*)hpcrun_malloc(sizeof(fileno) + 1);
-		*filename = fileno + '\0';
-		f_ptr = fopen(filename, "w");
-		fwrite(strings[i], lengths[i], 1, f_ptr);
-	}
-	fclose(f_ptr);
-	
-	clcreateprogramwithsource_t clCreateProgramWithSource_wrappee =
-		GOTCHA_GET_TYPED_WRAPPEE(clCreateProgramWithSource_handle, clcreateprogramwithsource_t);
-	return clCreateProgramWithSource_wrappee(context, count, strings, lengths, errcode_ret);
+  ETMSG(OPENCL, "inside clCreateProgramWithSource_wrapper");
+
+#if 0
+  FILE *f_ptr;
+  for (int i = 0; i < (int)count; i++) {
+    // what if a single file has multiple kernels?
+    // we need to add logic to get filenames by reading the strings contents
+    char fileno = '0' + (i + 1); // right now we are naming the files as index numbers
+    // using malloc instead of hpcrun_malloc gives extra garbage characters in file name
+    char *filename = (char*)hpcrun_malloc(sizeof(fileno) + 1);
+    *filename = fileno + '\0';
+    f_ptr = fopen(filename, "w");
+    fwrite(strings[i], lengths[i], 1, f_ptr);
+  }
+  fclose(f_ptr);
+#endif
+  
+  clcreateprogramwithsource_t clCreateProgramWithSource_wrappee =
+    GOTCHA_GET_TYPED_WRAPPEE(clCreateProgramWithSource_handle, clcreateprogramwithsource_t);
+  return clCreateProgramWithSource_wrappee(context, count, strings, lengths, errcode_ret);
 }
 
 
 // we are dumping the debuginfo temporarily since the binary does not have debugsection
-// poorly written code: FIXME
-static char*
-dumpIntelGPUBinary(cl_program program) {
-	int device_count = 1;
-	cl_int status = CL_SUCCESS;
-	size_t *binary_size = (size_t*)hpcrun_malloc(sizeof(size_t) * device_count);
-
-	status = clGetProgramInfo(program, CL_PROGRAM_BINARY_SIZES,	sizeof(size_t), binary_size, NULL);
-	assert(status == CL_SUCCESS);
-	uint8_t **binary = (uint8_t**)hpcrun_malloc(device_count * sizeof(uint8_t*));
-	for (size_t i = 0; i < device_count; ++i) {
-		binary[i] = (uint8_t*)hpcrun_malloc(binary_size[i] * sizeof(uint8_t));
-	}
-
-	status = clGetProgramInfo(program, CL_PROGRAM_BINARIES, device_count * sizeof(uint8_t*), binary, NULL);
-	assert(status == CL_SUCCESS);
-
-	FILE *bin_ptr;
-	bin_ptr = fopen("opencl_main.gpubin", "wb");
-	fwrite(binary[0], binary_size[0], 1, bin_ptr);
-
-  // SECOND
-	size_t *debug_info_size = (size_t*)hpcrun_malloc(sizeof(size_t) * device_count);
-
-	status = clGetProgramInfo(program, CL_PROGRAM_DEBUG_INFO_SIZES_INTEL,	sizeof(size_t), debug_info_size, NULL);
-	assert(status == CL_SUCCESS);
-	uint8_t **debug_info = (uint8_t**)hpcrun_malloc(device_count * sizeof(uint8_t*));
-	for (size_t i = 0; i < device_count; ++i) {
-		debug_info[i] = (uint8_t*)hpcrun_malloc(debug_info_size[i] * sizeof(uint8_t));
-	}
-
-	status = clGetProgramInfo(program, CL_PROGRAM_DEBUG_INFO_INTEL, device_count * sizeof(uint8_t*), debug_info, NULL);
-	assert(status == CL_SUCCESS);
-
-	char *debuginfoFileName = "opencl_main.debuginfo";
-	bin_ptr = fopen(debuginfoFileName, "wb");
-	fwrite(debug_info[0], debug_info_size[0], 1, bin_ptr);
-	fclose(bin_ptr);
-  ETMSG(OPENCL, "Intel GPU files dumped successfully");
-	return realpath(debuginfoFileName, NULL);
-}
-
-
+// poorly written code: FIXME(Aaron)
 static void
 clBuildProgramCallback
 (
-	cl_program program,
-	void* user_data
+ cl_program program,
+ void* user_data
 )
 {
-	char* debugInfoFullFileName = dumpIntelGPUBinary(program);
-	setDebugInfoFullFileName(debugInfoFullFileName);
+  // TODO(Aaron): where do you get device_count?
+  int device_count = 1;
+  cl_int status = CL_SUCCESS;
+
+  // binary
+  size_t *binary_size = (size_t *)malloc(device_count * sizeof(size_t));
+  status = clGetProgramInfo(program, CL_PROGRAM_BINARY_SIZES, device_count * sizeof(size_t), binary_size, NULL);
+  assert(status == CL_SUCCESS);
+
+  char **binary = (char **)malloc(device_count * sizeof(char *));
+  for (size_t i = 0; i < device_count; ++i) {
+    binary[i] = (char *)malloc(binary_size[i] * sizeof(char));
+  }
+
+  status = clGetProgramInfo(program, CL_PROGRAM_BINARIES, device_count * sizeof(char *), binary, NULL);
+  assert(status == CL_SUCCESS);
+
+  // debug info
+  size_t *debug_info_size = (size_t *)malloc(device_count * sizeof(size_t));
+  status = clGetProgramInfo(program, CL_PROGRAM_DEBUG_INFO_SIZES_INTEL, device_count * sizeof(size_t), debug_info_size, NULL);
+  assert(status == CL_SUCCESS);
+
+  char **debug_info = (char **)malloc(device_count * sizeof(char *));
+  for (size_t i = 0; i < device_count; ++i) {
+    debug_info[i] = (char *)malloc(debug_info_size[i] * sizeof(char));
+  }
+
+  status = clGetProgramInfo(program, CL_PROGRAM_DEBUG_INFO_INTEL, device_count * sizeof(char *), debug_info, NULL);
+  assert(status == CL_SUCCESS);
+
+  // TODO(Aaron): Is it ok to only write binary 0?
+  // write binaries and add them to load map
+  for (size_t i = 0; i < device_count; ++i) {
+    writeHashBinary(binary[i], binary_size[i], false);
+    writeHashBinary(debug_info[i], debug_info_size[i], true); 
+  }
+
+  // free memory
+  for (size_t i = 0; i < device_count; ++i) {
+    free(binary[i]);
+    free(debug_info[i]);
+  }
+  free(binary_size);
+  free(debug_info_size);
+
+  ETMSG(OPENCL, "Intel GPU files dumped successfully");
 }
 
 
@@ -265,31 +337,38 @@ clBuildProgram_wrapper
 (
  cl_program program,
  cl_uint num_devices,
- const cl_device_id* device_list,
- const char* options,
+ const cl_device_id *device_list,
+ const char *options,
  void (CL_CALLBACK* pfn_notify)(cl_program program, void* user_data),
- void* user_data
+ void *user_data
 )
 {
   ETMSG(OPENCL, "inside clBuildProgram_wrapper");
   clbuildprogram_t clBuildProgram_wrappee = 
     GOTCHA_GET_TYPED_WRAPPEE(clBuildProgram_handle, clbuildprogram_t);
 
-	char optionsWithDebugFlag[] = " -gline-tables-only ";
-	if (options != NULL) {
-		strcat(optionsWithDebugFlag, options);
-	}
-  return clBuildProgram_wrappee(program, num_devices, device_list, (const char*)optionsWithDebugFlag, clBuildProgramCallback, user_data);
+  // XXX(Aaron): Caution, what's the maximum length of options?
+  int len_options = options == NULL ? 0 : strlen(options);
+  int len_flag = strlen(LINE_TABLE_FLAG);
+  char *options_with_debug_flags = (char *)malloc((len_options + len_flag + 1) * sizeof(char));
+  memset(options_with_debug_flags, 0, (len_options + len_flag + 1));
+  if (len_options != 0) {
+    strncat(options_with_debug_flags, options, len_options);
+  }
+  strcat(options_with_debug_flags, LINE_TABLE_FLAG);
+  cl_int ret = clBuildProgram_wrappee(program, num_devices, device_list, options_with_debug_flags, clBuildProgramCallback, user_data);
+  free(options_with_debug_flags);
+  return ret;
 }
 
 
 static cl_command_queue
 clCreateCommandQueue_wrapper
 (
-  cl_context context,
-  cl_device_id device,
-  cl_command_queue_properties properties,
-  cl_int *errcode_ret
+ cl_context context,
+ cl_device_id device,
+ cl_command_queue_properties properties,
+ cl_int *errcode_ret
 )
 {
   // enabling profiling
@@ -305,15 +384,15 @@ clCreateCommandQueue_wrapper
 static cl_int
 clEnqueueNDRangeKernel_wrapper
 (
-  cl_command_queue command_queue,
-  cl_kernel ocl_kernel,
-  cl_uint work_dim,
-  const size_t *global_work_offset, 
-  const size_t *global_work_size,
-  const size_t *local_work_size,
-  cl_uint num_events_in_wait_list,
-  const cl_event *event_wait_list,
-  cl_event *event
+ cl_command_queue command_queue,
+ cl_kernel ocl_kernel,
+ cl_uint work_dim,
+ const size_t *global_work_offset, 
+ const size_t *global_work_size,
+ const size_t *local_work_size,
+ cl_uint num_events_in_wait_list,
+ const cl_event *event_wait_list,
+ cl_event *event
 )
 {
   uint64_t correlation_id = getCorrelationId();
@@ -334,16 +413,16 @@ clEnqueueNDRangeKernel_wrapper
     GOTCHA_GET_TYPED_WRAPPEE(clEnqueueNDRangeKernel_handle, clkernel_t);
   cl_int return_status = 
     clEnqueueNDRangeKernel_wrappee(command_queue, ocl_kernel, work_dim, 
-				   global_work_offset, global_work_size, 
-				   local_work_size, num_events_in_wait_list, 
-				   event_wait_list, eventp);
+           global_work_offset, global_work_size, 
+           local_work_size, num_events_in_wait_list, 
+           event_wait_list, eventp);
 
   ETMSG(OPENCL, "registering callback for type: kernel. " 
-	"Correlation id: %"PRIu64 "", correlation_id);
+  "Correlation id: %"PRIu64 "", correlation_id);
 
   opencl_subscriber_callback(kernel_cb->type, kernel_cb->correlation_id);
   clSetEventCallback_wrapper(*eventp, CL_COMPLETE, 
-			     &opencl_activity_completion_callback, kernel_info);
+           &opencl_activity_completion_callback, kernel_info);
   return return_status;
 }
 
@@ -351,15 +430,15 @@ clEnqueueNDRangeKernel_wrapper
 static cl_int
 clEnqueueReadBuffer_wrapper
 (
-  cl_command_queue command_queue,
-  cl_mem buffer,
-  cl_bool blocking_read,
-  size_t offset,
-  size_t cb,
-  void *ptr,
-  cl_uint num_events_in_wait_list,
-  const cl_event *event_wait_list,
-  cl_event *event
+ cl_command_queue command_queue,
+ cl_mem buffer,
+ cl_bool blocking_read,
+ size_t offset,
+ size_t cb,
+ void *ptr,
+ cl_uint num_events_in_wait_list,
+ const cl_event *event_wait_list,
+ cl_event *event
 )
 {
   uint64_t correlation_id = getCorrelationId();
@@ -380,19 +459,19 @@ clEnqueueReadBuffer_wrapper
     GOTCHA_GET_TYPED_WRAPPEE(clEnqueueReadBuffer_handle, clreadbuffer_t);
   cl_int return_status = 
     clEnqueueReadBuffer_wrappee(command_queue, buffer, blocking_read, offset, 
-				cb, ptr, num_events_in_wait_list, 
-				event_wait_list, eventp);
+        cb, ptr, num_events_in_wait_list, 
+        event_wait_list, eventp);
 
   ETMSG(OPENCL, "registering callback for type: D2H. " 
-	"Correlation id: %"PRIu64 "", correlation_id);
+  "Correlation id: %"PRIu64 "", correlation_id);
   ETMSG(OPENCL, "%d(bytes) of data being transferred from device to host", 
-	(long)cb);
+  (long)cb);
 
   opencl_subscriber_callback(mem_transfer_cb->type, 
-			     mem_transfer_cb->correlation_id);
+           mem_transfer_cb->correlation_id);
 
   clSetEventCallback_wrapper(*eventp, CL_COMPLETE, 
-			     &opencl_activity_completion_callback, mem_info);
+           &opencl_activity_completion_callback, mem_info);
 
   return return_status;
 }
@@ -401,15 +480,15 @@ clEnqueueReadBuffer_wrapper
 static cl_int
 clEnqueueWriteBuffer_wrapper
 (
-  cl_command_queue command_queue,
-  cl_mem buffer,
-  cl_bool blocking_write,
-  size_t offset,
-  size_t cb,
-  const void *ptr,
-  cl_uint num_events_in_wait_list,
-  const cl_event *event_wait_list,
-  cl_event *event
+ cl_command_queue command_queue,
+ cl_mem buffer,
+ cl_bool blocking_write,
+ size_t offset,
+ size_t cb,
+ const void *ptr,
+ cl_uint num_events_in_wait_list,
+ const cl_event *event_wait_list,
+ cl_event *event
 )
 {
   uint64_t correlation_id = getCorrelationId();
@@ -430,21 +509,21 @@ clEnqueueWriteBuffer_wrapper
     GOTCHA_GET_TYPED_WRAPPEE(clEnqueueWriteBuffer_handle, clwritebuffer_t);
   cl_int return_status = 
     clEnqueueWriteBuffer_wrappee(command_queue, buffer, blocking_write, offset,
-				 cb, ptr, num_events_in_wait_list, 
-				 event_wait_list, eventp);
+         cb, ptr, num_events_in_wait_list, 
+         event_wait_list, eventp);
 
   ETMSG(OPENCL, "registering callback for type: H2D. " 
-	"Correlation id: %"PRIu64 "", correlation_id);
+  "Correlation id: %"PRIu64 "", correlation_id);
 
   ETMSG(OPENCL, "%d(bytes) of data being transferred from host to device", 
-	(long)cb);
+  (long)cb);
 
   opencl_subscriber_callback(mem_transfer_cb->type, 
-			     mem_transfer_cb->correlation_id);
+           mem_transfer_cb->correlation_id);
 
   clSetEventCallback_wrapper(*eventp, CL_COMPLETE, 
-			     &opencl_activity_completion_callback, 
-			     (void*) mem_info);
+           &opencl_activity_completion_callback, 
+           (void*) mem_info);
 
   return return_status;
 }
@@ -498,28 +577,19 @@ static gotcha_binding_t opencl_bindings[] = {
 // interface operations
 //******************************************************************************
 
-char*
-getDebugInfoFullFileName
-(
-	void
-)
-{
-	return debugInfoFullFileName;
-}
-
 
 void
 opencl_intercept_setup
 (
-  void
+ void
 )
 {
 #ifndef HPCRUN_STATIC_LINK
   ETMSG(OPENCL, "setting up opencl intercepts");
-	gpu_metrics_KER_BLKINFO_enable();
-  enableProfiling();
+  gpu_metrics_KER_BLKINFO_enable();
+  opencl_enable_profiling();
   gotcha_wrap(opencl_bindings, 4, "opencl_bindings");
-  opencl_intercept_initialize();
+  initializeIntercept();
 #endif
 }
 
@@ -527,7 +597,7 @@ opencl_intercept_setup
 void
 opencl_intercept_teardown
 (
-  void
+ void
 )
 {
 #ifndef HPCRUN_STATIC_LINK

From 34a18b956036899b100ede077aaad8bf81dad66b Mon Sep 17 00:00:00 2001
From: Aaron Cherian <aarontcopal2@iris09.ftm.alcf.anl.gov>
Date: Fri, 25 Sep 2020 01:18:00 +0000
Subject: [PATCH 042/177] adding hpcrun metrics for changing scope of metrics

---
 lib/dtd/hpc-structure.dtd         | 10 +++++++++-
 src/lib/prof-lean/hpcrun-fmt.h    |  8 ++++++++
 src/lib/xml/hpc-structure.dtd.h   |  2 +-
 src/tool/hpcrun/gpu/gpu-metrics.c |  9 ++++++++-
 src/tool/hpcrun/metrics.c         |  7 +++++++
 src/tool/hpcrun/metrics.h         |  2 ++
 6 files changed, 35 insertions(+), 3 deletions(-)

diff --git a/lib/dtd/hpc-structure.dtd b/lib/dtd/hpc-structure.dtd
index c69edce928..a6263bc564 100644
--- a/lib/dtd/hpc-structure.dtd
+++ b/lib/dtd/hpc-structure.dtd
@@ -15,7 +15,7 @@
   <!--   (v)ma-range-set: "{[beg-end), [beg-end)...}" -->
   <!--   (t)arget: target function address -->
   <!--   (d)evice: device name -->
-  <!ELEMENT LM (F|P)*>
+  <!ELEMENT LM (F|P|B)*>
   <!ATTLIST LM
 	i CDATA #REQUIRED
 	n CDATA #REQUIRED
@@ -34,6 +34,14 @@
 	l  CDATA #IMPLIED
 	s  CDATA #IMPLIED
 	v  CDATA #IMPLIED>
+  <!-- Basic Block: -->
+  <!ELEMENT B (I)*>
+  <!ATTLIST B
+	o CDATA #REQUIRED>
+  <!-- Instruction: -->
+  <!ELEMENT I EMPTY>
+  <!ATTLIST I
+	o CDATA #REQUIRED>
   <!-- Alien: (f)ilename -->
   <!ELEMENT A (A|L|S|C)*>
   <!ATTLIST A
diff --git a/src/lib/prof-lean/hpcrun-fmt.h b/src/lib/prof-lean/hpcrun-fmt.h
index 85d724d65b..6034fa497d 100644
--- a/src/lib/prof-lean/hpcrun-fmt.h
+++ b/src/lib/prof-lean/hpcrun-fmt.h
@@ -168,6 +168,13 @@ hpcrun_fmt_hdr_free(hpcrun_fmt_hdr_t* hdr, hpcfmt_free_fn dealloc);
 #define HPCRUN_FMT_METRIC_SHOW_EXCLUSIVE  3
 #define HPCRUN_FMT_METRIC_INVISIBLE       4
 
+// changing the scope of metrics
+#define HPCRUN_FMT_METRIC_MOVE_TO_ENCLOSING_PROCEDURE       5
+#define HPCRUN_FMT_METRIC_KEEP_HERE													6
+#define HPCRUN_FMT_METRIC_PROPOGATE_TO_BASIC_BLOCKS					7
+
+
+
 //***************************************************************************
 // epoch-hdr
 //***************************************************************************
@@ -270,6 +277,7 @@ typedef struct hpcrun_metricFlags_fields {
   uint16_t             partner;
   uint8_t /*bool*/     show;
   uint8_t /*bool*/     showPercent;
+  uint8_t 						 scope;
 
   uint64_t unused1;
 } hpcrun_metricFlags_fields;
diff --git a/src/lib/xml/hpc-structure.dtd.h b/src/lib/xml/hpc-structure.dtd.h
index 83c46584a6..5b0583f06f 100644
--- a/src/lib/xml/hpc-structure.dtd.h
+++ b/src/lib/xml/hpc-structure.dtd.h
@@ -1 +1 @@
-"<!-- ******************************************************************** -->\n<!-- HPCToolkit Structure DTD                                             -->\n<!-- Version 4.7                                                          -->\n<!-- ******************************************************************** -->\n\n<!ELEMENT HPCToolkitStructure (LM)*>\n<!ATTLIST HPCToolkitStructure\n	version CDATA #REQUIRED\n	i       CDATA #REQUIRED\n	n       CDATA #IMPLIED>\n  <!-- Load module: -->\n  <!--   (i)d: unique identifier for cross referencing -->\n  <!--   (n)ame -->\n  <!--   (l)ine range: \"beg-end\" (inclusive range) -->\n  <!--   (v)ma-range-set: \"{[beg-end), [beg-end)...}\" -->\n  <!--   (t)arget: target function address -->\n  <!--   (d)evice: device name -->\n  <!ELEMENT LM (F|P)*>\n  <!ATTLIST LM\n	i CDATA #REQUIRED\n	n CDATA #REQUIRED\n	v CDATA #IMPLIED>\n  <!-- File: -->\n  <!ELEMENT F (P|L|S)*>\n  <!ATTLIST F\n	i CDATA #REQUIRED\n	n CDATA #REQUIRED>\n  <!-- Procedure: ln=link name (if different than name) -->\n  <!ELEMENT P (P|A|L|S|C)*>\n  <!ATTLIST P\n	i  CDATA #REQUIRED\n	n  CDATA #REQUIRED\n	ln CDATA #IMPLIED\n	l  CDATA #IMPLIED\n	s  CDATA #IMPLIED\n	v  CDATA #IMPLIED>\n  <!-- Alien: (f)ilename -->\n  <!ELEMENT A (A|L|S|C)*>\n  <!ATTLIST A\n	i CDATA #REQUIRED\n	f CDATA #IMPLIED\n	n CDATA #IMPLIED\n	ln CDATA #IMPLIED\n	l CDATA #IMPLIED\n	v CDATA #IMPLIED>\n  <!-- Loop -->\n  <!ELEMENT L (A|L|S|C)*>\n  <!ATTLIST L\n	i CDATA #REQUIRED\n	l CDATA #REQUIRED\n	f CDATA #IMPLIED\n	v CDATA #IMPLIED>\n  <!-- Statement -->\n  <!ELEMENT S EMPTY>\n  <!ATTLIST S\n	i CDATA #REQUIRED\n	l CDATA #REQUIRED\n	v CDATA #IMPLIED>\n  <!-- Callsite (a special Statement) -->\n  <!ELEMENT C (C)*>\n  <!ATTLIST C\n	i CDATA #REQUIRED\n	l CDATA #REQUIRED\n	v CDATA #IMPLIED\n	t CDATA #IMPLIED\n	d CDATA #IMPLIED>\n";
+"<!-- ******************************************************************** -->\n<!-- HPCToolkit Structure DTD                                             -->\n<!-- Version 4.7                                                          -->\n<!-- ******************************************************************** -->\n\n<!ELEMENT HPCToolkitStructure (LM)*>\n<!ATTLIST HPCToolkitStructure\n	version CDATA #REQUIRED\n	i       CDATA #REQUIRED\n	n       CDATA #IMPLIED>\n  <!-- Load module: -->\n  <!--   (i)d: unique identifier for cross referencing -->\n  <!--   (n)ame -->\n  <!--   (l)ine range: \"beg-end\" (inclusive range) -->\n  <!--   (v)ma-range-set: \"{[beg-end), [beg-end)...}\" -->\n  <!--   (t)arget: target function address -->\n  <!--   (d)evice: device name -->\n  <!ELEMENT LM (F|P|B)*>\n  <!ATTLIST LM\n	i CDATA #REQUIRED\n	n CDATA #REQUIRED\n	v CDATA #IMPLIED>\n  <!-- File: -->\n  <!ELEMENT F (P|L|S)*>\n  <!ATTLIST F\n	i CDATA #REQUIRED\n	n CDATA #REQUIRED>\n  <!-- Procedure: ln=link name (if different than name) -->\n  <!ELEMENT P (P|A|L|S|C)*>\n  <!ATTLIST P\n	i  CDATA #REQUIRED\n	n  CDATA #REQUIRED\n	ln CDATA #IMPLIED\n	l  CDATA #IMPLIED\n	s  CDATA #IMPLIED\n	v  CDATA #IMPLIED>\n  <!-- Basic Block: -->\n  <!ELEMENT B (I)*>\n  <!ATTLIST B\n	o CDATA #REQUIRED>\n  <!-- Instruction: -->\n  <!ELEMENT I EMPTY>\n  <!ATTLIST I\n	o CDATA #REQUIRED>\n  <!-- Alien: (f)ilename -->\n  <!ELEMENT A (A|L|S|C)*>\n  <!ATTLIST A\n	i CDATA #REQUIRED\n	f CDATA #IMPLIED\n	n CDATA #IMPLIED\n	ln CDATA #IMPLIED\n	l CDATA #IMPLIED\n	v CDATA #IMPLIED>\n  <!-- Loop -->\n  <!ELEMENT L (A|L|S|C)*>\n  <!ATTLIST L\n	i CDATA #REQUIRED\n	l CDATA #REQUIRED\n	f CDATA #IMPLIED\n	v CDATA #IMPLIED>\n  <!-- Statement -->\n  <!ELEMENT S EMPTY>\n  <!ATTLIST S\n	i CDATA #REQUIRED\n	l CDATA #REQUIRED\n	v CDATA #IMPLIED>\n  <!-- Callsite (a special Statement) -->\n  <!ELEMENT C (C)*>\n  <!ATTLIST C\n	i CDATA #REQUIRED\n	l CDATA #REQUIRED\n	v CDATA #IMPLIED\n	t CDATA #IMPLIED\n	d CDATA #IMPLIED>\n";
diff --git a/src/tool/hpcrun/gpu/gpu-metrics.c b/src/tool/hpcrun/gpu/gpu-metrics.c
index b77cb3cd78..604d6abed8 100644
--- a/src/tool/hpcrun/gpu/gpu-metrics.c
+++ b/src/tool/hpcrun/gpu/gpu-metrics.c
@@ -199,6 +199,11 @@ name ## _metric_kind
   reg_metric->format  = FORMAT_DISPLAY_PERCENTAGE
 
 
+#define SET_SCOPE_FOR_BASIC_BLOCKS(name) \
+  hpcrun_set_scope(METRIC_ID(name), HPCRUN_FMT_METRIC_MOVE_TO_ENCLOSING_PROCEDURE); \
+
+
+
 //*****************************************************************************
 // local variables 
 //*****************************************************************************
@@ -466,7 +471,7 @@ gpu_metrics_attribute_kernel_block
   cct_node_t *cct_node = activity->cct_node;
 
 	metric_data_list_t *metrics = 
-		hpcrun_reify_metric_set(cct_node, METRIC_ID(GPU_KINFO_STMEM_ACUMU));	//where will we get metrics from?
+		hpcrun_reify_metric_set(cct_node, METRIC_ID(KER_BLK_OFFSET));
 
 	gpu_metrics_attribute_metric_int(metrics, METRIC_ID(KER_BLK_OFFSET), 
 					 b->offset);
@@ -753,6 +758,8 @@ gpu_metrics_KER_BLKINFO_enable
 
   INITIALIZE_METRIC_KIND();
 
+	SET_SCOPE_FOR_BASIC_BLOCKS(KER_BLK_EXECUTION_COUNT);
+
   FORALL_KER_BLKINFO(INITIALIZE_SCALAR_METRIC_INT)
 
   FINALIZE_METRIC_KIND();
diff --git a/src/tool/hpcrun/metrics.c b/src/tool/hpcrun/metrics.c
index 7e903aec78..e474ae9b66 100644
--- a/src/tool/hpcrun/metrics.c
+++ b/src/tool/hpcrun/metrics.c
@@ -303,6 +303,7 @@ void hpcrun_set_display(int metric_id, uint8_t show) {
   mdesc->flags.fields.show = show;
 }
 
+
 // non finalizing
 void hpcrun_set_percent(int metric_id, uint8_t show_percent) {
   metric_desc_t* mdesc = hpcrun_id2metric_linked(metric_id);
@@ -310,6 +311,12 @@ void hpcrun_set_percent(int metric_id, uint8_t show_percent) {
 }
 
 
+void hpcrun_set_scope(int metric_id, uint8_t scope) {
+  metric_desc_t* mdesc = hpcrun_id2metric_linked(metric_id);
+  mdesc->flags.fields.scope = scope;
+}
+
+
 metric_desc_p_tbl_t*
 hpcrun_get_metric_tbl(kind_info_t **curr)
 {
diff --git a/src/tool/hpcrun/metrics.h b/src/tool/hpcrun/metrics.h
index be9a89f361..3a45db3479 100644
--- a/src/tool/hpcrun/metrics.h
+++ b/src/tool/hpcrun/metrics.h
@@ -128,6 +128,8 @@ void hpcrun_set_display(int metric_id, uint8_t show);
 
 void hpcrun_set_percent(int metric_id, uint8_t show_percent);
 
+void hpcrun_set_scope(int metric_id, uint8_t scope);
+
 metric_desc_p_tbl_t* hpcrun_get_metric_tbl(kind_info_t**);
 
 metric_upd_proc_t* hpcrun_get_metric_proc(int metric_id);

From f03228650f2311fda85afe09e917e5a8e8adc541 Mon Sep 17 00:00:00 2001
From: dejangrubisic <grubisic.dejan@yahoo.com>
Date: Fri, 25 Sep 2020 12:05:41 -0500
Subject: [PATCH 043/177] operation_channel_signal_consumer_when_full

---
 .../hpcrun/gpu/gpu-activity-multiplexer.c     |  46 +++++---
 .../hpcrun/gpu/gpu-operation-channel-set.c    |   2 +-
 src/tool/hpcrun/gpu/gpu-operation-channel.c   |  58 ++++++++--
 src/tool/hpcrun/gpu/gpu-operation-channel.h   |   9 +-
 src/tool/hpcrun/gpu/gpu-operation-item.h      |   2 +-
 src/tool/hpcrun/gpu/opencl/opencl-api.c       | 108 +++++++++++++-----
 src/tool/hpcrun/gpu/opencl/opencl-api.h       |  16 +++
 src/tool/hpcrun/gpu/opencl/opencl-intercept.c |   6 +-
 src/tool/hpcrun/gpu/opencl/opencl-intercept.h |   6 +-
 9 files changed, 189 insertions(+), 64 deletions(-)

diff --git a/src/tool/hpcrun/gpu/gpu-activity-multiplexer.c b/src/tool/hpcrun/gpu/gpu-activity-multiplexer.c
index 3aaaf36e82..e7353974b3 100644
--- a/src/tool/hpcrun/gpu/gpu-activity-multiplexer.c
+++ b/src/tool/hpcrun/gpu/gpu-activity-multiplexer.c
@@ -79,6 +79,14 @@ static pthread_once_t is_initialized = PTHREAD_ONCE_INIT;
 // private operations
 //******************************************************************************
 
+static void
+gpu_init_operation_channel(){
+  // Create operation channel
+  my_operation_set_id = atomic_fetch_add(&operation_set_id, 1);
+  gpu_operation_channel = gpu_operation_channel_get();
+  gpu_operation_channel_set_insert(gpu_operation_channel, my_operation_set_id);
+}
+
 
 static void *
 gpu_activity_record
@@ -103,19 +111,6 @@ void
 }
 
 
-void
-gpu_operation_release
-(
-gpu_operation_channel_t *channel
-)
-{
-  atomic_fetch_add(&operation_stream_counter, -1);
-}
-
-//******************************************************************************
-// interface operations
-//******************************************************************************
-
 static void
 gpu_activity_multiplexer_init
 (
@@ -133,6 +128,21 @@ void
 }
 
 
+
+//******************************************************************************
+// interface operations
+//******************************************************************************
+
+void
+gpu_operation_release
+(
+gpu_operation_channel_t *channel
+)
+{
+  atomic_fetch_add(&operation_stream_counter, -1);
+}
+
+
 void
 gpu_activity_multiplexer_fini
 (
@@ -148,6 +158,8 @@ void
 //  while (atomic_load(&operation_stream_counter));
 }
 
+
+
 void
 gpu_activity_multiplexer_push
 (
@@ -155,17 +167,15 @@ gpu_activity_channel_t *initiator_channel,
 gpu_activity_t *gpu_activity
 )
 {
+
   pthread_once(&is_initialized, gpu_activity_multiplexer_init);
 
   if (my_operation_set_id == -1){
-    // Create operation channel
-    my_operation_set_id = atomic_fetch_add(&operation_set_id, 1);
-    gpu_operation_channel = gpu_operation_channel_get();
-    gpu_operation_channel_set_insert(gpu_operation_channel, my_operation_set_id);
+    gpu_init_operation_channel();
   }
 
   gpu_operation_item_t item = (gpu_operation_item_t){.channel=initiator_channel, .activity=gpu_activity};
-  gpu_operation_channel_produce(gpu_operation_channel, &item);
+  gpu_operation_channel_produce(gpu_operation_channel_get(), &item);
 
 //  atomic_fetch_add(&operation_stream_counter, +1);
 
diff --git a/src/tool/hpcrun/gpu/gpu-operation-channel-set.c b/src/tool/hpcrun/gpu/gpu-operation-channel-set.c
index aab832ca46..c020de1b26 100644
--- a/src/tool/hpcrun/gpu/gpu-operation-channel-set.c
+++ b/src/tool/hpcrun/gpu/gpu-operation-channel-set.c
@@ -183,7 +183,7 @@ gpu_operation_channel_set_consume
 )
 {
   gpu_operation_channel_set_forall(gpu_operation_channel_consume, set_index);
-  gpu_operation_channel_set_forall(gpu_operation_channel_wait, set_index);
+  gpu_operation_channel_set_forall(gpu_operation_channel_await, set_index);
 }
 
 
diff --git a/src/tool/hpcrun/gpu/gpu-operation-channel.c b/src/tool/hpcrun/gpu/gpu-operation-channel.c
index b65d580ca2..2f6067faaf 100644
--- a/src/tool/hpcrun/gpu/gpu-operation-channel.c
+++ b/src/tool/hpcrun/gpu/gpu-operation-channel.c
@@ -63,6 +63,9 @@
 // macros
 //******************************************************************************
 
+#define CHANNEL_FILL_COUNT 100
+
+
 #undef typed_bichannel
 #undef typed_stack_elem
 
@@ -97,6 +100,7 @@ typedef struct gpu_operation_channel_t {
   bistack_t bistacks[2];
   pthread_mutex_t mutex;
   pthread_cond_t cond;
+  uint64_t count;
 } gpu_operation_channel_t;
 
 
@@ -116,17 +120,36 @@ static __thread gpu_operation_channel_t *gpu_operation_channel = NULL;
 typed_bichannel_impl(gpu_operation_item_t)
 
 
+static void
+gpu_operation_channel_signal_consumer_when_full
+(
+ gpu_operation_channel_t *channel
+)
+{
+  if (channel->count++ > CHANNEL_FILL_COUNT) {
+    channel->count = 0;
+    gpu_operation_channel_signal_consumer(channel);
+  }
+}
+
+
 static gpu_operation_channel_t *
 gpu_operation_channel_alloc
 (
 void
 )
 {
-  gpu_operation_channel_t *c = hpcrun_malloc_safe(sizeof(gpu_operation_channel_t));
+  gpu_operation_channel_t *channel = hpcrun_malloc_safe(sizeof(gpu_operation_channel_t));
+
+  memset(channel, 0, sizeof(gpu_operation_channel_t));
+
+  channel_init(channel);
+
 
-  channel_init(c);
+  pthread_mutex_init(&channel->mutex, NULL);
+  pthread_cond_init(&channel->cond, NULL);
 
-  return c;
+  return channel;
 }
 
 
@@ -157,12 +180,17 @@ gpu_operation_channel_produce
  gpu_operation_item_t *it
 )
 {
-  gpu_operation_item_t *channel_op = gpu_operation_item_alloc(channel);
-  *channel_op = *it;
+  gpu_operation_item_t *channel_it = gpu_operation_item_alloc(channel);
+//  channel_it->channel = it->channel;
+//  channel_it->activity = it->activity;
+  *channel_it = *it;
 
-  printf("\nPRODUCE: channel = %p || return_channel = %p -> activity = %p\n\n", channel, it->channel, it->activity);
+  printf("\nPRODUCE: channel = %p || return_channel = %p -> activity = %p\n\n", channel, channel_it->channel, channel_it->activity);
+
+  channel_push(channel, bichannel_direction_forward, channel_it);
+
+  gpu_operation_channel_signal_consumer_when_full(channel);
 
-  channel_push(channel, bichannel_direction_forward, channel_op);
 }
 
 
@@ -182,11 +210,11 @@ gpu_operation_channel_consume
   // consume all elements enqueued before this function was called
   for (;;) {
     gpu_operation_item_t *it = channel_pop(channel, bichannel_direction_forward);
-    printf("\n---------CONSUME: op_channel = %p || channel = %p , activity = %p\n", channel, it->channel, it->activity);
 
-    if (!it || !it->activity || !it->channel) {
+    if (!it) {
       break;
     }
+    printf("\n---------CONSUME: op_channel = %p || channel = %p , activity = %p\n", channel, it->channel, it->activity);
     gpu_operation_item_consume(gpu_operation_item_process, it);
     gpu_operation_item_free(channel, it);
   }
@@ -194,7 +222,7 @@ gpu_operation_channel_consume
 
 
 void
-gpu_operation_channel_wait
+gpu_operation_channel_await
 (
 gpu_operation_channel_t *channel
 )
@@ -206,4 +234,14 @@ gpu_operation_channel_t *channel
   // wait for a signal or for a few seconds. periodically waking
   // up avoids missing a signal.
   pthread_cond_timedwait(&channel->cond, &channel->mutex, &time);
+}
+
+
+void
+gpu_operation_channel_signal_consumer
+(
+gpu_operation_channel_t *channel
+)
+{
+  pthread_cond_signal(&channel->cond);
 }
\ No newline at end of file
diff --git a/src/tool/hpcrun/gpu/gpu-operation-channel.h b/src/tool/hpcrun/gpu/gpu-operation-channel.h
index aade93fa20..c8da2f5f3d 100644
--- a/src/tool/hpcrun/gpu/gpu-operation-channel.h
+++ b/src/tool/hpcrun/gpu/gpu-operation-channel.h
@@ -87,11 +87,18 @@ gpu_operation_channel_consume
  gpu_operation_channel_t *channel
 );
 
+
 void
-gpu_operation_channel_wait
+gpu_operation_channel_await
 (
  gpu_operation_channel_t *channel
 );
 
 
+void
+gpu_operation_channel_signal_consumer
+(
+ gpu_operation_channel_t *channel
+);
+
 #endif
diff --git a/src/tool/hpcrun/gpu/gpu-operation-item.h b/src/tool/hpcrun/gpu/gpu-operation-item.h
index 291688dd79..d994be4582 100644
--- a/src/tool/hpcrun/gpu/gpu-operation-item.h
+++ b/src/tool/hpcrun/gpu/gpu-operation-item.h
@@ -80,7 +80,7 @@ typedef struct gpu_operation_channel_t gpu_operation_channel_t;
 typedef struct gpu_operation_item_t{
   gpu_activity_channel_t *channel;
   gpu_activity_t *activity;
-}gpu_operation_item_t;
+} gpu_operation_item_t;
 
 
 typedef void (*gpu_operation_channel_fn_t)
diff --git a/src/tool/hpcrun/gpu/opencl/opencl-api.c b/src/tool/hpcrun/gpu/opencl/opencl-api.c
index 066c57e571..a206d76966 100644
--- a/src/tool/hpcrun/gpu/opencl/opencl-api.c
+++ b/src/tool/hpcrun/gpu/opencl/opencl-api.c
@@ -145,10 +145,17 @@
   macro(CL_INVALID_LINKER_OPTIONS)					\
   macro(CL_INVALID_DEVICE_PARTITION_COUNT)
 
-#define FORALL_OPENCL_CALLS(macro)					\
-  macro(GPU_MEMCPY_H2D)							\
+#define FORALL_OPENCL_KINDS(macro)					\
+  macro(GPU_ACTIVITY_UNKNOWN)							\
+  macro(GPU_ACTIVITY_KERNEL)           \
+  macro(GPU_ACTIVITY_MEMCPY)
+
+#define FORALL_OPENCL_MEM_TYPES(macro)					\
+  macro(GPU_MEMCPY_UNK)							\
+  macro(GPU_MEMCPY_H2D)           \
   macro(GPU_MEMCPY_D2H)
 
+
 #define CODE_TO_STRING(e) case e: return #e;
 
 #define opencl_path() "libOpenCL.so"
@@ -276,19 +283,32 @@ opencl_wait_for_pending_operations
 
 
 static const char*
-opencl_call_to_string
+opencl_kind_to_string
 (
-  gpu_memcpy_type_t type
+ gpu_activity_kind_t kind
 )
 {
-  switch (type)
+  switch (kind)
   {
-    FORALL_OPENCL_CALLS(CODE_TO_STRING)
-    default: return "CL_unknown_call";
+    FORALL_OPENCL_KINDS(CODE_TO_STRING)
+    default: return "CL_unknown_kind";
   }
 }
 
 
+static const char*
+opencl_type_to_string
+(
+gpu_memcpy_type_t kind
+)
+{
+  switch (kind)
+  {
+    FORALL_OPENCL_MEM_TYPES(CODE_TO_STRING)
+    default: return "CL_unknown_type";
+  }
+}
+
 static const char*
 opencl_error_report
 (
@@ -302,11 +322,60 @@ opencl_error_report
 }
 
 
+static bool
+opencl_in_correlation_map(cl_basic_callback_t cb_basic){
+  gpu_correlation_id_map_entry_t *cid_map_entry = gpu_correlation_id_map_lookup(cb_basic.correlation_id);
+  if (cid_map_entry == NULL) {
+    ETMSG(OPENCL, "Activity not in correlation map \n");
+    opencl_cb_basic_print(cb_basic, "NOT in Correlation map");
+    return false;
+  }
+  return true;
+}
 
 //******************************************************************************
 // interface operations
 //******************************************************************************
 
+cl_basic_callback_t
+opencl_cb_basic_get
+(
+opencl_object_t *cb_data
+)
+{
+  cl_basic_callback_t cb_basic;
+
+  if (cb_data->kind == GPU_ACTIVITY_KERNEL) {
+    cb_basic.correlation_id = cb_data->details.ker_cb.correlation_id;
+    cb_basic.kind = cb_data->kind;
+    cb_basic.type = 0; // not valid
+
+  } else if (cb_data->kind == GPU_ACTIVITY_MEMCPY) {
+    cb_basic.correlation_id = cb_data->details.mem_cb.correlation_id;
+    cb_basic.kind = cb_data->kind;
+    cb_basic.type = cb_data->details.mem_cb.type;
+  }
+
+  return cb_basic;
+}
+
+void
+opencl_cb_basic_print
+(
+ cl_basic_callback_t cb_basic,
+ char *title
+)
+{
+
+  ETMSG(OPENCL, " %s | Activity kind: %s | type: %s | correlation id: %"PRIu64 "",
+        title,
+        opencl_kind_to_string(cb_basic.kind),
+        opencl_type_to_string(cb_basic.type),
+        cb_basic.correlation_id);
+
+}
+
+
 void
 opencl_initialize_correlation_id
 (
@@ -381,28 +450,13 @@ opencl_activity_completion_callback
   void *user_data
 )
 {
-  cl_int complete_flag = CL_COMPLETE;
   opencl_object_t *cb_data = (opencl_object_t*)user_data;
-  cl_generic_callback_t *act_data;
+  cl_basic_callback_t cb_basic = opencl_cb_basic_get(cb_data);
 
-  if (cb_data->kind == GPU_ACTIVITY_KERNEL) {
-    act_data = (cl_generic_callback_t*) &(cb_data->details.ker_cb);
-  } else if (cb_data->kind == GPU_ACTIVITY_MEMCPY) {
-    act_data = (cl_generic_callback_t*) &(cb_data->details.mem_cb);
-  }
-  uint64_t correlation_id = act_data->correlation_id;
-  gpu_memcpy_type_t type = act_data->type;
-
-  if (event_command_exec_status == complete_flag) {
-    gpu_correlation_id_map_entry_t *cid_map_entry = 
-      gpu_correlation_id_map_lookup(correlation_id);
-    if (cid_map_entry == NULL) {
-      ETMSG(OPENCL, "completion callback was called before registration " 
-	    "callback. type: %d, correlation: %"PRIu64 "", type, 
-	    correlation_id);
-    }
-    ETMSG(OPENCL, "completion type: %s, Correlation id: %"PRIu64 "", 
-	  opencl_call_to_string(type), correlation_id);
+  if (event_command_exec_status == CL_COMPLETE) {
+    opencl_in_correlation_map(cb_basic);
+
+    opencl_cb_basic_print(cb_basic, "Completion_Callback");
 
     opencl_activity_process(event, cb_data);
   }
diff --git a/src/tool/hpcrun/gpu/opencl/opencl-api.h b/src/tool/hpcrun/gpu/opencl/opencl-api.h
index 4e874485db..7cd00fb3b8 100644
--- a/src/tool/hpcrun/gpu/opencl/opencl-api.h
+++ b/src/tool/hpcrun/gpu/opencl/opencl-api.h
@@ -67,12 +67,28 @@
 // interface operations
 //******************************************************************************
 
+cl_basic_callback_t
+opencl_cb_basic_get
+(
+  opencl_object_t *cb_data
+);
+
+
+void
+opencl_cb_basic_print
+(
+  cl_basic_callback_t cb_basic,
+  char *title
+);
+
+
 void
 opencl_initialize_correlation_id
 (
   void
 );
 
+
 void
 opencl_subscriber_callback
 (
diff --git a/src/tool/hpcrun/gpu/opencl/opencl-intercept.c b/src/tool/hpcrun/gpu/opencl/opencl-intercept.c
index 778b835e4b..b24ebbc14f 100644
--- a/src/tool/hpcrun/gpu/opencl/opencl-intercept.c
+++ b/src/tool/hpcrun/gpu/opencl/opencl-intercept.c
@@ -167,7 +167,7 @@ clEnqueueNDRangeKernel_wrapper
 				   local_work_size, num_events_in_wait_list, 
 				   event_wait_list, eventp);
 
-  ETMSG(OPENCL, "registering callback for type: kernel. " 
+  ETMSG(OPENCL, "Registering callback for kind: Kernel. "
 	"Correlation id: %"PRIu64 "", kernel_info->details.ker_cb.correlation_id);
 
   clSetEventCallback_wrapper(*eventp, CL_COMPLETE,
@@ -212,7 +212,7 @@ clEnqueueReadBuffer_wrapper
 				cb, ptr, num_events_in_wait_list, 
 				event_wait_list, eventp);
 
-  ETMSG(OPENCL, "registering callback for type: D2H. " 
+  ETMSG(OPENCL, "Registering callback for kind MEMCPY, type: D2H. "
 	"Correlation id: %"PRIu64 "", mem_info->details.mem_cb.correlation_id);
   ETMSG(OPENCL, "%d(bytes) of data being transferred from device to host", 
 	(long)cb);
@@ -263,7 +263,7 @@ clEnqueueWriteBuffer_wrapper
 				 cb, ptr, num_events_in_wait_list, 
 				 event_wait_list, eventp);
 
-  ETMSG(OPENCL, "registering callback for type: H2D. " 
+  ETMSG(OPENCL, "Registering callback for kind MEMCPY, type: H2D. "
 	"Correlation id: %"PRIu64 "", mem_info->details.mem_cb.correlation_id);
 
   ETMSG(OPENCL, "%d(bytes) of data being transferred from host to device", 
diff --git a/src/tool/hpcrun/gpu/opencl/opencl-intercept.h b/src/tool/hpcrun/gpu/opencl/opencl-intercept.h
index 3069c44d47..1ad4210ddd 100644
--- a/src/tool/hpcrun/gpu/opencl/opencl-intercept.h
+++ b/src/tool/hpcrun/gpu/opencl/opencl-intercept.h
@@ -113,15 +113,15 @@ typedef cl_int (*clwritebuffer_t)(
 );
 
 
-typedef struct cl_generic_callback_t {
+typedef struct cl_basic_callback_t {
   uint64_t correlation_id;
+  gpu_activity_kind_t kind;
   gpu_memcpy_type_t type;
-} cl_generic_callback_t;
+} cl_basic_callback_t;
 
 
 typedef struct cl_kernel_callback_t {
   uint64_t correlation_id;
-  gpu_memcpy_type_t type;
 } cl_kernel_callback_t;
 
 

From c94c5251448d3b1b002c42e9933f92933a01aaa8 Mon Sep 17 00:00:00 2001
From: Jokeren <robinho364@gmail.com>
Date: Sat, 26 Sep 2020 02:35:02 +0000
Subject: [PATCH 044/177] Dump debug info extracted from GTPin's API

---
 src/lib/binutils/intel/IntelGPUBinutils.cpp   |   8 +-
 .../instrumentation/opencl-instrumentation.c  | 205 +++++++++---------
 src/tool/hpcrun/gpu/opencl/opencl-api.c       |  72 +-----
 3 files changed, 109 insertions(+), 176 deletions(-)

diff --git a/src/lib/binutils/intel/IntelGPUBinutils.cpp b/src/lib/binutils/intel/IntelGPUBinutils.cpp
index 7792ffb36e..f23a0aaf12 100644
--- a/src/lib/binutils/intel/IntelGPUBinutils.cpp
+++ b/src/lib/binutils/intel/IntelGPUBinutils.cpp
@@ -391,12 +391,6 @@ findIntelGPUBins
     return false;
   }
 
-  auto iter = file_name.rfind("/");
-  if (iter == std::string::npos) {
-    return false;
-  }
-  std::string dir_name = file_name.substr(0, iter + 1);
-  
   for (uint32_t i = 0; i < header->NumberOfKernels; ++i) {
     const SKernelDebugDataHeaderIGC *kernel_header =
       reinterpret_cast<const SKernelDebugDataHeaderIGC*>(ptr);
@@ -409,7 +403,7 @@ findIntelGPUBins
 
     if (kernel_header->SizeVisaDbgInBytes > 0) {
       std::stringstream ss;
-      ss << dir_name << kernel_name << ".gpubin";
+      ss << file_name << "." << kernel_name;
 
       size_t kernel_size = kernel_header->SizeVisaDbgInBytes;
       char *kernel_buffer = (char *)malloc(kernel_size);
diff --git a/src/tool/hpcrun/gpu/instrumentation/opencl-instrumentation.c b/src/tool/hpcrun/gpu/instrumentation/opencl-instrumentation.c
index 4b1eae102d..06519445cf 100644
--- a/src/tool/hpcrun/gpu/instrumentation/opencl-instrumentation.c
+++ b/src/tool/hpcrun/gpu/instrumentation/opencl-instrumentation.c
@@ -63,6 +63,8 @@
 
 #include <hpcrun/safe-sampling.h>
 #include <hpcrun/cct/cct.h>
+#include <hpcrun/memory/hpcrun-malloc.h>
+#include <hpcrun/files.h>
 #include <hpcrun/gpu/gpu-activity-process.h>
 #include <hpcrun/gpu/gpu-activity-channel.h>
 #include <hpcrun/gpu/gpu-application-thread-api.h>
@@ -72,11 +74,13 @@
 #include <hpcrun/gpu/gpu-op-placeholders.h>
 #include <hpcrun/gpu/gpu-metrics.h>
 #include <hpcrun/gpu/gpu-monitoring-thread-api.h>
-#include <hpcrun/memory/hpcrun-malloc.h>
-#include <hpcrun/utilities/hpcrun-nanotime.h>
-#include <hpcrun/gpu/opencl/opencl-intercept.h>
-#include <hpcrun/files.h>
 #include <hpcrun/gpu/opencl/opencl-api.h>
+#include <hpcrun/gpu/opencl/opencl-intercept.h>
+#include <hpcrun/utilities/hpcrun-nanotime.h>
+
+#include <lib/prof-lean/crypto-hash.h>
+#include <lib/prof-lean/spinlock.h>
+
 #include "opencl-instrumentation.h"
 
 
@@ -90,6 +94,7 @@
 // TODO(Aaron): Why there are so many correlation ids
 static atomic_long correlation_id;
 
+static spinlock_t files_lock = SPINLOCK_UNLOCKED;
 
 //******************************************************************************
 // private operations
@@ -155,97 +160,108 @@ createKernelNode
 }
 
 
-static int32_t
-findOrAddKernelModule
+static bool
+writeBinary
 (
- const char *input_kernel_name
+ const char *file_name,
+ const void *binary,
+ size_t binary_size
 )
 {
-  char path_name[PATH_MAX];
-  size_t used = 0;
-  used += sprintf(&path_name[used], "%s", hpcrun_files_output_directory());
-  used += sprintf(&path_name[used], "%s", "/intel/");
+  int fd;
+  errno = 0;
+  fd = open(file_name, O_WRONLY | O_CREAT | O_EXCL, 0644);
+  if (errno == EEXIST) {
+    close(fd);
+    return true;
+  }
+  if (fd >= 0) {
+    // Success
+    if (write(fd, binary, binary_size) != binary_size) {
+      close(fd);
+      return false;
+    } else {
+      close(fd);
+      return true;
+    }
+  } else {
+    // Failure to open is a fatal error.
+    hpcrun_abort("hpctoolkit: unable to open file: '%s'", file_name);
+    return false;
+  }
+}
+
+
+void
+computeBinaryHash
+(
+ const char *binary,
+ size_t binary_size,
+ char *file_name
+)
+{
+  // Compute hash for the binary
+  unsigned char hash[HASH_LENGTH];
+  crypto_hash_compute(binary, binary_size, hash, HASH_LENGTH);
 
-  DIR *FD;
-  if (NULL == (FD = opendir(path_name))) {
-    return -1;
+  size_t i;
+  size_t used = 0;
+  used += sprintf(&file_name[used], "%s", hpcrun_files_output_directory());
+  used += sprintf(&file_name[used], "%s", "/intel/");
+  mkdir(file_name, S_IRWXU | S_IRWXG | S_IROTH | S_IXOTH);
+  for (i = 0; i < HASH_LENGTH; ++i) {
+    used += sprintf(&file_name[used], "%02x", hash[i]);
   }
+  used += sprintf(&file_name[used], "%s", ".gpubin");
+}
 
-  int module_id = -1;
-  struct dirent *in_file;
-  while ((in_file = readdir(FD))) {
-    if (!strstr(in_file->d_name, ".debuginfo")) {
-      continue;
-    }
 
-    char buffer[PATH_MAX];
-    used = 0;
-    used = sprintf(&buffer[used], "%s", path_name);
-    used = sprintf(&buffer[used], "%s", in_file->d_name);
-
-    FILE *fptr = fopen(buffer, "rb");
-    fseek(fptr, 0L, SEEK_END);
-    size_t debug_info_size = ftell(fptr);
-    rewind(fptr);
-    char *debug_info = (char *)malloc(debug_info_size);
-    fread(debug_info, debug_info_size, 1, fptr);
-
-    const char *ptr = debug_info;
-    const SProgramDebugDataHeaderIGC *header = (const SProgramDebugDataHeaderIGC *)(ptr);
-    ptr += sizeof(SProgramDebugDataHeaderIGC);
-
-    ETMSG(OPENCL, "Number of kernels: %d", header->NumberOfKernels);
-    for (uint32_t i = 0; i < header->NumberOfKernels; ++i) {
-      const SKernelDebugDataHeaderIGC* kernel_header = (const SKernelDebugDataHeaderIGC*)(ptr);
-      ptr += sizeof(SKernelDebugDataHeaderIGC);
-
-      const char *kernel_name = (const char *)(ptr);
-      if (kernel_header->SizeVisaDbgInBytes > 0 && strcmp(kernel_name, input_kernel_name) == 0) {
-        // Create file name
-        char file_name[PATH_MAX];
-        size_t i;
-        size_t used = 0;
-        used += sprintf(&file_name[used], "%s", hpcrun_files_output_directory());
-        used += sprintf(&file_name[used], "%s", "/intel/");
-        used += sprintf(&file_name[used], "%s", kernel_name);
-        used += sprintf(&file_name[used], "%s", ".gpubin");
-
-        #if 0
-        // Write a file if does not exist
-        bool file_flag;
-        spinlock_lock(&files_lock);
-        file_flag = writeBinary(file_name, binary, binary_size);
-        spinlock_unlock(&files_lock);
-        #endif
-
-        hpcrun_loadmap_lock();
-        load_module_t *module = hpcrun_loadmap_findByName(file_name);
-        if (module == NULL) {
-          module_id = hpcrun_loadModule_add(file_name);
-        } else {
-          // Find module
-          module_id = module->id;
-        }
-        hpcrun_loadmap_unlock();
-
-        break;
-      }
-
-      // TODO(Aaron): Should be zero for newest drivers (what does it mean?)
-      assert(kernel_header->SizeGenIsaDbgInBytes == 0);
-
-      ptr += kernel_header->SizeVisaDbgInBytes;
-      ptr += kernel_header->SizeGenIsaDbgInBytes;
-    }
+static uint32_t
+findOrAddKernelModule
+(
+ GTPinKernel kernel
+)
+{
+  char kernel_name[MAX_STR_SIZE];
+  GTPINTOOL_STATUS status;
+
+  status = GTPin_KernelGetName(kernel, MAX_STR_SIZE, kernel_name, NULL);
+  assert(status == GTPINTOOL_STATUS_SUCCESS);
 
-    free(debug_info);
-    fclose(fptr);
+  uint32_t kernel_elf_size = 0;
+  status = GTPin_GetElf(kernel, 0, NULL, &kernel_elf_size);
+  assert(status == GTPINTOOL_STATUS_SUCCESS);
 
-    if (module_id != -1) {
-      // Find module
-      break;
-    }
+  char *kernel_elf = (char *)malloc(sizeof(char) * kernel_elf_size);
+  status = GTPin_GetElf(kernel, kernel_elf_size, kernel_elf, NULL);
+  assert(status == GTPINTOOL_STATUS_SUCCESS);
+
+  // Create file name
+  char file_name[PATH_MAX];
+  memset(file_name, 0, PATH_MAX);
+  computeBinaryHash(kernel_elf, kernel_elf_size, file_name);
+
+  // Write a file if does not exist
+  spinlock_lock(&files_lock);
+  writeBinary(file_name, kernel_elf, kernel_elf_size);
+  spinlock_unlock(&files_lock);
+
+  free(kernel_elf);
+
+  strncat(file_name, ".", 1);
+  strncat(file_name, kernel_name, strlen(kernel_name));
+
+  uint32_t module_id = 0;
+
+  hpcrun_loadmap_lock();
+  load_module_t *module = hpcrun_loadmap_findByName(file_name);
+  if (module == NULL) {
+    module_id = hpcrun_loadModule_add(file_name);
+  } else {
+    // Find module
+    module_id = module->id;
   }
+  hpcrun_loadmap_unlock();
 
   return module_id;
 }
@@ -316,6 +332,11 @@ onKernelBuild
   data.kernel_cct_correlation_id = correlation_id;
   createKernelNode(correlation_id);
 
+  data.call_count = 0;
+  data.loadmap_module_id = findOrAddKernelModule(kernel);
+
+  kernel_data_map_insert1((uint64_t)kernel, data);
+
   mem_pair_node *h;
   mem_pair_node *current;
   bool isHeadNull = true;
@@ -354,25 +375,11 @@ onKernelBuild
   }
 
   gpu_activity_channel_consume(gpu_metrics_attribute);
-
-  char kernel_name[MAX_STR_SIZE];
-  status = GTPin_KernelGetName(kernel, MAX_STR_SIZE, kernel_name, NULL);
-  assert(status == GTPINTOOL_STATUS_SUCCESS);
   // 
   // m->next = NULL;
   // add these details to cct_node. If thats not needed, we can create the kernel_cct in onKernelComplete
   // XXX(Aaron): what is this for?
   //data.name = kernel_name;
-  data.call_count = 0;
-
-  int32_t module_id = findOrAddKernelModule(kernel_name);
-  if (module_id != -1) {
-    data.loadmap_module_id = module_id;
-  } else {
-    ETMSG(OPENCL, "onKernelComplete cannot find kernel %d\n", kernel_name);
-  }
-
-  kernel_data_map_insert1((uint64_t)kernel, data);
   ETMSG(OPENCL, "onKernelBuild complete. Inserted key: %"PRIu64 "",(uint64_t)kernel);
 }
 
diff --git a/src/tool/hpcrun/gpu/opencl/opencl-api.c b/src/tool/hpcrun/gpu/opencl/opencl-api.c
index b0a4abbb6b..0479a514d3 100644
--- a/src/tool/hpcrun/gpu/opencl/opencl-api.c
+++ b/src/tool/hpcrun/gpu/opencl/opencl-api.c
@@ -74,8 +74,6 @@
 #include <hpcrun/files.h>
 #include <lib/prof-lean/hpcrun-opencl.h>
 #include <lib/prof-lean/stdatomic.h>
-#include <lib/prof-lean/spinlock.h>
-#include <lib/prof-lean/crypto-hash.h>
 #include <lib/prof-lean/usec_time.h>
 
 #include "opencl-api.h"
@@ -332,7 +330,6 @@ OPENCL_FN
 static atomic_ullong opencl_pending_operations;
 static atomic_long correlation_id;
 
-static spinlock_t files_lock = SPINLOCK_UNLOCKED;
 
 #define CL_PROGRAM_DEBUG_INFO_SIZES_INTEL 0x4101
 #define CL_PROGRAM_DEBUG_INFO_INTEL       0x4100
@@ -382,73 +379,6 @@ initializeMemoryCallBackInfo
 }
 
 
-static bool
-writeBinary
-(
- const char *file_name,
- const void *binary,
- size_t binary_size
-)
-{
-  int fd;
-  errno = 0;
-  fd = open(file_name, O_WRONLY | O_CREAT | O_EXCL, 0644);
-  if (errno == EEXIST) {
-    close(fd);
-    return true;
-  }
-  if (fd >= 0) {
-    // Success
-    if (write(fd, binary, binary_size) != binary_size) {
-      close(fd);
-      return false;
-    } else {
-      close(fd);
-      return true;
-    }
-  } else {
-    // Failure to open is a fatal error.
-    hpcrun_abort("hpctoolkit: unable to open file: '%s'", file_name);
-    return false;
-  }
-}
-
-
-void
-writeHashBinary
-(
- const void *binary,
- size_t binary_size,
- bool is_debug_info
-)
-{
-  // Compute hash for the binary
-  unsigned char hash[HASH_LENGTH];
-  crypto_hash_compute(binary, binary_size, hash, HASH_LENGTH);
-
-  // Create file name
-  char file_name[PATH_MAX];
-  size_t i;
-  size_t used = 0;
-  used += sprintf(&file_name[used], "%s", hpcrun_files_output_directory());
-  used += sprintf(&file_name[used], "%s", "/intel/");
-  mkdir(file_name, S_IRWXU | S_IRWXG | S_IROTH | S_IXOTH);
-  for (i = 0; i < HASH_LENGTH; ++i) {
-    used += sprintf(&file_name[used], "%02x", hash[i]);
-  }
-  if (is_debug_info) {
-    used += sprintf(&file_name[used], "%s", ".debuginfo");
-  } else {
-    // XXX(Aaron): we do not use this file for now
-    used += sprintf(&file_name[used], "%s", ".gpumain");
-  }
-
-  // Write a file if does not exist
-  spinlock_lock(&files_lock);
-  writeBinary(file_name, binary, binary_size);
-  spinlock_unlock(&files_lock);
-}
-
 #if 0
 static char*
 getKernelNameFromSourceCode
@@ -480,6 +410,7 @@ clBuildProgramCallback
 	void* user_data
 )
 {
+#if 0
   // TODO(Aaron): where do you get device_count?
   int device_count = 1;
   cl_int status = CL_SUCCESS;
@@ -526,6 +457,7 @@ clBuildProgramCallback
   free(debug_info_size);
 
   ETMSG(OPENCL, "Intel GPU files dumped successfully");
+#endif
 }
 
 

From 679a5c7e153baaddac0a3c9d8d05e9b8c163c102 Mon Sep 17 00:00:00 2001
From: dejangrubisic <grubisic.dejan@yahoo.com>
Date: Sat, 26 Sep 2020 15:49:54 -0500
Subject: [PATCH 045/177] operation_channel is working, item must has
 return_channel*, activity

---
 src/lib/prof-lean/stacks.c                    |  2 +-
 .../hpcrun/gpu/gpu-activity-multiplexer.c     | 54 +++++++++++++------
 .../hpcrun/gpu/gpu-activity-multiplexer.h     | 36 +++++++++++--
 src/tool/hpcrun/gpu/gpu-activity.c            | 54 ++++++++++++++++---
 src/tool/hpcrun/gpu/gpu-activity.h            | 12 +++++
 .../hpcrun/gpu/gpu-operation-channel-set.c    |  6 +--
 .../hpcrun/gpu/gpu-operation-channel-set.h    |  3 +-
 src/tool/hpcrun/gpu/gpu-operation-channel.c   | 29 ++++++----
 .../hpcrun/gpu/gpu-operation-item-process.c   |  6 +--
 src/tool/hpcrun/gpu/gpu-operation-item.c      |  2 +-
 src/tool/hpcrun/gpu/gpu-operation-item.h      |  3 +-
 src/tool/hpcrun/gpu/gpu-trace.c               |  7 ++-
 .../gpu/opencl/opencl-activity-translate.c    |  2 +-
 src/tool/hpcrun/gpu/opencl/opencl-api.c       | 45 ++--------------
 14 files changed, 166 insertions(+), 95 deletions(-)

diff --git a/src/lib/prof-lean/stacks.c b/src/lib/prof-lean/stacks.c
index 3836f82a1b..e21905d0d6 100644
--- a/src/lib/prof-lean/stacks.c
+++ b/src/lib/prof-lean/stacks.c
@@ -115,7 +115,7 @@ sstack_pop
 {
   s_element_t *e = (s_element_t *) atomic_load_explicit(&Ap(q), memory_order_relaxed);
   if (e) {
-    s_element_t *next = 
+    s_element_t *next =
       (s_element_t *) atomic_load_explicit(&(e->Ad(next)), memory_order_relaxed);
     atomic_store_explicit(&Ap(q), next, memory_order_relaxed);
     atomic_store_explicit(&(e->Ad(next)), 0, memory_order_relaxed);
diff --git a/src/tool/hpcrun/gpu/gpu-activity-multiplexer.c b/src/tool/hpcrun/gpu/gpu-activity-multiplexer.c
index e7353974b3..27d15bb024 100644
--- a/src/tool/hpcrun/gpu/gpu-activity-multiplexer.c
+++ b/src/tool/hpcrun/gpu/gpu-activity-multiplexer.c
@@ -75,6 +75,8 @@ static __thread uint32_t my_operation_set_id = -1;
 static __thread gpu_operation_channel_t *gpu_operation_channel = NULL;
 static pthread_once_t is_initialized = PTHREAD_ONCE_INIT;
 
+
+
 //******************************************************************************
 // private operations
 //******************************************************************************
@@ -95,16 +97,17 @@ void
 )
 {
 
-
   while (!atomic_load(&stop_activity_flag)){
 
     for (int set_index = 0; set_index < atomic_load(&operation_set_id) ; ++set_index) {
-      gpu_operation_channel_set_consume(set_index);
+      gpu_operation_channel_set_apply(gpu_operation_channel_consume, set_index);
+      gpu_operation_channel_set_apply(gpu_operation_channel_await, set_index);
     }
   }
 
   for (int set_index = 0; set_index < atomic_load(&operation_set_id) ; ++set_index) {
-    gpu_operation_channel_set_consume(set_index);
+    gpu_operation_channel_set_apply(gpu_operation_channel_consume, set_index);
+    gpu_operation_channel_set_apply(gpu_operation_channel_await, set_index);
   }
 
   return NULL;
@@ -112,7 +115,7 @@ void
 
 
 static void
-gpu_activity_multiplexer_init
+gpu_activity_multiplexer_create
 (
 void
 )
@@ -133,13 +136,24 @@ void
 // interface operations
 //******************************************************************************
 
+bool
+gpu_activity_is_multiplexer_initialized
+(
+ void
+)
+{
+  return (my_operation_set_id != -1);
+}
+
+
 void
-gpu_operation_release
+gpu_activity_multiplexer_init
 (
-gpu_operation_channel_t *channel
+ void
 )
 {
-  atomic_fetch_add(&operation_stream_counter, -1);
+  pthread_once(&is_initialized, gpu_activity_multiplexer_create);
+  gpu_init_operation_channel();
 }
 
 
@@ -153,13 +167,15 @@ void
 
   atomic_store(&stop_activity_flag, true);
 
-//  gpu_context_stream_map_signal_all();
+  for (int set_index = 0; set_index < atomic_load(&operation_set_id) ; ++set_index) {
+    gpu_operation_channel_set_apply(gpu_operation_channel_signal_consumer, set_index);
+  }
+
 
 //  while (atomic_load(&operation_stream_counter));
 }
 
 
-
 void
 gpu_activity_multiplexer_push
 (
@@ -167,20 +183,24 @@ gpu_activity_channel_t *initiator_channel,
 gpu_activity_t *gpu_activity
 )
 {
+  gpu_operation_item_t item = (gpu_operation_item_t){.channel=initiator_channel, .activity=*gpu_activity};
+  gpu_operation_channel_produce(gpu_operation_channel, &item);
 
-  pthread_once(&is_initialized, gpu_activity_multiplexer_init);
-
-  if (my_operation_set_id == -1){
-    gpu_init_operation_channel();
-  }
+//  atomic_fetch_add(&operation_stream_counter, +1);
 
-  gpu_operation_item_t item = (gpu_operation_item_t){.channel=initiator_channel, .activity=gpu_activity};
-  gpu_operation_channel_produce(gpu_operation_channel_get(), &item);
+}
 
-//  atomic_fetch_add(&operation_stream_counter, +1);
 
+void
+gpu_operation_release
+(
+gpu_operation_channel_t *channel
+)
+{
+  atomic_fetch_add(&operation_stream_counter, -1);
 }
 
+
 void
 gpu_activity_multiplexer_release
 (
diff --git a/src/tool/hpcrun/gpu/gpu-activity-multiplexer.h b/src/tool/hpcrun/gpu/gpu-activity-multiplexer.h
index 1498a96a3d..dc3832bf3f 100644
--- a/src/tool/hpcrun/gpu/gpu-activity-multiplexer.h
+++ b/src/tool/hpcrun/gpu/gpu-activity-multiplexer.h
@@ -52,6 +52,7 @@
 // type declarations
 //******************************************************************************
 typedef struct gpu_activity_channel_t gpu_activity_channel_t;
+typedef struct gpu_activity_t gpu_activity_t;
 
 //******************************************************************************
 // local variables
@@ -62,14 +63,33 @@ typedef struct gpu_activity_channel_t gpu_activity_channel_t;
 // private operations
 //******************************************************************************
 
-void
-gpu_activity_multiplexer_fini(void);
-
 
 //******************************************************************************
 // interface operations
 //******************************************************************************
 
+
+bool
+gpu_activity_is_multiplexer_initialized
+(
+void
+);
+
+
+void
+gpu_activity_multiplexer_init
+(
+void
+);
+
+
+void
+gpu_activity_multiplexer_fini
+(
+void
+);
+
+
 void
 gpu_activity_multiplexer_push
 (
@@ -79,11 +99,17 @@ gpu_activity_t *gpu_activity
 
 
 void
-gpu_activity_multiplexer_release(void);
+gpu_activity_multiplexer_release
+(
+void
+);
 
 
 void
-gpu_operation_release(gpu_operation_channel_t *channel);
+gpu_operation_release
+(
+gpu_operation_channel_t *channel
+);
 
 
 
diff --git a/src/tool/hpcrun/gpu/gpu-activity.c b/src/tool/hpcrun/gpu/gpu-activity.c
index 39000392ad..03ab59bbe9 100644
--- a/src/tool/hpcrun/gpu/gpu-activity.c
+++ b/src/tool/hpcrun/gpu/gpu-activity.c
@@ -50,6 +50,16 @@
 
 
 
+//******************************************************************************
+// local includes
+//******************************************************************************
+
+#include "gpu-activity.h"
+#include "gpu-channel-item-allocator.h"
+#include "gpu-print.h"
+
+
+
 //******************************************************************************
 // macros
 //******************************************************************************
@@ -58,16 +68,18 @@
 
 #define DEBUG 0
 
-#include "gpu-print.h"
 
+#define FORALL_OPENCL_KINDS(macro)					\
+  macro(GPU_ACTIVITY_UNKNOWN)							\
+  macro(GPU_ACTIVITY_KERNEL)           \
+  macro(GPU_ACTIVITY_MEMCPY)
 
+#define FORALL_OPENCL_MEM_TYPES(macro)					\
+  macro(GPU_MEMCPY_UNK)							\
+  macro(GPU_MEMCPY_H2D)           \
+  macro(GPU_MEMCPY_D2H)
 
-//******************************************************************************
-// local includes
-//******************************************************************************
-
-#include "gpu-activity.h"
-#include "gpu-channel-item-allocator.h"
+#define CODE_TO_STRING(e) case e: return #e;
 
 
 
@@ -149,3 +161,31 @@ set_gpu_interval
   interval->start = start;
   interval->end = end;
 }
+
+
+const char*
+gpu_kind_to_string
+(
+gpu_activity_kind_t kind
+)
+{
+  switch (kind)
+  {
+    FORALL_OPENCL_KINDS(CODE_TO_STRING)
+    default: return "CL_unknown_kind";
+  }
+}
+
+
+const char*
+gpu_type_to_string
+(
+gpu_memcpy_type_t type
+)
+{
+  switch (type)
+  {
+    FORALL_OPENCL_MEM_TYPES(CODE_TO_STRING)
+    default: return "CL_unknown_type";
+  }
+}
\ No newline at end of file
diff --git a/src/tool/hpcrun/gpu/gpu-activity.h b/src/tool/hpcrun/gpu/gpu-activity.h
index 8e08df20d5..ca40ec5f3a 100644
--- a/src/tool/hpcrun/gpu/gpu-activity.h
+++ b/src/tool/hpcrun/gpu/gpu-activity.h
@@ -434,5 +434,17 @@ gpu_context_activity_dump
 );
 
 
+const char*
+gpu_kind_to_string
+(
+gpu_activity_kind_t kind
+);
+
+
+const char*
+gpu_type_to_string
+(
+gpu_memcpy_type_t type
+);
 
 #endif
diff --git a/src/tool/hpcrun/gpu/gpu-operation-channel-set.c b/src/tool/hpcrun/gpu/gpu-operation-channel-set.c
index c020de1b26..5b2d6f5283 100644
--- a/src/tool/hpcrun/gpu/gpu-operation-channel-set.c
+++ b/src/tool/hpcrun/gpu/gpu-operation-channel-set.c
@@ -177,13 +177,13 @@ gpu_operation_channel_set_insert
 
 
 void
-gpu_operation_channel_set_consume
+gpu_operation_channel_set_apply
 (
+ gpu_operation_channel_fn_t channel_fn,
  int set_index
 )
 {
-  gpu_operation_channel_set_forall(gpu_operation_channel_consume, set_index);
-  gpu_operation_channel_set_forall(gpu_operation_channel_await, set_index);
+  gpu_operation_channel_set_forall(channel_fn, set_index);
 }
 
 
diff --git a/src/tool/hpcrun/gpu/gpu-operation-channel-set.h b/src/tool/hpcrun/gpu/gpu-operation-channel-set.h
index 8917496573..5faacc838c 100644
--- a/src/tool/hpcrun/gpu/gpu-operation-channel-set.h
+++ b/src/tool/hpcrun/gpu/gpu-operation-channel-set.h
@@ -83,8 +83,9 @@ gpu_operation_channel_set_insert
 
 
 void
-gpu_operation_channel_set_consume
+gpu_operation_channel_set_apply
 (
+ gpu_operation_channel_fn_t channel_fn,
  int set_index
 );
 
diff --git a/src/tool/hpcrun/gpu/gpu-operation-channel.c b/src/tool/hpcrun/gpu/gpu-operation-channel.c
index 2f6067faaf..90ed030cab 100644
--- a/src/tool/hpcrun/gpu/gpu-operation-channel.c
+++ b/src/tool/hpcrun/gpu/gpu-operation-channel.c
@@ -88,7 +88,8 @@
 #define channel_steal \
   typed_bichannel_steal(gpu_operation_item_t)
 
-#define SECONDS_UNTIL_WAKEUP 2
+
+#define SECONDS_UNTIL_WAKEUP 1
 
 
 
@@ -180,14 +181,16 @@ gpu_operation_channel_produce
  gpu_operation_item_t *it
 )
 {
-  gpu_operation_item_t *channel_it = gpu_operation_item_alloc(channel);
-//  channel_it->channel = it->channel;
-//  channel_it->activity = it->activity;
-  *channel_it = *it;
+  gpu_operation_item_t *new_item = gpu_operation_item_alloc(channel);
+  *new_item = *it;
 
-  printf("\nPRODUCE: channel = %p || return_channel = %p -> activity = %p\n\n", channel, channel_it->channel, channel_it->activity);
+  printf("\nPRODUCE: channel = %p || return_channel = %p -> activity = %p | corr = %u kind = %s, type = %s\n\n",
+         channel, new_item->channel, &new_item->activity,
+         (new_item->activity.kind == GPU_ACTIVITY_MEMCPY)?new_item->activity.details.memcpy.correlation_id:new_item->activity.details.kernel.correlation_id,
+         gpu_kind_to_string(new_item->activity.kind),
+         gpu_type_to_string(new_item->activity.details.memcpy.copyKind));
 
-  channel_push(channel, bichannel_direction_forward, channel_it);
+  channel_push(channel, bichannel_direction_forward, new_item);
 
   gpu_operation_channel_signal_consumer_when_full(channel);
 
@@ -211,10 +214,14 @@ gpu_operation_channel_consume
   for (;;) {
     gpu_operation_item_t *it = channel_pop(channel, bichannel_direction_forward);
 
-    if (!it) {
-      break;
-    }
-    printf("\n---------CONSUME: op_channel = %p || channel = %p , activity = %p\n", channel, it->channel, it->activity);
+    if (!it) break;
+
+    printf("\n---------CONSUME: op_channel = %p || channel = %p , activity = %p | corr = %u, kind = %s, type = %s\n",
+           channel, it->channel, &it->activity,
+           (it->activity.kind == GPU_ACTIVITY_MEMCPY)?it->activity.details.memcpy.correlation_id:it->activity.details.kernel.correlation_id,
+           gpu_kind_to_string(it->activity.kind),
+           gpu_type_to_string(it->activity.details.memcpy.copyKind));
+
     gpu_operation_item_consume(gpu_operation_item_process, it);
     gpu_operation_item_free(channel, it);
   }
diff --git a/src/tool/hpcrun/gpu/gpu-operation-item-process.c b/src/tool/hpcrun/gpu/gpu-operation-item-process.c
index 4f0a6f1e54..a7a14fe12f 100644
--- a/src/tool/hpcrun/gpu/gpu-operation-item-process.c
+++ b/src/tool/hpcrun/gpu/gpu-operation-item-process.c
@@ -116,7 +116,7 @@ gpu_memcpy_process
 gpu_operation_item_t *it
 )
 {
-  gpu_activity_t *activity = it->activity;
+  gpu_activity_t *activity = &it->activity;
   gpu_activity_channel_t *channel = it->channel;
 
   assert(activity->cct_node != NULL);
@@ -143,7 +143,7 @@ gpu_kernel_process
 gpu_operation_item_t *it
 )
 {
-  gpu_activity_t *activity = it->activity;
+  gpu_activity_t *activity = &it->activity;
   gpu_activity_channel_t *channel = it->channel;
 
   gpu_trace_item_t entry_trace;
@@ -183,7 +183,7 @@ gpu_operation_item_t *it
 )
 {
 
-  switch (it->activity->kind) {
+  switch (it->activity.kind) {
 
   case GPU_ACTIVITY_MEMCPY:
     gpu_memcpy_process(it);
diff --git a/src/tool/hpcrun/gpu/gpu-operation-item.c b/src/tool/hpcrun/gpu/gpu-operation-item.c
index c26bc879e9..bb34d56827 100644
--- a/src/tool/hpcrun/gpu/gpu-operation-item.c
+++ b/src/tool/hpcrun/gpu/gpu-operation-item.c
@@ -94,7 +94,7 @@ gpu_operation_item_consume
  gpu_operation_item_t *it
 )
 {
-  gpu_context_activity_dump(it->activity, "CONSUME");
+  gpu_context_activity_dump(&it->activity, "CONSUME");
   ap_fn(it);
 }
 
diff --git a/src/tool/hpcrun/gpu/gpu-operation-item.h b/src/tool/hpcrun/gpu/gpu-operation-item.h
index d994be4582..a4e6e0be6f 100644
--- a/src/tool/hpcrun/gpu/gpu-operation-item.h
+++ b/src/tool/hpcrun/gpu/gpu-operation-item.h
@@ -78,8 +78,9 @@ typedef struct gpu_operation_channel_t gpu_operation_channel_t;
 //******************************************************************************
 
 typedef struct gpu_operation_item_t{
+  s_element_t next;
   gpu_activity_channel_t *channel;
-  gpu_activity_t *activity;
+  gpu_activity_t activity;
 } gpu_operation_item_t;
 
 
diff --git a/src/tool/hpcrun/gpu/gpu-trace.c b/src/tool/hpcrun/gpu/gpu-trace.c
index 34efdeb0ef..57c57d6b7e 100644
--- a/src/tool/hpcrun/gpu/gpu-trace.c
+++ b/src/tool/hpcrun/gpu/gpu-trace.c
@@ -421,8 +421,8 @@ gpu_trace_fini
   while (atomic_load(&stream_counter));
 }
 
-void *
-schedule_multi_threads
+static void
+gpu_trace_channel_set_append
 (
  gpu_trace_t *trace
 )
@@ -457,7 +457,6 @@ schedule_multi_threads
 
   PRINT("set_index = %d -> stream = %u\n", num_threads, num_streams);
 
-  return NULL;
 }
 
 
@@ -473,7 +472,7 @@ gpu_trace_create
   // Create a new thread for the stream without libmonitor watching
   monitor_disable_new_threads();
 
-  schedule_multi_threads(trace);
+  gpu_trace_channel_set_append(trace);
 
   monitor_enable_new_threads();
 
diff --git a/src/tool/hpcrun/gpu/opencl/opencl-activity-translate.c b/src/tool/hpcrun/gpu/opencl/opencl-activity-translate.c
index 08a74173b8..5d833a509b 100644
--- a/src/tool/hpcrun/gpu/opencl/opencl-activity-translate.c
+++ b/src/tool/hpcrun/gpu/opencl/opencl-activity-translate.c
@@ -103,7 +103,7 @@ convert_memcpy
   ga->details.memcpy.correlation_id  = cb_data->details.mem_cb.correlation_id;
   ga->details.memcpy.submit_time     = cb_data->details.submit_time;
   ga->details.memcpy.bytes           = cb_data->details.mem_cb.size;
-  ga->details.memcpy.copyKind        = cb_data->kind;
+  ga->details.memcpy.copyKind        = cb_data->details.mem_cb.type;
 }
 
 
diff --git a/src/tool/hpcrun/gpu/opencl/opencl-api.c b/src/tool/hpcrun/gpu/opencl/opencl-api.c
index a206d76966..ba4272e45b 100644
--- a/src/tool/hpcrun/gpu/opencl/opencl-api.c
+++ b/src/tool/hpcrun/gpu/opencl/opencl-api.c
@@ -145,16 +145,6 @@
   macro(CL_INVALID_LINKER_OPTIONS)					\
   macro(CL_INVALID_DEVICE_PARTITION_COUNT)
 
-#define FORALL_OPENCL_KINDS(macro)					\
-  macro(GPU_ACTIVITY_UNKNOWN)							\
-  macro(GPU_ACTIVITY_KERNEL)           \
-  macro(GPU_ACTIVITY_MEMCPY)
-
-#define FORALL_OPENCL_MEM_TYPES(macro)					\
-  macro(GPU_MEMCPY_UNK)							\
-  macro(GPU_MEMCPY_H2D)           \
-  macro(GPU_MEMCPY_D2H)
-
 
 #define CODE_TO_STRING(e) case e: return #e;
 
@@ -265,6 +255,9 @@ opencl_activity_process
   gpu_activity_t gpu_activity;
   opencl_activity_translate(&gpu_activity, event, cb_data);
 
+  if (gpu_activity_is_multiplexer_initialized() == false){
+    gpu_activity_multiplexer_init();
+  }
   gpu_activity_multiplexer_push(cb_data->details.initiator_channel, &gpu_activity);
 //  gpu_activity_process(&gpu_activity);
 }
@@ -282,33 +275,6 @@ opencl_wait_for_pending_operations
 }
 
 
-static const char*
-opencl_kind_to_string
-(
- gpu_activity_kind_t kind
-)
-{
-  switch (kind)
-  {
-    FORALL_OPENCL_KINDS(CODE_TO_STRING)
-    default: return "CL_unknown_kind";
-  }
-}
-
-
-static const char*
-opencl_type_to_string
-(
-gpu_memcpy_type_t kind
-)
-{
-  switch (kind)
-  {
-    FORALL_OPENCL_MEM_TYPES(CODE_TO_STRING)
-    default: return "CL_unknown_type";
-  }
-}
-
 static const char*
 opencl_error_report
 (
@@ -369,8 +335,8 @@ opencl_cb_basic_print
 
   ETMSG(OPENCL, " %s | Activity kind: %s | type: %s | correlation id: %"PRIu64 "",
         title,
-        opencl_kind_to_string(cb_basic.kind),
-        opencl_type_to_string(cb_basic.type),
+        gpu_kind_to_string(cb_basic.kind),
+        gpu_type_to_string(cb_basic.type),
         cb_basic.correlation_id);
 
 }
@@ -457,7 +423,6 @@ opencl_activity_completion_callback
     opencl_in_correlation_map(cb_basic);
 
     opencl_cb_basic_print(cb_basic, "Completion_Callback");
-
     opencl_activity_process(event, cb_data);
   }
   if (cb_data->isInternalClEvent) {

From 39f1a8f4876404abe2b1fbe2900fd673c0103d76 Mon Sep 17 00:00:00 2001
From: Jokeren <robinho364@gmail.com>
Date: Sat, 26 Sep 2020 21:29:14 +0000
Subject: [PATCH 046/177] Fix hpcstruct

---
 src/lib/banal/Struct-Output.cpp             |  39 ----
 src/lib/banal/Struct-Output.hpp             |  37 ---
 src/lib/banal/Struct.cpp                    |  24 +-
 src/lib/binutils/ElfHelper.cpp              |   1 -
 src/lib/binutils/InputFile.cpp              |   3 +-
 src/lib/binutils/InputFile.hpp              |   2 +-
 src/lib/binutils/LM.cpp                     |   2 +-
 src/lib/binutils/intel/IntelGPUBinutils.cpp | 243 --------------------
 src/lib/binutils/intel/IntelGPUBinutils.hpp |   6 -
 9 files changed, 11 insertions(+), 346 deletions(-)

diff --git a/src/lib/banal/Struct-Output.cpp b/src/lib/banal/Struct-Output.cpp
index eefa8963ca..48d5839099 100644
--- a/src/lib/banal/Struct-Output.cpp
+++ b/src/lib/banal/Struct-Output.cpp
@@ -72,11 +72,8 @@
 #include <map>
 #include <ostream>
 #include <string>
-#include <fcntl.h>
 
 #include <lib/binutils/VMAInterval.hpp>
-#include <lib/binutils/ElfHelper.hpp>
-#include <lib/binutils/intel/IntelGPUBinutils.hpp>
 #include <lib/support/FileUtil.hpp>
 #include <lib/support/StringTable.hpp>
 #include <lib/support/dictionary.h>
@@ -253,42 +250,6 @@ printLoadModuleEnd(ostream * os)
   *os << "</LM>\n";
 }
 
-//----------------------------------------------------------------------
-void
-printBlockAndInstructionOffset(ostream * os, string file_name)
-{
-	ElfFile *elfFile = new ElfFile;
-	int file_fd = open(file_name.c_str(), O_RDONLY);
-	size_t f_size = file_size(file_fd);
-	char  *file_buffer = (char *) malloc(f_size);
-	size_t bytes = read_all(file_fd, file_buffer, f_size);
-	bool result = elfFile->open(file_buffer, f_size, file_name);
-
-	Elf *elf = elfFile->getElf();
-	file_buffer = elfFile->getMemory();
-	ElfSectionVector *sections = elfGetSectionVector(elf);
-	GElf_Ehdr ehdr_v;
-	GElf_Ehdr *ehdr = gelf_getehdr(elf, &ehdr_v);
-
-	if (ehdr) {
-		for (auto si = sections->begin(); si != sections->end(); si++) {
-			Elf_Scn *scn = *si;
-			GElf_Shdr shdr_v;
-			GElf_Shdr *shdr = gelf_getshdr(scn, &shdr_v);
-			if (!shdr) continue;
-			char *sectionData = elfSectionGetData(file_buffer, shdr);
-			const char *section_name = elf_strptr(elf, ehdr->e_shstrndx, shdr->sh_name);
-			if (strcmp(section_name, ".text") == 0) {
-				std::vector<uint8_t> intelRawGenBinary(reinterpret_cast<uint8_t*>(sectionData), 
-						reinterpret_cast<uint8_t*>(sectionData) + shdr->sh_size);
-				std::string blockAndInstOffsets = getBlockAndInstructionOffsets(intelRawGenBinary);
-				*os << blockAndInstOffsets;
-			}
-		}
-	}
-
-}
-
 
 //----------------------------------------------------------------------
 
diff --git a/src/lib/banal/Struct-Output.hpp b/src/lib/banal/Struct-Output.hpp
index c6602e66cc..2f354d7628 100644
--- a/src/lib/banal/Struct-Output.hpp
+++ b/src/lib/banal/Struct-Output.hpp
@@ -78,43 +78,6 @@ void printFileEnd(ostream *, FileInfo *);
 void printProc(ostream *, ostream *, string, FileInfo *, GroupInfo *,
 	       ProcInfo *, HPC::StringTable & strTab);
 
-void printBlockAndInstructionOffset(ostream * os, string file_name);
-
-static size_t
-file_size(int fd)
-{
-  struct stat sb;
-  int retval = fstat(fd, &sb);
-  if (retval == 0 && S_ISREG(sb.st_mode)) {
-    return sb.st_size;
-  }
-  return 0;
-}
-
-
-// Automatically restart short reads.
-// This protects against EINTR.
-//
-static size_t
-read_all(int fd, void *buf, size_t count)
-{
-  ssize_t ret;
-  size_t len;
-
-  len = 0;
-  while (len < count) {
-    ret = read(fd, ((char *) buf) + len, count - len);
-    if (ret == 0 || (ret < 0 && errno != EINTR)) {
-      break;
-    }
-    if (ret > 0) {
-      len += ret;
-    }
-  }
-
-  return len;
-}
-
 }  // namespace Output
 }  // namespace BAnal
 
diff --git a/src/lib/banal/Struct.cpp b/src/lib/banal/Struct.cpp
index 8a598985e5..712de20fe5 100644
--- a/src/lib/banal/Struct.cpp
+++ b/src/lib/banal/Struct.cpp
@@ -584,10 +584,9 @@ makeStructure(string filename,
 #endif
 
   InputFile inputFile;
-	std::string inputFileType;
 
   // failure throws an error up the call chain
-  inputFile.openFile(filename, InputFileError_Error, &inputFileType);
+  inputFile.openFile(filename, InputFileError_Error);
   ElfFileVector * elfFileVector = inputFile.fileVector();
   string & sfilename = inputFile.fileName();
   const char * cfilename = inputFile.CfileName();
@@ -600,7 +599,6 @@ makeStructure(string filename,
 	
   for (uint i = 0; i < elfFileVector->size(); i++) {
     bool parsable = true;
-    parsable = false; // aaron
     ElfFile *elfFile = (*elfFileVector)[i];
 
     if (opts.show_time) {
@@ -621,8 +619,8 @@ makeStructure(string filename,
 
     Symtab * symtab = Inline::openSymtab(elfFile);
     if (symtab == NULL) {
-			continue;
-		}
+      continue;
+    }
     the_symtab = symtab;
     bool cuda_file = SYMTAB_ARCH_CUDA(symtab);
 
@@ -634,8 +632,8 @@ makeStructure(string filename,
     {
 #pragma omp for  schedule(dynamic, 1)
       for (uint i = 0; i < modVec.size(); i++) {
-	Module * mod = modVec[i];
-	mod->parseLineInformation();
+        Module * mod = modVec[i];
+        mod->parseLineInformation();
       }
     }  // end parallel
 
@@ -650,13 +648,13 @@ makeStructure(string filename,
     omp_set_num_threads(opts.jobs_parse);
 #endif
 
-		bool intel_file = elfFile->isIntelGPUFile();
+    bool intel_file = elfFile->isIntelGPUFile();
 
     if (cuda_file) { // don't run parseapi on cuda binary
       cuda_arch = elfFile->getArch();
       cubin_size = elfFile->getLength();
-      parsable = readCudaCFG(search_path, elfFile, the_symtab, 
-			      structOpts.compute_gpu_cfg, &code_src, &code_obj);
+      parsable = readCudaCFG(search_path, elfFile, the_symtab,
+        structOpts.compute_gpu_cfg, &code_src, &code_obj);
     } else if (intel_file) { // don't run parseapi on intel binary
       // TODO(Aaron): determine which generation of intel gpu it is
       intel_gpu_arch = 1;
@@ -719,12 +717,6 @@ makeStructure(string filename,
     // with try_lock(), there are interleavings where not all items
     // have been printed.
     printWorkList(wlPrint, num_done, outFile, gapsFile, gaps_filenm);
-	
-		// custom code for intel GPU elfs
-		if (intel_gpu_arch > 0) {
-      // TODO(Aaron): is it necessary to read binary again?
-    	Output::printBlockAndInstructionOffset(outFile, elfFileRealPath);
-		}
 
     Output::printLoadModuleEnd(outFile);
 
diff --git a/src/lib/binutils/ElfHelper.cpp b/src/lib/binutils/ElfHelper.cpp
index c3fdc47ac4..38ae9c268d 100644
--- a/src/lib/binutils/ElfHelper.cpp
+++ b/src/lib/binutils/ElfHelper.cpp
@@ -143,7 +143,6 @@ ElfFile::open
 
 ElfFile::~ElfFile() 
 {
-  // TODO(Keren): prevent memory leak
   if (origPtr != memPtr && origPtr != 0) free(origPtr);
   elf_end(elf);
 }
diff --git a/src/lib/binutils/InputFile.cpp b/src/lib/binutils/InputFile.cpp
index 7a91564c39..44632d8387 100644
--- a/src/lib/binutils/InputFile.cpp
+++ b/src/lib/binutils/InputFile.cpp
@@ -129,8 +129,7 @@ bool
 InputFile::openFile
 (
  std::string &filename,
- InputFileErrorType_t errType,
- std::string *fileType
+ InputFileErrorType_t errType
 )
 {
   const char *tag = 
diff --git a/src/lib/binutils/InputFile.hpp b/src/lib/binutils/InputFile.hpp
index 4814553820..cc3cc9d8f9 100644
--- a/src/lib/binutils/InputFile.hpp
+++ b/src/lib/binutils/InputFile.hpp
@@ -89,7 +89,7 @@ class InputFile {
 public:
   InputFile() { filevector = 0; }
   ~InputFile();
-  bool openFile(std::string &filename, InputFileErrorType_t errType, std::string *fileType);
+  bool openFile(std::string &filename, InputFileErrorType_t errType);
 
   std::string &fileName() { return filename; }
   const char *CfileName() { return filename.c_str(); }
diff --git a/src/lib/binutils/LM.cpp b/src/lib/binutils/LM.cpp
index 402851f1b2..bb200771d9 100644
--- a/src/lib/binutils/LM.cpp
+++ b/src/lib/binutils/LM.cpp
@@ -415,7 +415,7 @@ BinUtil::LM::open(const char* filenm)
 
   std::string file_name = std::string(filenm);
 
-  if (input_file.openFile(file_name, InputFileError_WarningNothrow, NULL)) {
+  if (input_file.openFile(file_name, InputFileError_WarningNothrow)) {
     // We only relocate individual cubins, with filevector size 1
     ElfFile *elf_file = (*input_file.fileVector())[0];
     if (isCubin(elf_file->getElf())) {
diff --git a/src/lib/binutils/intel/IntelGPUBinutils.cpp b/src/lib/binutils/intel/IntelGPUBinutils.cpp
index f23a0aaf12..20df3222c0 100644
--- a/src/lib/binutils/intel/IntelGPUBinutils.cpp
+++ b/src/lib/binutils/intel/IntelGPUBinutils.cpp
@@ -126,249 +126,6 @@ opencl_elf_section_type
   }
 }
 
-
-#define MAX_STR_SIZE 1024
-#define INDENT  "  "
-
-std::map<int32_t, bool> visitedBlockOffsets;
-
-
-
-//******************************************************************************
-// private operations
-//******************************************************************************
-
-class Edge {
-  public:
-    int32_t from; 
-    int32_t to;
-    int32_t from_blockEndOffset;
-
-    Edge(int32_t f, int32_t t, int32_t from_b) {
-      from = f;
-      to = t;
-      from_blockEndOffset = from_b;
-    }
-
-    bool operator == (const Edge &that) const 
-    {
-      return((this->from == that.from) && (this->to == that.to));
-    }
-    
-    bool operator<(const Edge& that) const 
-    {
-      if (this->from == that.from) {
-        return (this->to < that.to);
-      } else {
-        return (this->from < that.from);
-      }
-    }
-};
-
-
-static std::set<Edge>
-get_cfg_edges
-(
-  std::vector<uint8_t> &binary
-)
-{
-  KernelView kv(IGA_GEN9, binary.data(), binary.size(),
-      iga::SWSB_ENCODE_MODE::SingleDistPipe);
-  size_t binary_size = binary.size();
-  std::set<Edge> cfg_edges;
-
-  std::vector<int32_t> block_offsets;
-  int32_t offset = 0;
-  int32_t size;
-  while (offset < binary_size) {
-    int32_t prev_block_start_offset;
-    int32_t prev_block_end_offset;
-    int32_t block_start_offset;
-    bool isStartOfBasicBlock = kv.isInstTarget(offset);
-    if (isStartOfBasicBlock) {
-      block_offsets.push_back(offset);
-      visitedBlockOffsets.insert({offset, false});
-      block_start_offset = offset;  
-    }
-    size = kv.getInstSize(offset);
-    while (!kv.isInstTarget(offset + size) && (offset + size < binary_size)) {
-      offset += size; 
-      size = kv.getInstSize(offset);
-      if (size == 0) {
-        // this is a weird edge case, what to do?
-        break;
-      }
-    }
-
-    int32_t *jump_targets = new int32_t[KV_MAX_TARGETS_PER_INSTRUCTION];
-    size_t jump_targets_count = kv.getInstTargets(offset, jump_targets);
-    int32_t next_block_start_offset = offset + size;
-    bool isFallThroughEdgeAdded = false;
-
-    for (size_t i = 0; i < jump_targets_count; i++) {
-      if (jump_targets[i] == next_block_start_offset) {
-        isFallThroughEdgeAdded = true;
-      } else if (jump_targets[i] == block_start_offset) {
-        if (block_offsets.size() >= 2) {
-          int32_t from = block_offsets[block_offsets.size() - 2];
-          int32_t from_blockEndOffset;
-          for (Edge edge: cfg_edges) {
-            if (edge.from == from && edge.to == block_start_offset) {
-              from_blockEndOffset  = edge.from_blockEndOffset;
-            }
-          } 
-          cfg_edges.insert(Edge(block_offsets[block_offsets.size() - 2], block_start_offset, from_blockEndOffset));
-        }
-      }
-      cfg_edges.insert(Edge(block_start_offset, jump_targets[i], next_block_start_offset - size));
-    }
-    if(!isFallThroughEdgeAdded) {
-      cfg_edges.insert(Edge(block_start_offset, next_block_start_offset, next_block_start_offset - size));
-    }
-    prev_block_start_offset = block_start_offset;
-    prev_block_end_offset = offset; 
-    offset += size;
-  }
-  cfg_edges.insert(Edge(block_offsets[block_offsets.size() - 1], binary_size, binary_size - size));
-  return cfg_edges;
-}
-
-
-static void
-printCFGEdges
-(
-  std::set<Edge> &cfg_edges
-)
-{
-  for (Edge edge: cfg_edges) {
-    std::cout << edge.from << "->" << edge.to << std::endl; 
-  } 
-}
-
-
-static void
-printBasicBlocks
-(
-  std::vector<uint8_t> &binary,
-  std::set<Edge> &cfg_edges
-)
-{
-  KernelView kv(IGA_GEN9, binary.data(), binary.size(), iga::SWSB_ENCODE_MODE::SingleDistPipe);
-  int32_t offset;
-  char text[MAX_STR_SIZE] = { 0 };
-  size_t length;
-  int32_t size;
-
-  for (Edge edge: cfg_edges) {
-    offset = edge.from;
-    if(edge.from == edge.to) {
-      // skip self-loops
-      continue;
-    }
-    auto it = visitedBlockOffsets.find(offset);
-    if (it->second) {
-      continue;
-    } else {
-      it->second = true;
-    }
-    std::cout << offset << " [ label=\"\\\n"; 
-    while (offset < edge.to) {
-      size = kv.getInstSize(offset);
-      length = kv.getInstSyntax(offset, text, MAX_STR_SIZE);
-      assert(length > 0);
-      std::cout << offset << ": " << text << "\\\l";
-      offset += size;
-    }
-    std::cout << "\" shape=\"box\"]; \n" << std::endl;
-  } 
-}
-
-
-static std::vector<int32_t>
-getBlockOffsets
-(
-  std::vector<uint8_t> &binary
-)
-{
-  std::vector<int32_t> block_offsets;
-  int32_t offset = 0;
-  int32_t size = 0;
-  KernelView kv(IGA_GEN9, binary.data(), binary.size(),
-      iga::SWSB_ENCODE_MODE::SingleDistPipe);
-
-  while (offset < binary.size()) {
-    bool isStartOfBasicBlock = kv.isInstTarget(offset);
-    if (isStartOfBasicBlock) {
-      block_offsets.push_back(offset);
-    }
-    size = kv.getInstSize(offset);
-    offset += size; 
-  }
-  return block_offsets;
-}
-
-
-static void
-doIndent(std::stringstream *ss, int depth)
-{
-  for (int n = 1; n <= depth; n++) {
-    *ss << INDENT;
-  }
-}
-
-
-
-//******************************************************************************
-// interface operations
-//******************************************************************************
-
-// pass Intel kernel's raw gen binary
-// kernel's text region is a raw gen binary
-// you  can find kernel nested in [debug section of GPU binary/separate debug section dump]
-void
-printCFGInDotGraph
-(
-  std::vector<uint8_t> intelRawGenBinary
-)
-{
-  std::cout << "digraph GEMM_iga {" << std::endl;
-  std::set<Edge> edges = get_cfg_edges(intelRawGenBinary);
-  printBasicBlocks(intelRawGenBinary, edges);
-  printCFGEdges(edges);
-  std::cout << "}" << std::endl;
-}
-
-
-std::string
-getBlockAndInstructionOffsets
-(
- std::vector<uint8_t> &intelRawGenBinary
-)
-{
-  std::stringstream ss;
-  std::vector<int32_t> block_offsets = getBlockOffsets(intelRawGenBinary);
-  KernelView kv(IGA_GEN9, intelRawGenBinary.data(), intelRawGenBinary.size(),
-      iga::SWSB_ENCODE_MODE::SingleDistPipe);
-  int32_t offset, size;
-
-  for(auto i = 0; i < block_offsets.size()-1; i++) {
-    offset = block_offsets[i];
-    doIndent(&ss, 1);
-    ss << "<B o=\"0x" << std::hex << offset << "\">\n"; 
-    doIndent(&ss, 2);
-    while (offset != block_offsets[i+1]) {
-      ss << "<I o=\"0x" << std::hex << offset << "\"/>";
-      size = kv.getInstSize(offset);
-      offset += size;
-    }
-    ss << "\n"; 
-    doIndent(&ss, 1);
-    ss << "</B>\n"; 
-  }
-  return ss.str();
-}
-
-
 //******************************************************************************
 // interface operations
 //******************************************************************************
diff --git a/src/lib/binutils/intel/IntelGPUBinutils.hpp b/src/lib/binutils/intel/IntelGPUBinutils.hpp
index 7484f63a1d..3861ffb7e1 100644
--- a/src/lib/binutils/intel/IntelGPUBinutils.hpp
+++ b/src/lib/binutils/intel/IntelGPUBinutils.hpp
@@ -79,10 +79,4 @@ findIntelGPUBins
  ElfFileVector *filevector
 );
 
-std::string
-getBlockAndInstructionOffsets
-(
- std::vector<uint8_t> &intelRawGenBinary
-);
-
 #endif

From 46d3b4d225a1d51ef97241cbe10719676f7b59d8 Mon Sep 17 00:00:00 2001
From: dejangrubisic <grubisic.dejan@yahoo.com>
Date: Sat, 26 Sep 2020 21:09:20 -0500
Subject: [PATCH 047/177] All consume/produce are properly matched,  
 gpu_trace_fini(NULL);

---
 .../hpcrun/gpu/gpu-activity-multiplexer.c     | 27 +++---
 src/tool/hpcrun/gpu/gpu-operation-channel.c   |  4 +-
 .../hpcrun/gpu/gpu-operation-item-process.c   | 42 ++++-----
 src/tool/hpcrun/gpu/gpu-trace-channel-set.c   | 16 +---
 src/tool/hpcrun/gpu/gpu-trace-channel-set.h   |  5 +-
 src/tool/hpcrun/gpu/gpu-trace-channel.c       | 20 +++++
 src/tool/hpcrun/gpu/gpu-trace-channel.h       |  2 +
 src/tool/hpcrun/gpu/gpu-trace.c               | 85 ++++++++++++++-----
 src/tool/hpcrun/gpu/gpu-trace.h               |  2 +-
 src/tool/hpcrun/sample-sources/opencl.c       |  9 +-
 10 files changed, 134 insertions(+), 78 deletions(-)

diff --git a/src/tool/hpcrun/gpu/gpu-activity-multiplexer.c b/src/tool/hpcrun/gpu/gpu-activity-multiplexer.c
index 27d15bb024..c4984d14d7 100644
--- a/src/tool/hpcrun/gpu/gpu-activity-multiplexer.c
+++ b/src/tool/hpcrun/gpu/gpu-activity-multiplexer.c
@@ -52,6 +52,8 @@
 #include "gpu-activity-multiplexer.h"
 #include "gpu-monitoring-thread-api.h"
 #include "gpu-activity-process.h"
+#include "gpu-trace.h"
+
 #include "gpu-print.h"
 
 //TODO: Figure out how to get max number of application threads
@@ -69,8 +71,7 @@ typedef void *(*pthread_start_routine_t)(void *);
 
 static _Atomic(bool) stop_activity_flag;
 
-static atomic_uint operation_stream_counter;
-static atomic_uint operation_set_id;
+static atomic_uint stream_id;
 static __thread uint32_t my_operation_set_id = -1;
 static __thread gpu_operation_channel_t *gpu_operation_channel = NULL;
 static pthread_once_t is_initialized = PTHREAD_ONCE_INIT;
@@ -84,7 +85,7 @@ static pthread_once_t is_initialized = PTHREAD_ONCE_INIT;
 static void
 gpu_init_operation_channel(){
   // Create operation channel
-  my_operation_set_id = atomic_fetch_add(&operation_set_id, 1);
+  my_operation_set_id = atomic_fetch_add(&stream_id, 1);
   gpu_operation_channel = gpu_operation_channel_get();
   gpu_operation_channel_set_insert(gpu_operation_channel, my_operation_set_id);
 }
@@ -98,18 +99,24 @@ void
 {
 
   while (!atomic_load(&stop_activity_flag)){
+    int current_stream_id = atomic_load(&stream_id);
 
-    for (int set_index = 0; set_index < atomic_load(&operation_set_id) ; ++set_index) {
+    for (int set_index = 0; set_index < current_stream_id ; ++set_index) {
       gpu_operation_channel_set_apply(gpu_operation_channel_consume, set_index);
+
+      // TODO: change waiting policy to getting items when full
       gpu_operation_channel_set_apply(gpu_operation_channel_await, set_index);
     }
   }
 
-  for (int set_index = 0; set_index < atomic_load(&operation_set_id) ; ++set_index) {
+  int current_stream_id = atomic_load(&stream_id);
+  for (int set_index = 0; set_index < current_stream_id; ++set_index) {
     gpu_operation_channel_set_apply(gpu_operation_channel_consume, set_index);
     gpu_operation_channel_set_apply(gpu_operation_channel_await, set_index);
   }
 
+  gpu_trace_fini(NULL);
+
   return NULL;
 }
 
@@ -122,7 +129,7 @@ void
 {
   pthread_t thread;
   atomic_store(&stop_activity_flag, false);
-  atomic_store(&operation_set_id, 0);
+  atomic_store(&stream_id, 0);
 
   gpu_operation_channel_stack_alloc(max_threads_consumers);
   // You are the first to create monitor thread
@@ -167,12 +174,12 @@ void
 
   atomic_store(&stop_activity_flag, true);
 
-  for (int set_index = 0; set_index < atomic_load(&operation_set_id) ; ++set_index) {
+  int current_stream_id = atomic_load(&stream_id);
+  for (int set_index = 0; set_index < current_stream_id; ++set_index) {
     gpu_operation_channel_set_apply(gpu_operation_channel_signal_consumer, set_index);
   }
 
 
-//  while (atomic_load(&operation_stream_counter));
 }
 
 
@@ -186,8 +193,6 @@ gpu_activity_t *gpu_activity
   gpu_operation_item_t item = (gpu_operation_item_t){.channel=initiator_channel, .activity=*gpu_activity};
   gpu_operation_channel_produce(gpu_operation_channel, &item);
 
-//  atomic_fetch_add(&operation_stream_counter, +1);
-
 }
 
 
@@ -197,7 +202,6 @@ gpu_operation_release
 gpu_operation_channel_t *channel
 )
 {
-  atomic_fetch_add(&operation_stream_counter, -1);
 }
 
 
@@ -207,7 +211,6 @@ gpu_activity_multiplexer_release
  void
 )
 {
-  atomic_fetch_add(&operation_stream_counter, -1);
 }
 
 
diff --git a/src/tool/hpcrun/gpu/gpu-operation-channel.c b/src/tool/hpcrun/gpu/gpu-operation-channel.c
index 90ed030cab..d2c566c20d 100644
--- a/src/tool/hpcrun/gpu/gpu-operation-channel.c
+++ b/src/tool/hpcrun/gpu/gpu-operation-channel.c
@@ -184,7 +184,7 @@ gpu_operation_channel_produce
   gpu_operation_item_t *new_item = gpu_operation_item_alloc(channel);
   *new_item = *it;
 
-  printf("\nPRODUCE: channel = %p || return_channel = %p -> activity = %p | corr = %u kind = %s, type = %s\n\n",
+  printf("\nOPERATION_PRODUCE: channel = %p || return_channel = %p -> activity = %p | corr = %u kind = %s, type = %s\n\n",
          channel, new_item->channel, &new_item->activity,
          (new_item->activity.kind == GPU_ACTIVITY_MEMCPY)?new_item->activity.details.memcpy.correlation_id:new_item->activity.details.kernel.correlation_id,
          gpu_kind_to_string(new_item->activity.kind),
@@ -216,7 +216,7 @@ gpu_operation_channel_consume
 
     if (!it) break;
 
-    printf("\n---------CONSUME: op_channel = %p || channel = %p , activity = %p | corr = %u, kind = %s, type = %s\n",
+    printf("\nOPERATION_CONSUME: op_channel = %p || channel = %p , activity = %p | corr = %u, kind = %s, type = %s\n",
            channel, it->channel, &it->activity,
            (it->activity.kind == GPU_ACTIVITY_MEMCPY)?it->activity.details.memcpy.correlation_id:it->activity.details.kernel.correlation_id,
            gpu_kind_to_string(it->activity.kind),
diff --git a/src/tool/hpcrun/gpu/gpu-operation-item-process.c b/src/tool/hpcrun/gpu/gpu-operation-item-process.c
index a7a14fe12f..8b26ab71a5 100644
--- a/src/tool/hpcrun/gpu/gpu-operation-item-process.c
+++ b/src/tool/hpcrun/gpu/gpu-operation-item-process.c
@@ -91,20 +91,6 @@ gpu_context_stream_trace
 }
 
 
-static void
-trace_item_set
-(
- gpu_trace_item_t *ti,
- uint64_t submit_time,
- uint64_t start_time,
- uint64_t end_time,
- gpu_activity_t *ga
-)
-{
-  gpu_trace_item_produce(ti, submit_time, start_time, end_time, ga->cct_node);
-}
-
-
 
 //******************************************************************************
 // gpu operations process
@@ -122,12 +108,16 @@ gpu_operation_item_t *it
   assert(activity->cct_node != NULL);
 
   gpu_trace_item_t entry_trace;
-  trace_item_set(&entry_trace, activity->details.memcpy.submit_time,
-                 activity->details.memcpy.start, activity->details.memcpy.end, activity);
 
-  gpu_context_stream_trace
-    (activity->details.memcpy.context_id, activity->details.memcpy.stream_id,
-     &entry_trace);
+  gpu_trace_item_produce(&entry_trace,
+                         activity->details.memcpy.submit_time,
+                         activity->details.memcpy.start,
+                         activity->details.memcpy.end,
+                         activity->cct_node);
+
+  gpu_context_stream_trace(activity->details.memcpy.context_id,
+                           activity->details.memcpy.stream_id,
+                           &entry_trace);
 
   gpu_activity_channel_produce(channel, activity);
 
@@ -147,12 +137,16 @@ gpu_operation_item_t *it
   gpu_activity_channel_t *channel = it->channel;
 
   gpu_trace_item_t entry_trace;
-  trace_item_set(&entry_trace, activity->details.kernel.submit_time,
-                 activity->details.kernel.start, activity->details.kernel.end, activity);
 
-  gpu_context_stream_trace
-    (activity->details.kernel.context_id, activity->details.kernel.stream_id,
-     &entry_trace);
+  gpu_trace_item_produce(&entry_trace,
+                         activity->details.kernel.submit_time,
+                         activity->details.kernel.start,
+                         activity->details.kernel.end,
+                         activity->cct_node);
+
+  gpu_context_stream_trace(activity->details.kernel.context_id,
+                           activity->details.kernel.stream_id,
+                           &entry_trace);
 
   gpu_activity_channel_produce(channel, activity);
 
diff --git a/src/tool/hpcrun/gpu/gpu-trace-channel-set.c b/src/tool/hpcrun/gpu/gpu-trace-channel-set.c
index c325b34f85..8c4f431de0 100644
--- a/src/tool/hpcrun/gpu/gpu-trace-channel-set.c
+++ b/src/tool/hpcrun/gpu/gpu-trace-channel-set.c
@@ -167,6 +167,7 @@ gpu_trace_channel_set_insert
   // initialize the new entry
   e->channel = channel;
 
+
   // clear the entry's next ptr
   channel_stack_elem_ptr_set(e, 0);
 
@@ -176,20 +177,11 @@ gpu_trace_channel_set_insert
 
 
 void
-gpu_trace_channel_set_consume
-(
- int set_index
-)
-{
-  gpu_trace_channel_set_forall(gpu_trace_channel_consume, set_index);
-}
-
-
-void
-gpu_trace_channel_set_release
+gpu_trace_channel_set_apply
 (
+ gpu_trace_channel_fn_t channel_fn,
  int set_index
 )
 {
-  gpu_trace_channel_set_forall(gpu_trace_stream_release, set_index);
+  gpu_trace_channel_set_forall(channel_fn, set_index);
 }
diff --git a/src/tool/hpcrun/gpu/gpu-trace-channel-set.h b/src/tool/hpcrun/gpu/gpu-trace-channel-set.h
index ed9ad59483..23119621ac 100644
--- a/src/tool/hpcrun/gpu/gpu-trace-channel-set.h
+++ b/src/tool/hpcrun/gpu/gpu-trace-channel-set.h
@@ -82,11 +82,12 @@ gpu_trace_channel_set_insert
 
 
 void
-gpu_trace_channel_set_consume
+gpu_trace_channel_set_apply
 (
+ gpu_trace_channel_fn_t channel_fn,
  int set_index
 );
 
-void gpu_trace_channel_set_release(int set_index);
+
 
 #endif
diff --git a/src/tool/hpcrun/gpu/gpu-trace-channel.c b/src/tool/hpcrun/gpu/gpu-trace-channel.c
index c5a0807b1a..f5a8b5e4ef 100644
--- a/src/tool/hpcrun/gpu/gpu-trace-channel.c
+++ b/src/tool/hpcrun/gpu/gpu-trace-channel.c
@@ -150,6 +150,12 @@ gpu_trace_channel_get_td(gpu_trace_channel_t *ch)
   return ch->td;
 }
 
+int
+gpu_trace_channel_get_stream_id(gpu_trace_channel_t *ch)
+{
+  return ch->td->core_profile_trace_data.id;
+}
+
 
 gpu_trace_channel_t *
 gpu_trace_channel_alloc
@@ -184,6 +190,13 @@ gpu_trace_channel_produce
 
   *cti = *ti;
 
+  printf("\n===========TRACE_PRODUCE: ti = %p || submit = %lu, start = %lu, end = %lu, cct_node = %p\n\n",
+         ti,
+         ti->cpu_submit_time,
+         ti->start,
+         ti->end,
+         ti->call_path_leaf);
+
   channel_push(channel, bichannel_direction_forward, cti);
 
   gpu_trace_channel_signal_consumer_when_full(channel);
@@ -209,6 +222,13 @@ gpu_trace_channel_consume
   for (;;) {
     gpu_trace_item_t *ti = channel_pop(channel, bichannel_direction_forward);
     if (!ti) break;
+
+    printf("\n===========TRACE_CONSUME: ti = %p || submit = %lu, start = %lu, end = %lu, cct_node = %p\n\n",
+           ti,
+           ti->cpu_submit_time,
+           ti->start,
+           ti->end,
+           ti->call_path_leaf);
     gpu_trace_item_consume(consume_one_trace_item, channel->td, ti);
     gpu_trace_item_free(channel, ti);
   }
diff --git a/src/tool/hpcrun/gpu/gpu-trace-channel.h b/src/tool/hpcrun/gpu/gpu-trace-channel.h
index 609a5d6068..82ce178a20 100644
--- a/src/tool/hpcrun/gpu/gpu-trace-channel.h
+++ b/src/tool/hpcrun/gpu/gpu-trace-channel.h
@@ -68,6 +68,8 @@ typedef struct gpu_trace_channel_t gpu_trace_channel_t;
 thread_data_t *
 gpu_trace_channel_get_td(gpu_trace_channel_t *ch);
 
+int
+gpu_trace_channel_get_stream_id(gpu_trace_channel_t *ch);
 
 gpu_trace_channel_t *
 gpu_trace_channel_alloc
diff --git a/src/tool/hpcrun/gpu/gpu-trace.c b/src/tool/hpcrun/gpu/gpu-trace.c
index 57c57d6b7e..5684577eb8 100644
--- a/src/tool/hpcrun/gpu/gpu-trace.c
+++ b/src/tool/hpcrun/gpu/gpu-trace.c
@@ -95,7 +95,6 @@
 typedef struct gpu_trace_t {
   pthread_t thread;
   gpu_trace_channel_t *trace_channel;
-  unsigned int channel_set_id;
 } gpu_trace_t;
 
 
@@ -109,11 +108,14 @@ typedef void *(*pthread_start_routine_t)(void *);
 
 static _Atomic(bool) stop_trace_flag;
 
-static atomic_ullong stream_counter;
+static atomic_ullong active_streams_counter;
 
 static atomic_ullong stream_id;
 
 
+static __thread uint32_t my_trace_set_id = -1;
+
+
 static __thread uint64_t stream_start = 0;
 
 
@@ -312,17 +314,37 @@ gpu_trace_activities_process
  int set_index
 )
 {
-  gpu_trace_channel_set_consume(set_index);
+  gpu_trace_channel_set_apply(gpu_trace_channel_consume, set_index);
+}
+
+
+static void
+gpu_trace_channel_set_release
+(
+int set_index
+)
+{
+  gpu_trace_channel_set_apply(gpu_trace_stream_release, set_index);
+}
+
+
+static void
+gpu_trace_channel_set_notify
+(
+int set_index
+)
+{
+  gpu_trace_channel_set_apply(gpu_trace_channel_signal_consumer, set_index);
 }
 
 
 static void
-gpu_trace_activities_await
+gpu_trace_channel_set_await
 (
- gpu_trace_t* thread_args
+int set_index
 )
 {
-  gpu_trace_channel_await(thread_args->trace_channel);
+  gpu_trace_channel_set_apply(gpu_trace_channel_await, set_index);
 }
 
 
@@ -366,7 +388,7 @@ gpu_trace_stream_release
 
   hpcrun_write_profile_data(&td->core_profile_trace_data);
   hpcrun_trace_close(&td->core_profile_trace_data);
-  atomic_fetch_add(&stream_counter, -1);
+  atomic_fetch_add(&active_streams_counter, -1);
 
 }
 
@@ -382,7 +404,7 @@ gpu_trace_init
 )
 {
   atomic_store(&stop_trace_flag, false);
-  atomic_store(&stream_counter, 0);
+  atomic_store(&active_streams_counter, 0);
   atomic_store(&stream_id, 0);
 }
 
@@ -390,17 +412,28 @@ gpu_trace_init
 void *
 gpu_trace_record
 (
- gpu_trace_t *thread_args
+ void
 )
 {
 
   while (!atomic_load(&stop_trace_flag)) {
     //getting data from a trace channel
-    gpu_trace_activities_process(thread_args->channel_set_id);
-    gpu_trace_activities_await(thread_args);
+
+    int current_stream_id = atomic_load(&stream_id);
+
+    for (int set_index = 0; set_index < current_stream_id; ++set_index) {
+      gpu_trace_activities_process(set_index);
+      gpu_trace_channel_set_await(set_index);
+    }
+
+  }
+
+  int current_stream_id = atomic_load(&stream_id);
+  for (int set_index = 0; set_index < current_stream_id; ++set_index) {
+    gpu_trace_activities_process(set_index);
+    gpu_trace_channel_set_await(set_index);
+    gpu_trace_channel_set_release(set_index);
   }
-  gpu_trace_activities_process(thread_args->channel_set_id);
-  gpu_trace_channel_set_release(thread_args->channel_set_id);
 
   return NULL;
 }
@@ -416,9 +449,15 @@ gpu_trace_fini
 
   atomic_store(&stop_trace_flag, true);
 
-  gpu_context_stream_map_signal_all();
+  int current_stream_id = atomic_load(&stream_id);
+  for (int set_index = 0; set_index < current_stream_id; ++set_index) {
+    gpu_trace_channel_set_notify(set_index);
+  }
 
-  while (atomic_load(&stream_counter));
+//  gpu_context_stream_map_signal_all();
+
+
+  while (atomic_load(&active_streams_counter));
 }
 
 static void
@@ -435,26 +474,28 @@ gpu_trace_channel_set_append
   static int num_streams = 0;
   volatile bool new_thread = false;
 
-  gpu_trace_channel_stack_alloc(max_threads_consumers);
-
   num_streams++;
-  atomic_fetch_add(&stream_counter, 1);
+  atomic_fetch_add(&active_streams_counter, 1);
 
   if (num_streams >= (streams_per_thread * num_threads)) {
     num_threads++;
     new_thread = true;
+    gpu_trace_channel_stack_alloc(max_threads_consumers);
   }
 
   assert(streams_per_thread > 0);
   assert(num_threads < max_threads_consumers);
 
-  trace->channel_set_id = num_threads - 1;
-  gpu_trace_channel_set_insert(trace->trace_channel, trace->channel_set_id);
-
   if (new_thread) {
-    pthread_create(&trace->thread, NULL, (pthread_start_routine_t) gpu_trace_record, trace);
+    pthread_create(&trace->thread, NULL, (pthread_start_routine_t) gpu_trace_record, NULL);
   }
 
+  my_trace_set_id = num_threads - 1;
+
+  my_trace_set_id = gpu_trace_channel_get_stream_id(trace->trace_channel) - 500;
+
+  gpu_trace_channel_set_insert(trace->trace_channel, my_trace_set_id);
+
   PRINT("set_index = %d -> stream = %u\n", num_threads, num_streams);
 
 }
diff --git a/src/tool/hpcrun/gpu/gpu-trace.h b/src/tool/hpcrun/gpu/gpu-trace.h
index 8ee6cb70f8..b86720f56f 100644
--- a/src/tool/hpcrun/gpu/gpu-trace.h
+++ b/src/tool/hpcrun/gpu/gpu-trace.h
@@ -103,7 +103,7 @@ gpu_trace_create
 void *
 gpu_trace_record
 (
- gpu_trace_t *thread_args
+ void
 );
 
 
diff --git a/src/tool/hpcrun/sample-sources/opencl.c b/src/tool/hpcrun/sample-sources/opencl.c
index 581afa112f..4d526b3798 100644
--- a/src/tool/hpcrun/sample-sources/opencl.c
+++ b/src/tool/hpcrun/sample-sources/opencl.c
@@ -155,12 +155,15 @@ METHOD_FN(finalize_event_list)
   }
   #endif
   opencl_api_initialize();
+
+
+//  // Register shutdown functions to write trace files
+//  device_trace_finalizer_shutdown.fn = gpu_trace_fini;
+//  device_finalizer_register(device_finalizer_type_shutdown, &device_trace_finalizer_shutdown);
+
   device_finalizer_shutdown.fn = opencl_api_finalize;
   device_finalizer_register(device_finalizer_type_shutdown, &device_finalizer_shutdown);
 
-  // Register shutdown functions to write trace files
-  device_trace_finalizer_shutdown.fn = gpu_trace_fini;
-  device_finalizer_register(device_finalizer_type_shutdown, &device_trace_finalizer_shutdown);
 }
 
 

From e442df7ab2d84c99b0217a7c2295833505bc7661 Mon Sep 17 00:00:00 2001
From: Jokeren <robinho364@gmail.com>
Date: Sun, 27 Sep 2020 03:00:42 +0000
Subject: [PATCH 048/177] Fix opencl apis

---
 src/tool/hpcrun/Makefile.am                   |   4 +-
 src/tool/hpcrun/Makefile.in                   |  44 +-
 ...trumentation.c => gtpin-instrumentation.c} |  15 +-
 ...trumentation.h => gtpin-instrumentation.h} |  30 +-
 .../gpu/opencl/opencl-activity-translate.c    |   4 +-
 src/tool/hpcrun/gpu/opencl/opencl-api.c       | 565 ++++++++----------
 src/tool/hpcrun/gpu/opencl/opencl-api.h       |  41 +-
 src/tool/hpcrun/gpu/opencl/opencl-intercept.c |   6 +-
 src/tool/hpcrun/gpu/opencl/opencl-intercept.h |   6 -
 9 files changed, 291 insertions(+), 424 deletions(-)
 rename src/tool/hpcrun/gpu/instrumentation/{opencl-instrumentation.c => gtpin-instrumentation.c} (98%)
 rename src/tool/hpcrun/gpu/instrumentation/{opencl-instrumentation.h => gtpin-instrumentation.h} (80%)

diff --git a/src/tool/hpcrun/Makefile.am b/src/tool/hpcrun/Makefile.am
index d1f08bf483..ada1e016b5 100644
--- a/src/tool/hpcrun/Makefile.am
+++ b/src/tool/hpcrun/Makefile.am
@@ -507,8 +507,8 @@ endif
 
 if OPT_ENABLE_OPENCL
 MY_OPENCL_FILES = sample-sources/opencl.c \
-	gpu/opencl/opencl-intercept.c \
 	gpu/opencl/opencl-api.c \
+	gpu/opencl/opencl-intercept.c \
 	gpu/opencl/opencl-memory-manager.c \
 	gpu/opencl/opencl-activity-translate.c 
 endif
@@ -517,7 +517,7 @@ if OPT_ENABLE_GTPIN
 MY_GTPIN_FILES = \
 	gpu/instrumentation/gtpin-instrumentation-kernel-data-map.c \
 	gpu/instrumentation/gtpin-instrumentation-kernel-memory-map.c \
-	gpu/instrumentation/opencl-instrumentation.c
+	gpu/instrumentation/gtpin-instrumentation.c
 endif
 
 if OPT_ENABLE_ROCM
diff --git a/src/tool/hpcrun/Makefile.in b/src/tool/hpcrun/Makefile.in
index e553ba16d5..3e245edecc 100644
--- a/src/tool/hpcrun/Makefile.in
+++ b/src/tool/hpcrun/Makefile.in
@@ -539,12 +539,12 @@ am__libhpcrun_la_SOURCES_DIST = utilities/first_func.c main.h main.c \
 	gpu/level0/level0-command-process.c \
 	gpu/level0/level0-data-node.c gpu/level0/level0-event-map.c \
 	gpu/level0/level0-handle-map.c sample-sources/opencl.c \
-	gpu/opencl/opencl-intercept.c gpu/opencl/opencl-api.c \
+	gpu/opencl/opencl-api.c gpu/opencl/opencl-intercept.c \
 	gpu/opencl/opencl-memory-manager.c \
 	gpu/opencl/opencl-activity-translate.c \
 	gpu/instrumentation/gtpin-instrumentation-kernel-data-map.c \
 	gpu/instrumentation/gtpin-instrumentation-kernel-memory-map.c \
-	gpu/instrumentation/opencl-instrumentation.c \
+	gpu/instrumentation/gtpin-instrumentation.c \
 	unwind/common/backtrace.c unwind/common/unw-throw.c \
 	unwind/common/binarytree_uwi.c unwind/common/interval_t.c \
 	unwind/common/libunw_intervals.c unwind/common/stack_troll.c \
@@ -746,14 +746,14 @@ am__objects_35 =
 @OPT_ENABLE_LEVEL0_TRUE@am__objects_39 = $(am__objects_38)
 @OPT_ENABLE_OPENCL_TRUE@am__objects_40 =  \
 @OPT_ENABLE_OPENCL_TRUE@	sample-sources/libhpcrun_la-opencl.lo \
-@OPT_ENABLE_OPENCL_TRUE@	gpu/opencl/libhpcrun_la-opencl-intercept.lo \
 @OPT_ENABLE_OPENCL_TRUE@	gpu/opencl/libhpcrun_la-opencl-api.lo \
+@OPT_ENABLE_OPENCL_TRUE@	gpu/opencl/libhpcrun_la-opencl-intercept.lo \
 @OPT_ENABLE_OPENCL_TRUE@	gpu/opencl/libhpcrun_la-opencl-memory-manager.lo \
 @OPT_ENABLE_OPENCL_TRUE@	gpu/opencl/libhpcrun_la-opencl-activity-translate.lo
 @OPT_ENABLE_OPENCL_TRUE@am__objects_41 = $(am__objects_40)
 @OPT_ENABLE_GTPIN_TRUE@am__objects_42 = gpu/instrumentation/libhpcrun_la-gtpin-instrumentation-kernel-data-map.lo \
 @OPT_ENABLE_GTPIN_TRUE@	gpu/instrumentation/libhpcrun_la-gtpin-instrumentation-kernel-memory-map.lo \
-@OPT_ENABLE_GTPIN_TRUE@	gpu/instrumentation/libhpcrun_la-opencl-instrumentation.lo
+@OPT_ENABLE_GTPIN_TRUE@	gpu/instrumentation/libhpcrun_la-gtpin-instrumentation.lo
 @OPT_ENABLE_GTPIN_TRUE@am__objects_43 = $(am__objects_42)
 am__objects_44 = unwind/common/libhpcrun_la-backtrace.lo \
 	unwind/common/libhpcrun_la-unw-throw.lo
@@ -1877,15 +1877,15 @@ MY_AARCH64_FILES = \
 @OPT_ENABLE_CUPTI_TRUE@	gpu/nvidia/cupti-gpu-api.c		
 
 @OPT_ENABLE_OPENCL_TRUE@MY_OPENCL_FILES = sample-sources/opencl.c \
-@OPT_ENABLE_OPENCL_TRUE@	gpu/opencl/opencl-intercept.c \
 @OPT_ENABLE_OPENCL_TRUE@	gpu/opencl/opencl-api.c \
+@OPT_ENABLE_OPENCL_TRUE@	gpu/opencl/opencl-intercept.c \
 @OPT_ENABLE_OPENCL_TRUE@	gpu/opencl/opencl-memory-manager.c \
 @OPT_ENABLE_OPENCL_TRUE@	gpu/opencl/opencl-activity-translate.c 
 
 @OPT_ENABLE_GTPIN_TRUE@MY_GTPIN_FILES = \
 @OPT_ENABLE_GTPIN_TRUE@	gpu/instrumentation/gtpin-instrumentation-kernel-data-map.c \
 @OPT_ENABLE_GTPIN_TRUE@	gpu/instrumentation/gtpin-instrumentation-kernel-memory-map.c \
-@OPT_ENABLE_GTPIN_TRUE@	gpu/instrumentation/opencl-instrumentation.c
+@OPT_ENABLE_GTPIN_TRUE@	gpu/instrumentation/gtpin-instrumentation.c
 
 @OPT_ENABLE_ROCM_TRUE@MY_ROCM_FILES = \
 @OPT_ENABLE_ROCM_TRUE@	sample-sources/amd.c \
@@ -2823,11 +2823,11 @@ gpu/opencl/$(am__dirstamp):
 gpu/opencl/$(DEPDIR)/$(am__dirstamp):
 	@$(MKDIR_P) gpu/opencl/$(DEPDIR)
 	@: > gpu/opencl/$(DEPDIR)/$(am__dirstamp)
+gpu/opencl/libhpcrun_la-opencl-api.lo: gpu/opencl/$(am__dirstamp) \
+	gpu/opencl/$(DEPDIR)/$(am__dirstamp)
 gpu/opencl/libhpcrun_la-opencl-intercept.lo:  \
 	gpu/opencl/$(am__dirstamp) \
 	gpu/opencl/$(DEPDIR)/$(am__dirstamp)
-gpu/opencl/libhpcrun_la-opencl-api.lo: gpu/opencl/$(am__dirstamp) \
-	gpu/opencl/$(DEPDIR)/$(am__dirstamp)
 gpu/opencl/libhpcrun_la-opencl-memory-manager.lo:  \
 	gpu/opencl/$(am__dirstamp) \
 	gpu/opencl/$(DEPDIR)/$(am__dirstamp)
@@ -2846,7 +2846,7 @@ gpu/instrumentation/libhpcrun_la-gtpin-instrumentation-kernel-data-map.lo:  \
 gpu/instrumentation/libhpcrun_la-gtpin-instrumentation-kernel-memory-map.lo:  \
 	gpu/instrumentation/$(am__dirstamp) \
 	gpu/instrumentation/$(DEPDIR)/$(am__dirstamp)
-gpu/instrumentation/libhpcrun_la-opencl-instrumentation.lo:  \
+gpu/instrumentation/libhpcrun_la-gtpin-instrumentation.lo:  \
 	gpu/instrumentation/$(am__dirstamp) \
 	gpu/instrumentation/$(DEPDIR)/$(am__dirstamp)
 unwind/common/libhpcrun_la-backtrace.lo:  \
@@ -3745,7 +3745,7 @@ distclean-compile:
 @AMDEP_TRUE@@am__include@ @am__quote@gpu/amd/$(DEPDIR)/libhpcrun_la-roctracer-api.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@gpu/instrumentation/$(DEPDIR)/libhpcrun_la-gtpin-instrumentation-kernel-data-map.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@gpu/instrumentation/$(DEPDIR)/libhpcrun_la-gtpin-instrumentation-kernel-memory-map.Plo@am__quote@
-@AMDEP_TRUE@@am__include@ @am__quote@gpu/instrumentation/$(DEPDIR)/libhpcrun_la-opencl-instrumentation.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@gpu/instrumentation/$(DEPDIR)/libhpcrun_la-gtpin-instrumentation.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@gpu/level0/$(DEPDIR)/libhpcrun_la-level0-api.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@gpu/level0/$(DEPDIR)/libhpcrun_la-level0-command-list-map.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@gpu/level0/$(DEPDIR)/libhpcrun_la-level0-command-process.Plo@am__quote@
@@ -5403,13 +5403,6 @@ sample-sources/libhpcrun_la-opencl.lo: sample-sources/opencl.c
 @AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
 @am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -c -o sample-sources/libhpcrun_la-opencl.lo `test -f 'sample-sources/opencl.c' || echo '$(srcdir)/'`sample-sources/opencl.c
 
-gpu/opencl/libhpcrun_la-opencl-intercept.lo: gpu/opencl/opencl-intercept.c
-@am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -MT gpu/opencl/libhpcrun_la-opencl-intercept.lo -MD -MP -MF gpu/opencl/$(DEPDIR)/libhpcrun_la-opencl-intercept.Tpo -c -o gpu/opencl/libhpcrun_la-opencl-intercept.lo `test -f 'gpu/opencl/opencl-intercept.c' || echo '$(srcdir)/'`gpu/opencl/opencl-intercept.c
-@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) gpu/opencl/$(DEPDIR)/libhpcrun_la-opencl-intercept.Tpo gpu/opencl/$(DEPDIR)/libhpcrun_la-opencl-intercept.Plo
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='gpu/opencl/opencl-intercept.c' object='gpu/opencl/libhpcrun_la-opencl-intercept.lo' libtool=yes @AMDEPBACKSLASH@
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
-@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -c -o gpu/opencl/libhpcrun_la-opencl-intercept.lo `test -f 'gpu/opencl/opencl-intercept.c' || echo '$(srcdir)/'`gpu/opencl/opencl-intercept.c
-
 gpu/opencl/libhpcrun_la-opencl-api.lo: gpu/opencl/opencl-api.c
 @am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -MT gpu/opencl/libhpcrun_la-opencl-api.lo -MD -MP -MF gpu/opencl/$(DEPDIR)/libhpcrun_la-opencl-api.Tpo -c -o gpu/opencl/libhpcrun_la-opencl-api.lo `test -f 'gpu/opencl/opencl-api.c' || echo '$(srcdir)/'`gpu/opencl/opencl-api.c
 @am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) gpu/opencl/$(DEPDIR)/libhpcrun_la-opencl-api.Tpo gpu/opencl/$(DEPDIR)/libhpcrun_la-opencl-api.Plo
@@ -5417,6 +5410,13 @@ gpu/opencl/libhpcrun_la-opencl-api.lo: gpu/opencl/opencl-api.c
 @AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
 @am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -c -o gpu/opencl/libhpcrun_la-opencl-api.lo `test -f 'gpu/opencl/opencl-api.c' || echo '$(srcdir)/'`gpu/opencl/opencl-api.c
 
+gpu/opencl/libhpcrun_la-opencl-intercept.lo: gpu/opencl/opencl-intercept.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -MT gpu/opencl/libhpcrun_la-opencl-intercept.lo -MD -MP -MF gpu/opencl/$(DEPDIR)/libhpcrun_la-opencl-intercept.Tpo -c -o gpu/opencl/libhpcrun_la-opencl-intercept.lo `test -f 'gpu/opencl/opencl-intercept.c' || echo '$(srcdir)/'`gpu/opencl/opencl-intercept.c
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) gpu/opencl/$(DEPDIR)/libhpcrun_la-opencl-intercept.Tpo gpu/opencl/$(DEPDIR)/libhpcrun_la-opencl-intercept.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='gpu/opencl/opencl-intercept.c' object='gpu/opencl/libhpcrun_la-opencl-intercept.lo' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -c -o gpu/opencl/libhpcrun_la-opencl-intercept.lo `test -f 'gpu/opencl/opencl-intercept.c' || echo '$(srcdir)/'`gpu/opencl/opencl-intercept.c
+
 gpu/opencl/libhpcrun_la-opencl-memory-manager.lo: gpu/opencl/opencl-memory-manager.c
 @am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -MT gpu/opencl/libhpcrun_la-opencl-memory-manager.lo -MD -MP -MF gpu/opencl/$(DEPDIR)/libhpcrun_la-opencl-memory-manager.Tpo -c -o gpu/opencl/libhpcrun_la-opencl-memory-manager.lo `test -f 'gpu/opencl/opencl-memory-manager.c' || echo '$(srcdir)/'`gpu/opencl/opencl-memory-manager.c
 @am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) gpu/opencl/$(DEPDIR)/libhpcrun_la-opencl-memory-manager.Tpo gpu/opencl/$(DEPDIR)/libhpcrun_la-opencl-memory-manager.Plo
@@ -5445,12 +5445,12 @@ gpu/instrumentation/libhpcrun_la-gtpin-instrumentation-kernel-memory-map.lo: gpu
 @AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
 @am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -c -o gpu/instrumentation/libhpcrun_la-gtpin-instrumentation-kernel-memory-map.lo `test -f 'gpu/instrumentation/gtpin-instrumentation-kernel-memory-map.c' || echo '$(srcdir)/'`gpu/instrumentation/gtpin-instrumentation-kernel-memory-map.c
 
-gpu/instrumentation/libhpcrun_la-opencl-instrumentation.lo: gpu/instrumentation/opencl-instrumentation.c
-@am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -MT gpu/instrumentation/libhpcrun_la-opencl-instrumentation.lo -MD -MP -MF gpu/instrumentation/$(DEPDIR)/libhpcrun_la-opencl-instrumentation.Tpo -c -o gpu/instrumentation/libhpcrun_la-opencl-instrumentation.lo `test -f 'gpu/instrumentation/opencl-instrumentation.c' || echo '$(srcdir)/'`gpu/instrumentation/opencl-instrumentation.c
-@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) gpu/instrumentation/$(DEPDIR)/libhpcrun_la-opencl-instrumentation.Tpo gpu/instrumentation/$(DEPDIR)/libhpcrun_la-opencl-instrumentation.Plo
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='gpu/instrumentation/opencl-instrumentation.c' object='gpu/instrumentation/libhpcrun_la-opencl-instrumentation.lo' libtool=yes @AMDEPBACKSLASH@
+gpu/instrumentation/libhpcrun_la-gtpin-instrumentation.lo: gpu/instrumentation/gtpin-instrumentation.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -MT gpu/instrumentation/libhpcrun_la-gtpin-instrumentation.lo -MD -MP -MF gpu/instrumentation/$(DEPDIR)/libhpcrun_la-gtpin-instrumentation.Tpo -c -o gpu/instrumentation/libhpcrun_la-gtpin-instrumentation.lo `test -f 'gpu/instrumentation/gtpin-instrumentation.c' || echo '$(srcdir)/'`gpu/instrumentation/gtpin-instrumentation.c
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) gpu/instrumentation/$(DEPDIR)/libhpcrun_la-gtpin-instrumentation.Tpo gpu/instrumentation/$(DEPDIR)/libhpcrun_la-gtpin-instrumentation.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='gpu/instrumentation/gtpin-instrumentation.c' object='gpu/instrumentation/libhpcrun_la-gtpin-instrumentation.lo' libtool=yes @AMDEPBACKSLASH@
 @AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
-@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -c -o gpu/instrumentation/libhpcrun_la-opencl-instrumentation.lo `test -f 'gpu/instrumentation/opencl-instrumentation.c' || echo '$(srcdir)/'`gpu/instrumentation/opencl-instrumentation.c
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -c -o gpu/instrumentation/libhpcrun_la-gtpin-instrumentation.lo `test -f 'gpu/instrumentation/gtpin-instrumentation.c' || echo '$(srcdir)/'`gpu/instrumentation/gtpin-instrumentation.c
 
 unwind/common/libhpcrun_la-backtrace.lo: unwind/common/backtrace.c
 @am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -MT unwind/common/libhpcrun_la-backtrace.lo -MD -MP -MF unwind/common/$(DEPDIR)/libhpcrun_la-backtrace.Tpo -c -o unwind/common/libhpcrun_la-backtrace.lo `test -f 'unwind/common/backtrace.c' || echo '$(srcdir)/'`unwind/common/backtrace.c
diff --git a/src/tool/hpcrun/gpu/instrumentation/opencl-instrumentation.c b/src/tool/hpcrun/gpu/instrumentation/gtpin-instrumentation.c
similarity index 98%
rename from src/tool/hpcrun/gpu/instrumentation/opencl-instrumentation.c
rename to src/tool/hpcrun/gpu/instrumentation/gtpin-instrumentation.c
index 06519445cf..0ad76b53f9 100644
--- a/src/tool/hpcrun/gpu/instrumentation/opencl-instrumentation.c
+++ b/src/tool/hpcrun/gpu/instrumentation/gtpin-instrumentation.c
@@ -81,8 +81,7 @@
 #include <lib/prof-lean/crypto-hash.h>
 #include <lib/prof-lean/spinlock.h>
 
-#include "opencl-instrumentation.h"
-
+#include "gtpin-instrumentation.h"
 
 
 //******************************************************************************
@@ -443,13 +442,14 @@ onKernelComplete
     uint64_t execution_count = total; // + bm->val 
     //block_map_insert1(data.block_map_root, block->offset, execution_count);
 
-    activityNotify();  
     gpu_activity_t gpu_activity;
     kernelBlockActivityProcess(&gpu_activity, correlation_id, data.loadmap_module_id, block->offset, execution_count);
     block = block->next;
     //how to make offset the primary key within the cct and += the execution value for existing ccts?
   }
 
+  activityNotify();  
+
   ++(data.call_count);
 }
 
@@ -461,7 +461,7 @@ onKernelComplete
 
 
 void
-opencl_enable_profiling
+gtpin_enable_profiling
 (
  void
 )
@@ -470,10 +470,11 @@ opencl_enable_profiling
   initializeInstrumentation();
   knobAddBool("silent_warnings", true);
 
-  /*if (utils::GetEnv("PTI_GEN12") != nullptr) {
-    std::cout << "[INFO] Experimental GTPin mode: GEN12" << std::endl;
+#if 0
+  if (utils::GetEnv("PTI_GEN12") != nullptr) {
     KnobAddBool("gen12_1", true);
-    }*/
+  }
+#endif
 
   GTPin_OnKernelBuild(onKernelBuild, NULL);
   GTPin_OnKernelRun(onKernelRun, NULL);
diff --git a/src/tool/hpcrun/gpu/instrumentation/opencl-instrumentation.h b/src/tool/hpcrun/gpu/instrumentation/gtpin-instrumentation.h
similarity index 80%
rename from src/tool/hpcrun/gpu/instrumentation/opencl-instrumentation.h
rename to src/tool/hpcrun/gpu/instrumentation/gtpin-instrumentation.h
index 53dee81047..2a9081523c 100644
--- a/src/tool/hpcrun/gpu/instrumentation/opencl-instrumentation.h
+++ b/src/tool/hpcrun/gpu/instrumentation/gtpin-instrumentation.h
@@ -48,40 +48,12 @@
 #include "gtpin-instrumentation-kernel-memory-map.h"
 #include "gtpin-instrumentation-kernel-data-map.h"
 
-
-
-//******************************************************************************
-// type declarations
-//******************************************************************************
-
-// TODO(Aaron): Why it starts with a _? 
-typedef struct _SProgramDebugDataHeaderIGC
-{
- uint32_t Magic;
- uint32_t Version;
- uint32_t Size;
- uint32_t Device;
- uint32_t SteppingId;
- uint32_t GPUPointerSizeInBytes;
- uint32_t NumberOfKernels;
-} SProgramDebugDataHeaderIGC;
-
-
-typedef struct _SKernelDebugDataHeaderIGC
-{
- uint32_t KernelNameSize;
- uint32_t SizeVisaDbgInBytes;
- uint32_t SizeGenIsaDbgInBytes;
-} SKernelDebugDataHeaderIGC;
-
-
-
 //******************************************************************************
 // interface operations
 //******************************************************************************
 
 void
-opencl_enable_profiling
+gtpin_enable_profiling
 (
  void
 );
diff --git a/src/tool/hpcrun/gpu/opencl/opencl-activity-translate.c b/src/tool/hpcrun/gpu/opencl/opencl-activity-translate.c
index d6cc192fba..555ad5b0cd 100644
--- a/src/tool/hpcrun/gpu/opencl/opencl-activity-translate.c
+++ b/src/tool/hpcrun/gpu/opencl/opencl-activity-translate.c
@@ -91,7 +91,7 @@ convert_kernel_launch
 {
   cl_kernel_callback_t *kernel_cb_data = (cl_kernel_callback_t*)user_data;
   memset(&ga->details.kernel, 0, sizeof(gpu_kernel_t));
-  getTimingInfoFromClEvent(&ga->details.interval, event);
+  opencl_timing_info_get(&ga->details.interval, event);
   ga->kind = GPU_ACTIVITY_KERNEL;
   ga->details.kernel.correlation_id = kernel_cb_data->correlation_id;
 }
@@ -108,7 +108,7 @@ convert_memcpy
 {
   cl_memory_callback_t *memory_cb_data = (cl_memory_callback_t*)user_data;
   memset(&ga->details.memcpy, 0, sizeof(gpu_memcpy_t));
-  getTimingInfoFromClEvent(&ga->details.interval, event);
+  opencl_timing_info_get(&ga->details.interval, event);
   getMemoryProfileInfo(&ga->details.memcpy, memory_cb_data);
   ga->kind = GPU_ACTIVITY_MEMCPY;
 }
diff --git a/src/tool/hpcrun/gpu/opencl/opencl-api.c b/src/tool/hpcrun/gpu/opencl/opencl-api.c
index 0479a514d3..ebd5edfdd0 100644
--- a/src/tool/hpcrun/gpu/opencl/opencl-api.c
+++ b/src/tool/hpcrun/gpu/opencl/opencl-api.c
@@ -88,112 +88,99 @@
 
 #define CPU_NANOTIME() (usec_time() * 1000)
 
-#define FORALL_OPENCL_ERRORS(macro)					\
-  macro(CL_SUCCESS)							\
-  macro(CL_DEVICE_NOT_FOUND)						\
-  macro(CL_DEVICE_NOT_AVAILABLE)					\
-  macro(CL_COMPILER_NOT_AVAILABLE)					\
-  macro(CL_MEM_OBJECT_ALLOCATION_FAILURE)				\
-  macro(CL_OUT_OF_RESOURCES)						\
-  macro(CL_OUT_OF_HOST_MEMORY)						\
-  macro(CL_PROFILING_INFO_NOT_AVAILABLE)				\
-  macro(CL_MEM_COPY_OVERLAP)						\
-  macro(CL_IMAGE_FORMAT_MISMATCH)					\
-  macro(CL_IMAGE_FORMAT_NOT_SUPPORTED)					\
-  macro(CL_BUILD_PROGRAM_FAILURE)					\
-  macro(CL_MAP_FAILURE)							\
-  macro(CL_MISALIGNED_SUB_BUFFER_OFFSET)				\
-  macro(CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST)			\
-  macro(CL_COMPILE_PROGRAM_FAILURE)					\
-  macro(CL_LINKER_NOT_AVAILABLE)					\
-  macro(CL_LINK_PROGRAM_FAILURE)					\
-  macro(CL_DEVICE_PARTITION_FAILED)					\
-  macro(CL_KERNEL_ARG_INFO_NOT_AVAILABLE)				\
-  macro(CL_INVALID_VALUE)						\
-  macro(CL_INVALID_DEVICE_TYPE)						\
-  macro(CL_INVALID_PLATFORM)						\
-  macro(CL_INVALID_DEVICE)						\
-  macro(CL_INVALID_CONTEXT)						\
-  macro(CL_INVALID_QUEUE_PROPERTIES)					\
-  macro(CL_INVALID_COMMAND_QUEUE)					\
-  macro(CL_INVALID_HOST_PTR)						\
-  macro(CL_INVALID_MEM_OBJECT)						\
-  macro(CL_INVALID_IMAGE_FORMAT_DESCRIPTOR)				\
-  macro(CL_INVALID_IMAGE_SIZE)						\
-  macro(CL_INVALID_SAMPLER)						\
-  macro(CL_INVALID_BINARY)						\
-  macro(CL_INVALID_BUILD_OPTIONS)					\
-  macro(CL_INVALID_PROGRAM)						\
-  macro(CL_INVALID_PROGRAM_EXECUTABLE)					\
-  macro(CL_INVALID_KERNEL_NAME)						\
-  macro(CL_INVALID_KERNEL_DEFINITION)					\
-  macro(CL_INVALID_KERNEL)						\
-  macro(CL_INVALID_ARG_INDEX)						\
-  macro(CL_INVALID_ARG_VALUE)						\
-  macro(CL_INVALID_ARG_SIZE)						\
-  macro(CL_INVALID_KERNEL_ARGS)						\
-  macro(CL_INVALID_WORK_DIMENSION)					\
-  macro(CL_INVALID_WORK_GROUP_SIZE)					\
-  macro(CL_INVALID_WORK_ITEM_SIZE)					\
-  macro(CL_INVALID_GLOBAL_OFFSET)					\
-  macro(CL_INVALID_EVENT_WAIT_LIST)					\
-  macro(CL_INVALID_EVENT)						\
-  macro(CL_INVALID_OPERATION)						\
-  macro(CL_INVALID_GL_OBJECT)						\
-  macro(CL_INVALID_BUFFER_SIZE)						\
-  macro(CL_INVALID_MIP_LEVEL)						\
-  macro(CL_INVALID_GLOBAL_WORK_SIZE)					\
-  macro(CL_INVALID_PROPERTY)						\
-  macro(CL_INVALID_IMAGE_DESCRIPTOR)					\
-  macro(CL_INVALID_COMPILER_OPTIONS)					\
-  macro(CL_INVALID_LINKER_OPTIONS)					\
+#define FORALL_OPENCL_ERRORS(macro)          \
+  macro(CL_SUCCESS)              \
+  macro(CL_DEVICE_NOT_FOUND)            \
+  macro(CL_DEVICE_NOT_AVAILABLE)          \
+  macro(CL_COMPILER_NOT_AVAILABLE)          \
+  macro(CL_MEM_OBJECT_ALLOCATION_FAILURE)        \
+  macro(CL_OUT_OF_RESOURCES)            \
+  macro(CL_OUT_OF_HOST_MEMORY)            \
+  macro(CL_PROFILING_INFO_NOT_AVAILABLE)        \
+  macro(CL_MEM_COPY_OVERLAP)            \
+  macro(CL_IMAGE_FORMAT_MISMATCH)          \
+  macro(CL_IMAGE_FORMAT_NOT_SUPPORTED)          \
+  macro(CL_BUILD_PROGRAM_FAILURE)          \
+  macro(CL_MAP_FAILURE)              \
+  macro(CL_MISALIGNED_SUB_BUFFER_OFFSET)        \
+  macro(CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST)      \
+  macro(CL_COMPILE_PROGRAM_FAILURE)          \
+  macro(CL_LINKER_NOT_AVAILABLE)          \
+  macro(CL_LINK_PROGRAM_FAILURE)          \
+  macro(CL_DEVICE_PARTITION_FAILED)          \
+  macro(CL_KERNEL_ARG_INFO_NOT_AVAILABLE)        \
+  macro(CL_INVALID_VALUE)            \
+  macro(CL_INVALID_DEVICE_TYPE)            \
+  macro(CL_INVALID_PLATFORM)            \
+  macro(CL_INVALID_DEVICE)            \
+  macro(CL_INVALID_CONTEXT)            \
+  macro(CL_INVALID_QUEUE_PROPERTIES)          \
+  macro(CL_INVALID_COMMAND_QUEUE)          \
+  macro(CL_INVALID_HOST_PTR)            \
+  macro(CL_INVALID_MEM_OBJECT)            \
+  macro(CL_INVALID_IMAGE_FORMAT_DESCRIPTOR)        \
+  macro(CL_INVALID_IMAGE_SIZE)            \
+  macro(CL_INVALID_SAMPLER)            \
+  macro(CL_INVALID_BINARY)            \
+  macro(CL_INVALID_BUILD_OPTIONS)          \
+  macro(CL_INVALID_PROGRAM)            \
+  macro(CL_INVALID_PROGRAM_EXECUTABLE)          \
+  macro(CL_INVALID_KERNEL_NAME)            \
+  macro(CL_INVALID_KERNEL_DEFINITION)          \
+  macro(CL_INVALID_KERNEL)            \
+  macro(CL_INVALID_ARG_INDEX)            \
+  macro(CL_INVALID_ARG_VALUE)            \
+  macro(CL_INVALID_ARG_SIZE)            \
+  macro(CL_INVALID_KERNEL_ARGS)            \
+  macro(CL_INVALID_WORK_DIMENSION)          \
+  macro(CL_INVALID_WORK_GROUP_SIZE)          \
+  macro(CL_INVALID_WORK_ITEM_SIZE)          \
+  macro(CL_INVALID_GLOBAL_OFFSET)          \
+  macro(CL_INVALID_EVENT_WAIT_LIST)          \
+  macro(CL_INVALID_EVENT)            \
+  macro(CL_INVALID_OPERATION)            \
+  macro(CL_INVALID_GL_OBJECT)            \
+  macro(CL_INVALID_BUFFER_SIZE)            \
+  macro(CL_INVALID_MIP_LEVEL)            \
+  macro(CL_INVALID_GLOBAL_WORK_SIZE)          \
+  macro(CL_INVALID_PROPERTY)            \
+  macro(CL_INVALID_IMAGE_DESCRIPTOR)          \
+  macro(CL_INVALID_COMPILER_OPTIONS)          \
+  macro(CL_INVALID_LINKER_OPTIONS)          \
   macro(CL_INVALID_DEVICE_PARTITION_COUNT)
 
-#define FORALL_OPENCL_CALLS(macro)					\
-  macro(memcpy_H2D)							\
-  macro(memcpy_D2H)							\
+#define FORALL_OPENCL_CALLS(macro)          \
+  macro(memcpy_H2D)              \
+  macro(memcpy_D2H)              \
   macro(kernel)
 
 #define CODE_TO_STRING(e) case e: return #e;
 
 #define opencl_path() "libOpenCL.so"
 
-#define FORALL_OPENCL_ROUTINES(macro)					\
-  macro(clBuildProgram)					\
-  macro(clCreateProgramWithSource)					\
-  macro(clCreateCommandQueue)					\
-  macro(clEnqueueNDRangeKernel)					\
-  macro(clEnqueueReadBuffer)					\
-  macro(clEnqueueWriteBuffer)					\
-  macro(clGetEventProfilingInfo)					\
-  macro(clReleaseEvent)							\
+#define FORALL_OPENCL_ROUTINES(macro)          \
+  macro(clBuildProgram)          \
+  macro(clCreateProgramWithSource)          \
+  macro(clCreateCommandQueue)          \
+  macro(clEnqueueNDRangeKernel)          \
+  macro(clEnqueueReadBuffer)          \
+  macro(clEnqueueWriteBuffer)          \
+  macro(clGetEventProfilingInfo)          \
+  macro(clReleaseEvent)              \
   macro(clSetEventCallback)
 
 #define OPENCL_FN_NAME(f) DYN_FN_NAME(f)
 
-#define OPENCL_FN(fn, args)			\
+#define OPENCL_FN(fn, args)      \
   static cl_int (*OPENCL_FN_NAME(fn)) args
 
-#define OPENCL_PROGRAM_FN(fn, args)			\
+#define OPENCL_PROGRAM_FN(fn, args)      \
   static cl_program (*OPENCL_FN_NAME(fn)) args
 
 #define HPCRUN_OPENCL_CALL(fn, args) (OPENCL_FN_NAME(fn) args)
 
 #define LINE_TABLE_FLAG " -gline-tables-only "
 
-
-/*
-#define HPCRUN_OPENCL_CALL(fn, args)								\
-  {																									\
-    cl_int status = OPENCL_FN_NAME(fn) args;				\
-    if (status != CL_SUCCESS) {											\
-      ETMSG(OPENCL, "opencl call failed: %s",				\
-	    opencl_error_report(status));									\
-    }																								\
-  }
-*/
-
-
 //******************************************************************************
 // local data
 //******************************************************************************
@@ -206,12 +193,12 @@ OPENCL_FN
 (
   clBuildProgram, 
   (
-	 cl_program program,
-	 cl_uint num_devices,
-	 const cl_device_id* device_list,
-	 const char* options,
-	 void (CL_CALLBACK* pfn_notify)(cl_program program, void* user_data),
-	 void* user_data
+   cl_program program,
+   cl_uint num_devices,
+   const cl_device_id* device_list,
+   const char* options,
+   void (CL_CALLBACK* pfn_notify)(cl_program program, void* user_data),
+   void* user_data
   )
 );
 
@@ -220,11 +207,11 @@ OPENCL_PROGRAM_FN
 (
   clCreateProgramWithSource, 
   (
-	 cl_context context,
-	 cl_uint count,
-	 const char** strings,
-	 const size_t* lengths,
-	 cl_int* errcode_ret
+   cl_context context,
+   cl_uint count,
+   const char** strings,
+   const size_t* lengths,
+   cl_int* errcode_ret
   )
 );
 
@@ -232,12 +219,12 @@ OPENCL_PROGRAM_FN
 OPENCL_FN
 (
   clCreateCommandQueue, 
-	(
-	 cl_context,
-	 cl_device_id,
-	 cl_command_queue_properties,
-	 cl_int*
-	)
+  (
+   cl_context,
+   cl_device_id,
+   cl_command_queue_properties,
+   cl_int*
+  )
 );
 
 
@@ -245,15 +232,15 @@ OPENCL_PROGRAM_FN
 (
   clEnqueueNDRangeKernel, 
   (
-	 cl_command_queue,
-	 cl_kernel,
-	 cl_uint,
-	 const size_t *, 
-	 const size_t *,
-	 const size_t *,
-	 cl_uint,
-	 const cl_event *,
-	 cl_event *
+   cl_command_queue,
+   cl_kernel,
+   cl_uint,
+   const size_t *, 
+   const size_t *,
+   const size_t *,
+   cl_uint,
+   const cl_event *,
+   cl_event *
   )
 );
 
@@ -262,15 +249,15 @@ OPENCL_FN
 (
   clEnqueueReadBuffer, 
   (
-	 cl_command_queue,
-	 cl_mem,
-	 cl_bool,
-	 size_t,
-	 size_t,
-	 void *,
-	 cl_uint,
-	 const cl_event *,
-	 cl_event *
+   cl_command_queue,
+   cl_mem,
+   cl_bool,
+   size_t,
+   size_t,
+   void *,
+   cl_uint,
+   const cl_event *,
+   cl_event *
   )
 );
 
@@ -279,15 +266,15 @@ OPENCL_FN
 (
   clEnqueueWriteBuffer, 
   (
-	 cl_command_queue,
-	 cl_mem,
-	 cl_bool,
-	 size_t,
-	 size_t,
-	 const void *,
-	 cl_uint,
-	 const cl_event *,
-	 cl_event *
+   cl_command_queue,
+   cl_mem,
+   cl_bool,
+   size_t,
+   size_t,
+   const void *,
+   cl_uint,
+   const cl_event *,
+   cl_event *
   )
 );
 
@@ -296,11 +283,11 @@ OPENCL_FN
 (
   clGetEventProfilingInfo,
   (
-    cl_event event,
-    cl_profiling_info param_name,
-    size_t param_value_size,
-    void *param_value,
-    size_t *param_value_size_ret
+   cl_event event,
+   cl_profiling_info param_name,
+   size_t param_value_size,
+   void *param_value,
+   size_t *param_value_size_ret
   )
 );
 
@@ -309,7 +296,7 @@ OPENCL_FN
 (
   clReleaseEvent, 
   (
-    cl_event event
+   cl_event event
   )
 );
 
@@ -318,11 +305,11 @@ OPENCL_FN
 (
   clSetEventCallback,
   (
-    cl_event event,
-    cl_int command_exec_callback_type,
-    void (CL_CALLBACK *pfn_notify)
-    (cl_event event, cl_int event_command_status, void *user_data),
-    void *user_data
+   cl_event event,
+   cl_int command_exec_callback_type,
+   void (CL_CALLBACK *pfn_notify)
+   (cl_event event, cl_int event_command_status, void *user_data),
+   void *user_data
   )
 );
 
@@ -330,12 +317,9 @@ OPENCL_FN
 static atomic_ullong opencl_pending_operations;
 static atomic_long correlation_id;
 
-
 #define CL_PROGRAM_DEBUG_INFO_SIZES_INTEL 0x4101
 #define CL_PROGRAM_DEBUG_INFO_INTEL       0x4100
 
-
-
 //******************************************************************************
 // private operations
 //******************************************************************************
@@ -343,7 +327,7 @@ static atomic_long correlation_id;
 static uint64_t
 getCorrelationId
 (
-  void
+ void
 )
 {
   return atomic_fetch_add(&correlation_id, 1);
@@ -353,8 +337,8 @@ getCorrelationId
 static void
 initializeKernelCallBackInfo
 (
-  cl_kernel_callback_t *kernel_cb,
-  uint64_t correlation_id
+ cl_kernel_callback_t *kernel_cb,
+ uint64_t correlation_id
 )
 {
   kernel_cb->correlation_id = correlation_id;
@@ -365,10 +349,10 @@ initializeKernelCallBackInfo
 static void
 initializeMemoryCallBackInfo
 (
-  cl_memory_callback_t *mem_transfer_cb,
-  uint64_t correlation_id,
-  size_t size,
-  bool fromHostToDevice
+ cl_memory_callback_t *mem_transfer_cb,
+ uint64_t correlation_id,
+ size_t size,
+ bool fromHostToDevice
 )
 {
   mem_transfer_cb->correlation_id = correlation_id;
@@ -379,92 +363,21 @@ initializeMemoryCallBackInfo
 }
 
 
-#if 0
-static char*
-getKernelNameFromSourceCode
-(
-	const char *kernelSourceCode
-)
-{
-	char *kernelCode_copy = (char*)hpcrun_malloc(sizeof(kernelSourceCode));
-	strcpy(kernelCode_copy, kernelSourceCode);
-	char *token = strtok(kernelCode_copy, " ");
-	while (token != NULL) {
-		if (strcmp(token, "void") == 0) { // not searching for kernel because "supported\n#endif\nkernel"
-			token = strtok(NULL, " ");
-			printf("kernel name: %s", token);
-			return token;
-		}
-		token = strtok(NULL, " ");
-	}
-	return NULL;
-}
-#endif
-
-
 // we are dumping the debuginfo since the binary does not have debugsection
 static void
 clBuildProgramCallback
 (
-	cl_program program,
-	void* user_data
+ cl_program program,
+ void* user_data
 )
 {
-#if 0
-  // TODO(Aaron): where do you get device_count?
-  int device_count = 1;
-  cl_int status = CL_SUCCESS;
-
-  // binary
-  size_t *binary_size = (size_t *)malloc(device_count * sizeof(size_t));
-  status = clGetProgramInfo(program, CL_PROGRAM_BINARY_SIZES, device_count * sizeof(size_t), binary_size, NULL);
-  assert(status == CL_SUCCESS);
-
-  char **binary = (char **)malloc(device_count * sizeof(char *));
-  for (size_t i = 0; i < device_count; ++i) {
-    binary[i] = (char *)malloc(binary_size[i] * sizeof(char));
-  }
-
-  status = clGetProgramInfo(program, CL_PROGRAM_BINARIES, device_count * sizeof(char *), binary, NULL);
-  assert(status == CL_SUCCESS);
-
-  // debug info
-  size_t *debug_info_size = (size_t *)malloc(device_count * sizeof(size_t));
-  status = clGetProgramInfo(program, CL_PROGRAM_DEBUG_INFO_SIZES_INTEL, device_count * sizeof(size_t), debug_info_size, NULL);
-  assert(status == CL_SUCCESS);
-
-  char **debug_info = (char **)malloc(device_count * sizeof(char *));
-  for (size_t i = 0; i < device_count; ++i) {
-    debug_info[i] = (char *)malloc(debug_info_size[i] * sizeof(char));
-  }
-
-  status = clGetProgramInfo(program, CL_PROGRAM_DEBUG_INFO_INTEL, device_count * sizeof(char *), debug_info, NULL);
-  assert(status == CL_SUCCESS);
-
-  // TODO(Aaron): Is it ok to only write binary 0?
-  // write binaries and add them to load map
-  for (size_t i = 0; i < device_count; ++i) {
-    writeHashBinary(binary[i], binary_size[i], false);
-    writeHashBinary(debug_info[i], debug_info_size[i], true); 
-  }
-
-  // free memory
-  for (size_t i = 0; i < device_count; ++i) {
-    free(binary[i]);
-    free(debug_info[i]);
-  }
-  free(binary_size);
-  free(debug_info_size);
-
-  ETMSG(OPENCL, "Intel GPU files dumped successfully");
-#endif
 }
 
 
 static void
 opencl_pending_operations_adjust
 (
-  int value
+ int value
 )
 {
   atomic_fetch_add(&opencl_pending_operations, value);
@@ -474,7 +387,7 @@ opencl_pending_operations_adjust
 static void
 opencl_activity_completion_notify
 (
-  void
+ void
 )
 {
   gpu_monitoring_thread_activities_ready();
@@ -484,8 +397,8 @@ opencl_activity_completion_notify
 static void
 opencl_activity_process
 (
-  cl_event event,
-  void *user_data
+ cl_event event,
+ void *user_data
 )
 {
   gpu_activity_t gpu_activity;
@@ -501,7 +414,7 @@ opencl_wait_for_pending_operations
 )
 {
   ETMSG(OPENCL, "pending operations: %lu", 
-	atomic_load(&opencl_pending_operations));
+  atomic_load(&opencl_pending_operations));
   while (atomic_load(&opencl_pending_operations) != 0);
 }
 
@@ -541,8 +454,8 @@ opencl_error_report
 void
 opencl_subscriber_callback
 (
-  opencl_call_t type,
-  uint64_t correlation_id
+ opencl_call_t type,
+ uint64_t correlation_id
 )
 {
   opencl_pending_operations_adjust(1);
@@ -555,18 +468,18 @@ opencl_subscriber_callback
   switch (type) {
     case memcpy_H2D:
       gpu_op_placeholder_flags_set(&gpu_op_placeholder_flags, 
-				   gpu_placeholder_type_copyin);
+           gpu_placeholder_type_copyin);
       break;
     case memcpy_D2H:
       gpu_op_placeholder_flags_set(&gpu_op_placeholder_flags, 
-				   gpu_placeholder_type_copyout);
+           gpu_placeholder_type_copyout);
       break;
     case kernel:
       gpu_op_placeholder_flags_set(&gpu_op_placeholder_flags, 
-				   gpu_placeholder_type_kernel);
+           gpu_placeholder_type_kernel);
 
       gpu_op_placeholder_flags_set(&gpu_op_placeholder_flags, 
-				   gpu_placeholder_type_trace);
+           gpu_placeholder_type_trace);
       break;
     default:
       assert(0);
@@ -576,19 +489,19 @@ opencl_subscriber_callback
   gpu_op_ccts_insert(api_node, &gpu_op_ccts, gpu_op_placeholder_flags);
   hpcrun_safe_exit();
 
-  gpu_activity_channel_consume(gpu_metrics_attribute);	
+  gpu_activity_channel_consume(gpu_metrics_attribute);  
   uint64_t cpu_submit_time = CPU_NANOTIME();
   gpu_correlation_channel_produce(correlation_id, &gpu_op_ccts, 
-				  cpu_submit_time);
+          cpu_submit_time);
 }
 
 
 void
 opencl_activity_completion_callback
 (
-  cl_event event,
-  cl_int event_command_exec_status,
-  void *user_data
+ cl_event event,
+ cl_int event_command_exec_status,
+ void *user_data
 )
 {
   cl_int complete_flag = CL_COMPLETE;
@@ -607,11 +520,11 @@ opencl_activity_completion_callback
       gpu_correlation_id_map_lookup(correlation_id);
     if (cid_map_entry == NULL) {
       ETMSG(OPENCL, "completion callback was called before registration " 
-	    "callback. type: %d, correlation: %"PRIu64 "", type, 
-	    correlation_id);
+      "callback. type: %d, correlation: %"PRIu64 "", type, 
+      correlation_id);
     }
     ETMSG(OPENCL, "completion type: %s, Correlation id: %"PRIu64 "", 
-	  opencl_call_to_string(type), correlation_id);
+    opencl_call_to_string(type), correlation_id);
     opencl_activity_completion_notify();
     opencl_activity_process(event, act_data);
   }
@@ -624,22 +537,22 @@ opencl_activity_completion_callback
 
 
 void
-getTimingInfoFromClEvent
+opencl_timing_info_get
 (
-  gpu_interval_t *interval,
-  cl_event event
+ gpu_interval_t *interval,
+ cl_event event
 )
 {
   cl_ulong commandStart = 0;
   cl_ulong commandEnd = 0;
 
   HPCRUN_OPENCL_CALL(clGetEventProfilingInfo, 
-		     (event, CL_PROFILING_COMMAND_START, 
-		      sizeof(commandStart), &commandStart, NULL));
+         (event, CL_PROFILING_COMMAND_START, 
+          sizeof(commandStart), &commandStart, NULL));
 
   HPCRUN_OPENCL_CALL(clGetEventProfilingInfo, 
-		     (event, CL_PROFILING_COMMAND_END, 
-		      sizeof(commandEnd), &commandEnd, NULL));
+         (event, CL_PROFILING_COMMAND_END, 
+          sizeof(commandEnd), &commandEnd, NULL));
 
   set_gpu_interval(interval, (uint64_t)commandStart, (uint64_t)commandEnd);
 }
@@ -648,22 +561,22 @@ getTimingInfoFromClEvent
 void
 clSetEventCallback_wrapper
 (
-  cl_event event,
-  cl_int event_command_status,
-  void (CL_CALLBACK *pfn_notify)
-  (cl_event event, cl_int event_command_status, void *user_data),
-  void *user_data
+ cl_event event,
+ cl_int event_command_status,
+ void (CL_CALLBACK *pfn_notify)
+ (cl_event event, cl_int event_command_status, void *user_data),
+ void *user_data
 )
 {
   HPCRUN_OPENCL_CALL(clSetEventCallback, 
-		     (event, event_command_status, pfn_notify, user_data));
+         (event, event_command_status, pfn_notify, user_data));
 }
 
 
 void
 opencl_api_initialize
 (
-  void
+ void
 )
 {
   opencl_intercept_setup();
@@ -684,7 +597,7 @@ opencl_bind
   CHK_DLOPEN(opencl, opencl_path(), RTLD_NOW | RTLD_GLOBAL);
   hpcrun_force_dlopen(false);
   
-#define OPENCL_BIND(fn)				\
+#define OPENCL_BIND(fn)        \
   CHK_DLSYM(opencl, fn);
   
   FORALL_OPENCL_ROUTINES(OPENCL_BIND)
@@ -708,22 +621,22 @@ clCreateProgramWithSource
  cl_int* errcode_ret
 )
 {
-	ETMSG(OPENCL, "inside clCreateProgramWithSource_wrapper");
-
-	FILE *f_ptr;
-	for (int i = 0; i < (int)count; i++) {
-		// what if a single file has multiple kernels?
-		// we need to add logic to get filenames by reading the strings contents
-		char fileno = '0' + (i + 1); // right now we are naming the files as index numbers
-		// using malloc instead of hpcrun_malloc gives extra garbage characters in file name
-		char *filename = (char*)hpcrun_malloc(sizeof(fileno) + 1);
-		*filename = fileno + '\0';
-		f_ptr = fopen(filename, "w");
-		fwrite(strings[i], lengths[i], 1, f_ptr);
-	}
-	fclose(f_ptr);
-	
-	return HPCRUN_OPENCL_CALL(clCreateProgramWithSource, (context, count, strings, lengths, errcode_ret));
+  ETMSG(OPENCL, "inside clCreateProgramWithSource_wrapper");
+
+  FILE *f_ptr;
+  for (int i = 0; i < (int)count; i++) {
+    // what if a single file has multiple kernels?
+    // we need to add logic to get filenames by reading the strings contents
+    char fileno = '0' + (i + 1); // right now we are naming the files as index numbers
+    // using malloc instead of hpcrun_malloc gives extra garbage characters in file name
+    char *filename = (char*)hpcrun_malloc(sizeof(fileno) + 1);
+    *filename = fileno + '\0';
+    f_ptr = fopen(filename, "w");
+    fwrite(strings[i], lengths[i], 1, f_ptr);
+  }
+  fclose(f_ptr);
+  
+  return HPCRUN_OPENCL_CALL(clCreateProgramWithSource, (context, count, strings, lengths, errcode_ret));
 }
 
 
@@ -758,32 +671,32 @@ clBuildProgram
 cl_command_queue
 clCreateCommandQueue
 (
-  cl_context context,
-  cl_device_id device,
-  cl_command_queue_properties properties,
-  cl_int *errcode_ret
+ cl_context context,
+ cl_device_id device,
+ cl_command_queue_properties properties,
+ cl_int *errcode_ret
 )
 {
   // enabling profiling
   properties |= (cl_command_queue_properties)CL_QUEUE_PROFILING_ENABLE; 
 
-	return HPCRUN_OPENCL_CALL(clCreateCommandQueue, (context, device,
-				properties,errcode_ret));	
+  return HPCRUN_OPENCL_CALL(clCreateCommandQueue, (context, device,
+        properties,errcode_ret));  
 }
 
 
 cl_int
 clEnqueueNDRangeKernel
 (
-  cl_command_queue command_queue,
-  cl_kernel ocl_kernel,
-  cl_uint work_dim,
-  const size_t *global_work_offset, 
-  const size_t *global_work_size,
-  const size_t *local_work_size,
-  cl_uint num_events_in_wait_list,
-  const cl_event *event_wait_list,
-  cl_event *event
+ cl_command_queue command_queue,
+ cl_kernel ocl_kernel,
+ cl_uint work_dim,
+ const size_t *global_work_offset, 
+ const size_t *global_work_size,
+ const size_t *local_work_size,
+ cl_uint num_events_in_wait_list,
+ const cl_event *event_wait_list,
+ cl_event *event
 )
 {
   uint64_t correlation_id = getCorrelationId();
@@ -802,16 +715,16 @@ clEnqueueNDRangeKernel
   }
   cl_int return_status = 
     HPCRUN_OPENCL_CALL(clEnqueueNDRangeKernel, (command_queue, ocl_kernel, work_dim, 
-				   global_work_offset, global_work_size, 
-				   local_work_size, num_events_in_wait_list, 
-				   event_wait_list, eventp));
+           global_work_offset, global_work_size, 
+           local_work_size, num_events_in_wait_list, 
+           event_wait_list, eventp));
 
   ETMSG(OPENCL, "registering callback for type: kernel. " 
-	"Correlation id: %"PRIu64 "", correlation_id);
+  "Correlation id: %"PRIu64 "", correlation_id);
 
   opencl_subscriber_callback(kernel_cb->type, kernel_cb->correlation_id);
   clSetEventCallback_wrapper(*eventp, CL_COMPLETE, 
-			     &opencl_activity_completion_callback, kernel_info);
+           &opencl_activity_completion_callback, kernel_info);
   return return_status;
 }
 
@@ -819,15 +732,15 @@ clEnqueueNDRangeKernel
 cl_int
 clEnqueueReadBuffer
 (
-  cl_command_queue command_queue,
-  cl_mem buffer,
-  cl_bool blocking_read,
-  size_t offset,
-  size_t cb,
-  void *ptr,
-  cl_uint num_events_in_wait_list,
-  const cl_event *event_wait_list,
-  cl_event *event
+ cl_command_queue command_queue,
+ cl_mem buffer,
+ cl_bool blocking_read,
+ size_t offset,
+ size_t cb,
+ void *ptr,
+ cl_uint num_events_in_wait_list,
+ const cl_event *event_wait_list,
+ cl_event *event
 )
 {
   uint64_t correlation_id = getCorrelationId();
@@ -846,19 +759,19 @@ clEnqueueReadBuffer
   }
   cl_int return_status = 
     HPCRUN_OPENCL_CALL(clEnqueueReadBuffer, (command_queue, buffer, blocking_read, offset, 
-				cb, ptr, num_events_in_wait_list, 
-				event_wait_list, eventp));
+        cb, ptr, num_events_in_wait_list, 
+        event_wait_list, eventp));
 
   ETMSG(OPENCL, "registering callback for type: D2H. " 
-	"Correlation id: %"PRIu64 "", correlation_id);
+  "Correlation id: %"PRIu64 "", correlation_id);
   ETMSG(OPENCL, "%d(bytes) of data being transferred from device to host", 
-	(long)cb);
+  (long)cb);
 
   opencl_subscriber_callback(mem_transfer_cb->type, 
-			     mem_transfer_cb->correlation_id);
+           mem_transfer_cb->correlation_id);
 
   clSetEventCallback_wrapper(*eventp, CL_COMPLETE, 
-			     &opencl_activity_completion_callback, mem_info);
+           &opencl_activity_completion_callback, mem_info);
 
   return return_status;
 }
@@ -867,15 +780,15 @@ clEnqueueReadBuffer
 cl_int
 clEnqueueWriteBuffer
 (
-  cl_command_queue command_queue,
-  cl_mem buffer,
-  cl_bool blocking_write,
-  size_t offset,
-  size_t cb,
-  const void *ptr,
-  cl_uint num_events_in_wait_list,
-  const cl_event *event_wait_list,
-  cl_event *event
+ cl_command_queue command_queue,
+ cl_mem buffer,
+ cl_bool blocking_write,
+ size_t offset,
+ size_t cb,
+ const void *ptr,
+ cl_uint num_events_in_wait_list,
+ const cl_event *event_wait_list,
+ cl_event *event
 )
 {
   uint64_t correlation_id = getCorrelationId();
@@ -894,21 +807,19 @@ clEnqueueWriteBuffer
   }
   cl_int return_status = 
     HPCRUN_OPENCL_CALL(clEnqueueWriteBuffer, (command_queue, buffer, blocking_write, offset,
-				 cb, ptr, num_events_in_wait_list, 
-				 event_wait_list, eventp));
+         cb, ptr, num_events_in_wait_list, 
+         event_wait_list, eventp));
 
   ETMSG(OPENCL, "registering callback for type: H2D. " 
-	"Correlation id: %"PRIu64 "", correlation_id);
+  "Correlation id: %"PRIu64 "", correlation_id);
 
   ETMSG(OPENCL, "%d(bytes) of data being transferred from host to device", 
-	(long)cb);
+  (long)cb);
 
   opencl_subscriber_callback(mem_transfer_cb->type, 
-			     mem_transfer_cb->correlation_id);
+           mem_transfer_cb->correlation_id);
 
-  clSetEventCallback_wrapper(*eventp, CL_COMPLETE, 
-			     &opencl_activity_completion_callback, 
-			     (void*) mem_info);
+  clSetEventCallback_wrapper(*eventp, CL_COMPLETE, &opencl_activity_completion_callback, (void *)mem_info);
 
   return return_status;
 }
@@ -917,7 +828,7 @@ clEnqueueWriteBuffer
 void
 opencl_api_finalize
 (
-  void *args
+ void *args
 )
 {
   opencl_wait_for_pending_operations();
diff --git a/src/tool/hpcrun/gpu/opencl/opencl-api.h b/src/tool/hpcrun/gpu/opencl/opencl-api.h
index fa2d373f12..df46d51d51 100644
--- a/src/tool/hpcrun/gpu/opencl/opencl-api.h
+++ b/src/tool/hpcrun/gpu/opencl/opencl-api.h
@@ -44,8 +44,6 @@
 #ifndef _OPENCL_API_H_
 #define _OPENCL_API_H_
 
-
-
 //******************************************************************************
 // local includes
 //******************************************************************************
@@ -55,72 +53,63 @@
 
 #include "opencl-intercept.h"
 
-
-
 //******************************************************************************
 // interface operations
 //******************************************************************************
 
-char*
-getDebugInfoFullFileName
-(
-	void
-);
-
-
 void
 opencl_subscriber_callback
 (
-  opencl_call_t,
-  uint64_t
+ opencl_call_t,
+ uint64_t
 );
 
 
 void
 opencl_activity_completion_callback
 (
-  cl_event,
-  cl_int,
-  void *
+ cl_event,
+ cl_int,
+ void *
 );
 
 
 void
-getTimingInfoFromClEvent
+opencl_timing_info_get
 (
-  gpu_interval_t *,
-  cl_event
+ gpu_interval_t *,
+ cl_event
 );
 
 
 void
 clSetEventCallback_wrapper
 (
-  cl_event,
-  cl_int,
-  void (CL_CALLBACK*)(cl_event, cl_int, void *),
-  void *
+ cl_event,
+ cl_int,
+ void (CL_CALLBACK*)(cl_event, cl_int, void *),
+ void *
 );
 
 
 void
 opencl_api_initialize
 (
-  void
+ void
 );
 
 
 int
 opencl_bind
 (
-  void
+ void
 );
 
 
 void
 opencl_api_finalize
 (
-  void *
+ void *
 );
 
 
diff --git a/src/tool/hpcrun/gpu/opencl/opencl-intercept.c b/src/tool/hpcrun/gpu/opencl/opencl-intercept.c
index 3fa3523e3f..ba9ac4a02c 100644
--- a/src/tool/hpcrun/gpu/opencl/opencl-intercept.c
+++ b/src/tool/hpcrun/gpu/opencl/opencl-intercept.c
@@ -51,7 +51,7 @@
 // local includes
 //******************************************************************************
 
-#include <hpcrun/gpu/instrumentation/opencl-instrumentation.h>
+#include <hpcrun/gpu/instrumentation/gtpin-instrumentation.h>
 #include <hpcrun/gpu/gpu-metrics.h>
 #include <hpcrun/memory/hpcrun-malloc.h>
 #include <hpcrun/messages/messages.h>
@@ -74,8 +74,8 @@ opencl_intercept_setup
 {
 #ifndef HPCRUN_STATIC_LINK
   ETMSG(OPENCL, "setting up opencl intercepts");
-	gpu_metrics_KER_BLKINFO_enable();
-  opencl_enable_profiling();
+  gpu_metrics_KER_BLKINFO_enable();
+  gtpin_enable_profiling();
 #endif
 }
 
diff --git a/src/tool/hpcrun/gpu/opencl/opencl-intercept.h b/src/tool/hpcrun/gpu/opencl/opencl-intercept.h
index 5bb8be1106..b8cb2de3e5 100644
--- a/src/tool/hpcrun/gpu/opencl/opencl-intercept.h
+++ b/src/tool/hpcrun/gpu/opencl/opencl-intercept.h
@@ -52,16 +52,12 @@
 
 #include <stdbool.h>
 
-
-
 //******************************************************************************
 // local includes
 //******************************************************************************
 
 #include <lib/prof-lean/hpcrun-opencl.h>
 
-
-
 //******************************************************************************
 // type declarations
 //******************************************************************************
@@ -162,7 +158,6 @@ typedef struct cl_memory_callback_t {
 } cl_memory_callback_t;
 
 
-
 //******************************************************************************
 // interface operations
 //******************************************************************************
@@ -181,5 +176,4 @@ opencl_intercept_teardown
 );
 
 
-
 #endif  //_OPENCL_INTERCEPT_H_

From 618daf71f031c215b049d85e97228213e2c40133 Mon Sep 17 00:00:00 2001
From: Jokeren <robinho364@gmail.com>
Date: Sun, 27 Sep 2020 03:16:30 +0000
Subject: [PATCH 049/177] Further refine hpcstruct

---
 lib/dtd/hpc-structure.dtd | 10 +---------
 src/lib/banal/Struct.cpp  | 18 ++----------------
 2 files changed, 3 insertions(+), 25 deletions(-)

diff --git a/lib/dtd/hpc-structure.dtd b/lib/dtd/hpc-structure.dtd
index a6263bc564..c69edce928 100644
--- a/lib/dtd/hpc-structure.dtd
+++ b/lib/dtd/hpc-structure.dtd
@@ -15,7 +15,7 @@
   <!--   (v)ma-range-set: "{[beg-end), [beg-end)...}" -->
   <!--   (t)arget: target function address -->
   <!--   (d)evice: device name -->
-  <!ELEMENT LM (F|P|B)*>
+  <!ELEMENT LM (F|P)*>
   <!ATTLIST LM
 	i CDATA #REQUIRED
 	n CDATA #REQUIRED
@@ -34,14 +34,6 @@
 	l  CDATA #IMPLIED
 	s  CDATA #IMPLIED
 	v  CDATA #IMPLIED>
-  <!-- Basic Block: -->
-  <!ELEMENT B (I)*>
-  <!ATTLIST B
-	o CDATA #REQUIRED>
-  <!-- Instruction: -->
-  <!ELEMENT I EMPTY>
-  <!ATTLIST I
-	o CDATA #REQUIRED>
   <!-- Alien: (f)ilename -->
   <!ELEMENT A (A|L|S|C)*>
   <!ATTLIST A
diff --git a/src/lib/banal/Struct.cpp b/src/lib/banal/Struct.cpp
index 712de20fe5..5a7bd17436 100644
--- a/src/lib/banal/Struct.cpp
+++ b/src/lib/banal/Struct.cpp
@@ -142,7 +142,7 @@ using namespace std;
 #endif
 
 #define DEBUG_CFG_SOURCE  0
-#define DEBUG_MAKE_SKEL   0 //1
+#define DEBUG_MAKE_SKEL   0
 #define DEBUG_SHOW_GAPS   0
 #define DEBUG_SKEL_SUMMARY  0
 
@@ -695,8 +695,7 @@ makeStructure(string filename,
 
     makeWorkList(fileMap, wlPrint, wlLaunch);
 		
-		char *elfFileRealPath = realpath(elfFile->getFileName().c_str(), NULL);
-    Output::printLoadModuleBegin(outFile, elfFileRealPath);
+    Output::printLoadModuleBegin(outFile, elfFile->getFileName());
 
 #pragma omp parallel  default(none)				\
     shared(wlPrint, wlLaunch, num_done, output_mtx)		\
@@ -1801,13 +1800,6 @@ doBlock(WorkEnv & env, GroupInfo * ginfo, ParseAPI::Function * func,
   LineMapCache lmcache (ginfo->sym_func, env.realPath);
 
   // iterate through the instructions in this block
-#if 0
-// no longer support this path
-#ifdef DYNINST_INSTRUCTION_PTR
-  map <Offset, Instruction::Ptr> imap;
-#else
-#endif
-#endif
   map <Offset, Instruction> imap;
   block->getInsns(imap);
 
@@ -1826,12 +1818,6 @@ doBlock(WorkEnv & env, GroupInfo * ginfo, ParseAPI::Function * func,
     string filenm = "";
     uint line = 0;
 
-#if 0
-#ifdef DYNINST_INSTRUCTION_PTR
-      len = iit->second->size();
-#else
-#endif
-#endif
     len = iit->second.size();
 
     lmcache.getLineInfo(vma, filenm, line);

From ef9c061f8a9a2ebc65df6e199c1a5aacd5744c6c Mon Sep 17 00:00:00 2001
From: Aaron Cherian <aarontcopal2@iris18.ftm.alcf.anl.gov>
Date: Mon, 28 Sep 2020 03:15:44 +0000
Subject: [PATCH 050/177] changing the cct_node creation for opencl
 instructions. ccts will be created from onKernelRun instead of onKernelBuild

---
 src/lib/prof-lean/hpcrun-fmt.h                |   6 +-
 src/tool/hpcrun/Makefile.am                   |   1 +
 src/tool/hpcrun/Makefile.in                   |  14 ++
 src/tool/hpcrun/gpu/gpu-activity.h            |   1 -
 src/tool/hpcrun/gpu/gpu-metrics.c             |  15 +--
 src/tool/hpcrun/gpu/gpu-metrics.h             |   2 -
 .../gtpin-instrumentation-kernel-data-map.h   |  12 +-
 .../gtpin-instrumentation-kernel-memory-map.h |   1 +
 .../kernel_runs_correlation_offset_map.c      | 125 ++++++++++++++++++
 .../kernel_runs_correlation_offset_map.h      |  59 +++++++++
 .../instrumentation/opencl-instrumentation.c  | 111 ++++++++++------
 .../instrumentation/opencl-instrumentation.h  |   2 +
 src/tool/hpcrun/metrics.c                     |   4 +-
 src/tool/hpcrun/metrics.h                     |   2 +-
 14 files changed, 290 insertions(+), 65 deletions(-)
 create mode 100644 src/tool/hpcrun/gpu/instrumentation/kernel_runs_correlation_offset_map.c
 create mode 100644 src/tool/hpcrun/gpu/instrumentation/kernel_runs_correlation_offset_map.h

diff --git a/src/lib/prof-lean/hpcrun-fmt.h b/src/lib/prof-lean/hpcrun-fmt.h
index 6034fa497d..3796282c33 100644
--- a/src/lib/prof-lean/hpcrun-fmt.h
+++ b/src/lib/prof-lean/hpcrun-fmt.h
@@ -168,10 +168,10 @@ hpcrun_fmt_hdr_free(hpcrun_fmt_hdr_t* hdr, hpcfmt_free_fn dealloc);
 #define HPCRUN_FMT_METRIC_SHOW_EXCLUSIVE  3
 #define HPCRUN_FMT_METRIC_INVISIBLE       4
 
-// changing the scope of metrics
+// relocating the metrics
 #define HPCRUN_FMT_METRIC_MOVE_TO_ENCLOSING_PROCEDURE       5
 #define HPCRUN_FMT_METRIC_KEEP_HERE													6
-#define HPCRUN_FMT_METRIC_PROPOGATE_TO_BASIC_BLOCKS					7
+#define HPCRUN_FMT_METRIC_COPY_TO_INST_SIBLINGS_IN_BBLOCK		7
 
 
 
@@ -277,7 +277,7 @@ typedef struct hpcrun_metricFlags_fields {
   uint16_t             partner;
   uint8_t /*bool*/     show;
   uint8_t /*bool*/     showPercent;
-  uint8_t 						 scope;
+  uint8_t 						 relocation_type;
 
   uint64_t unused1;
 } hpcrun_metricFlags_fields;
diff --git a/src/tool/hpcrun/Makefile.am b/src/tool/hpcrun/Makefile.am
index d1f08bf483..dd3b0649c3 100644
--- a/src/tool/hpcrun/Makefile.am
+++ b/src/tool/hpcrun/Makefile.am
@@ -517,6 +517,7 @@ if OPT_ENABLE_GTPIN
 MY_GTPIN_FILES = \
 	gpu/instrumentation/gtpin-instrumentation-kernel-data-map.c \
 	gpu/instrumentation/gtpin-instrumentation-kernel-memory-map.c \
+	gpu/instrumentation/kernel_runs_correlation_offset_map.c \
 	gpu/instrumentation/opencl-instrumentation.c
 endif
 
diff --git a/src/tool/hpcrun/Makefile.in b/src/tool/hpcrun/Makefile.in
index e553ba16d5..e127165cf7 100644
--- a/src/tool/hpcrun/Makefile.in
+++ b/src/tool/hpcrun/Makefile.in
@@ -544,6 +544,7 @@ am__libhpcrun_la_SOURCES_DIST = utilities/first_func.c main.h main.c \
 	gpu/opencl/opencl-activity-translate.c \
 	gpu/instrumentation/gtpin-instrumentation-kernel-data-map.c \
 	gpu/instrumentation/gtpin-instrumentation-kernel-memory-map.c \
+	gpu/instrumentation/kernel_runs_correlation_offset_map.c \
 	gpu/instrumentation/opencl-instrumentation.c \
 	unwind/common/backtrace.c unwind/common/unw-throw.c \
 	unwind/common/binarytree_uwi.c unwind/common/interval_t.c \
@@ -753,6 +754,7 @@ am__objects_35 =
 @OPT_ENABLE_OPENCL_TRUE@am__objects_41 = $(am__objects_40)
 @OPT_ENABLE_GTPIN_TRUE@am__objects_42 = gpu/instrumentation/libhpcrun_la-gtpin-instrumentation-kernel-data-map.lo \
 @OPT_ENABLE_GTPIN_TRUE@	gpu/instrumentation/libhpcrun_la-gtpin-instrumentation-kernel-memory-map.lo \
+@OPT_ENABLE_GTPIN_TRUE@	gpu/instrumentation/libhpcrun_la-kernel_runs_correlation_offset_map.lo \
 @OPT_ENABLE_GTPIN_TRUE@	gpu/instrumentation/libhpcrun_la-opencl-instrumentation.lo
 @OPT_ENABLE_GTPIN_TRUE@am__objects_43 = $(am__objects_42)
 am__objects_44 = unwind/common/libhpcrun_la-backtrace.lo \
@@ -1885,6 +1887,7 @@ MY_AARCH64_FILES = \
 @OPT_ENABLE_GTPIN_TRUE@MY_GTPIN_FILES = \
 @OPT_ENABLE_GTPIN_TRUE@	gpu/instrumentation/gtpin-instrumentation-kernel-data-map.c \
 @OPT_ENABLE_GTPIN_TRUE@	gpu/instrumentation/gtpin-instrumentation-kernel-memory-map.c \
+@OPT_ENABLE_GTPIN_TRUE@	gpu/instrumentation/kernel_runs_correlation_offset_map.c \
 @OPT_ENABLE_GTPIN_TRUE@	gpu/instrumentation/opencl-instrumentation.c
 
 @OPT_ENABLE_ROCM_TRUE@MY_ROCM_FILES = \
@@ -2846,6 +2849,9 @@ gpu/instrumentation/libhpcrun_la-gtpin-instrumentation-kernel-data-map.lo:  \
 gpu/instrumentation/libhpcrun_la-gtpin-instrumentation-kernel-memory-map.lo:  \
 	gpu/instrumentation/$(am__dirstamp) \
 	gpu/instrumentation/$(DEPDIR)/$(am__dirstamp)
+gpu/instrumentation/libhpcrun_la-kernel_runs_correlation_offset_map.lo:  \
+	gpu/instrumentation/$(am__dirstamp) \
+	gpu/instrumentation/$(DEPDIR)/$(am__dirstamp)
 gpu/instrumentation/libhpcrun_la-opencl-instrumentation.lo:  \
 	gpu/instrumentation/$(am__dirstamp) \
 	gpu/instrumentation/$(DEPDIR)/$(am__dirstamp)
@@ -3745,6 +3751,7 @@ distclean-compile:
 @AMDEP_TRUE@@am__include@ @am__quote@gpu/amd/$(DEPDIR)/libhpcrun_la-roctracer-api.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@gpu/instrumentation/$(DEPDIR)/libhpcrun_la-gtpin-instrumentation-kernel-data-map.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@gpu/instrumentation/$(DEPDIR)/libhpcrun_la-gtpin-instrumentation-kernel-memory-map.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@gpu/instrumentation/$(DEPDIR)/libhpcrun_la-kernel_runs_correlation_offset_map.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@gpu/instrumentation/$(DEPDIR)/libhpcrun_la-opencl-instrumentation.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@gpu/level0/$(DEPDIR)/libhpcrun_la-level0-api.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@gpu/level0/$(DEPDIR)/libhpcrun_la-level0-command-list-map.Plo@am__quote@
@@ -5445,6 +5452,13 @@ gpu/instrumentation/libhpcrun_la-gtpin-instrumentation-kernel-memory-map.lo: gpu
 @AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
 @am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -c -o gpu/instrumentation/libhpcrun_la-gtpin-instrumentation-kernel-memory-map.lo `test -f 'gpu/instrumentation/gtpin-instrumentation-kernel-memory-map.c' || echo '$(srcdir)/'`gpu/instrumentation/gtpin-instrumentation-kernel-memory-map.c
 
+gpu/instrumentation/libhpcrun_la-kernel_runs_correlation_offset_map.lo: gpu/instrumentation/kernel_runs_correlation_offset_map.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -MT gpu/instrumentation/libhpcrun_la-kernel_runs_correlation_offset_map.lo -MD -MP -MF gpu/instrumentation/$(DEPDIR)/libhpcrun_la-kernel_runs_correlation_offset_map.Tpo -c -o gpu/instrumentation/libhpcrun_la-kernel_runs_correlation_offset_map.lo `test -f 'gpu/instrumentation/kernel_runs_correlation_offset_map.c' || echo '$(srcdir)/'`gpu/instrumentation/kernel_runs_correlation_offset_map.c
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) gpu/instrumentation/$(DEPDIR)/libhpcrun_la-kernel_runs_correlation_offset_map.Tpo gpu/instrumentation/$(DEPDIR)/libhpcrun_la-kernel_runs_correlation_offset_map.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='gpu/instrumentation/kernel_runs_correlation_offset_map.c' object='gpu/instrumentation/libhpcrun_la-kernel_runs_correlation_offset_map.lo' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -c -o gpu/instrumentation/libhpcrun_la-kernel_runs_correlation_offset_map.lo `test -f 'gpu/instrumentation/kernel_runs_correlation_offset_map.c' || echo '$(srcdir)/'`gpu/instrumentation/kernel_runs_correlation_offset_map.c
+
 gpu/instrumentation/libhpcrun_la-opencl-instrumentation.lo: gpu/instrumentation/opencl-instrumentation.c
 @am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -MT gpu/instrumentation/libhpcrun_la-opencl-instrumentation.lo -MD -MP -MF gpu/instrumentation/$(DEPDIR)/libhpcrun_la-opencl-instrumentation.Tpo -c -o gpu/instrumentation/libhpcrun_la-opencl-instrumentation.lo `test -f 'gpu/instrumentation/opencl-instrumentation.c' || echo '$(srcdir)/'`gpu/instrumentation/opencl-instrumentation.c
 @am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) gpu/instrumentation/$(DEPDIR)/libhpcrun_la-opencl-instrumentation.Tpo gpu/instrumentation/$(DEPDIR)/libhpcrun_la-opencl-instrumentation.Plo
diff --git a/src/tool/hpcrun/gpu/gpu-activity.h b/src/tool/hpcrun/gpu/gpu-activity.h
index 2c5d219ec5..968bf62d9f 100644
--- a/src/tool/hpcrun/gpu/gpu-activity.h
+++ b/src/tool/hpcrun/gpu/gpu-activity.h
@@ -261,7 +261,6 @@ typedef struct gpu_kernel_t {
 
 typedef struct gpu_kernel_block_t {
 	uint32_t correlation_id;
-	uint64_t offset;
 	uint64_t execution_count;
 	ip_normalized_t pc;
 } gpu_kernel_block_t;
diff --git a/src/tool/hpcrun/gpu/gpu-metrics.c b/src/tool/hpcrun/gpu/gpu-metrics.c
index 604d6abed8..28cf605b5a 100644
--- a/src/tool/hpcrun/gpu/gpu-metrics.c
+++ b/src/tool/hpcrun/gpu/gpu-metrics.c
@@ -199,8 +199,8 @@ name ## _metric_kind
   reg_metric->format  = FORMAT_DISPLAY_PERCENTAGE
 
 
-#define SET_SCOPE_FOR_BASIC_BLOCKS(name) \
-  hpcrun_set_scope(METRIC_ID(name), HPCRUN_FMT_METRIC_MOVE_TO_ENCLOSING_PROCEDURE); \
+#define COPY_METRIC_TO_SIBLINGS_IN_BBLOCK(name) \
+  hpcrun_set_relocation_type(METRIC_ID(name), HPCRUN_FMT_METRIC_COPY_TO_INST_SIBLINGS_IN_BBLOCK); \
 
 
 
@@ -471,10 +471,7 @@ gpu_metrics_attribute_kernel_block
   cct_node_t *cct_node = activity->cct_node;
 
 	metric_data_list_t *metrics = 
-		hpcrun_reify_metric_set(cct_node, METRIC_ID(KER_BLK_OFFSET));
-
-	gpu_metrics_attribute_metric_int(metrics, METRIC_ID(KER_BLK_OFFSET), 
-					 b->offset);
+		hpcrun_reify_metric_set(cct_node, METRIC_ID(KER_BLK_EXECUTION_COUNT));
 
 	gpu_metrics_attribute_metric_int(metrics, METRIC_ID(KER_BLK_EXECUTION_COUNT),	// need to increment execution count for existing ccts
 					 b->execution_count);
@@ -754,13 +751,13 @@ gpu_metrics_KER_BLKINFO_enable
 {
 // kernel block characteristics metrics
 #undef CURRENT_METRIC 
-#define CURRENT_METRIC KER_BLKINFO
+#define CURRENT_METRIC GPU_INST // we are copying from gpu_metrics_GPU_INST_enable(). confirm 
 
   INITIALIZE_METRIC_KIND();
 
-	SET_SCOPE_FOR_BASIC_BLOCKS(KER_BLK_EXECUTION_COUNT);
+	COPY_METRIC_TO_SIBLINGS_IN_BBLOCK(GPU_INST_ALL);
 
-  FORALL_KER_BLKINFO(INITIALIZE_SCALAR_METRIC_INT)
+  FORALL_GPU_INST(INITIALIZE_SCALAR_METRIC_INT)
 
   FINALIZE_METRIC_KIND();
 }
diff --git a/src/tool/hpcrun/gpu/gpu-metrics.h b/src/tool/hpcrun/gpu/gpu-metrics.h
index 843a7f2288..d466cbeeba 100644
--- a/src/tool/hpcrun/gpu/gpu-metrics.h
+++ b/src/tool/hpcrun/gpu/gpu-metrics.h
@@ -382,8 +382,6 @@ typedef enum {
   FORALL_GSAMP_REAL(macro)				
 
 #define FORALL_KER_BLKINFO(macro)		\
-  macro("KER:BLK_OFST (B)",            KER_BLK_OFFSET,		\
-	"block offset with respect to kernel binary")		\
   macro("KER:BLK_EXEC_COUNT",            KER_BLK_EXECUTION_COUNT,		\
 	"count of number of dynamic executions of block")
 
diff --git a/src/tool/hpcrun/gpu/instrumentation/gtpin-instrumentation-kernel-data-map.h b/src/tool/hpcrun/gpu/instrumentation/gtpin-instrumentation-kernel-data-map.h
index 5324466751..8e4db7f0b8 100644
--- a/src/tool/hpcrun/gpu/instrumentation/gtpin-instrumentation-kernel-data-map.h
+++ b/src/tool/hpcrun/gpu/instrumentation/gtpin-instrumentation-kernel-data-map.h
@@ -4,7 +4,6 @@
 //******************************************************************************
 
 #include <stdint.h>
-//#include <gtpin.h>
 
 
 
@@ -12,12 +11,15 @@
 // type declarations
 //******************************************************************************
 
+typedef struct kernel_offset {
+	uint32_t offset;
+	struct kernel_offset *next;
+} kernel_offset;
+
+
 typedef struct KernelData {
-	uint64_t kernel_cct_correlation_id;
-  char *name;
-  uint32_t call_count;
 	uint32_t loadmap_module_id;
-	//block_map_t *block_map_root;
+	kernel_offset *offset_head;
 } KernelData;
 
 
diff --git a/src/tool/hpcrun/gpu/instrumentation/gtpin-instrumentation-kernel-memory-map.h b/src/tool/hpcrun/gpu/instrumentation/gtpin-instrumentation-kernel-memory-map.h
index c9f0b89686..90446b6b9b 100644
--- a/src/tool/hpcrun/gpu/instrumentation/gtpin-instrumentation-kernel-memory-map.h
+++ b/src/tool/hpcrun/gpu/instrumentation/gtpin-instrumentation-kernel-memory-map.h
@@ -13,6 +13,7 @@
 
 typedef struct mem_pair_node {
 	int32_t offset;
+	int32_t endOffset;
 	GTPinMem mem;
 	struct mem_pair_node *next;
 } mem_pair_node;
diff --git a/src/tool/hpcrun/gpu/instrumentation/kernel_runs_correlation_offset_map.c b/src/tool/hpcrun/gpu/instrumentation/kernel_runs_correlation_offset_map.c
new file mode 100644
index 0000000000..d7160eeab4
--- /dev/null
+++ b/src/tool/hpcrun/gpu/instrumentation/kernel_runs_correlation_offset_map.c
@@ -0,0 +1,125 @@
+//******************************************************************************
+// system includes
+//******************************************************************************
+
+#include <string.h>
+#include <assert.h>
+
+
+//******************************************************************************
+// local includes
+//******************************************************************************
+
+#include <hpcrun/gpu/gpu-splay-allocator.h>
+#include <lib/prof-lean/splay-uint64.h>
+
+#include "kernel_runs_correlation_offset_map.h"
+
+
+
+//******************************************************************************
+// type declarations
+//******************************************************************************
+
+#define kco_insert																					\
+  typed_splay_insert(kernel_correlation_offset_map)
+
+#define kco_lookup																					\
+  typed_splay_lookup(kernel_correlation_offset_map)
+
+#define kco_delete																					\
+  typed_splay_delete(kernel_correlation_offset_map)
+
+#define kco_forall																					\
+  typed_splay_forall(kernel_correlation_offset_map)
+
+#define kco_count																						\
+  typed_splay_count(kernel_correlation_offset_map)
+
+#define kco_alloc(free_list)																\
+  typed_splay_alloc(free_list, kernel_correlation_offset_map_t)
+
+#define kco_free(free_list, node)														\
+  typed_splay_free(free_list, node)
+
+typed_splay_impl(kernel_correlation_offset_map);
+
+
+
+//******************************************************************************
+// local data
+//******************************************************************************
+
+static kernel_correlation_offset_map_t *kernel_correlation_offset_map_root = NULL;
+static kernel_correlation_offset_map_t *kernel_correlation_offset_map_free_list = NULL;
+
+
+
+//******************************************************************************
+// private operations
+//******************************************************************************
+
+static kernel_correlation_offset_map_t *
+kernel_data_alloc()
+{
+  return kco_alloc(&kernel_correlation_offset_map_free_list);
+}
+
+
+static kernel_correlation_offset_map_t *
+kernel_data_new
+(
+	uint64_t GTPinKernelExec_id,
+	kernel_runs_correlation_offset *data
+)
+{
+  kernel_correlation_offset_map_t *e = kernel_data_alloc();
+  memset(e, 0, sizeof(kernel_correlation_offset_map_t)); 
+  e->GTPinKernelExec_id = GTPinKernelExec_id;
+  e->head = data;
+  return e;
+}
+
+
+
+//******************************************************************************
+// interface operations
+//******************************************************************************
+
+kernel_correlation_offset_map_t*
+kernel_correlation_offset_map_lookup1
+(
+	uint64_t GTPinKernelExec_id
+)
+{
+  kernel_correlation_offset_map_t *result = kco_lookup(&kernel_correlation_offset_map_root, GTPinKernelExec_id);
+	return result;
+}
+
+
+void
+kernel_correlation_offset_map_insert1
+(
+	uint64_t GTPinKernelExec_id,
+	kernel_runs_correlation_offset *data
+)
+{
+	if (kco_lookup(&kernel_correlation_offset_map_root, GTPinKernelExec_id)) {
+		assert(0);	// entry for a given key should be inserted only once
+	} else {
+		kernel_correlation_offset_map_t *entry = kernel_data_new(GTPinKernelExec_id, data);
+		kco_insert(&kernel_correlation_offset_map_root, entry);	
+	}
+}
+
+
+void
+kernel_correlation_offset_map_delete1
+(
+	uint64_t GTPinKernelExec_id
+)
+{
+	kernel_correlation_offset_map_t *node = kco_delete(&kernel_correlation_offset_map_root, GTPinKernelExec_id);
+	kco_free(&kernel_correlation_offset_map_free_list, node);
+}
+
diff --git a/src/tool/hpcrun/gpu/instrumentation/kernel_runs_correlation_offset_map.h b/src/tool/hpcrun/gpu/instrumentation/kernel_runs_correlation_offset_map.h
new file mode 100644
index 0000000000..025bd061b7
--- /dev/null
+++ b/src/tool/hpcrun/gpu/instrumentation/kernel_runs_correlation_offset_map.h
@@ -0,0 +1,59 @@
+
+//******************************************************************************
+// system includes
+//******************************************************************************
+
+#include <stdint.h>
+
+
+
+//******************************************************************************
+// type declarations
+//******************************************************************************
+
+typedef struct kernel_runs_correlation_offset {
+	int32_t offset;
+	uint32_t correlation_id;
+	struct kernel_runs_correlation_offset *next;
+} kernel_runs_correlation_offset;
+
+
+#undef typed_splay_node
+#define typed_splay_node(kernel_correlation_offset_map) kernel_correlation_offset_map_t
+
+
+typedef struct typed_splay_node(kernel_correlation_offset_map) {
+  struct typed_splay_node(kernel_correlation_offset_map) *left;
+  struct typed_splay_node(kernel_correlation_offset_map) *right;
+  uint64_t GTPinKernelExec_id; // key
+
+	kernel_runs_correlation_offset *head;
+}typed_splay_node(kernel_correlation_offset_map);
+
+
+
+//******************************************************************************
+// interface operations
+//******************************************************************************
+
+kernel_correlation_offset_map_t*
+kernel_correlation_offset_map_lookup1
+(
+	uint64_t
+);
+
+
+void
+kernel_correlation_offset_map_insert1
+(
+	uint64_t,
+	kernel_runs_correlation_offset *
+);
+
+
+void
+kernel_correlation_offset_map_delete1
+(
+	uint64_t
+);
+
diff --git a/src/tool/hpcrun/gpu/instrumentation/opencl-instrumentation.c b/src/tool/hpcrun/gpu/instrumentation/opencl-instrumentation.c
index 06519445cf..83a881d947 100644
--- a/src/tool/hpcrun/gpu/instrumentation/opencl-instrumentation.c
+++ b/src/tool/hpcrun/gpu/instrumentation/opencl-instrumentation.c
@@ -291,7 +291,6 @@ kernelBlockActivityTranslate
   ga->details.kernel_block.correlation_id = correlation_id;
   ga->details.kernel_block.pc.lm_id = (uint16_t)loadmap_module_id;
   ga->details.kernel_block.pc.lm_ip = (uintptr_t)offset;
-  ga->details.kernel_block.offset = offset;
   ga->details.kernel_block.execution_count = execution_count;
   ga->kind = GPU_ACTIVITY_KERNEL_BLOCK;
 
@@ -327,59 +326,69 @@ onKernelBuild
   assert(kernel_data_map_lookup1((uint64_t)kernel) == 0);
 
   KernelData data;
-
-  uint32_t correlation_id = getCorrelationId();
-  data.kernel_cct_correlation_id = correlation_id;
-  createKernelNode(correlation_id);
-
-  data.call_count = 0;
   data.loadmap_module_id = findOrAddKernelModule(kernel);
-
-  kernel_data_map_insert1((uint64_t)kernel, data);
-
+	
+	kernel_offset *offset_head = NULL;
   mem_pair_node *h;
-  mem_pair_node *current;
+  mem_pair_node *m_current;
+  kernel_offset *k_current;
   bool isHeadNull = true;
 
   for (GTPinBBL block = GTPin_BBLHead(kernel); GTPin_BBLValid(block); block = GTPin_BBLNext(block)) {
     GTPinINS head = GTPin_InsHead(block);
+    GTPinINS tail = GTPin_InsTail(block);
     assert(GTPin_InsValid(head));
-
-    int32_t offset = GTPin_InsOffset(head);
+    int32_t head_offset = GTPin_InsOffset(head);
+    int32_t tail_offset = GTPin_InsOffset(tail);
 
     GTPinMem mem = NULL;
     status = GTPin_MemClaim(kernel, sizeof(uint32_t), &mem);
     assert(status == GTPINTOOL_STATUS_SUCCESS);
-
     status = GTPin_OpcodeprofInstrument(head, mem);
     assert(status == GTPINTOOL_STATUS_SUCCESS);
 
     // TODO(Aaron): when using hpcrun_malloc, find a way to recycle memory
     mem_pair_node *m = hpcrun_malloc(sizeof(mem_pair_node));
-    m->offset = offset;
+    m->offset = head_offset;
+    m->endOffset = tail_offset;
     m->mem = mem;
     m->next = NULL;
 
     if (isHeadNull == true) {
       h = m;
-      current = m;
+      m_current = m;
       isHeadNull = false;
     } else {
-      current->next = m;
-      current = current->next;
+      m_current->next = m;
+      m_current = m_current->next;
     }
+		
+		// while loop that iterates for each instruction in the block and adds an offset entry in map
+		int32_t offset = head_offset;
+		GTPinINS inst = GTPin_InsHead(block);
+		int count = 0;
+		while (offset <= tail_offset && offset != -1) {
+			kernel_offset *ko = hpcrun_malloc(sizeof(kernel_offset));
+			ko->offset = offset;
+			if (offset_head == NULL) {
+				offset_head = ko;	
+				k_current = ko;
+			} else {
+				k_current->next = ko;
+				k_current = k_current->next;
+			}
+			inst = GTPin_InsNext(inst);
+			offset = GTPin_InsOffset(inst);
+		}
   }
+	data.offset_head = offset_head;
   if (h != NULL) {
     // TODO(Aaron): naming insert1/insert2 is confusing
     kernel_memory_map_insert1((uint64_t)kernel, h);
+		kernel_data_map_insert1((uint64_t)kernel, data);
   }
-
-  gpu_activity_channel_consume(gpu_metrics_attribute);
-  // 
   // m->next = NULL;
   // add these details to cct_node. If thats not needed, we can create the kernel_cct in onKernelComplete
-  // XXX(Aaron): what is this for?
-  //data.name = kernel_name;
   ETMSG(OPENCL, "onKernelBuild complete. Inserted key: %"PRIu64 "",(uint64_t)kernel);
 }
 
@@ -394,6 +403,31 @@ onKernelRun
   GTPINTOOL_STATUS status = GTPINTOOL_STATUS_SUCCESS;
   GTPin_KernelProfilingActive(kernelExec, 1);
   assert(status == GTPINTOOL_STATUS_SUCCESS);
+
+  GTPinKernel kernel = GTPin_KernelExec_GetKernel(kernelExec);
+  kernel_offset *offset_head = kernel_data_map_lookup1((uint64_t)kernel)->data.offset_head;
+	kernel_offset *current = offset_head;
+
+	kernel_runs_correlation_offset *kco_head, *co_current;
+	kco_head = hpcrun_malloc(sizeof(kernel_runs_correlation_offset));
+	co_current = kco_head;
+	uint32_t correlation_id = getCorrelationId();
+	createKernelNode(correlation_id);
+	kco_head->correlation_id = correlation_id;
+	kco_head->offset = offset_head->offset;
+
+	while (current->next != NULL) {
+		current = current->next;
+		correlation_id = getCorrelationId();
+		createKernelNode(correlation_id);
+		// save id=GTPinKernelExec and value=correlation_id in another map
+		kernel_runs_correlation_offset *kco = hpcrun_malloc(sizeof(kernel_runs_correlation_offset));
+		kco->correlation_id = correlation_id;
+		kco->offset = current->offset;
+		co_current->next = kco;
+		co_current = co_current->next;
+	}
+	kernel_correlation_offset_map_insert1((uint64_t)kernelExec, kco_head);
 }
 
 
@@ -416,18 +450,12 @@ onKernelComplete
   kernel_memory_map_t *kernel_memory_list = kernel_memory_map_lookup1((uint64_t)kernel);
   mem_pair_node *block = kernel_memory_list->head;
 
-  // get kernel cct root node from correlation_id
-  uint32_t correlation_id = data.kernel_cct_correlation_id;
+	kernel_runs_correlation_offset *kco_head = kernel_correlation_offset_map_lookup1((uint64_t)kernelExec);
+	kernel_runs_correlation_offset *kco_curr = kco_head;
+  uint32_t correlation_id = kco_curr->correlation_id;
+	printf("correlation_id: %d. ptr: %p\n", correlation_id, kco_curr);
 
   while (block != NULL) {
-    /*!
-     * @return sampling size for mem handle
-     * @ingroup MEM
-     * @param[in]   mem     the memory handle
-     *
-     * @par Availability:
-     * - all callbacks
-     */
     uint32_t thread_count = GTPin_MemSampleLength(block->mem);
     assert(thread_count > 0);
 
@@ -437,20 +465,19 @@ onKernelComplete
       assert(status == GTPINTOOL_STATUS_SUCCESS);
       total += value;
     }
-
-    //block_map_t *bm = block_map_lookup1(data.block_map_root, block->offset);
-    //assert(bm != 0);
     uint64_t execution_count = total; // + bm->val 
-    //block_map_insert1(data.block_map_root, block->offset, execution_count);
 
-    activityNotify();  
-    gpu_activity_t gpu_activity;
-    kernelBlockActivityProcess(&gpu_activity, correlation_id, data.loadmap_module_id, block->offset, execution_count);
+    activityNotify();
+		printf("correlation_id: %d. ptr: %d\n", correlation_id, kco_curr->offset);
+		while(kco_curr->offset != block->endOffset) {
+			gpu_activity_t gpu_activity;
+	    kernelBlockActivityProcess(&gpu_activity, kco_curr->correlation_id,
+						data.loadmap_module_id, kco_curr->offset, execution_count);
+			kco_curr = kco_curr->next;
+		}
     block = block->next;
     //how to make offset the primary key within the cct and += the execution value for existing ccts?
   }
-
-  ++(data.call_count);
 }
 
 
diff --git a/src/tool/hpcrun/gpu/instrumentation/opencl-instrumentation.h b/src/tool/hpcrun/gpu/instrumentation/opencl-instrumentation.h
index 53dee81047..1633b032f9 100644
--- a/src/tool/hpcrun/gpu/instrumentation/opencl-instrumentation.h
+++ b/src/tool/hpcrun/gpu/instrumentation/opencl-instrumentation.h
@@ -47,6 +47,8 @@
 
 #include "gtpin-instrumentation-kernel-memory-map.h"
 #include "gtpin-instrumentation-kernel-data-map.h"
+//#include "kernel_offset_map.h"
+#include "kernel_runs_correlation_offset_map.h"
 
 
 
diff --git a/src/tool/hpcrun/metrics.c b/src/tool/hpcrun/metrics.c
index e474ae9b66..a4f444e750 100644
--- a/src/tool/hpcrun/metrics.c
+++ b/src/tool/hpcrun/metrics.c
@@ -311,9 +311,9 @@ void hpcrun_set_percent(int metric_id, uint8_t show_percent) {
 }
 
 
-void hpcrun_set_scope(int metric_id, uint8_t scope) {
+void hpcrun_set_relocation_type(int metric_id, uint8_t relocation_type) {
   metric_desc_t* mdesc = hpcrun_id2metric_linked(metric_id);
-  mdesc->flags.fields.scope = scope;
+  mdesc->flags.fields.relocation_type = relocation_type;
 }
 
 
diff --git a/src/tool/hpcrun/metrics.h b/src/tool/hpcrun/metrics.h
index 3a45db3479..c30917aec4 100644
--- a/src/tool/hpcrun/metrics.h
+++ b/src/tool/hpcrun/metrics.h
@@ -128,7 +128,7 @@ void hpcrun_set_display(int metric_id, uint8_t show);
 
 void hpcrun_set_percent(int metric_id, uint8_t show_percent);
 
-void hpcrun_set_scope(int metric_id, uint8_t scope);
+void hpcrun_set_relocation_type(int metric_id, uint8_t relocation_type);
 
 metric_desc_p_tbl_t* hpcrun_get_metric_tbl(kind_info_t**);
 

From 0693153d15c3da15f4faa4ddcc096e838bd20501 Mon Sep 17 00:00:00 2001
From: Aaron Cherian <aarontcopal2@iris16.ftm.alcf.anl.gov>
Date: Tue, 29 Sep 2020 15:47:03 +0000
Subject: [PATCH 051/177] refactoring opencl instrumentation code. instrumented
 metrics not visible in hpcviewer

---
 .../hpcrun/gpu/instrumentation/gtpin-instrumentation.c    | 8 ++------
 .../hpcrun/gpu/instrumentation/gtpin-instrumentation.h    | 1 -
 src/tool/hpcrun/gpu/opencl/opencl-api.c                   | 4 +++-
 src/tool/hpcrun/gpu/opencl/opencl-intercept.c             | 4 +++-
 4 files changed, 8 insertions(+), 9 deletions(-)

diff --git a/src/tool/hpcrun/gpu/instrumentation/gtpin-instrumentation.c b/src/tool/hpcrun/gpu/instrumentation/gtpin-instrumentation.c
index 6d2cb784a9..01f3cf17af 100644
--- a/src/tool/hpcrun/gpu/instrumentation/gtpin-instrumentation.c
+++ b/src/tool/hpcrun/gpu/instrumentation/gtpin-instrumentation.c
@@ -74,8 +74,6 @@
 #include <hpcrun/gpu/gpu-op-placeholders.h>
 #include <hpcrun/gpu/gpu-metrics.h>
 #include <hpcrun/gpu/gpu-monitoring-thread-api.h>
-#include <hpcrun/gpu/opencl/opencl-api.h>
-#include <hpcrun/gpu/opencl/opencl-intercept.h>
 #include <hpcrun/utilities/hpcrun-nanotime.h>
 
 #include <lib/prof-lean/crypto-hash.h>
@@ -449,10 +447,9 @@ onKernelComplete
   kernel_memory_map_t *kernel_memory_list = kernel_memory_map_lookup1((uint64_t)kernel);
   mem_pair_node *block = kernel_memory_list->head;
 
-	kernel_runs_correlation_offset *kco_head = kernel_correlation_offset_map_lookup1((uint64_t)kernelExec);
+	kernel_runs_correlation_offset *kco_head = kernel_correlation_offset_map_lookup1((uint64_t)kernelExec)->head;
 	kernel_runs_correlation_offset *kco_curr = kco_head;
   uint32_t correlation_id = kco_curr->correlation_id;
-	printf("correlation_id: %d. ptr: %p\n", correlation_id, kco_curr);
 
   while (block != NULL) {
     uint32_t thread_count = GTPin_MemSampleLength(block->mem);
@@ -467,7 +464,7 @@ onKernelComplete
     uint64_t execution_count = total; // + bm->val 
 
     gpu_activity_t gpu_activity;
-		printf("correlation_id: %d. ptr: %d\n", correlation_id, kco_curr->offset);
+		activityNotify();  
 		while(kco_curr->offset != block->endOffset) {
 			gpu_activity_t gpu_activity;
 	    kernelBlockActivityProcess(&gpu_activity, kco_curr->correlation_id,
@@ -477,7 +474,6 @@ onKernelComplete
     block = block->next;
     //how to make offset the primary key within the cct and += the execution value for existing ccts?
   }
-  activityNotify();  
 }
 
 
diff --git a/src/tool/hpcrun/gpu/instrumentation/gtpin-instrumentation.h b/src/tool/hpcrun/gpu/instrumentation/gtpin-instrumentation.h
index 93c24eb13b..00f3171fc3 100644
--- a/src/tool/hpcrun/gpu/instrumentation/gtpin-instrumentation.h
+++ b/src/tool/hpcrun/gpu/instrumentation/gtpin-instrumentation.h
@@ -47,7 +47,6 @@
 
 #include "gtpin-instrumentation-kernel-memory-map.h"
 #include "gtpin-instrumentation-kernel-data-map.h"
-//#include "kernel_offset_map.h"
 #include "kernel_runs_correlation_offset_map.h"
 
 //******************************************************************************
diff --git a/src/tool/hpcrun/gpu/opencl/opencl-api.c b/src/tool/hpcrun/gpu/opencl/opencl-api.c
index ebd5edfdd0..a2323bc255 100644
--- a/src/tool/hpcrun/gpu/opencl/opencl-api.c
+++ b/src/tool/hpcrun/gpu/opencl/opencl-api.c
@@ -69,6 +69,7 @@
 #include <hpcrun/gpu/gpu-metrics.h>
 #include <hpcrun/gpu/gpu-monitoring-thread-api.h>
 #include <hpcrun/gpu/gpu-op-placeholders.h>
+#include <hpcrun/gpu/instrumentation/gtpin-instrumentation.h>
 #include <hpcrun/messages/messages.h>
 #include <hpcrun/sample-sources/libdl.h>
 #include <hpcrun/files.h>
@@ -579,7 +580,8 @@ opencl_api_initialize
  void
 )
 {
-  opencl_intercept_setup();
+  gpu_metrics_GPU_INST_enable();
+  gtpin_enable_profiling();
   atomic_store(&correlation_id, 0);
   atomic_store(&opencl_pending_operations, 0);
 }
diff --git a/src/tool/hpcrun/gpu/opencl/opencl-intercept.c b/src/tool/hpcrun/gpu/opencl/opencl-intercept.c
index ba9ac4a02c..0e2e10bfc8 100644
--- a/src/tool/hpcrun/gpu/opencl/opencl-intercept.c
+++ b/src/tool/hpcrun/gpu/opencl/opencl-intercept.c
@@ -66,6 +66,8 @@
 #include "opencl-intercept.h"
 
 
+// TODO: This file is no longer needed. To be deleted
+
 void
 opencl_intercept_setup
 (
@@ -74,7 +76,7 @@ opencl_intercept_setup
 {
 #ifndef HPCRUN_STATIC_LINK
   ETMSG(OPENCL, "setting up opencl intercepts");
-  gpu_metrics_KER_BLKINFO_enable();
+  gpu_metrics_GPU_INST_enable();
   gtpin_enable_profiling();
 #endif
 }

From 4b0c48ae1ea69bff009a736ebacaf8cfab9abd4b Mon Sep 17 00:00:00 2001
From: Jokeren <robinho364@gmail.com>
Date: Tue, 29 Sep 2020 23:39:48 +0000
Subject: [PATCH 052/177] Refactoring gtpin code. 1. Using structured blocks
 and insts to store offsets 2. Simplify instruction attribution

---
 src/tool/hpcrun/Makefile.am                   |   4 +-
 src/tool/hpcrun/Makefile.in                   |  48 +----
 src/tool/hpcrun/gpu/gpu-activity-process.c    |  45 ++---
 src/tool/hpcrun/gpu/gpu-activity.h            |  10 +-
 src/tool/hpcrun/gpu/gpu-metrics.c             | 179 ++++++++----------
 src/tool/hpcrun/gpu/gpu-metrics.h             |  11 --
 .../gtpin-instrumentation-kernel-data-map.c   | 125 ------------
 .../gtpin-instrumentation-kernel-data-map.h   |  64 -------
 .../gtpin-instrumentation-kernel-memory-map.c | 126 ------------
 .../gtpin-instrumentation-kernel-memory-map.h |  61 ------
 .../instrumentation/gtpin-instrumentation.c   | 176 +++++++----------
 .../instrumentation/gtpin-instrumentation.h   |   9 +-
 .../gpu/instrumentation/kernel-data-map.c     | 142 ++++++++++++++
 .../gpu/instrumentation/kernel-data-map.h     |  45 +++++
 .../hpcrun/gpu/instrumentation/kernel-data.h  |  34 ++++
 .../kernel_runs_correlation_offset_map.c      | 125 ------------
 .../kernel_runs_correlation_offset_map.h      |  59 ------
 17 files changed, 410 insertions(+), 853 deletions(-)
 delete mode 100644 src/tool/hpcrun/gpu/instrumentation/gtpin-instrumentation-kernel-data-map.c
 delete mode 100644 src/tool/hpcrun/gpu/instrumentation/gtpin-instrumentation-kernel-data-map.h
 delete mode 100644 src/tool/hpcrun/gpu/instrumentation/gtpin-instrumentation-kernel-memory-map.c
 delete mode 100644 src/tool/hpcrun/gpu/instrumentation/gtpin-instrumentation-kernel-memory-map.h
 create mode 100644 src/tool/hpcrun/gpu/instrumentation/kernel-data-map.c
 create mode 100644 src/tool/hpcrun/gpu/instrumentation/kernel-data-map.h
 create mode 100644 src/tool/hpcrun/gpu/instrumentation/kernel-data.h
 delete mode 100644 src/tool/hpcrun/gpu/instrumentation/kernel_runs_correlation_offset_map.c
 delete mode 100644 src/tool/hpcrun/gpu/instrumentation/kernel_runs_correlation_offset_map.h

diff --git a/src/tool/hpcrun/Makefile.am b/src/tool/hpcrun/Makefile.am
index 105c0c9bd2..82345075f2 100644
--- a/src/tool/hpcrun/Makefile.am
+++ b/src/tool/hpcrun/Makefile.am
@@ -515,9 +515,7 @@ endif
 
 if OPT_ENABLE_GTPIN
 MY_GTPIN_FILES = \
-	gpu/instrumentation/gtpin-instrumentation-kernel-data-map.c \
-	gpu/instrumentation/gtpin-instrumentation-kernel-memory-map.c \
-	gpu/instrumentation/kernel_runs_correlation_offset_map.c \
+	gpu/instrumentation/kernel-data-map.c \
 	gpu/instrumentation/gtpin-instrumentation.c
 endif
 
diff --git a/src/tool/hpcrun/Makefile.in b/src/tool/hpcrun/Makefile.in
index f5107f9f96..f0276123bc 100644
--- a/src/tool/hpcrun/Makefile.in
+++ b/src/tool/hpcrun/Makefile.in
@@ -542,9 +542,7 @@ am__libhpcrun_la_SOURCES_DIST = utilities/first_func.c main.h main.c \
 	gpu/opencl/opencl-api.c gpu/opencl/opencl-intercept.c \
 	gpu/opencl/opencl-memory-manager.c \
 	gpu/opencl/opencl-activity-translate.c \
-	gpu/instrumentation/gtpin-instrumentation-kernel-data-map.c \
-	gpu/instrumentation/gtpin-instrumentation-kernel-memory-map.c \
-	gpu/instrumentation/kernel_runs_correlation_offset_map.c \
+	gpu/instrumentation/kernel-data-map.c \
 	gpu/instrumentation/gtpin-instrumentation.c \
 	unwind/common/backtrace.c unwind/common/unw-throw.c \
 	unwind/common/binarytree_uwi.c unwind/common/interval_t.c \
@@ -752,9 +750,7 @@ am__objects_35 =
 @OPT_ENABLE_OPENCL_TRUE@	gpu/opencl/libhpcrun_la-opencl-memory-manager.lo \
 @OPT_ENABLE_OPENCL_TRUE@	gpu/opencl/libhpcrun_la-opencl-activity-translate.lo
 @OPT_ENABLE_OPENCL_TRUE@am__objects_41 = $(am__objects_40)
-@OPT_ENABLE_GTPIN_TRUE@am__objects_42 = gpu/instrumentation/libhpcrun_la-gtpin-instrumentation-kernel-data-map.lo \
-@OPT_ENABLE_GTPIN_TRUE@	gpu/instrumentation/libhpcrun_la-gtpin-instrumentation-kernel-memory-map.lo \
-@OPT_ENABLE_GTPIN_TRUE@	gpu/instrumentation/libhpcrun_la-kernel_runs_correlation_offset_map.lo \
+@OPT_ENABLE_GTPIN_TRUE@am__objects_42 = gpu/instrumentation/libhpcrun_la-kernel-data-map.lo \
 @OPT_ENABLE_GTPIN_TRUE@	gpu/instrumentation/libhpcrun_la-gtpin-instrumentation.lo
 @OPT_ENABLE_GTPIN_TRUE@am__objects_43 = $(am__objects_42)
 am__objects_44 = unwind/common/libhpcrun_la-backtrace.lo \
@@ -1885,9 +1881,7 @@ MY_AARCH64_FILES = \
 @OPT_ENABLE_OPENCL_TRUE@	gpu/opencl/opencl-activity-translate.c 
 
 @OPT_ENABLE_GTPIN_TRUE@MY_GTPIN_FILES = \
-@OPT_ENABLE_GTPIN_TRUE@	gpu/instrumentation/gtpin-instrumentation-kernel-data-map.c \
-@OPT_ENABLE_GTPIN_TRUE@	gpu/instrumentation/gtpin-instrumentation-kernel-memory-map.c \
-@OPT_ENABLE_GTPIN_TRUE@	gpu/instrumentation/kernel_runs_correlation_offset_map.c \
+@OPT_ENABLE_GTPIN_TRUE@	gpu/instrumentation/kernel-data-map.c \
 @OPT_ENABLE_GTPIN_TRUE@	gpu/instrumentation/gtpin-instrumentation.c
 
 @OPT_ENABLE_ROCM_TRUE@MY_ROCM_FILES = \
@@ -2843,13 +2837,7 @@ gpu/instrumentation/$(am__dirstamp):
 gpu/instrumentation/$(DEPDIR)/$(am__dirstamp):
 	@$(MKDIR_P) gpu/instrumentation/$(DEPDIR)
 	@: > gpu/instrumentation/$(DEPDIR)/$(am__dirstamp)
-gpu/instrumentation/libhpcrun_la-gtpin-instrumentation-kernel-data-map.lo:  \
-	gpu/instrumentation/$(am__dirstamp) \
-	gpu/instrumentation/$(DEPDIR)/$(am__dirstamp)
-gpu/instrumentation/libhpcrun_la-gtpin-instrumentation-kernel-memory-map.lo:  \
-	gpu/instrumentation/$(am__dirstamp) \
-	gpu/instrumentation/$(DEPDIR)/$(am__dirstamp)
-gpu/instrumentation/libhpcrun_la-kernel_runs_correlation_offset_map.lo:  \
+gpu/instrumentation/libhpcrun_la-kernel-data-map.lo:  \
 	gpu/instrumentation/$(am__dirstamp) \
 	gpu/instrumentation/$(DEPDIR)/$(am__dirstamp)
 gpu/instrumentation/libhpcrun_la-gtpin-instrumentation.lo:  \
@@ -3749,10 +3737,8 @@ distclean-compile:
 @AMDEP_TRUE@@am__include@ @am__quote@gpu/amd/$(DEPDIR)/libhpcrun_la-rocm-debug-api.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@gpu/amd/$(DEPDIR)/libhpcrun_la-roctracer-activity-translate.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@gpu/amd/$(DEPDIR)/libhpcrun_la-roctracer-api.Plo@am__quote@
-@AMDEP_TRUE@@am__include@ @am__quote@gpu/instrumentation/$(DEPDIR)/libhpcrun_la-gtpin-instrumentation-kernel-data-map.Plo@am__quote@
-@AMDEP_TRUE@@am__include@ @am__quote@gpu/instrumentation/$(DEPDIR)/libhpcrun_la-gtpin-instrumentation-kernel-memory-map.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@gpu/instrumentation/$(DEPDIR)/libhpcrun_la-gtpin-instrumentation.Plo@am__quote@
-@AMDEP_TRUE@@am__include@ @am__quote@gpu/instrumentation/$(DEPDIR)/libhpcrun_la-kernel_runs_correlation_offset_map.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@gpu/instrumentation/$(DEPDIR)/libhpcrun_la-kernel-data-map.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@gpu/level0/$(DEPDIR)/libhpcrun_la-level0-api.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@gpu/level0/$(DEPDIR)/libhpcrun_la-level0-command-list-map.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@gpu/level0/$(DEPDIR)/libhpcrun_la-level0-command-process.Plo@am__quote@
@@ -5438,26 +5424,12 @@ gpu/opencl/libhpcrun_la-opencl-activity-translate.lo: gpu/opencl/opencl-activity
 @AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
 @am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -c -o gpu/opencl/libhpcrun_la-opencl-activity-translate.lo `test -f 'gpu/opencl/opencl-activity-translate.c' || echo '$(srcdir)/'`gpu/opencl/opencl-activity-translate.c
 
-gpu/instrumentation/libhpcrun_la-gtpin-instrumentation-kernel-data-map.lo: gpu/instrumentation/gtpin-instrumentation-kernel-data-map.c
-@am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -MT gpu/instrumentation/libhpcrun_la-gtpin-instrumentation-kernel-data-map.lo -MD -MP -MF gpu/instrumentation/$(DEPDIR)/libhpcrun_la-gtpin-instrumentation-kernel-data-map.Tpo -c -o gpu/instrumentation/libhpcrun_la-gtpin-instrumentation-kernel-data-map.lo `test -f 'gpu/instrumentation/gtpin-instrumentation-kernel-data-map.c' || echo '$(srcdir)/'`gpu/instrumentation/gtpin-instrumentation-kernel-data-map.c
-@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) gpu/instrumentation/$(DEPDIR)/libhpcrun_la-gtpin-instrumentation-kernel-data-map.Tpo gpu/instrumentation/$(DEPDIR)/libhpcrun_la-gtpin-instrumentation-kernel-data-map.Plo
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='gpu/instrumentation/gtpin-instrumentation-kernel-data-map.c' object='gpu/instrumentation/libhpcrun_la-gtpin-instrumentation-kernel-data-map.lo' libtool=yes @AMDEPBACKSLASH@
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
-@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -c -o gpu/instrumentation/libhpcrun_la-gtpin-instrumentation-kernel-data-map.lo `test -f 'gpu/instrumentation/gtpin-instrumentation-kernel-data-map.c' || echo '$(srcdir)/'`gpu/instrumentation/gtpin-instrumentation-kernel-data-map.c
-
-gpu/instrumentation/libhpcrun_la-gtpin-instrumentation-kernel-memory-map.lo: gpu/instrumentation/gtpin-instrumentation-kernel-memory-map.c
-@am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -MT gpu/instrumentation/libhpcrun_la-gtpin-instrumentation-kernel-memory-map.lo -MD -MP -MF gpu/instrumentation/$(DEPDIR)/libhpcrun_la-gtpin-instrumentation-kernel-memory-map.Tpo -c -o gpu/instrumentation/libhpcrun_la-gtpin-instrumentation-kernel-memory-map.lo `test -f 'gpu/instrumentation/gtpin-instrumentation-kernel-memory-map.c' || echo '$(srcdir)/'`gpu/instrumentation/gtpin-instrumentation-kernel-memory-map.c
-@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) gpu/instrumentation/$(DEPDIR)/libhpcrun_la-gtpin-instrumentation-kernel-memory-map.Tpo gpu/instrumentation/$(DEPDIR)/libhpcrun_la-gtpin-instrumentation-kernel-memory-map.Plo
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='gpu/instrumentation/gtpin-instrumentation-kernel-memory-map.c' object='gpu/instrumentation/libhpcrun_la-gtpin-instrumentation-kernel-memory-map.lo' libtool=yes @AMDEPBACKSLASH@
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
-@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -c -o gpu/instrumentation/libhpcrun_la-gtpin-instrumentation-kernel-memory-map.lo `test -f 'gpu/instrumentation/gtpin-instrumentation-kernel-memory-map.c' || echo '$(srcdir)/'`gpu/instrumentation/gtpin-instrumentation-kernel-memory-map.c
-
-gpu/instrumentation/libhpcrun_la-kernel_runs_correlation_offset_map.lo: gpu/instrumentation/kernel_runs_correlation_offset_map.c
-@am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -MT gpu/instrumentation/libhpcrun_la-kernel_runs_correlation_offset_map.lo -MD -MP -MF gpu/instrumentation/$(DEPDIR)/libhpcrun_la-kernel_runs_correlation_offset_map.Tpo -c -o gpu/instrumentation/libhpcrun_la-kernel_runs_correlation_offset_map.lo `test -f 'gpu/instrumentation/kernel_runs_correlation_offset_map.c' || echo '$(srcdir)/'`gpu/instrumentation/kernel_runs_correlation_offset_map.c
-@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) gpu/instrumentation/$(DEPDIR)/libhpcrun_la-kernel_runs_correlation_offset_map.Tpo gpu/instrumentation/$(DEPDIR)/libhpcrun_la-kernel_runs_correlation_offset_map.Plo
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='gpu/instrumentation/kernel_runs_correlation_offset_map.c' object='gpu/instrumentation/libhpcrun_la-kernel_runs_correlation_offset_map.lo' libtool=yes @AMDEPBACKSLASH@
+gpu/instrumentation/libhpcrun_la-kernel-data-map.lo: gpu/instrumentation/kernel-data-map.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -MT gpu/instrumentation/libhpcrun_la-kernel-data-map.lo -MD -MP -MF gpu/instrumentation/$(DEPDIR)/libhpcrun_la-kernel-data-map.Tpo -c -o gpu/instrumentation/libhpcrun_la-kernel-data-map.lo `test -f 'gpu/instrumentation/kernel-data-map.c' || echo '$(srcdir)/'`gpu/instrumentation/kernel-data-map.c
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) gpu/instrumentation/$(DEPDIR)/libhpcrun_la-kernel-data-map.Tpo gpu/instrumentation/$(DEPDIR)/libhpcrun_la-kernel-data-map.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='gpu/instrumentation/kernel-data-map.c' object='gpu/instrumentation/libhpcrun_la-kernel-data-map.lo' libtool=yes @AMDEPBACKSLASH@
 @AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
-@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -c -o gpu/instrumentation/libhpcrun_la-kernel_runs_correlation_offset_map.lo `test -f 'gpu/instrumentation/kernel_runs_correlation_offset_map.c' || echo '$(srcdir)/'`gpu/instrumentation/kernel_runs_correlation_offset_map.c
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -c -o gpu/instrumentation/libhpcrun_la-kernel-data-map.lo `test -f 'gpu/instrumentation/kernel-data-map.c' || echo '$(srcdir)/'`gpu/instrumentation/kernel-data-map.c
 
 gpu/instrumentation/libhpcrun_la-gtpin-instrumentation.lo: gpu/instrumentation/gtpin-instrumentation.c
 @am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -MT gpu/instrumentation/libhpcrun_la-gtpin-instrumentation.lo -MD -MP -MF gpu/instrumentation/$(DEPDIR)/libhpcrun_la-gtpin-instrumentation.Tpo -c -o gpu/instrumentation/libhpcrun_la-gtpin-instrumentation.lo `test -f 'gpu/instrumentation/gtpin-instrumentation.c' || echo '$(srcdir)/'`gpu/instrumentation/gtpin-instrumentation.c
diff --git a/src/tool/hpcrun/gpu/gpu-activity-process.c b/src/tool/hpcrun/gpu/gpu-activity-process.c
index 0b67c62c92..552e5ee13e 100644
--- a/src/tool/hpcrun/gpu/gpu-activity-process.c
+++ b/src/tool/hpcrun/gpu/gpu-activity-process.c
@@ -427,37 +427,26 @@ gpu_kernel_block_process
  gpu_activity_t* activity
 )
 {
-  uint32_t correlation_id = activity->details.kernel_block.correlation_id;
+  uint64_t external_id = activity->details.kernel_block.external_id;
+  ip_normalized_t ip = activity->details.kernel_block.pc;
 
-  gpu_correlation_id_map_entry_t *cid_map_entry =
-    gpu_correlation_id_map_lookup(correlation_id);
-
-  if (cid_map_entry != NULL) {
-    uint64_t external_id =
-      gpu_correlation_id_map_entry_external_id_get(cid_map_entry);
+  gpu_host_correlation_map_entry_t *host_op_entry =
+    gpu_host_correlation_map_lookup(external_id);
 
-    ip_normalized_t ip = activity->details.kernel_block.pc;
-
-    gpu_host_correlation_map_entry_t *host_op_entry =
-      gpu_host_correlation_map_lookup(external_id);
-
-    if (host_op_entry != NULL) {
-      PRINT("external_id %lu\n", external_id);
+  if (host_op_entry != NULL) {
+    PRINT("external_id %lu\n", external_id);
 
-      cct_node_t *host_op_node =
-        gpu_host_correlation_map_entry_op_function_get(host_op_entry);
+    cct_node_t *host_op_node =
+      gpu_host_correlation_map_entry_op_function_get(host_op_entry);
 
-			// create a child cct node that contains 2 metrics: offset of block head wrt. original binary, dynamic execution count of block
-      cct_node_t *cct_child = hpcrun_cct_insert_ip_norm(host_op_node, ip); // how to set the ip_norm
-      if (cct_child) {
-        PRINT("cct_child %p\n", cct_child);
-        attribute_activity(host_op_entry, activity, cct_child);
-      }
-    } else {
-      PRINT("host_map_entry %lu not found\n", external_id);
+    // create a child cct node that contains 2 metrics: offset of block head wrt. original binary, dynamic execution count of block
+    cct_node_t *cct_child = hpcrun_cct_insert_ip_norm(host_op_node, ip); // how to set the ip_norm
+    if (cct_child) {
+      PRINT("cct_child %p\n", cct_child);
+      attribute_activity(host_op_entry, activity, cct_child);
     }
   } else {
-    PRINT("correlation_id_map_entry %u not found\n", correlation_id);
+    PRINT("host_map_entry %lu not found\n", external_id);
   }
 }
 
@@ -661,9 +650,9 @@ gpu_activity_process
     gpu_kernel_process(ga);
     break;
 
-	case GPU_ACTIVITY_KERNEL_BLOCK:
-		gpu_kernel_block_process(ga);
-		break;
+  case GPU_ACTIVITY_KERNEL_BLOCK:
+    gpu_kernel_block_process(ga);
+    break;
 
   case GPU_ACTIVITY_SYNCHRONIZATION:
     gpu_synchronization_process(ga);
diff --git a/src/tool/hpcrun/gpu/gpu-activity.h b/src/tool/hpcrun/gpu/gpu-activity.h
index 968bf62d9f..4b6a5a22de 100644
--- a/src/tool/hpcrun/gpu/gpu-activity.h
+++ b/src/tool/hpcrun/gpu/gpu-activity.h
@@ -80,7 +80,7 @@ typedef struct gpu_activity_channel_t gpu_activity_channel_t;
 typedef enum {    
   GPU_ACTIVITY_UNKNOWN                 = 0,
   GPU_ACTIVITY_KERNEL                  = 1,
-	GPU_ACTIVITY_KERNEL_BLOCK						 = 2,	
+  GPU_ACTIVITY_KERNEL_BLOCK             = 2,  
   GPU_ACTIVITY_MEMCPY                  = 3,
   GPU_ACTIVITY_MEMCPY2                 = 4,
   GPU_ACTIVITY_MEMSET                  = 5,
@@ -260,9 +260,9 @@ typedef struct gpu_kernel_t {
 
 
 typedef struct gpu_kernel_block_t {
-	uint32_t correlation_id;
-	uint64_t execution_count;
-	ip_normalized_t pc;
+  uint64_t external_id;
+  uint64_t execution_count;
+  ip_normalized_t pc;
 } gpu_kernel_block_t;
 
 
@@ -359,7 +359,7 @@ typedef struct gpu_activity_details_t {
     gpu_memory_t memory;
     gpu_memset_t memset;
     gpu_kernel_t kernel;
-		gpu_kernel_block_t kernel_block;
+    gpu_kernel_block_t kernel_block;
     gpu_function_t function;
     gpu_cdpkernel_t cdpkernel;
     gpu_event_t event;
diff --git a/src/tool/hpcrun/gpu/gpu-metrics.c b/src/tool/hpcrun/gpu/gpu-metrics.c
index 28cf605b5a..9fffadcbce 100644
--- a/src/tool/hpcrun/gpu/gpu-metrics.c
+++ b/src/tool/hpcrun/gpu/gpu-metrics.c
@@ -68,28 +68,28 @@
 #define FORMAT_DISPLAY_PERCENTAGE     "%6.2f %%"
 #define FORMAT_DISPLAY_INT            "%6.0f"
 
-#define FORALL_INDEXED_METRIC_KINDS(macro)	\
-  macro(GMEM, 0)				\
-  macro(GMSET, 1)				\
-  macro(GPU_INST_STALL, 2)			\
-  macro(GXCOPY, 3)				\
-  macro(GSYNC, 4)				\
-  macro(GGMEM, 5)				\
+#define FORALL_INDEXED_METRIC_KINDS(macro)  \
+  macro(GMEM, 0)  \
+  macro(GMSET, 1)  \
+  macro(GPU_INST_STALL, 2)  \
+  macro(GXCOPY, 3)  \
+  macro(GSYNC, 4)  \
+  macro(GGMEM, 5)  \
   macro(GLMEM, 6)
 
 
-#define FORALL_SCALAR_METRIC_KINDS(macro)	\
-  macro(GBR, 7)					\
-  macro(GICOPY, 8)				\
-  macro(GPU_INST, 9)				\
-  macro(GTIMES, 10)				\
-  macro(KINFO, 12)				\
-  macro(GSAMP, 13)				\
-	macro(KER_BLKINFO, 14)
+#define FORALL_SCALAR_METRIC_KINDS(macro)  \
+  macro(GBR, 7)  \
+  macro(GICOPY, 8)  \
+  macro(GPU_INST, 9)  \
+  macro(GTIMES, 10)  \
+  macro(KINFO, 12)  \
+  macro(GSAMP, 13)  \
+  macro(KER_BLKINFO, 14)
 
 
-#define FORALL_METRIC_KINDS(macro)	\
-  FORALL_INDEXED_METRIC_KINDS(macro)	\
+#define FORALL_METRIC_KINDS(macro)  \
+  FORALL_INDEXED_METRIC_KINDS(macro)  \
   FORALL_SCALAR_METRIC_KINDS(macro)
 
 
@@ -102,22 +102,22 @@
 #define COUNT_FORALL_CLAUSE(a,b,c) + 1
 #define NUM_CLAUSES(forall_macro) 0 forall_macro(COUNT_FORALL_CLAUSE)
 
-#define METRIC_KIND(name)			\
+#define METRIC_KIND(name)      \
 name ## _metric_kind
 
-#define INITIALIZE_METRIC_KINDS(name, value)	\
+#define INITIALIZE_METRIC_KINDS(name, value)  \
   static kind_info_t * METRIC_KIND(name) = NULL;
 
 #define METRIC_ID(name) \
   name ## _metric_id
 
-#define INITIALIZE_INDEXED_METRIC(name, value)		\
+#define INITIALIZE_INDEXED_METRIC(name, value)    \
   static int METRIC_ID(name)[NUM_CLAUSES( FORALL_ ## name)];
 
-#define INITIALIZE_SCALAR_METRIC(string, name, desc)	\
+#define INITIALIZE_SCALAR_METRIC(string, name, desc)  \
   static int METRIC_ID(name);
 
-#define INITIALIZE_SCALAR_METRIC_KIND(kind, value)	\
+#define INITIALIZE_SCALAR_METRIC_KIND(kind, value)  \
   FORALL_ ## kind (INITIALIZE_SCALAR_METRIC)
 
 //------------------------------------------------------------------------------
@@ -126,55 +126,55 @@ name ## _metric_kind
 
 #define APPLY(f,n) f(n)
 
-#define INITIALIZE_METRIC_KIND()					\
+#define INITIALIZE_METRIC_KIND()          \
   APPLY(METRIC_KIND,CURRENT_METRIC) = hpcrun_metrics_new_kind()
 
 
-#define FINALIZE_METRIC_KIND()				\
+#define FINALIZE_METRIC_KIND()        \
   hpcrun_close_kind(APPLY(METRIC_KIND,CURRENT_METRIC))
 
 
-#define INITIALIZE_INDEXED_METRIC_INT(metric_name, index, metric_desc)	\
-   APPLY(METRIC_ID,CURRENT_METRIC)[index] =				\
-     hpcrun_set_new_metric_desc_and_period				\
-     (APPLY(METRIC_KIND,CURRENT_METRIC), metric_name, metric_desc,	\
+#define INITIALIZE_INDEXED_METRIC_INT(metric_name, index, metric_desc)  \
+   APPLY(METRIC_ID,CURRENT_METRIC)[index] =        \
+     hpcrun_set_new_metric_desc_and_period        \
+     (APPLY(METRIC_KIND,CURRENT_METRIC), metric_name, metric_desc,  \
      MetricFlags_ValFmt_Int, 1, metric_property_none);
 
 
-#define INITIALIZE_INDEXED_METRIC_REAL(metric_name, index, metric_desc)	\
-  APPLY(METRIC_ID,CURRENT_METRIC)[index] =				\
-    hpcrun_set_new_metric_desc_and_period				\
-    (APPLY(METRIC_KIND,CURRENT_METRIC), metric_name, metric_desc,	\
+#define INITIALIZE_INDEXED_METRIC_REAL(metric_name, index, metric_desc)  \
+  APPLY(METRIC_ID,CURRENT_METRIC)[index] =        \
+    hpcrun_set_new_metric_desc_and_period        \
+    (APPLY(METRIC_KIND,CURRENT_METRIC), metric_name, metric_desc,  \
      MetricFlags_ValFmt_Real, 1, metric_property_none);
 
 
 #define INITIALIZE_SCALAR_METRIC_INT(metric_name, metric_var, metric_desc) \
-  METRIC_ID(metric_var) =						\
-    hpcrun_set_new_metric_desc_and_period				\
-    (APPLY(METRIC_KIND,CURRENT_METRIC), metric_name, metric_desc,	\
+  METRIC_ID(metric_var) =            \
+    hpcrun_set_new_metric_desc_and_period        \
+    (APPLY(METRIC_KIND,CURRENT_METRIC), metric_name, metric_desc,  \
      MetricFlags_ValFmt_Int, 1, metric_property_none);
 
 
 #define INITIALIZE_SCALAR_METRIC_REAL(metric_name, metric_var, metric_desc) \
-  METRIC_ID(metric_var) =						\
-    hpcrun_set_new_metric_desc_and_period				\
-    (APPLY(METRIC_KIND,CURRENT_METRIC), metric_name, metric_desc,	\
+  METRIC_ID(metric_var) =            \
+    hpcrun_set_new_metric_desc_and_period        \
+    (APPLY(METRIC_KIND,CURRENT_METRIC), metric_name, metric_desc,  \
      MetricFlags_ValFmt_Real, 1, metric_property_none);
 
 
-#define SET_DISPLAY_INDEXED_METRIC(name, index, val)			\
+#define SET_DISPLAY_INDEXED_METRIC(name, index, val)      \
   hpcrun_set_display(APPLY(METRIC_ID,CURRENT_METRIC)[index], val);
 
 
-#define SET_DISPLAY_SCALAR_METRIC(name, val)			\
+#define SET_DISPLAY_SCALAR_METRIC(name, val)      \
   hpcrun_set_display(APPLY(METRIC_ID,name), val);
 
 
-#define HIDE_INDEXED_METRIC(string, name, desc)				\
+#define HIDE_INDEXED_METRIC(string, name, desc)        \
     SET_DISPLAY_INDEXED_METRIC(name,  name, HPCRUN_FMT_METRIC_HIDE);
 
 
-#define HIDE_SCALAR_METRIC(string, name, desc)				\
+#define HIDE_SCALAR_METRIC(string, name, desc)        \
     SET_DISPLAY_SCALAR_METRIC(name,  HPCRUN_FMT_METRIC_HIDE);
 
 
@@ -280,7 +280,7 @@ gpu_metrics_attribute_pc_sampling
 
   // instruction execution metric
   gpu_metrics_attribute_metric_int(inst_metric, METRIC_ID(GPU_INST_ALL), 
-				   inst_count);
+           inst_count);
 
   if (sinfo->stallReason != GPU_INST_STALL_INVALID) {
     int stall_summary_metric_index = 
@@ -296,12 +296,12 @@ gpu_metrics_attribute_pc_sampling
     if (sinfo->stallReason != GPU_INST_STALL_NONE) {
       // stall summary metric
       gpu_metrics_attribute_metric_int(stall_metrics, 
-				       stall_summary_metric_index, stall_count);
+               stall_summary_metric_index, stall_count);
     }
 
     // stall reason specific metric
     gpu_metrics_attribute_metric_int(stall_metrics, 
-				     stall_kind_metric_index, stall_count);
+             stall_kind_metric_index, stall_count);
   }
 }
 
@@ -321,16 +321,16 @@ gpu_metrics_attribute_pc_sampling_info
   
   // OK to use set here because sampling cycle is changed during execution
   hpcrun_metric_std_set(METRIC_ID(GPU_SAMPLE_PERIOD), metrics,
-			(cct_metric_data_t){.i = s->samplingPeriodInCycles});
+      (cct_metric_data_t){.i = s->samplingPeriodInCycles});
   
   gpu_metrics_attribute_metric_int(metrics, METRIC_ID(GPU_SAMPLE_TOTAL), 
-				   s->totalSamples);
+           s->totalSamples);
   
   gpu_metrics_attribute_metric_int(metrics, METRIC_ID(GPU_SAMPLE_EXPECTED), 
-				   s->fullSMSamples);
+           s->fullSMSamples);
 
   gpu_metrics_attribute_metric_int(metrics, METRIC_ID(GPU_SAMPLE_DROPPED), 
-				   s->droppedSamples);
+           s->droppedSamples);
 }
 
 
@@ -350,7 +350,7 @@ gpu_metrics_attribute_mem_op
   gpu_metrics_attribute_metric_int(bytes_metrics, bytes_metric_index, m->bytes);
 
   gpu_metrics_attribute_metric_time_interval(cct_node, time_metric_index, 
-					     (gpu_interval_t *) m);
+               (gpu_interval_t *) m);
 
   metric_data_list_t *count_metrics = 
     hpcrun_reify_metric_set(cct_node, count_metric_index);
@@ -374,7 +374,7 @@ gpu_metrics_attribute_memory
   int count_metric_index = METRIC_ID(GMEM)[GPU_MEM_COUNT];
 
   gpu_metrics_attribute_mem_op(cct_node, bytes_metric_index, 
-			       METRIC_ID(GPU_TIME_MEM), count_metric_index, (gpu_mem_t *) m);
+             METRIC_ID(GPU_TIME_MEM), count_metric_index, (gpu_mem_t *) m);
 }
 
 
@@ -392,7 +392,7 @@ gpu_metrics_attribute_memcpy
   int count_metric_index = METRIC_ID(GXCOPY)[GPU_MEMCPY_COUNT];
 
   gpu_metrics_attribute_mem_op(cct_node, bytes_metric_index, 
-			       METRIC_ID(GPU_TIME_XCOPY), count_metric_index, (gpu_mem_t *) m);
+             METRIC_ID(GPU_TIME_XCOPY), count_metric_index, (gpu_mem_t *) m);
 }
 
 
@@ -410,7 +410,7 @@ gpu_metrics_attribute_memset
   int count_metric_index = METRIC_ID(GMSET)[GPU_MEM_COUNT];
 
   gpu_metrics_attribute_mem_op(cct_node, bytes_metric_index, 
-			       METRIC_ID(GPU_TIME_MSET), count_metric_index, (gpu_mem_t *) m);
+             METRIC_ID(GPU_TIME_MSET), count_metric_index, (gpu_mem_t *) m);
 }
 
 
@@ -428,28 +428,28 @@ gpu_metrics_attribute_kernel
       hpcrun_reify_metric_set(cct_node, METRIC_ID(GPU_KINFO_STMEM_ACUMU));
 
     gpu_metrics_attribute_metric_int(metrics, METRIC_ID(GPU_KINFO_STMEM_ACUMU), 
-				     k->staticSharedMemory);
+             k->staticSharedMemory);
 
     gpu_metrics_attribute_metric_int(metrics, METRIC_ID(GPU_KINFO_DYMEM_ACUMU), 
-				     k->dynamicSharedMemory);
+             k->dynamicSharedMemory);
 
     gpu_metrics_attribute_metric_int(metrics, METRIC_ID(GPU_KINFO_LMEM_ACUMU), 
-				     k->localMemoryTotal);
+             k->localMemoryTotal);
 
     gpu_metrics_attribute_metric_int(metrics, METRIC_ID(GPU_KINFO_FGP_ACT_ACUMU), 
-				     k->activeWarpsPerSM);
+             k->activeWarpsPerSM);
 
     gpu_metrics_attribute_metric_int(metrics, METRIC_ID(GPU_KINFO_FGP_MAX_ACUMU), 
-				     k->maxActiveWarpsPerSM);
+             k->maxActiveWarpsPerSM);
   
     gpu_metrics_attribute_metric_int(metrics, METRIC_ID(GPU_KINFO_REGISTERS_ACUMU), 
-				     k->threadRegisters);
+             k->threadRegisters);
 
     gpu_metrics_attribute_metric_int(metrics, METRIC_ID(GPU_KINFO_BLK_THREADS_ACUMU), 
-				     k->blockThreads);
+             k->blockThreads);
 
     gpu_metrics_attribute_metric_int(metrics, METRIC_ID(GPU_KINFO_BLK_SMEM_ACUMU), 
-				     k->blockSharedMemory);
+             k->blockSharedMemory);
 
     // number of kernel launches
     gpu_metrics_attribute_metric_int(metrics, METRIC_ID(GPU_KINFO_COUNT), 1);
@@ -457,24 +457,23 @@ gpu_metrics_attribute_kernel
   
   // kernel execution time
   gpu_metrics_attribute_metric_time_interval(cct_node, METRIC_ID(GPU_TIME_KER), 
-				     (gpu_interval_t *) k);
+             (gpu_interval_t *) k);
 }
 
 
 static void
 gpu_metrics_attribute_kernel_block
 (
-	gpu_activity_t *activity
+ gpu_activity_t *activity
 )
 {
   gpu_kernel_block_t *b = &(activity->details.kernel_block);
   cct_node_t *cct_node = activity->cct_node;
 
-	metric_data_list_t *metrics = 
-		hpcrun_reify_metric_set(cct_node, METRIC_ID(KER_BLK_EXECUTION_COUNT));
+  metric_data_list_t *metrics = 
+    hpcrun_reify_metric_set(cct_node, METRIC_ID(GPU_INST_ALL));
 
-	gpu_metrics_attribute_metric_int(metrics, METRIC_ID(KER_BLK_EXECUTION_COUNT),	// need to increment execution count for existing ccts
-					 b->execution_count);
+  gpu_metrics_attribute_metric_int(metrics, METRIC_ID(GPU_INST_ALL), b->execution_count);
 }
 
 
@@ -490,11 +489,11 @@ gpu_metrics_attribute_synchronization
   int sync_kind_metric_id = METRIC_ID(GSYNC)[s->syncKind];
 
   gpu_metrics_attribute_metric_time_interval(cct_node, sync_kind_metric_id, 
-				     (gpu_interval_t *) s);
+             (gpu_interval_t *) s);
 
   gpu_metrics_attribute_metric_time_interval(cct_node, 
-					     METRIC_ID(GPU_TIME_SYNC), 
-					     (gpu_interval_t *) s);
+               METRIC_ID(GPU_TIME_SYNC), 
+               (gpu_interval_t *) s);
 
   int count_metric_index = METRIC_ID(GSYNC)[GPU_SYNC_COUNT];
   
@@ -527,7 +526,7 @@ gpu_metrics_attribute_global_access
     METRIC_ID(GMEM)[GPU_GMEM_LD_CACHED_L2TRANS_THEOR + type];
 
   gpu_metrics_attribute_metric_int(metrics, l2t_theoretical_index, 
-				   g->theoreticalL2Transactions);
+           g->theoreticalL2Transactions);
 
   int bytes_index = METRIC_ID(GMEM)[GPU_GMEM_LD_CACHED_BYTES + type];
   gpu_metrics_attribute_metric_int(metrics, bytes_index, g->bytes);
@@ -551,11 +550,11 @@ gpu_metrics_attribute_local_access
     hpcrun_reify_metric_set(cct_node, lmem_trans_index);
 
   gpu_metrics_attribute_metric_int(metrics, lmem_trans_index, 
-				   l->sharedTransactions);
+           l->sharedTransactions);
   
   int lmem_trans_theor_index = METRIC_ID(GLMEM)[GPU_LMEM_LD_TRANS_THEOR + type];
   gpu_metrics_attribute_metric_int(metrics, lmem_trans_theor_index, 
-				   l->theoreticalSharedTransactions);
+           l->theoreticalSharedTransactions);
   
   int bytes_index = METRIC_ID(GLMEM)[GPU_LMEM_LD_BYTES + type];
   gpu_metrics_attribute_metric_int(metrics, bytes_index, l->bytes);
@@ -575,10 +574,10 @@ gpu_metrics_attribute_branch
     hpcrun_reify_metric_set(cct_node, METRIC_ID(GPU_BR_DIVERGED));
 
   gpu_metrics_attribute_metric_int(metrics, METRIC_ID(GPU_BR_DIVERGED), 
-				   b->diverged);
+           b->diverged);
 
   gpu_metrics_attribute_metric_int(metrics, METRIC_ID(GPU_BR_EXECUTED), 
-				   b->executed);
+           b->executed);
 }
 
 
@@ -617,9 +616,9 @@ gpu_metrics_attribute
     gpu_metrics_attribute_kernel(activity);
     break;
 
-	case GPU_ACTIVITY_KERNEL_BLOCK:
-		gpu_metrics_attribute_kernel_block(activity);
-		break;
+  case GPU_ACTIVITY_KERNEL_BLOCK:
+    gpu_metrics_attribute_kernel_block(activity);
+    break;
     
   case GPU_ACTIVITY_SYNCHRONIZATION:
     gpu_metrics_attribute_synchronization(activity);
@@ -743,26 +742,6 @@ gpu_metrics_KINFO_enable
 }
 
 
-void
-gpu_metrics_KER_BLKINFO_enable
-(
- void
-)
-{
-// kernel block characteristics metrics
-#undef CURRENT_METRIC 
-#define CURRENT_METRIC GPU_INST // we are copying from gpu_metrics_GPU_INST_enable(). confirm 
-
-  INITIALIZE_METRIC_KIND();
-
-	COPY_METRIC_TO_SIBLINGS_IN_BBLOCK(GPU_INST_ALL);
-
-  FORALL_GPU_INST(INITIALIZE_SCALAR_METRIC_INT)
-
-  FINALIZE_METRIC_KIND();
-}
-
-
 void
 gpu_metrics_GICOPY_enable
 (
@@ -861,7 +840,7 @@ gpu_metrics_GSAMP_enable
   char *util_formula = hpcrun_malloc_safe(sizeof(char) * MAX_CHAR_FORMULA);
 
   sprintf(util_formula, "min(100, max(0, 100*#%d/#%d))", METRIC_ID(GPU_SAMPLE_TOTAL), 
-	  METRIC_ID(GPU_SAMPLE_EXPECTED));
+    METRIC_ID(GPU_SAMPLE_EXPECTED));
 
   util_metric->formula = util_formula;
   util_metric->format  = FORMAT_DISPLAY_PERCENTAGE;
@@ -902,7 +881,7 @@ gpu_metrics_GPU_INST_STALL_enable
   FORALL_GPU_INST_STALL(HIDE_INDEXED_METRIC);
 
   SET_DISPLAY_INDEXED_METRIC(GPU_INST_STALL_ANY, GPU_INST_STALL_ANY, 
-			     HPCRUN_FMT_METRIC_SHOW);
+           HPCRUN_FMT_METRIC_SHOW);
 
   FINALIZE_METRIC_KIND();
 }
diff --git a/src/tool/hpcrun/gpu/gpu-metrics.h b/src/tool/hpcrun/gpu/gpu-metrics.h
index d466cbeeba..1205df0d40 100644
--- a/src/tool/hpcrun/gpu/gpu-metrics.h
+++ b/src/tool/hpcrun/gpu/gpu-metrics.h
@@ -413,17 +413,6 @@ gpu_metrics_KINFO_enable
 );
 
 
-//--------------------------------------------------
-// record INTEL GTPIN kernel instrumentation info
-//--------------------------------------------------
-
-void
-gpu_metrics_KER_BLKINFO_enable
-(
- void
-);
-
-
 //--------------------------------------------------
 // record implicit copy metrics for unified memory
 //--------------------------------------------------
diff --git a/src/tool/hpcrun/gpu/instrumentation/gtpin-instrumentation-kernel-data-map.c b/src/tool/hpcrun/gpu/instrumentation/gtpin-instrumentation-kernel-data-map.c
deleted file mode 100644
index d0af48e6c2..0000000000
--- a/src/tool/hpcrun/gpu/instrumentation/gtpin-instrumentation-kernel-data-map.c
+++ /dev/null
@@ -1,125 +0,0 @@
-//******************************************************************************
-// system includes
-//******************************************************************************
-
-#include <string.h>
-#include <assert.h>
-
-
-//******************************************************************************
-// local includes
-//******************************************************************************
-
-#include <hpcrun/gpu/gpu-splay-allocator.h>
-#include <lib/prof-lean/splay-uint64.h>
-
-#include "gtpin-instrumentation-kernel-data-map.h"
-
-
-
-//******************************************************************************
-// type declarations
-//******************************************************************************
-
-#define kdm_insert																					\
-  typed_splay_insert(kernel_data_map)
-
-#define kdm_lookup																					\
-  typed_splay_lookup(kernel_data_map)
-
-#define kdm_delete																					\
-  typed_splay_delete(kernel_data_map)
-
-#define kdm_forall																					\
-  typed_splay_forall(kernel_data_map)
-
-#define kdm_count																						\
-  typed_splay_count(kernel_data_map)
-
-#define kdm_alloc(free_list)																\
-  typed_splay_alloc(free_list, kernel_data_map_t)
-
-#define kdm_free(free_list, node)														\
-  typed_splay_free(free_list, node)
-
-typed_splay_impl(kernel_data_map);
-
-
-
-//******************************************************************************
-// local data
-//******************************************************************************
-
-static kernel_data_map_t *kernel_data_map_root = NULL;
-static kernel_data_map_t *kernel_data_map_free_list = NULL;
-
-
-
-//******************************************************************************
-// private operations
-//******************************************************************************
-
-static kernel_data_map_t *
-kernel_data_alloc()
-{
-  return kdm_alloc(&kernel_data_map_free_list);
-}
-
-
-static kernel_data_map_t *
-kernel_data_new
-(
-	uint64_t GTPinKernel_id,
-	KernelData data
-)
-{
-  kernel_data_map_t *e = kernel_data_alloc();
-  memset(e, 0, sizeof(kernel_data_map_t)); 
-  e->GTPinKernel_id = GTPinKernel_id;
-  e->data = data;
-  return e;
-}
-
-
-
-//******************************************************************************
-// interface operations
-//******************************************************************************
-
-kernel_data_map_t*
-kernel_data_map_lookup1
-(
-	uint64_t GTPinKernel_id
-)
-{
-  kernel_data_map_t *result = kdm_lookup(&kernel_data_map_root, GTPinKernel_id);
-	return result;
-}
-
-
-void
-kernel_data_map_insert1
-(
-	uint64_t GTPinKernel_id,
-	KernelData data
-)
-{
-	if (kdm_lookup(&kernel_data_map_root, GTPinKernel_id)) {
-		assert(0);	// entry for a given key should be inserted only once
-	} else {
-		kernel_data_map_t *entry = kernel_data_new(GTPinKernel_id, data);
-		kdm_insert(&kernel_data_map_root, entry);	
-	}
-}
-
-
-void
-kernel_data_map_delete1
-(
-	uint64_t GTPinKernel_id
-)
-{
-	kernel_data_map_t *node = kdm_delete(&kernel_data_map_root, GTPinKernel_id);
-	kdm_free(&kernel_data_map_free_list, node);
-}
-
diff --git a/src/tool/hpcrun/gpu/instrumentation/gtpin-instrumentation-kernel-data-map.h b/src/tool/hpcrun/gpu/instrumentation/gtpin-instrumentation-kernel-data-map.h
deleted file mode 100644
index 8e4db7f0b8..0000000000
--- a/src/tool/hpcrun/gpu/instrumentation/gtpin-instrumentation-kernel-data-map.h
+++ /dev/null
@@ -1,64 +0,0 @@
-
-//******************************************************************************
-// system includes
-//******************************************************************************
-
-#include <stdint.h>
-
-
-
-//******************************************************************************
-// type declarations
-//******************************************************************************
-
-typedef struct kernel_offset {
-	uint32_t offset;
-	struct kernel_offset *next;
-} kernel_offset;
-
-
-typedef struct KernelData {
-	uint32_t loadmap_module_id;
-	kernel_offset *offset_head;
-} KernelData;
-
-
-#undef typed_splay_node
-#define typed_splay_node(kernel_data_map) kernel_data_map_t
-
-
-typedef struct typed_splay_node(kernel_data_map) {
-  struct typed_splay_node(kernel_data_map) *left;
-  struct typed_splay_node(kernel_data_map) *right;
-  uint64_t GTPinKernel_id; // key
-
-	KernelData data;
-}typed_splay_node(kernel_data_map);
-
-
-
-//******************************************************************************
-// interface operations
-//******************************************************************************
-
-kernel_data_map_t*
-kernel_data_map_lookup1
-(
-	uint64_t
-);
-
-
-void
-kernel_data_map_insert1
-(
-	uint64_t,
-	KernelData
-);
-
-
-void
-kernel_data_map_delete1
-(
-	uint64_t
-);
-
diff --git a/src/tool/hpcrun/gpu/instrumentation/gtpin-instrumentation-kernel-memory-map.c b/src/tool/hpcrun/gpu/instrumentation/gtpin-instrumentation-kernel-memory-map.c
deleted file mode 100644
index 003f087caa..0000000000
--- a/src/tool/hpcrun/gpu/instrumentation/gtpin-instrumentation-kernel-memory-map.c
+++ /dev/null
@@ -1,126 +0,0 @@
-
-//******************************************************************************
-// system includes
-//******************************************************************************
-
-#include <string.h>
-
-
-
-//******************************************************************************
-// local includes
-//******************************************************************************
-
-#include <hpcrun/gpu/gpu-splay-allocator.h>
-#include <lib/prof-lean/splay-uint64.h>
-
-#include "gtpin-instrumentation-kernel-memory-map.h"
-
-
-//******************************************************************************
-// type declarations
-//******************************************************************************
-
-#define kmm_insert																					\
-  typed_splay_insert(kernel_memory_map)
-
-#define kmm_lookup																					\
-  typed_splay_lookup(kernel_memory_map)
-
-#define kmm_delete																					\
-  typed_splay_delete(kernel_memory_map)
-
-#define kmm_forall																					\
-  typed_splay_forall(kernel_memory_map)
-
-#define kmm_count																						\
-  typed_splay_count(kernel_memory_map)
-
-#define kmm_alloc(free_list)																\
-  typed_splay_alloc(free_list, kernel_memory_map_t)
-
-#define kmm_free(free_list, node)														\
-  typed_splay_free(free_list, node)
-
-typed_splay_impl(kernel_memory_map);
-
-
-
-//******************************************************************************
-// local data
-//******************************************************************************
-
-static kernel_memory_map_t *kernel_memory_map_root = NULL;
-static kernel_memory_map_t *kernel_memory_map_free_list = NULL;
-
-
-
-//******************************************************************************
-// private operations
-//******************************************************************************
-
-static kernel_memory_map_t *
-kernel_mem_alloc()
-{
-  return kmm_alloc(&kernel_memory_map_free_list);
-}
-
-
-static kernel_memory_map_t *
-kernel_mem_new
-(
-	uint64_t GTPinKernel_id,
-	mem_pair_node *head
-)
-{
-  kernel_memory_map_t *e = kernel_mem_alloc();
-  memset(e, 0, sizeof(kernel_memory_map_t)); 
-  e->GTPinKernel_id = GTPinKernel_id;
-  e->head = head;
-  return e;
-}
-
-
-
-
-//******************************************************************************
-// interface operations
-//******************************************************************************
-
-kernel_memory_map_t*
-kernel_memory_map_lookup1
-(
-	uint64_t GTPinKernel_id
-)
-{
-  kernel_memory_map_t *result = kmm_lookup(&kernel_memory_map_root, GTPinKernel_id);
-	return result;
-}
-
-
-void
-kernel_memory_map_insert1
-(
-	uint64_t GTPinKernel_id,
-	mem_pair_node *head
-)
-{
-	if (kmm_lookup(&kernel_memory_map_root, GTPinKernel_id)) {
-		assert(0);	// entry for a given key should be inserted only once
-	} else {
-		kernel_memory_map_t *entry = kernel_mem_new(GTPinKernel_id, head);
-		kmm_insert(&kernel_memory_map_root, entry);	
-	}
-}
-
-
-void
-kernel_memory_map_delete1
-(
-	uint64_t GTPinKernel_id
-)
-{
-	kernel_memory_map_t *node = kmm_delete(&kernel_memory_map_root, GTPinKernel_id);
-	kmm_free(&kernel_memory_map_free_list, node);
-}
-
diff --git a/src/tool/hpcrun/gpu/instrumentation/gtpin-instrumentation-kernel-memory-map.h b/src/tool/hpcrun/gpu/instrumentation/gtpin-instrumentation-kernel-memory-map.h
deleted file mode 100644
index 90446b6b9b..0000000000
--- a/src/tool/hpcrun/gpu/instrumentation/gtpin-instrumentation-kernel-memory-map.h
+++ /dev/null
@@ -1,61 +0,0 @@
-
-//******************************************************************************
-// system includes
-//******************************************************************************
-
-#include <gtpin.h>
-
-
-
-//******************************************************************************
-// type declarations
-//******************************************************************************
-
-typedef struct mem_pair_node {
-	int32_t offset;
-	int32_t endOffset;
-	GTPinMem mem;
-	struct mem_pair_node *next;
-} mem_pair_node;
-
-
-#undef typed_splay_node
-#define typed_splay_node(kernel_memory_map) kernel_memory_map_t
-
-
-typedef struct typed_splay_node(kernel_memory_map) {
-  struct typed_splay_node(kernel_memory_map) *left;
-  struct typed_splay_node(kernel_memory_map) *right;
-  uint64_t GTPinKernel_id; // key
-
-	mem_pair_node *head;
-
-} typed_splay_node(kernel_memory_map);
-
-
-
-//******************************************************************************
-// interface operations
-//******************************************************************************
-
-kernel_memory_map_t*
-kernel_memory_map_lookup1
-(
-	uint64_t
-);
-
-
-void
-kernel_memory_map_insert1
-(
-	uint64_t,
-	mem_pair_node *
-);
-
-
-void
-kernel_memory_map_delete1
-(
-	uint64_t
-);
-
diff --git a/src/tool/hpcrun/gpu/instrumentation/gtpin-instrumentation.c b/src/tool/hpcrun/gpu/instrumentation/gtpin-instrumentation.c
index 01f3cf17af..0c3a31175d 100644
--- a/src/tool/hpcrun/gpu/instrumentation/gtpin-instrumentation.c
+++ b/src/tool/hpcrun/gpu/instrumentation/gtpin-instrumentation.c
@@ -80,7 +80,8 @@
 #include <lib/prof-lean/spinlock.h>
 
 #include "gtpin-instrumentation.h"
-
+#include "kernel-data.h"
+#include "kernel-data-map.h"
 
 //******************************************************************************
 // local data
@@ -89,7 +90,7 @@
 #define MAX_STR_SIZE 1024
 
 // TODO(Aaron): Why there are so many correlation ids
-static atomic_long correlation_id;
+static atomic_ullong correlation_id;
 
 static spinlock_t files_lock = SPINLOCK_UNLOCKED;
 
@@ -120,11 +121,11 @@ initializeInstrumentation
  void
 )
 {
-  atomic_store(&correlation_id, 5000);  // to avoid conflict with opencl operation correlation ids, we start instrumentation ids with 5000 (TODO(Aaron):FIX)
+  atomic_store(&correlation_id, 100000000);  // to avoid conflict with opencl operation correlation ids, we start instrumentation ids with 5000 (TODO(Aaron):FIX)
 }
 
 
-static uint32_t
+static uint64_t
 getCorrelationId
 (
  void
@@ -141,7 +142,6 @@ createKernelNode
 )
 {
   cct_node_t *api_node = gpu_application_thread_correlation_callback(correlation_id);
-  gpu_correlation_id_map_insert(correlation_id, correlation_id);
 
   gpu_op_ccts_t gpu_op_ccts;
   gpu_op_placeholder_flags_t gpu_op_placeholder_flags = 0;
@@ -199,7 +199,7 @@ computeBinaryHash
 {
   // Compute hash for the binary
   unsigned char hash[HASH_LENGTH];
-  crypto_hash_compute(binary, binary_size, hash, HASH_LENGTH);
+  crypto_hash_compute((const unsigned char *)binary, binary_size, hash, HASH_LENGTH);
 
   size_t i;
   size_t used = 0;
@@ -278,14 +278,14 @@ static void
 kernelBlockActivityTranslate
 (
  gpu_activity_t *ga,
- uint32_t correlation_id,
+ uint64_t correlation_id,
  uint32_t loadmap_module_id,
  uint64_t offset,
  uint64_t execution_count
 )
 {
   memset(&ga->details.kernel_block, 0, sizeof(gpu_kernel_block_t));
-  ga->details.kernel_block.correlation_id = correlation_id;
+  ga->details.kernel_block.external_id = correlation_id;
   ga->details.kernel_block.pc.lm_id = (uint16_t)loadmap_module_id;
   ga->details.kernel_block.pc.lm_ip = (uintptr_t)offset;
   ga->details.kernel_block.execution_count = execution_count;
@@ -298,15 +298,15 @@ kernelBlockActivityTranslate
 static void
 kernelBlockActivityProcess
 (
- gpu_activity_t *ga,
- uint32_t correlation_id,
+ uint64_t correlation_id,
  uint32_t loadmap_module_id,
  uint64_t offset,
  uint64_t execution_count
 )
 {
-  kernelBlockActivityTranslate(ga, correlation_id, loadmap_module_id, offset, execution_count);
-  gpu_activity_process(ga);
+  gpu_activity_t ga;
+  kernelBlockActivityTranslate(&ga, correlation_id, loadmap_module_id, offset, execution_count);
+  gpu_activity_process(&ga);
 }
 
 
@@ -319,17 +319,14 @@ onKernelBuild
 {
   GTPINTOOL_STATUS status = GTPINTOOL_STATUS_SUCCESS;
 
-  assert(kernel_memory_map_lookup1((uint64_t)kernel) == 0);
-  assert(kernel_data_map_lookup1((uint64_t)kernel) == 0);
+  assert(kernel_data_map_lookup((uint64_t)kernel) == 0);
+
+  kernel_data_t kernel_data;
+  kernel_data.loadmap_module_id = findOrAddKernelModule(kernel);
+  kernel_data.kind = KERNEL_DATA_GTPIN;
 
-  KernelData data;
-  data.loadmap_module_id = findOrAddKernelModule(kernel);
-	
-	kernel_offset *offset_head = NULL;
-  mem_pair_node *h;
-  mem_pair_node *m_current;
-  kernel_offset *k_current;
-  bool isHeadNull = true;
+  kernel_data_gtpin_block_t *gtpin_block_head = NULL;
+  kernel_data_gtpin_block_t *gtpin_block_curr = NULL;
 
   for (GTPinBBL block = GTPin_BBLHead(kernel); GTPin_BBLValid(block); block = GTPin_BBLNext(block)) {
     GTPinINS head = GTPin_InsHead(block);
@@ -344,47 +341,45 @@ onKernelBuild
     status = GTPin_OpcodeprofInstrument(head, mem);
     assert(status == GTPINTOOL_STATUS_SUCCESS);
 
-    // TODO(Aaron): when using hpcrun_malloc, find a way to recycle memory
-    mem_pair_node *m = hpcrun_malloc(sizeof(mem_pair_node));
-    m->offset = head_offset;
-    m->endOffset = tail_offset;
-    m->mem = mem;
-    m->next = NULL;
-
-    if (isHeadNull == true) {
-      h = m;
-      m_current = m;
-      isHeadNull = false;
+    kernel_data_gtpin_block_t *gtpin_block = (kernel_data_gtpin_block_t *)hpcrun_malloc(sizeof(kernel_data_gtpin_block_t));
+    gtpin_block->head_offset = head_offset;
+    gtpin_block->tail_offset = tail_offset;
+    gtpin_block->mem = mem;
+    gtpin_block->next = NULL;
+
+    if (gtpin_block_head == NULL) {
+      gtpin_block_head = gtpin_block;
     } else {
-      m_current->next = m;
-      m_current = m_current->next;
+      gtpin_block_curr->next = gtpin_block;
+    }
+    gtpin_block_curr = gtpin_block;
+    
+    // while loop that iterates for each instruction in the block and adds an offset entry in map
+    int32_t offset = head_offset;
+    GTPinINS inst = GTPin_InsHead(block);
+    kernel_data_gtpin_inst_t *gtpin_inst_curr = NULL;
+    while (offset <= tail_offset && offset != -1) {
+      kernel_data_gtpin_inst_t *gtpin_inst = (kernel_data_gtpin_inst_t *)hpcrun_malloc(sizeof(kernel_data_gtpin_inst_t));
+      gtpin_inst->offset = offset;
+      if (gtpin_inst_curr == NULL) {
+        gtpin_block_curr->inst = gtpin_inst;
+      } else {
+        gtpin_inst_curr->next = gtpin_inst;
+      }
+      gtpin_inst_curr = gtpin_inst;
+      inst = GTPin_InsNext(inst);
+      offset = GTPin_InsOffset(inst);
     }
-		
-		// while loop that iterates for each instruction in the block and adds an offset entry in map
-		int32_t offset = head_offset;
-		GTPinINS inst = GTPin_InsHead(block);
-		int count = 0;
-		while (offset <= tail_offset && offset != -1) {
-			kernel_offset *ko = hpcrun_malloc(sizeof(kernel_offset));
-			ko->offset = offset;
-			if (offset_head == NULL) {
-				offset_head = ko;	
-				k_current = ko;
-			} else {
-				k_current->next = ko;
-				k_current = k_current->next;
-			}
-			inst = GTPin_InsNext(inst);
-			offset = GTPin_InsOffset(inst);
-		}
   }
-	data.offset_head = offset_head;
-  if (h != NULL) {
-    // TODO(Aaron): naming insert1/insert2 is confusing
-    kernel_memory_map_insert1((uint64_t)kernel, h);
-		kernel_data_map_insert1((uint64_t)kernel, data);
+
+  if (gtpin_block_head != NULL) {
+    kernel_data_gtpin_t *kernel_data_gtpin = (kernel_data_gtpin_t *)hpcrun_malloc(sizeof(kernel_data_gtpin_t));
+    kernel_data_gtpin->kernel_id = (uint64_t)kernel;
+    kernel_data_gtpin->block = gtpin_block_head;
+    kernel_data.data = kernel_data_gtpin; 
+    kernel_data_map_insert((uint64_t)kernel, kernel_data);
   }
-  // m->next = NULL;
+
   // add these details to cct_node. If thats not needed, we can create the kernel_cct in onKernelComplete
   ETMSG(OPENCL, "onKernelBuild complete. Inserted key: %"PRIu64 "",(uint64_t)kernel);
 }
@@ -401,30 +396,7 @@ onKernelRun
   GTPin_KernelProfilingActive(kernelExec, 1);
   assert(status == GTPINTOOL_STATUS_SUCCESS);
 
-  GTPinKernel kernel = GTPin_KernelExec_GetKernel(kernelExec);
-  kernel_offset *offset_head = kernel_data_map_lookup1((uint64_t)kernel)->data.offset_head;
-	kernel_offset *current = offset_head;
-
-	kernel_runs_correlation_offset *kco_head, *co_current;
-	kco_head = hpcrun_malloc(sizeof(kernel_runs_correlation_offset));
-	co_current = kco_head;
-	uint32_t correlation_id = getCorrelationId();
-	createKernelNode(correlation_id);
-	kco_head->correlation_id = correlation_id;
-	kco_head->offset = offset_head->offset;
-
-	while (current->next != NULL) {
-		current = current->next;
-		correlation_id = getCorrelationId();
-		createKernelNode(correlation_id);
-		// save id=GTPinKernelExec and value=correlation_id in another map
-		kernel_runs_correlation_offset *kco = hpcrun_malloc(sizeof(kernel_runs_correlation_offset));
-		kco->correlation_id = correlation_id;
-		kco->offset = current->offset;
-		co_current->next = kco;
-		co_current = co_current->next;
-	}
-	kernel_correlation_offset_map_insert1((uint64_t)kernelExec, kco_head);
+  createKernelNode((uint64_t)kernelExec);
 }
 
 
@@ -435,21 +407,22 @@ onKernelComplete
  void *v
 )
 {
+  // Receive correlations from the host thread
+  activityNotify();  
+
   GTPINTOOL_STATUS status = GTPINTOOL_STATUS_SUCCESS;
   GTPinKernel kernel = GTPin_KernelExec_GetKernel(kernelExec);
   ETMSG(OPENCL, "onKernelComplete starting. Lookup: key: %"PRIu64 "",(uint64_t)kernel);
-  assert(kernel_data_map_lookup1((uint64_t)kernel) != 0);
-  assert(kernel_memory_map_lookup1((uint64_t)kernel) != 0);
+  assert(kernel_data_map_lookup((uint64_t)kernel) != 0);
 
-  // TODO(Aaron): rename lookup methods, do not use magic numbers
-  kernel_data_map_t *kernel_data_list = kernel_data_map_lookup1((uint64_t)kernel);
-  KernelData data = kernel_data_list->data;
-  kernel_memory_map_t *kernel_memory_list = kernel_memory_map_lookup1((uint64_t)kernel);
-  mem_pair_node *block = kernel_memory_list->head;
+  kernel_data_map_entry_t *kernel_data_map_entry = kernel_data_map_lookup((uint64_t)kernel);
+  assert(kernel_data_map_entry != NULL);
 
-	kernel_runs_correlation_offset *kco_head = kernel_correlation_offset_map_lookup1((uint64_t)kernelExec)->head;
-	kernel_runs_correlation_offset *kco_curr = kco_head;
-  uint32_t correlation_id = kco_curr->correlation_id;
+  kernel_data_t kernel_data = kernel_data_map_entry_kernel_data_get(kernel_data_map_entry);
+  assert(kernel_data.kind == KERNEL_DATA_GTPIN);
+
+  kernel_data_gtpin_t *kernel_data_gtpin = (kernel_data_gtpin_t *)kernel_data.data; 
+  kernel_data_gtpin_block_t *block = kernel_data_gtpin->block;
 
   while (block != NULL) {
     uint32_t thread_count = GTPin_MemSampleLength(block->mem);
@@ -463,26 +436,21 @@ onKernelComplete
     }
     uint64_t execution_count = total; // + bm->val 
 
-    gpu_activity_t gpu_activity;
-		activityNotify();  
-		while(kco_curr->offset != block->endOffset) {
-			gpu_activity_t gpu_activity;
-	    kernelBlockActivityProcess(&gpu_activity, kco_curr->correlation_id,
-						data.loadmap_module_id, kco_curr->offset, execution_count);
-			kco_curr = kco_curr->next;
-		}
+    kernel_data_gtpin_inst_t *inst = block->inst;
+    while (inst != NULL) {
+      kernelBlockActivityProcess((uint64_t)kernelExec, kernel_data.loadmap_module_id,
+        inst->offset, execution_count);
+      inst = inst->next;
+    }
     block = block->next;
     //how to make offset the primary key within the cct and += the execution value for existing ccts?
   }
 }
 
-
-
 //******************************************************************************
 // interface operations
 //******************************************************************************
 
-
 void
 gtpin_enable_profiling
 (
diff --git a/src/tool/hpcrun/gpu/instrumentation/gtpin-instrumentation.h b/src/tool/hpcrun/gpu/instrumentation/gtpin-instrumentation.h
index 00f3171fc3..8efbd6db3f 100644
--- a/src/tool/hpcrun/gpu/instrumentation/gtpin-instrumentation.h
+++ b/src/tool/hpcrun/gpu/instrumentation/gtpin-instrumentation.h
@@ -41,14 +41,13 @@
 //
 // ******************************************************* EndRiceCopyright *
 
+#ifndef gpu_instrumentation_gtpin_instrumentation_h
+#define gpu_instrumentation_gtpin_instrumentation_h
+
 //******************************************************************************
 // local includes
 //******************************************************************************
 
-#include "gtpin-instrumentation-kernel-memory-map.h"
-#include "gtpin-instrumentation-kernel-data-map.h"
-#include "kernel_runs_correlation_offset_map.h"
-
 //******************************************************************************
 // interface operations
 //******************************************************************************
@@ -58,3 +57,5 @@ gtpin_enable_profiling
 (
  void
 );
+
+#endif
diff --git a/src/tool/hpcrun/gpu/instrumentation/kernel-data-map.c b/src/tool/hpcrun/gpu/instrumentation/kernel-data-map.c
new file mode 100644
index 0000000000..80d186f467
--- /dev/null
+++ b/src/tool/hpcrun/gpu/instrumentation/kernel-data-map.c
@@ -0,0 +1,142 @@
+//******************************************************************************
+// system includes
+//******************************************************************************
+
+#include <string.h>
+#include <assert.h>
+
+//******************************************************************************
+// local includes
+//******************************************************************************
+
+#include <hpcrun/gpu/gpu-splay-allocator.h>
+#include <lib/prof-lean/splay-uint64.h>
+#include <lib/prof-lean/spinlock.h>
+
+#include "kernel-data-map.h"
+
+//******************************************************************************
+// type declarations
+//******************************************************************************
+
+#define kd_insert \
+  typed_splay_insert(kernel_data)
+
+#define kd_lookup \
+  typed_splay_lookup(kernel_data)
+
+#define kd_delete \
+  typed_splay_delete(kernel_data)
+
+#define kd_forall \
+  typed_splay_forall(kernel_data)
+
+#define kd_count \
+  typed_splay_count(kernel_data)
+
+#define kd_alloc(free_list) \
+  typed_splay_alloc(free_list, kernel_data_map_entry_t)
+
+#define kd_free(free_list, node) \
+  typed_splay_free(free_list, node)
+
+#undef typed_splay_node
+#define typed_splay_node(kernel_data) kernel_data_map_entry_t
+
+typedef struct typed_splay_node(kernel_data) {
+  struct typed_splay_node(kernel_data) *left;
+  struct typed_splay_node(kernel_data) *right;
+  uint64_t kernel_id; // key
+
+  kernel_data_t kernel_data;
+} typed_splay_node(kernel_data);
+
+typed_splay_impl(kernel_data);
+
+//******************************************************************************
+// local data
+//******************************************************************************
+
+static kernel_data_map_entry_t *kernel_data_map_root = NULL;
+static kernel_data_map_entry_t *kernel_data_map_free_list = NULL;
+
+static spinlock_t kernel_data_map_lock = SPINLOCK_UNLOCKED;
+
+//******************************************************************************
+// private operations
+//******************************************************************************
+
+static kernel_data_map_entry_t *
+kernel_data_alloc()
+{
+  return kd_alloc(&kernel_data_map_free_list);
+}
+
+
+static kernel_data_map_entry_t *
+kernel_data_new
+(
+ uint64_t kernel_id,
+ kernel_data_t kernel_data
+)
+{
+  kernel_data_map_entry_t *e = kernel_data_alloc();
+  e->kernel_id = kernel_id;
+  e->kernel_data = kernel_data;
+  return e;
+}
+
+//******************************************************************************
+// interface operations
+//******************************************************************************
+
+kernel_data_map_entry_t*
+kernel_data_map_lookup
+(
+ uint64_t kernel_id
+)
+{
+  spinlock_lock(&kernel_data_map_lock);
+  kernel_data_map_entry_t *result = kd_lookup(&kernel_data_map_root, kernel_id);
+  spinlock_unlock(&kernel_data_map_lock);
+  return result;
+}
+
+
+void
+kernel_data_map_insert
+(
+ uint64_t kernel_id,
+ kernel_data_t kernel_data
+)
+{
+  if (kd_lookup(&kernel_data_map_root, kernel_id)) {
+    assert(0);  // entry for a given key should be inserted only once
+  } else {
+    spinlock_lock(&kernel_data_map_lock);
+    kernel_data_map_entry_t *entry = kernel_data_new(kernel_id, kernel_data);
+    kd_insert(&kernel_data_map_root, entry);  
+    spinlock_unlock(&kernel_data_map_lock);
+  }
+}
+
+
+void
+kernel_data_map_delete
+(
+ uint64_t kernel_id
+)
+{
+  kernel_data_map_entry_t *node = kd_delete(&kernel_data_map_root, kernel_id);
+  kd_free(&kernel_data_map_free_list, node);
+}
+
+
+kernel_data_t
+kernel_data_map_entry_kernel_data_get
+(
+ kernel_data_map_entry_t *entry
+)
+{
+  return entry->kernel_data;
+}
diff --git a/src/tool/hpcrun/gpu/instrumentation/kernel-data-map.h b/src/tool/hpcrun/gpu/instrumentation/kernel-data-map.h
new file mode 100644
index 0000000000..7c91f0fd73
--- /dev/null
+++ b/src/tool/hpcrun/gpu/instrumentation/kernel-data-map.h
@@ -0,0 +1,45 @@
+#ifndef gpu_instrumentation_kernel_data_map_h
+#define gpu_instrumentation_kernel_data_map_h
+
+//******************************************************************************
+// system includes
+//******************************************************************************
+
+#include <stdint.h>
+#include "kernel-data.h"
+
+//******************************************************************************
+// interface operations
+//******************************************************************************
+
+typedef struct kernel_data_map_entry_t kernel_data_map_entry_t;
+
+kernel_data_map_entry_t*
+kernel_data_map_lookup
+(
+ uint64_t kernel_id
+);
+
+
+void
+kernel_data_map_insert
+(
+ uint64_t kernel_id,
+ kernel_data_t kernel_data
+);
+
+
+void
+kernel_data_map_delete
+(
+ uint64_t kernel_id
+);
+
+
+kernel_data_t
+kernel_data_map_entry_kernel_data_get
+(
+ kernel_data_map_entry_t *entry
+);
+
+#endif
diff --git a/src/tool/hpcrun/gpu/instrumentation/kernel-data.h b/src/tool/hpcrun/gpu/instrumentation/kernel-data.h
new file mode 100644
index 0000000000..e7e63f9481
--- /dev/null
+++ b/src/tool/hpcrun/gpu/instrumentation/kernel-data.h
@@ -0,0 +1,34 @@
+#ifndef gpu_instrumentation_kernel_data_h
+#define gpu_instrumentation_kernel_data_h
+
+#include <gtpin.h>
+
+typedef enum {
+  KERNEL_DATA_GTPIN
+} kernel_data_kind_t;
+
+typedef struct kernel_data_gtpin_inst {
+  int32_t offset;
+  struct kernel_data_gtpin_inst *next;
+} kernel_data_gtpin_inst_t; 
+
+typedef struct kernel_data_gtpin_block {
+  int32_t head_offset;
+  int32_t tail_offset;
+  GTPinMem mem;
+  struct kernel_data_gtpin_inst *inst;
+  struct kernel_data_gtpin_block *next;
+} kernel_data_gtpin_block_t; 
+
+typedef struct kernel_data_gtpin {
+  uint64_t kernel_id;
+  struct kernel_data_gtpin_block *block;
+} kernel_data_gtpin_t; 
+  
+typedef struct kernel_data {
+  uint32_t loadmap_module_id;
+  kernel_data_kind_t kind;
+  void *data;
+} kernel_data_t;
+
+#endif
diff --git a/src/tool/hpcrun/gpu/instrumentation/kernel_runs_correlation_offset_map.c b/src/tool/hpcrun/gpu/instrumentation/kernel_runs_correlation_offset_map.c
deleted file mode 100644
index d7160eeab4..0000000000
--- a/src/tool/hpcrun/gpu/instrumentation/kernel_runs_correlation_offset_map.c
+++ /dev/null
@@ -1,125 +0,0 @@
-//******************************************************************************
-// system includes
-//******************************************************************************
-
-#include <string.h>
-#include <assert.h>
-
-
-//******************************************************************************
-// local includes
-//******************************************************************************
-
-#include <hpcrun/gpu/gpu-splay-allocator.h>
-#include <lib/prof-lean/splay-uint64.h>
-
-#include "kernel_runs_correlation_offset_map.h"
-
-
-
-//******************************************************************************
-// type declarations
-//******************************************************************************
-
-#define kco_insert																					\
-  typed_splay_insert(kernel_correlation_offset_map)
-
-#define kco_lookup																					\
-  typed_splay_lookup(kernel_correlation_offset_map)
-
-#define kco_delete																					\
-  typed_splay_delete(kernel_correlation_offset_map)
-
-#define kco_forall																					\
-  typed_splay_forall(kernel_correlation_offset_map)
-
-#define kco_count																						\
-  typed_splay_count(kernel_correlation_offset_map)
-
-#define kco_alloc(free_list)																\
-  typed_splay_alloc(free_list, kernel_correlation_offset_map_t)
-
-#define kco_free(free_list, node)														\
-  typed_splay_free(free_list, node)
-
-typed_splay_impl(kernel_correlation_offset_map);
-
-
-
-//******************************************************************************
-// local data
-//******************************************************************************
-
-static kernel_correlation_offset_map_t *kernel_correlation_offset_map_root = NULL;
-static kernel_correlation_offset_map_t *kernel_correlation_offset_map_free_list = NULL;
-
-
-
-//******************************************************************************
-// private operations
-//******************************************************************************
-
-static kernel_correlation_offset_map_t *
-kernel_data_alloc()
-{
-  return kco_alloc(&kernel_correlation_offset_map_free_list);
-}
-
-
-static kernel_correlation_offset_map_t *
-kernel_data_new
-(
-	uint64_t GTPinKernelExec_id,
-	kernel_runs_correlation_offset *data
-)
-{
-  kernel_correlation_offset_map_t *e = kernel_data_alloc();
-  memset(e, 0, sizeof(kernel_correlation_offset_map_t)); 
-  e->GTPinKernelExec_id = GTPinKernelExec_id;
-  e->head = data;
-  return e;
-}
-
-
-
-//******************************************************************************
-// interface operations
-//******************************************************************************
-
-kernel_correlation_offset_map_t*
-kernel_correlation_offset_map_lookup1
-(
-	uint64_t GTPinKernelExec_id
-)
-{
-  kernel_correlation_offset_map_t *result = kco_lookup(&kernel_correlation_offset_map_root, GTPinKernelExec_id);
-	return result;
-}
-
-
-void
-kernel_correlation_offset_map_insert1
-(
-	uint64_t GTPinKernelExec_id,
-	kernel_runs_correlation_offset *data
-)
-{
-	if (kco_lookup(&kernel_correlation_offset_map_root, GTPinKernelExec_id)) {
-		assert(0);	// entry for a given key should be inserted only once
-	} else {
-		kernel_correlation_offset_map_t *entry = kernel_data_new(GTPinKernelExec_id, data);
-		kco_insert(&kernel_correlation_offset_map_root, entry);	
-	}
-}
-
-
-void
-kernel_correlation_offset_map_delete1
-(
-	uint64_t GTPinKernelExec_id
-)
-{
-	kernel_correlation_offset_map_t *node = kco_delete(&kernel_correlation_offset_map_root, GTPinKernelExec_id);
-	kco_free(&kernel_correlation_offset_map_free_list, node);
-}
-
diff --git a/src/tool/hpcrun/gpu/instrumentation/kernel_runs_correlation_offset_map.h b/src/tool/hpcrun/gpu/instrumentation/kernel_runs_correlation_offset_map.h
deleted file mode 100644
index 025bd061b7..0000000000
--- a/src/tool/hpcrun/gpu/instrumentation/kernel_runs_correlation_offset_map.h
+++ /dev/null
@@ -1,59 +0,0 @@
-
-//******************************************************************************
-// system includes
-//******************************************************************************
-
-#include <stdint.h>
-
-
-
-//******************************************************************************
-// type declarations
-//******************************************************************************
-
-typedef struct kernel_runs_correlation_offset {
-	int32_t offset;
-	uint32_t correlation_id;
-	struct kernel_runs_correlation_offset *next;
-} kernel_runs_correlation_offset;
-
-
-#undef typed_splay_node
-#define typed_splay_node(kernel_correlation_offset_map) kernel_correlation_offset_map_t
-
-
-typedef struct typed_splay_node(kernel_correlation_offset_map) {
-  struct typed_splay_node(kernel_correlation_offset_map) *left;
-  struct typed_splay_node(kernel_correlation_offset_map) *right;
-  uint64_t GTPinKernelExec_id; // key
-
-	kernel_runs_correlation_offset *head;
-}typed_splay_node(kernel_correlation_offset_map);
-
-
-
-//******************************************************************************
-// interface operations
-//******************************************************************************
-
-kernel_correlation_offset_map_t*
-kernel_correlation_offset_map_lookup1
-(
-	uint64_t
-);
-
-
-void
-kernel_correlation_offset_map_insert1
-(
-	uint64_t,
-	kernel_runs_correlation_offset *
-);
-
-
-void
-kernel_correlation_offset_map_delete1
-(
-	uint64_t
-);
-

From d15ab0c7a40c66c8de3a2a8b5418ecc977e7b12d Mon Sep 17 00:00:00 2001
From: dejangrubisic <grubisic.dejan@yahoo.com>
Date: Tue, 29 Sep 2020 19:38:41 -0500
Subject: [PATCH 053/177] Tracing is working: gpu_trace_fini is called from
 Monitoring thread, which is finalized from opencl_api_finalize

---
 src/tool/hpcrun/gpu/gpu-activity-multiplexer.c | 5 ++++-
 src/tool/hpcrun/gpu/gpu-operation-channel.c    | 9 +++++++--
 src/tool/hpcrun/gpu/gpu-trace-channel.c        | 6 +++---
 src/tool/hpcrun/gpu/gpu-trace.c                | 6 ++----
 src/tool/hpcrun/ompt/ompt-region-debug.h       | 2 +-
 src/tool/hpcrun/sample-sources/opencl.c        | 2 +-
 6 files changed, 18 insertions(+), 12 deletions(-)

diff --git a/src/tool/hpcrun/gpu/gpu-activity-multiplexer.c b/src/tool/hpcrun/gpu/gpu-activity-multiplexer.c
index c4984d14d7..abc80f93ad 100644
--- a/src/tool/hpcrun/gpu/gpu-activity-multiplexer.c
+++ b/src/tool/hpcrun/gpu/gpu-activity-multiplexer.c
@@ -70,6 +70,7 @@ typedef void *(*pthread_start_routine_t)(void *);
 //******************************************************************************
 
 static _Atomic(bool) stop_activity_flag;
+static _Atomic(bool) gpu_trace_finished;
 
 static atomic_uint stream_id;
 static __thread uint32_t my_operation_set_id = -1;
@@ -116,6 +117,7 @@ void
   }
 
   gpu_trace_fini(NULL);
+  atomic_store(&gpu_trace_finished, true);
 
   return NULL;
 }
@@ -129,6 +131,7 @@ void
 {
   pthread_t thread;
   atomic_store(&stop_activity_flag, false);
+  atomic_store(&gpu_trace_finished, false);
   atomic_store(&stream_id, 0);
 
   gpu_operation_channel_stack_alloc(max_threads_consumers);
@@ -179,7 +182,7 @@ void
     gpu_operation_channel_set_apply(gpu_operation_channel_signal_consumer, set_index);
   }
 
-
+  while (!atomic_load(&gpu_trace_finished));
 }
 
 
diff --git a/src/tool/hpcrun/gpu/gpu-operation-channel.c b/src/tool/hpcrun/gpu/gpu-operation-channel.c
index d2c566c20d..bb760fa612 100644
--- a/src/tool/hpcrun/gpu/gpu-operation-channel.c
+++ b/src/tool/hpcrun/gpu/gpu-operation-channel.c
@@ -58,11 +58,16 @@
 #include "gpu-operation-item.h"
 #include "gpu-operation-item-process.h"
 
+#define DEBUG 0
+#include "gpu-print.h"
+
 
 //******************************************************************************
 // macros
 //******************************************************************************
 
+
+
 #define CHANNEL_FILL_COUNT 100
 
 
@@ -184,7 +189,7 @@ gpu_operation_channel_produce
   gpu_operation_item_t *new_item = gpu_operation_item_alloc(channel);
   *new_item = *it;
 
-  printf("\nOPERATION_PRODUCE: channel = %p || return_channel = %p -> activity = %p | corr = %u kind = %s, type = %s\n\n",
+  PRINT("\nOPERATION_PRODUCE: channel = %p || return_channel = %p -> activity = %p | corr = %u kind = %s, type = %s\n\n",
          channel, new_item->channel, &new_item->activity,
          (new_item->activity.kind == GPU_ACTIVITY_MEMCPY)?new_item->activity.details.memcpy.correlation_id:new_item->activity.details.kernel.correlation_id,
          gpu_kind_to_string(new_item->activity.kind),
@@ -216,7 +221,7 @@ gpu_operation_channel_consume
 
     if (!it) break;
 
-    printf("\nOPERATION_CONSUME: op_channel = %p || channel = %p , activity = %p | corr = %u, kind = %s, type = %s\n",
+    PRINT("\nOPERATION_CONSUME: op_channel = %p || channel = %p , activity = %p | corr = %u, kind = %s, type = %s\n",
            channel, it->channel, &it->activity,
            (it->activity.kind == GPU_ACTIVITY_MEMCPY)?it->activity.details.memcpy.correlation_id:it->activity.details.kernel.correlation_id,
            gpu_kind_to_string(it->activity.kind),
diff --git a/src/tool/hpcrun/gpu/gpu-trace-channel.c b/src/tool/hpcrun/gpu/gpu-trace-channel.c
index f5a8b5e4ef..d593735c0e 100644
--- a/src/tool/hpcrun/gpu/gpu-trace-channel.c
+++ b/src/tool/hpcrun/gpu/gpu-trace-channel.c
@@ -55,7 +55,7 @@
 
 #define SECONDS_UNTIL_WAKEUP 2
 
-
+#define DEBUG 0
 
 //******************************************************************************
 // local includes
@@ -190,7 +190,7 @@ gpu_trace_channel_produce
 
   *cti = *ti;
 
-  printf("\n===========TRACE_PRODUCE: ti = %p || submit = %lu, start = %lu, end = %lu, cct_node = %p\n\n",
+  PRINT("\n===========TRACE_PRODUCE: ti = %p || submit = %lu, start = %lu, end = %lu, cct_node = %p\n\n",
          ti,
          ti->cpu_submit_time,
          ti->start,
@@ -223,7 +223,7 @@ gpu_trace_channel_consume
     gpu_trace_item_t *ti = channel_pop(channel, bichannel_direction_forward);
     if (!ti) break;
 
-    printf("\n===========TRACE_CONSUME: ti = %p || submit = %lu, start = %lu, end = %lu, cct_node = %p\n\n",
+    PRINT("\n===========TRACE_CONSUME: ti = %p || submit = %lu, start = %lu, end = %lu, cct_node = %p\n\n",
            ti,
            ti->cpu_submit_time,
            ti->start,
diff --git a/src/tool/hpcrun/gpu/gpu-trace.c b/src/tool/hpcrun/gpu/gpu-trace.c
index 5684577eb8..5a9edb299c 100644
--- a/src/tool/hpcrun/gpu/gpu-trace.c
+++ b/src/tool/hpcrun/gpu/gpu-trace.c
@@ -76,7 +76,6 @@
 #include "gpu-trace-item.h"
 #include "gpu-trace-channel-set.h"
 #include "gpu-trace.h"
-#include "gpu-print.h"
 
 
 
@@ -85,6 +84,7 @@
 //******************************************************************************
 
 #define DEBUG 0
+#include "gpu-print.h"
 
 
 
@@ -454,10 +454,8 @@ gpu_trace_fini
     gpu_trace_channel_set_notify(set_index);
   }
 
-//  gpu_context_stream_map_signal_all();
-
-
   while (atomic_load(&active_streams_counter));
+
 }
 
 static void
diff --git a/src/tool/hpcrun/ompt/ompt-region-debug.h b/src/tool/hpcrun/ompt/ompt-region-debug.h
index 0ba14125e9..1a02b10eb6 100644
--- a/src/tool/hpcrun/ompt/ompt-region-debug.h
+++ b/src/tool/hpcrun/ompt/ompt-region-debug.h
@@ -51,7 +51,7 @@
 // macros
 //*****************************************************************************
 
-#define REGION_DEBUG 1
+#define REGION_DEBUG 0
 
 //*****************************************************************************
 // macros
diff --git a/src/tool/hpcrun/sample-sources/opencl.c b/src/tool/hpcrun/sample-sources/opencl.c
index 4d526b3798..f4691c50fb 100644
--- a/src/tool/hpcrun/sample-sources/opencl.c
+++ b/src/tool/hpcrun/sample-sources/opencl.c
@@ -157,7 +157,7 @@ METHOD_FN(finalize_event_list)
   opencl_api_initialize();
 
 
-//  // Register shutdown functions to write trace files
+//  gpu_trace_fini - finalized from opencl_api_finalize -> gpu_activity_multiplexer_fini
 //  device_trace_finalizer_shutdown.fn = gpu_trace_fini;
 //  device_finalizer_register(device_finalizer_type_shutdown, &device_trace_finalizer_shutdown);
 

From 1402825a2cf4e3ef4fe1607afdb0a161b2c60f7c Mon Sep 17 00:00:00 2001
From: Jokeren <robinho364@gmail.com>
Date: Wed, 30 Sep 2020 01:21:24 +0000
Subject: [PATCH 054/177] Use opencl runtime stack for gtpin

---
 .../instrumentation/gtpin-instrumentation.c   | 57 +++++++++++++++----
 .../instrumentation/gtpin-instrumentation.h   | 10 ++++
 src/tool/hpcrun/gpu/opencl/opencl-api.c       |  5 ++
 src/tool/hpcrun/gpu/opencl/opencl-api.h       |  7 +++
 4 files changed, 67 insertions(+), 12 deletions(-)

diff --git a/src/tool/hpcrun/gpu/instrumentation/gtpin-instrumentation.c b/src/tool/hpcrun/gpu/instrumentation/gtpin-instrumentation.c
index 0c3a31175d..c22c3fd150 100644
--- a/src/tool/hpcrun/gpu/instrumentation/gtpin-instrumentation.c
+++ b/src/tool/hpcrun/gpu/instrumentation/gtpin-instrumentation.c
@@ -70,7 +70,7 @@
 #include <hpcrun/gpu/gpu-application-thread-api.h>
 #include <hpcrun/gpu/gpu-correlation.h>
 #include <hpcrun/gpu/gpu-correlation-channel.h>
-#include <hpcrun/gpu/gpu-correlation-id-map.h>
+#include <hpcrun/gpu/gpu-host-correlation-map.h>
 #include <hpcrun/gpu/gpu-op-placeholders.h>
 #include <hpcrun/gpu/gpu-metrics.h>
 #include <hpcrun/gpu/gpu-monitoring-thread-api.h>
@@ -94,6 +94,11 @@ static atomic_ullong correlation_id;
 
 static spinlock_t files_lock = SPINLOCK_UNLOCKED;
 
+static bool gtpin_use_runtime_callstack = false;
+
+static __thread uint64_t gtpin_correlation_id = 0;
+static __thread uint64_t gtpin_cpu_submit_time = 0;
+
 //******************************************************************************
 // private operations
 //******************************************************************************
@@ -141,19 +146,26 @@ createKernelNode
  uint64_t correlation_id
 )
 {
-  cct_node_t *api_node = gpu_application_thread_correlation_callback(correlation_id);
+  uint64_t cpu_submit_time = hpcrun_nanotime();
 
-  gpu_op_ccts_t gpu_op_ccts;
-  gpu_op_placeholder_flags_t gpu_op_placeholder_flags = 0;
-  gpu_op_placeholder_flags_set(&gpu_op_placeholder_flags, gpu_placeholder_type_kernel);
+  if (gtpin_use_runtime_callstack) {
+    // XXX(Keren): gtpin's call stack is a mass, better to use opencl's call path
+    // onKernelRun->clEnqueueNDRangeKernel_wrapper->opencl_subscriber_callback
+    gtpin_correlation_id = correlation_id;
+    gtpin_cpu_submit_time = cpu_submit_time;
+  } else {
+    cct_node_t *api_node = gpu_application_thread_correlation_callback(correlation_id);
 
-  hpcrun_safe_enter();
-  gpu_op_ccts_insert(api_node, &gpu_op_ccts, gpu_op_placeholder_flags);
-  hpcrun_safe_exit();
+    gpu_op_ccts_t gpu_op_ccts;
+    gpu_op_placeholder_flags_t gpu_op_placeholder_flags = 0;
+    gpu_op_placeholder_flags_set(&gpu_op_placeholder_flags, gpu_placeholder_type_kernel);
 
-  gpu_activity_channel_consume(gpu_metrics_attribute);
-  uint64_t cpu_submit_time = hpcrun_nanotime();
-  gpu_correlation_channel_produce(correlation_id, &gpu_op_ccts, cpu_submit_time);
+    hpcrun_safe_enter();
+    gpu_op_ccts_insert(api_node, &gpu_op_ccts, gpu_op_placeholder_flags);
+    hpcrun_safe_exit();
+
+    gpu_correlation_channel_produce(correlation_id, &gpu_op_ccts, cpu_submit_time);
+  }
 }
 
 
@@ -270,6 +282,9 @@ activityNotify
  void
 )
 {
+  // Once you attribute this kernel, you want to update the host_correlation_id entry.
+  // Otherwise, the same memory might be reclaimed
+  // gpu_monitoring_thread_activities_ready(allow_update);
   gpu_monitoring_thread_activities_ready();
 }
 
@@ -392,6 +407,8 @@ onKernelRun
  void *v
 )
 {
+  gpu_activity_channel_consume(gpu_metrics_attribute);
+
   GTPINTOOL_STATUS status = GTPINTOOL_STATUS_SUCCESS;
   GTPin_KernelProfilingActive(kernelExec, 1);
   assert(status == GTPINTOOL_STATUS_SUCCESS);
@@ -407,7 +424,9 @@ onKernelComplete
  void *v
 )
 {
-  // Receive correlations from the host thread
+  // Receive correlations from the host thread.
+  // XXX(Keren): This is done usually at the monitor thread, but not guaranteed.
+  // For safety concern, we need to adopt the multiplexer framework.
   activityNotify();  
 
   GTPINTOOL_STATUS status = GTPINTOOL_STATUS_SUCCESS;
@@ -467,6 +486,9 @@ gtpin_enable_profiling
   }
 #endif
 
+  // Use opencl/level zero runtime stack
+  gtpin_use_runtime_callstack = true;
+
   GTPin_OnKernelBuild(onKernelBuild, NULL);
   GTPin_OnKernelRun(onKernelRun, NULL);
   GTPin_OnKernelComplete(onKernelComplete, NULL);
@@ -474,3 +496,14 @@ gtpin_enable_profiling
   GTPIN_Start();
 }
 
+
+void
+gtpin_produce_runtime_callstack
+(
+ gpu_op_ccts_t *gpu_op_ccts
+)
+{
+  if (gtpin_use_runtime_callstack) {
+    gpu_correlation_channel_produce(gtpin_correlation_id, gpu_op_ccts, gtpin_cpu_submit_time);
+  }
+}
diff --git a/src/tool/hpcrun/gpu/instrumentation/gtpin-instrumentation.h b/src/tool/hpcrun/gpu/instrumentation/gtpin-instrumentation.h
index 8efbd6db3f..1efd824d78 100644
--- a/src/tool/hpcrun/gpu/instrumentation/gtpin-instrumentation.h
+++ b/src/tool/hpcrun/gpu/instrumentation/gtpin-instrumentation.h
@@ -52,10 +52,20 @@
 // interface operations
 //******************************************************************************
 
+typedef struct gpu_op_ccts_t gpu_op_ccts_t;
+
 void
 gtpin_enable_profiling
 (
  void
 );
 
+
+void
+gtpin_produce_runtime_callstack
+(
+ gpu_op_ccts_t *
+);
+
+
 #endif
diff --git a/src/tool/hpcrun/gpu/opencl/opencl-api.c b/src/tool/hpcrun/gpu/opencl/opencl-api.c
index a2323bc255..d0d6d40c1f 100644
--- a/src/tool/hpcrun/gpu/opencl/opencl-api.c
+++ b/src/tool/hpcrun/gpu/opencl/opencl-api.c
@@ -490,6 +490,11 @@ opencl_subscriber_callback
   gpu_op_ccts_insert(api_node, &gpu_op_ccts, gpu_op_placeholder_flags);
   hpcrun_safe_exit();
 
+  if (type == kernel) {
+    // Callback to produce gtpin correlation
+    gtpin_produce_runtime_callstack(&gpu_op_ccts);
+  }
+
   gpu_activity_channel_consume(gpu_metrics_attribute);  
   uint64_t cpu_submit_time = CPU_NANOTIME();
   gpu_correlation_channel_produce(correlation_id, &gpu_op_ccts, 
diff --git a/src/tool/hpcrun/gpu/opencl/opencl-api.h b/src/tool/hpcrun/gpu/opencl/opencl-api.h
index df46d51d51..dbd79d97eb 100644
--- a/src/tool/hpcrun/gpu/opencl/opencl-api.h
+++ b/src/tool/hpcrun/gpu/opencl/opencl-api.h
@@ -82,6 +82,13 @@ opencl_timing_info_get
 );
 
 
+cct_node_t *
+opencl_api_node_get
+(
+ void
+);
+
+
 void
 clSetEventCallback_wrapper
 (

From 5d0f747c5703b97d3a4bb43cefd644a40ec0aa78 Mon Sep 17 00:00:00 2001
From: Aaron Cherian <aarontcopal2@iris10.ftm.alcf.anl.gov>
Date: Wed, 30 Sep 2020 22:36:51 +0000
Subject: [PATCH 055/177] removing opencl-intercept files and some unwanted
 functions related to gpu-metrics

---
 src/lib/prof-lean/hpcrun-fmt.h                |   6 -
 src/lib/xml/hpc-structure.dtd.h               |   2 +-
 src/tool/hpcrun/Makefile.am                   |   1 -
 src/tool/hpcrun/Makefile.in                   |  16 +-
 src/tool/hpcrun/gpu/gpu-metrics.c             |   7 +-
 src/tool/hpcrun/gpu/gpu-metrics.h             |   4 -
 .../gpu/opencl/opencl-activity-translate.c    |   1 -
 src/tool/hpcrun/gpu/opencl/opencl-api.h       |  35 +++-
 src/tool/hpcrun/gpu/opencl/opencl-intercept.c |  91 ---------
 src/tool/hpcrun/gpu/opencl/opencl-intercept.h | 179 ------------------
 .../hpcrun/gpu/opencl/opencl-memory-manager.h |   2 +-
 src/tool/hpcrun/metrics.c                     |   6 -
 src/tool/hpcrun/metrics.h                     |   2 -
 13 files changed, 38 insertions(+), 314 deletions(-)
 delete mode 100644 src/tool/hpcrun/gpu/opencl/opencl-intercept.c
 delete mode 100644 src/tool/hpcrun/gpu/opencl/opencl-intercept.h

diff --git a/src/lib/prof-lean/hpcrun-fmt.h b/src/lib/prof-lean/hpcrun-fmt.h
index 3796282c33..f5e7af4bad 100644
--- a/src/lib/prof-lean/hpcrun-fmt.h
+++ b/src/lib/prof-lean/hpcrun-fmt.h
@@ -168,11 +168,6 @@ hpcrun_fmt_hdr_free(hpcrun_fmt_hdr_t* hdr, hpcfmt_free_fn dealloc);
 #define HPCRUN_FMT_METRIC_SHOW_EXCLUSIVE  3
 #define HPCRUN_FMT_METRIC_INVISIBLE       4
 
-// relocating the metrics
-#define HPCRUN_FMT_METRIC_MOVE_TO_ENCLOSING_PROCEDURE       5
-#define HPCRUN_FMT_METRIC_KEEP_HERE													6
-#define HPCRUN_FMT_METRIC_COPY_TO_INST_SIBLINGS_IN_BBLOCK		7
-
 
 
 //***************************************************************************
@@ -277,7 +272,6 @@ typedef struct hpcrun_metricFlags_fields {
   uint16_t             partner;
   uint8_t /*bool*/     show;
   uint8_t /*bool*/     showPercent;
-  uint8_t 						 relocation_type;
 
   uint64_t unused1;
 } hpcrun_metricFlags_fields;
diff --git a/src/lib/xml/hpc-structure.dtd.h b/src/lib/xml/hpc-structure.dtd.h
index 5b0583f06f..83c46584a6 100644
--- a/src/lib/xml/hpc-structure.dtd.h
+++ b/src/lib/xml/hpc-structure.dtd.h
@@ -1 +1 @@
-"<!-- ******************************************************************** -->\n<!-- HPCToolkit Structure DTD                                             -->\n<!-- Version 4.7                                                          -->\n<!-- ******************************************************************** -->\n\n<!ELEMENT HPCToolkitStructure (LM)*>\n<!ATTLIST HPCToolkitStructure\n	version CDATA #REQUIRED\n	i       CDATA #REQUIRED\n	n       CDATA #IMPLIED>\n  <!-- Load module: -->\n  <!--   (i)d: unique identifier for cross referencing -->\n  <!--   (n)ame -->\n  <!--   (l)ine range: \"beg-end\" (inclusive range) -->\n  <!--   (v)ma-range-set: \"{[beg-end), [beg-end)...}\" -->\n  <!--   (t)arget: target function address -->\n  <!--   (d)evice: device name -->\n  <!ELEMENT LM (F|P|B)*>\n  <!ATTLIST LM\n	i CDATA #REQUIRED\n	n CDATA #REQUIRED\n	v CDATA #IMPLIED>\n  <!-- File: -->\n  <!ELEMENT F (P|L|S)*>\n  <!ATTLIST F\n	i CDATA #REQUIRED\n	n CDATA #REQUIRED>\n  <!-- Procedure: ln=link name (if different than name) -->\n  <!ELEMENT P (P|A|L|S|C)*>\n  <!ATTLIST P\n	i  CDATA #REQUIRED\n	n  CDATA #REQUIRED\n	ln CDATA #IMPLIED\n	l  CDATA #IMPLIED\n	s  CDATA #IMPLIED\n	v  CDATA #IMPLIED>\n  <!-- Basic Block: -->\n  <!ELEMENT B (I)*>\n  <!ATTLIST B\n	o CDATA #REQUIRED>\n  <!-- Instruction: -->\n  <!ELEMENT I EMPTY>\n  <!ATTLIST I\n	o CDATA #REQUIRED>\n  <!-- Alien: (f)ilename -->\n  <!ELEMENT A (A|L|S|C)*>\n  <!ATTLIST A\n	i CDATA #REQUIRED\n	f CDATA #IMPLIED\n	n CDATA #IMPLIED\n	ln CDATA #IMPLIED\n	l CDATA #IMPLIED\n	v CDATA #IMPLIED>\n  <!-- Loop -->\n  <!ELEMENT L (A|L|S|C)*>\n  <!ATTLIST L\n	i CDATA #REQUIRED\n	l CDATA #REQUIRED\n	f CDATA #IMPLIED\n	v CDATA #IMPLIED>\n  <!-- Statement -->\n  <!ELEMENT S EMPTY>\n  <!ATTLIST S\n	i CDATA #REQUIRED\n	l CDATA #REQUIRED\n	v CDATA #IMPLIED>\n  <!-- Callsite (a special Statement) -->\n  <!ELEMENT C (C)*>\n  <!ATTLIST C\n	i CDATA #REQUIRED\n	l CDATA #REQUIRED\n	v CDATA #IMPLIED\n	t CDATA #IMPLIED\n	d CDATA #IMPLIED>\n";
+"<!-- ******************************************************************** -->\n<!-- HPCToolkit Structure DTD                                             -->\n<!-- Version 4.7                                                          -->\n<!-- ******************************************************************** -->\n\n<!ELEMENT HPCToolkitStructure (LM)*>\n<!ATTLIST HPCToolkitStructure\n	version CDATA #REQUIRED\n	i       CDATA #REQUIRED\n	n       CDATA #IMPLIED>\n  <!-- Load module: -->\n  <!--   (i)d: unique identifier for cross referencing -->\n  <!--   (n)ame -->\n  <!--   (l)ine range: \"beg-end\" (inclusive range) -->\n  <!--   (v)ma-range-set: \"{[beg-end), [beg-end)...}\" -->\n  <!--   (t)arget: target function address -->\n  <!--   (d)evice: device name -->\n  <!ELEMENT LM (F|P)*>\n  <!ATTLIST LM\n	i CDATA #REQUIRED\n	n CDATA #REQUIRED\n	v CDATA #IMPLIED>\n  <!-- File: -->\n  <!ELEMENT F (P|L|S)*>\n  <!ATTLIST F\n	i CDATA #REQUIRED\n	n CDATA #REQUIRED>\n  <!-- Procedure: ln=link name (if different than name) -->\n  <!ELEMENT P (P|A|L|S|C)*>\n  <!ATTLIST P\n	i  CDATA #REQUIRED\n	n  CDATA #REQUIRED\n	ln CDATA #IMPLIED\n	l  CDATA #IMPLIED\n	s  CDATA #IMPLIED\n	v  CDATA #IMPLIED>\n  <!-- Alien: (f)ilename -->\n  <!ELEMENT A (A|L|S|C)*>\n  <!ATTLIST A\n	i CDATA #REQUIRED\n	f CDATA #IMPLIED\n	n CDATA #IMPLIED\n	ln CDATA #IMPLIED\n	l CDATA #IMPLIED\n	v CDATA #IMPLIED>\n  <!-- Loop -->\n  <!ELEMENT L (A|L|S|C)*>\n  <!ATTLIST L\n	i CDATA #REQUIRED\n	l CDATA #REQUIRED\n	f CDATA #IMPLIED\n	v CDATA #IMPLIED>\n  <!-- Statement -->\n  <!ELEMENT S EMPTY>\n  <!ATTLIST S\n	i CDATA #REQUIRED\n	l CDATA #REQUIRED\n	v CDATA #IMPLIED>\n  <!-- Callsite (a special Statement) -->\n  <!ELEMENT C (C)*>\n  <!ATTLIST C\n	i CDATA #REQUIRED\n	l CDATA #REQUIRED\n	v CDATA #IMPLIED\n	t CDATA #IMPLIED\n	d CDATA #IMPLIED>\n";
diff --git a/src/tool/hpcrun/Makefile.am b/src/tool/hpcrun/Makefile.am
index 82345075f2..805511a936 100644
--- a/src/tool/hpcrun/Makefile.am
+++ b/src/tool/hpcrun/Makefile.am
@@ -508,7 +508,6 @@ endif
 if OPT_ENABLE_OPENCL
 MY_OPENCL_FILES = sample-sources/opencl.c \
 	gpu/opencl/opencl-api.c \
-	gpu/opencl/opencl-intercept.c \
 	gpu/opencl/opencl-memory-manager.c \
 	gpu/opencl/opencl-activity-translate.c 
 endif
diff --git a/src/tool/hpcrun/Makefile.in b/src/tool/hpcrun/Makefile.in
index f0276123bc..0f758c8b9f 100644
--- a/src/tool/hpcrun/Makefile.in
+++ b/src/tool/hpcrun/Makefile.in
@@ -539,8 +539,7 @@ am__libhpcrun_la_SOURCES_DIST = utilities/first_func.c main.h main.c \
 	gpu/level0/level0-command-process.c \
 	gpu/level0/level0-data-node.c gpu/level0/level0-event-map.c \
 	gpu/level0/level0-handle-map.c sample-sources/opencl.c \
-	gpu/opencl/opencl-api.c gpu/opencl/opencl-intercept.c \
-	gpu/opencl/opencl-memory-manager.c \
+	gpu/opencl/opencl-api.c gpu/opencl/opencl-memory-manager.c \
 	gpu/opencl/opencl-activity-translate.c \
 	gpu/instrumentation/kernel-data-map.c \
 	gpu/instrumentation/gtpin-instrumentation.c \
@@ -746,7 +745,6 @@ am__objects_35 =
 @OPT_ENABLE_OPENCL_TRUE@am__objects_40 =  \
 @OPT_ENABLE_OPENCL_TRUE@	sample-sources/libhpcrun_la-opencl.lo \
 @OPT_ENABLE_OPENCL_TRUE@	gpu/opencl/libhpcrun_la-opencl-api.lo \
-@OPT_ENABLE_OPENCL_TRUE@	gpu/opencl/libhpcrun_la-opencl-intercept.lo \
 @OPT_ENABLE_OPENCL_TRUE@	gpu/opencl/libhpcrun_la-opencl-memory-manager.lo \
 @OPT_ENABLE_OPENCL_TRUE@	gpu/opencl/libhpcrun_la-opencl-activity-translate.lo
 @OPT_ENABLE_OPENCL_TRUE@am__objects_41 = $(am__objects_40)
@@ -1876,7 +1874,6 @@ MY_AARCH64_FILES = \
 
 @OPT_ENABLE_OPENCL_TRUE@MY_OPENCL_FILES = sample-sources/opencl.c \
 @OPT_ENABLE_OPENCL_TRUE@	gpu/opencl/opencl-api.c \
-@OPT_ENABLE_OPENCL_TRUE@	gpu/opencl/opencl-intercept.c \
 @OPT_ENABLE_OPENCL_TRUE@	gpu/opencl/opencl-memory-manager.c \
 @OPT_ENABLE_OPENCL_TRUE@	gpu/opencl/opencl-activity-translate.c 
 
@@ -2822,9 +2819,6 @@ gpu/opencl/$(DEPDIR)/$(am__dirstamp):
 	@: > gpu/opencl/$(DEPDIR)/$(am__dirstamp)
 gpu/opencl/libhpcrun_la-opencl-api.lo: gpu/opencl/$(am__dirstamp) \
 	gpu/opencl/$(DEPDIR)/$(am__dirstamp)
-gpu/opencl/libhpcrun_la-opencl-intercept.lo:  \
-	gpu/opencl/$(am__dirstamp) \
-	gpu/opencl/$(DEPDIR)/$(am__dirstamp)
 gpu/opencl/libhpcrun_la-opencl-memory-manager.lo:  \
 	gpu/opencl/$(am__dirstamp) \
 	gpu/opencl/$(DEPDIR)/$(am__dirstamp)
@@ -3765,7 +3759,6 @@ distclean-compile:
 @AMDEP_TRUE@@am__include@ @am__quote@gpu/nvidia/$(DEPDIR)/libhpcrun_o-cupti-gpu-api.Po@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@gpu/opencl/$(DEPDIR)/libhpcrun_la-opencl-activity-translate.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@gpu/opencl/$(DEPDIR)/libhpcrun_la-opencl-api.Plo@am__quote@
-@AMDEP_TRUE@@am__include@ @am__quote@gpu/opencl/$(DEPDIR)/libhpcrun_la-opencl-intercept.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@gpu/opencl/$(DEPDIR)/libhpcrun_la-opencl-memory-manager.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@lush-agents/$(DEPDIR)/libagent_cilk_la-agent-cilk.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@lush-agents/$(DEPDIR)/libagent_pthread_la-agent-pthread.Plo@am__quote@
@@ -5403,13 +5396,6 @@ gpu/opencl/libhpcrun_la-opencl-api.lo: gpu/opencl/opencl-api.c
 @AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
 @am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -c -o gpu/opencl/libhpcrun_la-opencl-api.lo `test -f 'gpu/opencl/opencl-api.c' || echo '$(srcdir)/'`gpu/opencl/opencl-api.c
 
-gpu/opencl/libhpcrun_la-opencl-intercept.lo: gpu/opencl/opencl-intercept.c
-@am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -MT gpu/opencl/libhpcrun_la-opencl-intercept.lo -MD -MP -MF gpu/opencl/$(DEPDIR)/libhpcrun_la-opencl-intercept.Tpo -c -o gpu/opencl/libhpcrun_la-opencl-intercept.lo `test -f 'gpu/opencl/opencl-intercept.c' || echo '$(srcdir)/'`gpu/opencl/opencl-intercept.c
-@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) gpu/opencl/$(DEPDIR)/libhpcrun_la-opencl-intercept.Tpo gpu/opencl/$(DEPDIR)/libhpcrun_la-opencl-intercept.Plo
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='gpu/opencl/opencl-intercept.c' object='gpu/opencl/libhpcrun_la-opencl-intercept.lo' libtool=yes @AMDEPBACKSLASH@
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
-@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -c -o gpu/opencl/libhpcrun_la-opencl-intercept.lo `test -f 'gpu/opencl/opencl-intercept.c' || echo '$(srcdir)/'`gpu/opencl/opencl-intercept.c
-
 gpu/opencl/libhpcrun_la-opencl-memory-manager.lo: gpu/opencl/opencl-memory-manager.c
 @am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -MT gpu/opencl/libhpcrun_la-opencl-memory-manager.lo -MD -MP -MF gpu/opencl/$(DEPDIR)/libhpcrun_la-opencl-memory-manager.Tpo -c -o gpu/opencl/libhpcrun_la-opencl-memory-manager.lo `test -f 'gpu/opencl/opencl-memory-manager.c' || echo '$(srcdir)/'`gpu/opencl/opencl-memory-manager.c
 @am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) gpu/opencl/$(DEPDIR)/libhpcrun_la-opencl-memory-manager.Tpo gpu/opencl/$(DEPDIR)/libhpcrun_la-opencl-memory-manager.Plo
diff --git a/src/tool/hpcrun/gpu/gpu-metrics.c b/src/tool/hpcrun/gpu/gpu-metrics.c
index 9fffadcbce..cff90e361c 100644
--- a/src/tool/hpcrun/gpu/gpu-metrics.c
+++ b/src/tool/hpcrun/gpu/gpu-metrics.c
@@ -84,8 +84,7 @@
   macro(GPU_INST, 9)  \
   macro(GTIMES, 10)  \
   macro(KINFO, 12)  \
-  macro(GSAMP, 13)  \
-  macro(KER_BLKINFO, 14)
+  macro(GSAMP, 13)
 
 
 #define FORALL_METRIC_KINDS(macro)  \
@@ -199,10 +198,6 @@ name ## _metric_kind
   reg_metric->format  = FORMAT_DISPLAY_PERCENTAGE
 
 
-#define COPY_METRIC_TO_SIBLINGS_IN_BBLOCK(name) \
-  hpcrun_set_relocation_type(METRIC_ID(name), HPCRUN_FMT_METRIC_COPY_TO_INST_SIBLINGS_IN_BBLOCK); \
-
-
 
 //*****************************************************************************
 // local variables 
diff --git a/src/tool/hpcrun/gpu/gpu-metrics.h b/src/tool/hpcrun/gpu/gpu-metrics.h
index 1205df0d40..42d8eddc84 100644
--- a/src/tool/hpcrun/gpu/gpu-metrics.h
+++ b/src/tool/hpcrun/gpu/gpu-metrics.h
@@ -381,10 +381,6 @@ typedef enum {
   FORALL_GSAMP_INT(macro)			\
   FORALL_GSAMP_REAL(macro)				
 
-#define FORALL_KER_BLKINFO(macro)		\
-  macro("KER:BLK_EXEC_COUNT",            KER_BLK_EXECUTION_COUNT,		\
-	"count of number of dynamic executions of block")
-
 
 
 //******************************************************************************
diff --git a/src/tool/hpcrun/gpu/opencl/opencl-activity-translate.c b/src/tool/hpcrun/gpu/opencl/opencl-activity-translate.c
index 555ad5b0cd..972914a3ea 100644
--- a/src/tool/hpcrun/gpu/opencl/opencl-activity-translate.c
+++ b/src/tool/hpcrun/gpu/opencl/opencl-activity-translate.c
@@ -58,7 +58,6 @@
 
 #include "opencl-activity-translate.h"
 #include "opencl-api.h"
-#include "opencl-intercept.h"
 
 
 
diff --git a/src/tool/hpcrun/gpu/opencl/opencl-api.h b/src/tool/hpcrun/gpu/opencl/opencl-api.h
index dbd79d97eb..ca847eacc5 100644
--- a/src/tool/hpcrun/gpu/opencl/opencl-api.h
+++ b/src/tool/hpcrun/gpu/opencl/opencl-api.h
@@ -51,7 +51,40 @@
 #include <hpcrun/gpu/gpu-activity.h>
 #include <lib/prof-lean/hpcrun-opencl.h>
 
-#include "opencl-intercept.h"
+
+
+//******************************************************************************
+// type declarations
+//******************************************************************************
+
+typedef enum {
+  memcpy_H2D                      = 0,
+  memcpy_D2H                      = 1,
+  kernel                          = 2
+} opencl_call_t;
+
+
+typedef struct cl_generic_callback_t {
+  uint64_t correlation_id;
+  opencl_call_t type;
+} cl_generic_callback_t;
+
+
+typedef struct cl_kernel_callback_t {
+  uint64_t correlation_id;
+  opencl_call_t type;
+} cl_kernel_callback_t;
+
+
+typedef struct cl_memory_callback_t {
+  uint64_t correlation_id;
+  opencl_call_t type;
+  bool fromHostToDevice;
+  bool fromDeviceToHost;
+  size_t size;
+} cl_memory_callback_t;
+
+
 
 //******************************************************************************
 // interface operations
diff --git a/src/tool/hpcrun/gpu/opencl/opencl-intercept.c b/src/tool/hpcrun/gpu/opencl/opencl-intercept.c
deleted file mode 100644
index 0e2e10bfc8..0000000000
--- a/src/tool/hpcrun/gpu/opencl/opencl-intercept.c
+++ /dev/null
@@ -1,91 +0,0 @@
-// -*-Mode: C++;-*- // technically C99
-
-// * BeginRiceCopyright *****************************************************
-//
-// --------------------------------------------------------------------------
-// Part of HPCToolkit (hpctoolkit.org)
-//
-// Information about sources of support for research and development of
-// HPCToolkit is at 'hpctoolkit.org' and in 'README.Acknowledgments'.
-// --------------------------------------------------------------------------
-//
-// Copyright ((c)) 2002-2020, Rice University
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// * Redistributions of source code must retain the above copyright
-//   notice, this list of conditions and the following disclaimer.
-//
-// * Redistributions in binary form must reproduce the above copyright
-//   notice, this list of conditions and the following disclaimer in the
-//   documentation and/or other materials provided with the distribution.
-//
-// * Neither the name of Rice University (RICE) nor the names of its
-//   contributors may be used to endorse or promote products derived from
-//   this software without specific prior written permission.
-//
-// This software is provided by RICE and contributors "as is" and any
-// express or implied warranties, including, but not limited to, the
-// implied warranties of merchantability and fitness for a particular
-// purpose are disclaimed. In no event shall RICE or contributors be
-// liable for any direct, indirect, incidental, special, exemplary, or
-// consequential damages (including, but not limited to, procurement of
-// substitute goods or services; loss of use, data, or profits; or
-// business interruption) however caused and on any theory of liability,
-// whether in contract, strict liability, or tort (including negligence
-// or otherwise) arising in any way out of the use of this software, even
-// if advised of the possibility of such damage.
-//
-// ******************************************************* EndRiceCopyright *
-
-//******************************************************************************
-// system includes
-//******************************************************************************
-
-#include <inttypes.h>
-
-//******************************************************************************
-// local includes
-//******************************************************************************
-
-#include <hpcrun/gpu/instrumentation/gtpin-instrumentation.h>
-#include <hpcrun/gpu/gpu-metrics.h>
-#include <hpcrun/memory/hpcrun-malloc.h>
-#include <hpcrun/messages/messages.h>
-#include <hpcrun/files.h>
-#include <hpcrun/sample-sources/libdl.h>
-#include <lib/prof-lean/hpcrun-gotcha.h>
-#include <lib/prof-lean/hpcrun-opencl.h>
-#include <lib/prof-lean/stdatomic.h>
-#include <lib/prof-lean/spinlock.h>
-#include <lib/prof-lean/crypto-hash.h>
-
-#include "opencl-intercept.h"
-
-
-// TODO: This file is no longer needed. To be deleted
-
-void
-opencl_intercept_setup
-(
- void
-)
-{
-#ifndef HPCRUN_STATIC_LINK
-  ETMSG(OPENCL, "setting up opencl intercepts");
-  gpu_metrics_GPU_INST_enable();
-  gtpin_enable_profiling();
-#endif
-}
-
-
-void
-opencl_intercept_teardown
-(
- void
-)
-{
-}
diff --git a/src/tool/hpcrun/gpu/opencl/opencl-intercept.h b/src/tool/hpcrun/gpu/opencl/opencl-intercept.h
deleted file mode 100644
index b8cb2de3e5..0000000000
--- a/src/tool/hpcrun/gpu/opencl/opencl-intercept.h
+++ /dev/null
@@ -1,179 +0,0 @@
-// -*-Mode: C++;-*- // technically C99
-
-// * BeginRiceCopyright *****************************************************
-//
-// --------------------------------------------------------------------------
-// Part of HPCToolkit (hpctoolkit.org)
-//
-// Information about sources of support for research and development of
-// HPCToolkit is at 'hpctoolkit.org' and in 'README.Acknowledgments'.
-// --------------------------------------------------------------------------
-//
-// Copyright ((c)) 2002-2020, Rice University
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// * Redistributions of source code must retain the above copyright
-//   notice, this list of conditions and the following disclaimer.
-//
-// * Redistributions in binary form must reproduce the above copyright
-//   notice, this list of conditions and the following disclaimer in the
-//   documentation and/or other materials provided with the distribution.
-//
-// * Neither the name of Rice University (RICE) nor the names of its
-//   contributors may be used to endorse or promote products derived from
-//   this software without specific prior written permission.
-//
-// This software is provided by RICE and contributors "as is" and any
-// express or implied warranties, including, but not limited to, the
-// implied warranties of merchantability and fitness for a particular
-// purpose are disclaimed. In no event shall RICE or contributors be
-// liable for any direct, indirect, incidental, special, exemplary, or
-// consequential damages (including, but not limited to, procurement of
-// substitute goods or services; loss of use, data, or profits; or
-// business interruption) however caused and on any theory of liability,
-// whether in contract, strict liability, or tort (including negligence
-// or otherwise) arising in any way out of the use of this software, even
-// if advised of the possibility of such damage.
-//
-// ******************************************************* EndRiceCopyright *
-
-#ifndef _OPENCL_INTERCEPT_H_
-#define _OPENCL_INTERCEPT_H_
-
-
-
-//******************************************************************************
-// system includes
-//******************************************************************************
-
-#include <stdbool.h>
-
-//******************************************************************************
-// local includes
-//******************************************************************************
-
-#include <lib/prof-lean/hpcrun-opencl.h>
-
-//******************************************************************************
-// type declarations
-//******************************************************************************
-
-typedef cl_command_queue (*clqueue_t)(
-  cl_context,
-  cl_device_id,
-  cl_command_queue_properties,
-  cl_int *
-);
-
-
-typedef cl_int (*clkernel_t)(
-  cl_command_queue,
-  cl_kernel,
-  cl_uint,
-  const size_t *,
-  const size_t *,
-  const size_t *,
-  cl_uint,
-  const cl_event *,
-  cl_event *
-);
-
-
-typedef cl_int (*clreadbuffer_t)(
-  cl_command_queue,
-  cl_mem,
-  cl_bool,
-  size_t,
-  size_t,
-  void *,
-  cl_uint,
-  const cl_event *,
-  cl_event *
-);
-
-
-typedef cl_int (*clwritebuffer_t)(
-  cl_command_queue,
-  cl_mem,
-  cl_bool,
-  size_t,
-  size_t,
-  const void *,
-  cl_uint,
-  const cl_event *,
-  cl_event *
-);
-
-
-typedef cl_int (*clbuildprogram_t)
-(
- cl_program program,
- cl_uint num_devices,
- const cl_device_id* device_list,
- const char* options,
- void (CL_CALLBACK* pfn_notify)(cl_program program, void* user_data),
- void* user_data
-);
-
-
-typedef cl_program (*clcreateprogramwithsource_t)
-(
- cl_context context,
- cl_uint count,
- const char** strings,
- const size_t* lengths,
- cl_int* errcode_ret
-);
-
-
-typedef enum {
-  memcpy_H2D                      = 0,
-  memcpy_D2H                      = 1,
-  kernel                          = 2
-} opencl_call_t;
-
-
-typedef struct cl_generic_callback_t {
-  uint64_t correlation_id;
-  opencl_call_t type;
-} cl_generic_callback_t;
-
-
-typedef struct cl_kernel_callback_t {
-  uint64_t correlation_id;
-  opencl_call_t type;
-} cl_kernel_callback_t;
-
-
-typedef struct cl_memory_callback_t {
-  uint64_t correlation_id;
-  opencl_call_t type;
-  bool fromHostToDevice;
-  bool fromDeviceToHost;
-  size_t size;
-} cl_memory_callback_t;
-
-
-//******************************************************************************
-// interface operations
-//******************************************************************************
-
-void
-opencl_intercept_setup
-(
-  void
-);
-
-
-void
-opencl_intercept_teardown
-(
-  void
-);
-
-
-#endif  //_OPENCL_INTERCEPT_H_
diff --git a/src/tool/hpcrun/gpu/opencl/opencl-memory-manager.h b/src/tool/hpcrun/gpu/opencl/opencl-memory-manager.h
index 33b994af78..3513700489 100644
--- a/src/tool/hpcrun/gpu/opencl/opencl-memory-manager.h
+++ b/src/tool/hpcrun/gpu/opencl/opencl-memory-manager.h
@@ -52,7 +52,7 @@
 
 #include <lib/prof-lean/bistack.h>
 
-#include "opencl-intercept.h"
+#include "opencl-api.h"
 
 
 
diff --git a/src/tool/hpcrun/metrics.c b/src/tool/hpcrun/metrics.c
index a4f444e750..8495e58ebb 100644
--- a/src/tool/hpcrun/metrics.c
+++ b/src/tool/hpcrun/metrics.c
@@ -311,12 +311,6 @@ void hpcrun_set_percent(int metric_id, uint8_t show_percent) {
 }
 
 
-void hpcrun_set_relocation_type(int metric_id, uint8_t relocation_type) {
-  metric_desc_t* mdesc = hpcrun_id2metric_linked(metric_id);
-  mdesc->flags.fields.relocation_type = relocation_type;
-}
-
-
 metric_desc_p_tbl_t*
 hpcrun_get_metric_tbl(kind_info_t **curr)
 {
diff --git a/src/tool/hpcrun/metrics.h b/src/tool/hpcrun/metrics.h
index c30917aec4..be9a89f361 100644
--- a/src/tool/hpcrun/metrics.h
+++ b/src/tool/hpcrun/metrics.h
@@ -128,8 +128,6 @@ void hpcrun_set_display(int metric_id, uint8_t show);
 
 void hpcrun_set_percent(int metric_id, uint8_t show_percent);
 
-void hpcrun_set_relocation_type(int metric_id, uint8_t relocation_type);
-
 metric_desc_p_tbl_t* hpcrun_get_metric_tbl(kind_info_t**);
 
 metric_upd_proc_t* hpcrun_get_metric_proc(int metric_id);

From 3fcf58557afb5bbc972276e24f047dec0625fd2b Mon Sep 17 00:00:00 2001
From: Jokeren <robinho364@gmail.com>
Date: Fri, 2 Oct 2020 01:20:57 +0000
Subject: [PATCH 056/177] Fix opencl api by adding a
 clCreateCommandQueueWithProperties wrapper

---
 src/tool/hpcrun/gpu/opencl/opencl-api.c | 96 +++++++++++++++++++++----
 1 file changed, 83 insertions(+), 13 deletions(-)

diff --git a/src/tool/hpcrun/gpu/opencl/opencl-api.c b/src/tool/hpcrun/gpu/opencl/opencl-api.c
index d0d6d40c1f..fbbc23dc94 100644
--- a/src/tool/hpcrun/gpu/opencl/opencl-api.c
+++ b/src/tool/hpcrun/gpu/opencl/opencl-api.c
@@ -53,8 +53,6 @@
 #include <sys/types.h>
 #include <unistd.h>
 
-
-
 //******************************************************************************
 // local includes
 //******************************************************************************
@@ -159,15 +157,16 @@
 
 #define opencl_path() "libOpenCL.so"
 
-#define FORALL_OPENCL_ROUTINES(macro)          \
-  macro(clBuildProgram)          \
-  macro(clCreateProgramWithSource)          \
-  macro(clCreateCommandQueue)          \
-  macro(clEnqueueNDRangeKernel)          \
-  macro(clEnqueueReadBuffer)          \
-  macro(clEnqueueWriteBuffer)          \
-  macro(clGetEventProfilingInfo)          \
-  macro(clReleaseEvent)              \
+#define FORALL_OPENCL_ROUTINES(macro)  \
+  macro(clBuildProgram)  \
+  macro(clCreateProgramWithSource)  \
+  macro(clCreateCommandQueue)  \
+  macro(clCreateCommandQueueWithProperties)  \
+  macro(clEnqueueNDRangeKernel)  \
+  macro(clEnqueueReadBuffer)  \
+  macro(clEnqueueWriteBuffer)  \
+  macro(clGetEventProfilingInfo)  \
+  macro(clReleaseEvent)  \
   macro(clSetEventCallback)
 
 #define OPENCL_FN_NAME(f) DYN_FN_NAME(f)
@@ -178,6 +177,9 @@
 #define OPENCL_PROGRAM_FN(fn, args)      \
   static cl_program (*OPENCL_FN_NAME(fn)) args
 
+#define OPENCL_QUEUE_FN(fn, args)      \
+  static cl_command_queue (*OPENCL_FN_NAME(fn)) args
+
 #define HPCRUN_OPENCL_CALL(fn, args) (OPENCL_FN_NAME(fn) args)
 
 #define LINE_TABLE_FLAG " -gline-tables-only "
@@ -217,7 +219,7 @@ OPENCL_PROGRAM_FN
 );
 
 
-OPENCL_FN
+OPENCL_QUEUE_FN
 (
   clCreateCommandQueue, 
   (
@@ -229,7 +231,19 @@ OPENCL_FN
 );
 
 
-OPENCL_PROGRAM_FN
+OPENCL_QUEUE_FN
+(
+  clCreateCommandQueueWithProperties, 
+  (
+   cl_context,
+   cl_device_id,
+   const cl_bitfield *,
+   cl_int*
+  )
+);
+
+
+OPENCL_FN
 (
   clEnqueueNDRangeKernel, 
   (
@@ -434,6 +448,7 @@ opencl_call_to_string
 }
 
 
+__attribute__((unused))
 static const char*
 opencl_error_report
 (
@@ -560,6 +575,8 @@ opencl_timing_info_get
          (event, CL_PROFILING_COMMAND_END, 
           sizeof(commandEnd), &commandEnd, NULL));
 
+  ETMSG(OPENCL, "duration [%lu, %lu]", commandStart, commandEnd);
+
   set_gpu_interval(interval, (uint64_t)commandStart, (uint64_t)commandEnd);
 }
 
@@ -692,6 +709,59 @@ clCreateCommandQueue
 }
 
 
+cl_command_queue
+clCreateCommandQueueWithProperties
+(
+ cl_context context,
+ cl_device_id device,
+ const cl_bitfield* properties,
+ cl_int* errcode_ret
+)
+{
+  cl_bitfield *queue_properties = (cl_bitfield *)properties;
+  if (properties == NULL) {
+    queue_properties = (cl_bitfield *)malloc(sizeof(cl_bitfield) * 3);
+    queue_properties[0] = CL_QUEUE_PROPERTIES;
+    queue_properties[1] = CL_QUEUE_PROFILING_ENABLE;
+    queue_properties[2] = 0;
+  } else {
+    int queue_props_id = -1;
+    int props_count = 0;
+    while (properties[props_count] != 0) {
+      if (properties[props_count] == CL_QUEUE_PROPERTIES) {
+        queue_props_id = props_count;
+      }
+      ++props_count;
+    }
+
+    if (queue_props_id >= 0 && queue_props_id + 1 < props_count) {
+      queue_properties = (cl_bitfield *)malloc(sizeof(cl_bitfield) * (props_count + 1));
+      for (int i = 0; i < props_count; ++i) {
+        queue_properties[i] = properties[i];
+      }
+      // We do have a queue property entry, just enable profiling
+      queue_properties[queue_props_id + 1] |= CL_QUEUE_PROFILING_ENABLE;
+      queue_properties[props_count] = 0;
+    } else {
+      // We do not have a queue property entry, need to allocate a queue property entry and set up
+      queue_properties = (cl_bitfield *)malloc(sizeof(cl_bitfield) * (props_count + 3));
+      for (int i = 0; i < props_count; ++i) {
+        queue_properties[i] = properties[i];
+      }
+      queue_properties[props_count] = CL_QUEUE_PROPERTIES;
+      queue_properties[props_count + 1] = CL_QUEUE_PROFILING_ENABLE;
+      queue_properties[props_count + 2] = 0;
+    }
+  }
+  cl_command_queue queue = HPCRUN_OPENCL_CALL(clCreateCommandQueueWithProperties, (context, device, queue_properties, errcode_ret));
+  if (queue_properties != NULL) {
+    // The property is created by us
+    free(queue_properties);
+  }
+  return queue;
+}
+
+
 cl_int
 clEnqueueNDRangeKernel
 (

From e5eeb395ad6ca2bdadddf55fdbc48a529dd1ff4c Mon Sep 17 00:00:00 2001
From: Jokeren <robinho364@gmail.com>
Date: Sat, 3 Oct 2020 01:27:40 +0000
Subject: [PATCH 057/177] 1. Fix duplicate correlation id in gtpin; 2. Fix
 clCreateCommandQueueWithProperties

---
 src/tool/hpcrun/gpu/gpu-correlation.c         |  2 +-
 .../hpcrun/gpu/gpu-host-correlation-map.c     | 26 +++++++++++++++----
 .../hpcrun/gpu/gpu-host-correlation-map.h     | 17 ++++++++++++
 .../instrumentation/gtpin-instrumentation.c   |  3 +++
 src/tool/hpcrun/gpu/opencl/opencl-api.c       |  4 +++
 5 files changed, 46 insertions(+), 6 deletions(-)

diff --git a/src/tool/hpcrun/gpu/gpu-correlation.c b/src/tool/hpcrun/gpu/gpu-correlation.c
index d6f78c4d85..31faf48f21 100644
--- a/src/tool/hpcrun/gpu/gpu-correlation.c
+++ b/src/tool/hpcrun/gpu/gpu-correlation.c
@@ -120,7 +120,7 @@ gpu_correlation_consume
 #else
     PRINT("Consume correlation id %lu\n", c->host_correlation_id);
     gpu_host_correlation_map_insert(c->host_correlation_id, &(c->gpu_op_ccts), 
-				    c->cpu_submit_time, c->activity_channel);
+      c->cpu_submit_time, c->activity_channel);
 #endif
 }
 
diff --git a/src/tool/hpcrun/gpu/gpu-host-correlation-map.c b/src/tool/hpcrun/gpu/gpu-host-correlation-map.c
index 812d490b31..79d7c1da4e 100644
--- a/src/tool/hpcrun/gpu/gpu-host-correlation-map.c
+++ b/src/tool/hpcrun/gpu/gpu-host-correlation-map.c
@@ -129,7 +129,7 @@ static gpu_host_correlation_map_entry_t *map_root = NULL;
 
 static gpu_host_correlation_map_entry_t *free_list = NULL;
 
-
+static bool allow_replace = false;
 
 //******************************************************************************
 // private operations
@@ -215,10 +215,17 @@ gpu_host_correlation_map_insert
  gpu_activity_channel_t *activity_channel
 )
 {
-  if (st_lookup(&map_root, host_correlation_id)) { 
-    // fatal error: host_correlation id already present; a
-    // correlation should be inserted only once.
-    assert(0);
+  gpu_host_correlation_map_entry_t *entry = st_lookup(&map_root, host_correlation_id);
+  if (entry) {
+    if (allow_replace) {
+      entry->gpu_op_ccts = *gpu_op_ccts;
+      entry->cpu_submit_time = cpu_submit_time;
+      entry->activity_channel = activity_channel;
+    } else {
+      // fatal error: host_correlation id already present; a
+      // correlation should be inserted only once.
+      assert(0);
+    }
   } else {
     gpu_host_correlation_map_entry_t *entry = 
       gpu_host_correlation_map_entry_new(host_correlation_id, gpu_op_ccts, 
@@ -331,6 +338,15 @@ gpu_host_correlation_map_entry_cpu_submit_time
 }
 
 
+void
+gpu_host_correlation_map_replace_set
+(
+ bool replace
+)
+{
+  allow_replace = replace;
+}
+
 
 //*****************************************************************************
 // debugging code
diff --git a/src/tool/hpcrun/gpu/gpu-host-correlation-map.h b/src/tool/hpcrun/gpu/gpu-host-correlation-map.h
index e32c99cb16..904485e091 100644
--- a/src/tool/hpcrun/gpu/gpu-host-correlation-map.h
+++ b/src/tool/hpcrun/gpu/gpu-host-correlation-map.h
@@ -99,6 +99,23 @@ gpu_host_correlation_map_insert
 );
 
 
+void
+gpu_host_correlation_map_replace
+(
+ uint64_t host_correlation_id,
+ gpu_op_ccts_t *gpu_op_ccts,
+ uint64_t cpu_gpu_time_offset,
+ gpu_activity_channel_t *activity_channel
+);
+
+
+void
+gpu_host_correlation_map_replace_set
+(
+ bool replace
+);
+
+
 // samples == total_samples remove the node and return false
 bool
 gpu_host_correlation_map_samples_increase
diff --git a/src/tool/hpcrun/gpu/instrumentation/gtpin-instrumentation.c b/src/tool/hpcrun/gpu/instrumentation/gtpin-instrumentation.c
index c22c3fd150..d49193fc90 100644
--- a/src/tool/hpcrun/gpu/instrumentation/gtpin-instrumentation.c
+++ b/src/tool/hpcrun/gpu/instrumentation/gtpin-instrumentation.c
@@ -489,6 +489,9 @@ gtpin_enable_profiling
   // Use opencl/level zero runtime stack
   gtpin_use_runtime_callstack = true;
 
+  // Enable host correlation id replace
+  gpu_host_correlation_map_replace_set(true);
+
   GTPin_OnKernelBuild(onKernelBuild, NULL);
   GTPin_OnKernelRun(onKernelRun, NULL);
   GTPin_OnKernelComplete(onKernelComplete, NULL);
diff --git a/src/tool/hpcrun/gpu/opencl/opencl-api.c b/src/tool/hpcrun/gpu/opencl/opencl-api.c
index fbbc23dc94..41b9fd155c 100644
--- a/src/tool/hpcrun/gpu/opencl/opencl-api.c
+++ b/src/tool/hpcrun/gpu/opencl/opencl-api.c
@@ -730,6 +730,10 @@ clCreateCommandQueueWithProperties
     while (properties[props_count] != 0) {
       if (properties[props_count] == CL_QUEUE_PROPERTIES) {
         queue_props_id = props_count;
+        ++props_count;
+      } else if (properties[props_count] == 0x1094) {
+        // TODO(Keren): A temporay hack
+        ++props_count;
       }
       ++props_count;
     }

From 6a1c23a6da3832f5f28c3ab35ca973e4cfdb3df1 Mon Sep 17 00:00:00 2001
From: Jokeren <robinho364@gmail.com>
Date: Sat, 3 Oct 2020 03:21:36 +0000
Subject: [PATCH 058/177] Enable instruction metrics only when gtpin is enabled

---
 src/tool/hpcrun/gpu/instrumentation/gtpin-instrumentation.c | 2 ++
 src/tool/hpcrun/gpu/opencl/opencl-api.c                     | 1 -
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/tool/hpcrun/gpu/instrumentation/gtpin-instrumentation.c b/src/tool/hpcrun/gpu/instrumentation/gtpin-instrumentation.c
index d49193fc90..79009cae5f 100644
--- a/src/tool/hpcrun/gpu/instrumentation/gtpin-instrumentation.c
+++ b/src/tool/hpcrun/gpu/instrumentation/gtpin-instrumentation.c
@@ -486,6 +486,8 @@ gtpin_enable_profiling
   }
 #endif
 
+  gpu_metrics_GPU_INST_enable();
+
   // Use opencl/level zero runtime stack
   gtpin_use_runtime_callstack = true;
 
diff --git a/src/tool/hpcrun/gpu/opencl/opencl-api.c b/src/tool/hpcrun/gpu/opencl/opencl-api.c
index 41b9fd155c..e67daa1a82 100644
--- a/src/tool/hpcrun/gpu/opencl/opencl-api.c
+++ b/src/tool/hpcrun/gpu/opencl/opencl-api.c
@@ -602,7 +602,6 @@ opencl_api_initialize
  void
 )
 {
-  gpu_metrics_GPU_INST_enable();
   gtpin_enable_profiling();
   atomic_store(&correlation_id, 0);
   atomic_store(&opencl_pending_operations, 0);

From b65bf7fd903f1eb3308fdf6ef982ba2585289a9a Mon Sep 17 00:00:00 2001
From: Jokeren <robinho364@gmail.com>
Date: Sat, 3 Oct 2020 03:37:24 +0000
Subject: [PATCH 059/177] Modify gtpin log to include correlation

---
 src/tool/hpcrun/gpu/instrumentation/gtpin-instrumentation.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/tool/hpcrun/gpu/instrumentation/gtpin-instrumentation.c b/src/tool/hpcrun/gpu/instrumentation/gtpin-instrumentation.c
index 79009cae5f..deb434c96f 100644
--- a/src/tool/hpcrun/gpu/instrumentation/gtpin-instrumentation.c
+++ b/src/tool/hpcrun/gpu/instrumentation/gtpin-instrumentation.c
@@ -407,6 +407,8 @@ onKernelRun
  void *v
 )
 {
+  ETMSG(OPENCL, "onKernelRun starting. Inserted: correlation %llu", (uint64_t)kernelExec);
+
   gpu_activity_channel_consume(gpu_metrics_attribute);
 
   GTPINTOOL_STATUS status = GTPINTOOL_STATUS_SUCCESS;
@@ -431,7 +433,7 @@ onKernelComplete
 
   GTPINTOOL_STATUS status = GTPINTOOL_STATUS_SUCCESS;
   GTPinKernel kernel = GTPin_KernelExec_GetKernel(kernelExec);
-  ETMSG(OPENCL, "onKernelComplete starting. Lookup: key: %"PRIu64 "",(uint64_t)kernel);
+  ETMSG(OPENCL, "onKernelComplete starting. Lookup: correlation %llu, kernel: %llu", (uint64_t)kernelExec, (uint64_t)kernel);
   assert(kernel_data_map_lookup((uint64_t)kernel) != 0);
 
   kernel_data_map_entry_t *kernel_data_map_entry = kernel_data_map_lookup((uint64_t)kernel);

From 19afe111f9bb4dbf9113bf0b4adaec44857bd6b1 Mon Sep 17 00:00:00 2001
From: Jokeren <robinho364@gmail.com>
Date: Sat, 3 Oct 2020 03:37:39 +0000
Subject: [PATCH 060/177] Fix initialization order warning

---
 src/lib/binutils/ElfHelper.hpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/lib/binutils/ElfHelper.hpp b/src/lib/binutils/ElfHelper.hpp
index 787a6df1f4..1d8d554629 100644
--- a/src/lib/binutils/ElfHelper.hpp
+++ b/src/lib/binutils/ElfHelper.hpp
@@ -86,7 +86,7 @@
 
 class ElfFile {
 public:
-  ElfFile() : origPtr(0), memPtr(0), elf(0), memLen(0), intelGPU(false) {}
+  ElfFile() : origPtr(0), memPtr(0), memLen(0), elf(0), intelGPU(false) {}
   bool open(char *_memPtr, size_t _memLen, const std::string &_fileName);
   ~ElfFile();
   int getArch() { return arch; }
@@ -103,8 +103,8 @@ class ElfFile {
   char *origPtr;
   char *memPtr;
   size_t memLen;
-  bool intelGPU;
   Elf *elf;
+  bool intelGPU;
   std::string fileName;
 };
 

From e0a90087402ad874ff87d6c37f08a487d8556b4c Mon Sep 17 00:00:00 2001
From: dejangrubisic <grubisic.dejan@yahoo.com>
Date: Fri, 2 Oct 2020 22:44:02 -0500
Subject: [PATCH 061/177] opencl: traceOrdered flag added

---
 src/lib/prof-lean/hpcrun-fmt.h            | 1 +
 src/tool/hpcrun/core_profile_trace_data.h | 1 +
 src/tool/hpcrun/gpu/gpu-trace.c           | 9 ++++++++-
 src/tool/hpcrun/thread_data.c             | 2 +-
 src/tool/hpcrun/write_data.c              | 1 +
 5 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/src/lib/prof-lean/hpcrun-fmt.h b/src/lib/prof-lean/hpcrun-fmt.h
index 85d724d65b..abdbe4f9d3 100644
--- a/src/lib/prof-lean/hpcrun-fmt.h
+++ b/src/lib/prof-lean/hpcrun-fmt.h
@@ -161,6 +161,7 @@ hpcrun_fmt_hdr_free(hpcrun_fmt_hdr_t* hdr, hpcfmt_free_fn dealloc);
 
 #define HPCRUN_FMT_NV_traceMinTime "trace-min-time"
 #define HPCRUN_FMT_NV_traceMaxTime "trace-max-time"
+#define HPCRUN_FMT_NV_traceOrdered "trace-time-ordered"
 
 #define HPCRUN_FMT_METRIC_HIDE            0
 #define HPCRUN_FMT_METRIC_SHOW            1
diff --git a/src/tool/hpcrun/core_profile_trace_data.h b/src/tool/hpcrun/core_profile_trace_data.h
index daaf61b9cd..d15cc7a47d 100644
--- a/src/tool/hpcrun/core_profile_trace_data.h
+++ b/src/tool/hpcrun/core_profile_trace_data.h
@@ -30,6 +30,7 @@ typedef struct core_profile_trace_data_t {
   // ----------------------------------------
   uint64_t trace_min_time_us;
   uint64_t trace_max_time_us;
+  bool traceOrdered;
 
   // ----------------------------------------
   // IO support
diff --git a/src/tool/hpcrun/gpu/gpu-trace.c b/src/tool/hpcrun/gpu/gpu-trace.c
index 5a9edb299c..116471e0f6 100644
--- a/src/tool/hpcrun/gpu/gpu-trace.c
+++ b/src/tool/hpcrun/gpu/gpu-trace.c
@@ -242,7 +242,14 @@ gpu_trace_start_adjust
 )
 {
   uint64_t last_end = td->gpu_trace_prev_time;
-  if (start < last_end) {
+
+  if (end < last_end){
+    // If stream becomes unordered, mark it (it will be sorted in prof)
+    td->core_profile_trace_data.traceOrdered = false;
+    return start;
+  }
+
+  if(start < last_end) {
     // If we have a hardware measurement error (Power9),
     // set the offset as the end of the last activity
     start = last_end + 1;
diff --git a/src/tool/hpcrun/thread_data.c b/src/tool/hpcrun/thread_data.c
index 6de9070757..1e3b8910f8 100644
--- a/src/tool/hpcrun/thread_data.c
+++ b/src/tool/hpcrun/thread_data.c
@@ -236,7 +236,7 @@ core_profile_trace_data_init(core_profile_trace_data_t * cptd, int id, cct_ctxt_
   // ----------------------------------------
   cptd->trace_min_time_us = 0;
   cptd->trace_max_time_us = 0;
-
+  cptd->traceOrdered = true;
   // ----------------------------------------
   // IO support
   // ----------------------------------------
diff --git a/src/tool/hpcrun/write_data.c b/src/tool/hpcrun/write_data.c
index 6b2f279a3a..8d5a65d750 100644
--- a/src/tool/hpcrun/write_data.c
+++ b/src/tool/hpcrun/write_data.c
@@ -198,6 +198,7 @@ lazy_open_data_file(core_profile_trace_data_t * cptd)
                         HPCRUN_FMT_NV_pid, pidStr,
 			HPCRUN_FMT_NV_traceMinTime, traceMinTimeStr,
 			HPCRUN_FMT_NV_traceMaxTime, traceMaxTimeStr,
+                        HPCRUN_FMT_NV_traceOrdered, cptd->traceOrdered?"1":"0",
                         NULL);
   return fs;
 }

From fb0acea2481ccf5a9b9a2d19f1423ff92383d57a Mon Sep 17 00:00:00 2001
From: Dejan XXX <dx4@llnl.cs.rice.edu>
Date: Sat, 3 Oct 2020 03:38:38 -0500
Subject: [PATCH 062/177] improved multi-threaded tracing, requires testing

---
 src/tool/hpcrun/gpu/gpu-trace-channel-set.c |  15 ++-
 src/tool/hpcrun/gpu/gpu-trace-channel-set.h |  15 ++-
 src/tool/hpcrun/gpu/gpu-trace-channel.c     |   2 +-
 src/tool/hpcrun/gpu/gpu-trace.c             | 110 ++++++++++++--------
 src/tool/hpcrun/gpu/gpu-trace.h             |   2 +-
 src/tool/hpcrun/sample-sources/nvidia.c     |   7 +-
 src/tool/hpcrun/thread_data.c               |   2 +-
 src/tool/hpcrun/write_data.c                |   7 +-
 8 files changed, 102 insertions(+), 58 deletions(-)

diff --git a/src/tool/hpcrun/gpu/gpu-trace-channel-set.c b/src/tool/hpcrun/gpu/gpu-trace-channel-set.c
index 8c4f431de0..aaa7c8f416 100644
--- a/src/tool/hpcrun/gpu/gpu-trace-channel-set.c
+++ b/src/tool/hpcrun/gpu/gpu-trace-channel-set.c
@@ -102,7 +102,7 @@ typed_stack_declare_type(gpu_trace_channel_ptr_t);
 // local data
 //******************************************************************************
 
-static
+static __thread
 typed_stack_elem_ptr(gpu_trace_channel_ptr_t) *gpu_trace_channel_stack;
 
 
@@ -149,8 +149,19 @@ gpu_trace_channel_set_forall
 // interface operations
 //******************************************************************************
 
-void gpu_trace_channel_stack_alloc(int size){
+void
+gpu_trace_channel_stack_init
+(
+ void *trace_channel_set_ptr
+)
+{
+  gpu_trace_channel_stack = trace_channel_set_ptr;
+}
+
+void *
+gpu_trace_channel_stack_alloc(int size){
 	gpu_trace_channel_stack = hpcrun_malloc_safe( size * sizeof(typed_stack_elem_ptr(gpu_trace_channel_ptr_t)));
+  return gpu_trace_channel_stack;
 }
 
 void
diff --git a/src/tool/hpcrun/gpu/gpu-trace-channel-set.h b/src/tool/hpcrun/gpu/gpu-trace-channel-set.h
index 23119621ac..a4136f6fd2 100644
--- a/src/tool/hpcrun/gpu/gpu-trace-channel-set.h
+++ b/src/tool/hpcrun/gpu/gpu-trace-channel-set.h
@@ -44,7 +44,7 @@
 #ifndef gpu_trace_channel_set_h
 #define gpu_trace_channel_set_h
 
-
+#include <lib/prof-lean/stacks.h>
 
 //******************************************************************************
 // forward type declarations
@@ -69,8 +69,19 @@ typedef void (*gpu_trace_channel_fn_t)
 // interface operations
 //******************************************************************************
 
+
 void
-gpu_trace_channel_stack_alloc(int size);
+gpu_trace_channel_stack_init
+(
+ void *trace_channel_set_ptr
+);
+
+
+void *
+gpu_trace_channel_stack_alloc
+(
+ int size
+);
 
 
 void
diff --git a/src/tool/hpcrun/gpu/gpu-trace-channel.c b/src/tool/hpcrun/gpu/gpu-trace-channel.c
index d593735c0e..7d7c5c7cd2 100644
--- a/src/tool/hpcrun/gpu/gpu-trace-channel.c
+++ b/src/tool/hpcrun/gpu/gpu-trace-channel.c
@@ -55,7 +55,7 @@
 
 #define SECONDS_UNTIL_WAKEUP 2
 
-#define DEBUG 0
+#define DEBUG 1
 
 //******************************************************************************
 // local includes
diff --git a/src/tool/hpcrun/gpu/gpu-trace.c b/src/tool/hpcrun/gpu/gpu-trace.c
index 116471e0f6..50f02a1725 100644
--- a/src/tool/hpcrun/gpu/gpu-trace.c
+++ b/src/tool/hpcrun/gpu/gpu-trace.c
@@ -49,11 +49,11 @@
 #include <pthread.h>
 
 #include <hpcrun/cct/cct.h>
+#include <hpcrun/control-knob.h>
 #include <hpcrun/thread_data.h>
 #include <hpcrun/threadmgr.h>
 #include <hpcrun/trace.h>
 #include <hpcrun/write_data.h>
-#include <hpcrun/control-knob.h>
 
 #include <assert.h>
 
@@ -86,6 +86,7 @@
 #define DEBUG 0
 #include "gpu-print.h"
 
+#define MIN(a,b)  (((a)<=(b))?(a):(b))
 
 
 //******************************************************************************
@@ -97,6 +98,11 @@ typedef struct gpu_trace_t {
   gpu_trace_channel_t *trace_channel;
 } gpu_trace_t;
 
+typedef struct gpu_stream_set_t {
+  void *ptr;
+  int thread_id;
+} gpu_stream_set_t;
+
 
 
 typedef void *(*pthread_start_routine_t)(void *);
@@ -110,11 +116,13 @@ static _Atomic(bool) stop_trace_flag;
 
 static atomic_ullong active_streams_counter;
 
-static atomic_ullong stream_id;
-
+static atomic_ullong num_streams;
 
-static __thread uint32_t my_trace_set_id = -1;
+static int streams_per_thread;
+static int max_threads_consumers;
+static int num_threads = 0;
 
+static void **trace_channel_set_array;
 
 static __thread uint64_t stream_start = 0;
 
@@ -242,14 +250,7 @@ gpu_trace_start_adjust
 )
 {
   uint64_t last_end = td->gpu_trace_prev_time;
-
-  if (end < last_end){
-    // If stream becomes unordered, mark it (it will be sorted in prof)
-    td->core_profile_trace_data.traceOrdered = false;
-    return start;
-  }
-
-  if(start < last_end) {
+  if (start < last_end) {
     // If we have a hardware measurement error (Power9),
     // set the offset as the end of the last activity
     start = last_end + 1;
@@ -362,7 +363,7 @@ gpu_trace_stream_id
 )
 {
   // FIXME: this is a bad way to compute a stream id
-  int id = 500 + atomic_fetch_add(&stream_id, 1);
+  int id = 500 + atomic_fetch_add(&num_streams, 1);
 
   return id;
 }
@@ -412,31 +413,55 @@ gpu_trace_init
 {
   atomic_store(&stop_trace_flag, false);
   atomic_store(&active_streams_counter, 0);
-  atomic_store(&stream_id, 0);
+  atomic_store(&num_streams, 0);
+
+  control_knob_value_get_int("STREAMS_PER_THREAD", &streams_per_thread);
+  control_knob_value_get_int("MAX_THREADS_CONSUMERS", &max_threads_consumers);
+
+  printf("streams_per_thread, max_threads_consumers = %d %d\n", streams_per_thread, max_threads_consumers);
+
+  trace_channel_set_array = hpcrun_malloc(sizeof(void *) * max_threads_consumers);
+
+}
+
+static int
+get_my_streams(int thread_id){
+  int stream_count = atomic_load(&num_streams);
+  if (num_threads - 1 == thread_id){
+    return stream_count % streams_per_thread;
+  } else{
+    return streams_per_thread;
+  }
 }
 
 
 void *
 gpu_trace_record
 (
- void
+// void *trace_channel_set_ptr
+void * args
 )
 {
+  gpu_stream_set_t *stream_set = (gpu_stream_set_t *) args;
+  gpu_trace_channel_stack_init(stream_set->ptr);
+  int my_streams;
 
   while (!atomic_load(&stop_trace_flag)) {
     //getting data from a trace channel
 
-    int current_stream_id = atomic_load(&stream_id);
+    my_streams = get_my_streams(stream_set->thread_id);
+
 
-    for (int set_index = 0; set_index < current_stream_id; ++set_index) {
+    for (int set_index = 0; set_index < my_streams; ++set_index) {
       gpu_trace_activities_process(set_index);
       gpu_trace_channel_set_await(set_index);
     }
 
   }
 
-  int current_stream_id = atomic_load(&stream_id);
-  for (int set_index = 0; set_index < current_stream_id; ++set_index) {
+  my_streams = get_my_streams(stream_set->thread_id);
+
+  for (int set_index = 0; set_index < my_streams; ++set_index) {
     gpu_trace_activities_process(set_index);
     gpu_trace_channel_set_await(set_index);
     gpu_trace_channel_set_release(set_index);
@@ -456,11 +481,15 @@ gpu_trace_fini
 
   atomic_store(&stop_trace_flag, true);
 
-  int current_stream_id = atomic_load(&stream_id);
-  for (int set_index = 0; set_index < current_stream_id; ++set_index) {
-    gpu_trace_channel_set_notify(set_index);
+  for (int t = 0; t < num_threads; ++t) {
+    gpu_trace_channel_stack_init(trace_channel_set_array[t]);
+    int my_streams = get_my_streams(t);
+    for (int set_index = 0; set_index < my_streams; ++set_index) {
+      gpu_trace_channel_set_notify(set_index);
+    }
   }
 
+
   while (atomic_load(&active_streams_counter));
 
 }
@@ -471,37 +500,28 @@ gpu_trace_channel_set_append
  gpu_trace_t *trace
 )
 {
-  int streams_per_thread;
-  control_knob_value_get_int("STREAMS_PER_THREAD", &streams_per_thread);
-  int max_threads_consumers;
-  control_knob_value_get_int("MAX_THREADS_CONSUMERS", &max_threads_consumers);
-  static int num_threads = 0;
-  static int num_streams = 0;
-  volatile bool new_thread = false;
+  static int stream_id = 0;
 
-  num_streams++;
-  atomic_fetch_add(&active_streams_counter, 1);
+  if (stream_id == 0) {
+    trace_channel_set_array[num_threads] = gpu_trace_channel_stack_alloc(max_threads_consumers);
 
-  if (num_streams >= (streams_per_thread * num_threads)) {
-    num_threads++;
-    new_thread = true;
-    gpu_trace_channel_stack_alloc(max_threads_consumers);
-  }
+    gpu_stream_set_t *stream_set = hpcrun_malloc(sizeof(gpu_stream_set_t));
+    stream_set->ptr=trace_channel_set_array[num_threads];
+    stream_set->thread_id=num_threads;
 
-  assert(streams_per_thread > 0);
-  assert(num_threads < max_threads_consumers);
-
-  if (new_thread) {
-    pthread_create(&trace->thread, NULL, (pthread_start_routine_t) gpu_trace_record, NULL);
+    pthread_create(&trace->thread, NULL, (pthread_start_routine_t) gpu_trace_record,
+                   stream_set);
+    num_threads++;
+    assert(num_threads < max_threads_consumers);
   }
 
-  my_trace_set_id = num_threads - 1;
+  gpu_trace_channel_set_insert(trace->trace_channel, stream_id);
 
-  my_trace_set_id = gpu_trace_channel_get_stream_id(trace->trace_channel) - 500;
+  PRINT("set_index = %d -> stream = %u\n", num_threads, stream_id);
 
-  gpu_trace_channel_set_insert(trace->trace_channel, my_trace_set_id);
+  atomic_fetch_add(&active_streams_counter, 1);
+  stream_id = (stream_id+1) % streams_per_thread;
 
-  PRINT("set_index = %d -> stream = %u\n", num_threads, num_streams);
 
 }
 
diff --git a/src/tool/hpcrun/gpu/gpu-trace.h b/src/tool/hpcrun/gpu/gpu-trace.h
index b86720f56f..e3fb91291b 100644
--- a/src/tool/hpcrun/gpu/gpu-trace.h
+++ b/src/tool/hpcrun/gpu/gpu-trace.h
@@ -103,7 +103,7 @@ gpu_trace_create
 void *
 gpu_trace_record
 (
- void
+void *args
 );
 
 
diff --git a/src/tool/hpcrun/sample-sources/nvidia.c b/src/tool/hpcrun/sample-sources/nvidia.c
index 5d41b347e6..aeb3093768 100644
--- a/src/tool/hpcrun/sample-sources/nvidia.c
+++ b/src/tool/hpcrun/sample-sources/nvidia.c
@@ -268,8 +268,8 @@ METHOD_FN(init)
   // Reset cupti flags
   cupti_device_init();
 
-  // Init records
-  gpu_trace_init();
+//  // Init records
+//  gpu_trace_init();
 }
 
 static void
@@ -414,6 +414,9 @@ METHOD_FN(process_event_list, int lush_metrics)
 static void
 METHOD_FN(finalize_event_list)
 {
+  // Init records
+  gpu_trace_init();
+
   cupti_enable_activities();
 }
 
diff --git a/src/tool/hpcrun/thread_data.c b/src/tool/hpcrun/thread_data.c
index 1e3b8910f8..6de9070757 100644
--- a/src/tool/hpcrun/thread_data.c
+++ b/src/tool/hpcrun/thread_data.c
@@ -236,7 +236,7 @@ core_profile_trace_data_init(core_profile_trace_data_t * cptd, int id, cct_ctxt_
   // ----------------------------------------
   cptd->trace_min_time_us = 0;
   cptd->trace_max_time_us = 0;
-  cptd->traceOrdered = true;
+
   // ----------------------------------------
   // IO support
   // ----------------------------------------
diff --git a/src/tool/hpcrun/write_data.c b/src/tool/hpcrun/write_data.c
index 8d5a65d750..74b68c64b4 100644
--- a/src/tool/hpcrun/write_data.c
+++ b/src/tool/hpcrun/write_data.c
@@ -190,15 +190,14 @@ lazy_open_data_file(core_profile_trace_data_t * cptd)
   hpcrun_fmt_hdr_fwrite(fs,
                         HPCRUN_FMT_NV_prog, hpcrun_files_executable_name(),
                         HPCRUN_FMT_NV_progPath, hpcrun_files_executable_pathname(),
-			HPCRUN_FMT_NV_envPath, getenv("PATH"),
+			                  HPCRUN_FMT_NV_envPath, getenv("PATH"),
                         HPCRUN_FMT_NV_jobId, jobIdStr,
                         HPCRUN_FMT_NV_mpiRank, mpiRankStr,
                         HPCRUN_FMT_NV_tid, tidStr,
                         HPCRUN_FMT_NV_hostid, hostidStr,
                         HPCRUN_FMT_NV_pid, pidStr,
-			HPCRUN_FMT_NV_traceMinTime, traceMinTimeStr,
-			HPCRUN_FMT_NV_traceMaxTime, traceMaxTimeStr,
-                        HPCRUN_FMT_NV_traceOrdered, cptd->traceOrdered?"1":"0",
+                        HPCRUN_FMT_NV_traceMinTime, traceMinTimeStr,
+                        HPCRUN_FMT_NV_traceMaxTime, traceMaxTimeStr,
                         NULL);
   return fs;
 }

From 51b26b65e8b771cc483fbd5fad29469e60477ffe Mon Sep 17 00:00:00 2001
From: Dejan XXX <dx4@llnl.cs.rice.edu>
Date: Sat, 3 Oct 2020 09:25:31 -0500
Subject: [PATCH 063/177] added atomicity for num_threads

---
 src/tool/hpcrun/gpu/gpu-trace.c | 48 ++++++++++++++++++---------------
 1 file changed, 26 insertions(+), 22 deletions(-)

diff --git a/src/tool/hpcrun/gpu/gpu-trace.c b/src/tool/hpcrun/gpu/gpu-trace.c
index 50f02a1725..d4383aa12b 100644
--- a/src/tool/hpcrun/gpu/gpu-trace.c
+++ b/src/tool/hpcrun/gpu/gpu-trace.c
@@ -83,7 +83,7 @@
 // macros
 //******************************************************************************
 
-#define DEBUG 0
+#define DEBUG 1
 #include "gpu-print.h"
 
 #define MIN(a,b)  (((a)<=(b))?(a):(b))
@@ -120,7 +120,7 @@ static atomic_ullong num_streams;
 
 static int streams_per_thread;
 static int max_threads_consumers;
-static int num_threads = 0;
+static atomic_uint num_threads;
 
 static void **trace_channel_set_array;
 
@@ -412,6 +412,7 @@ gpu_trace_init
 )
 {
   atomic_store(&stop_trace_flag, false);
+  atomic_store(&num_threads, 0);
   atomic_store(&active_streams_counter, 0);
   atomic_store(&num_streams, 0);
 
@@ -425,10 +426,11 @@ gpu_trace_init
 }
 
 static int
-get_my_streams(int thread_id){
-  int stream_count = atomic_load(&num_streams);
-  if (num_threads - 1 == thread_id){
-    return stream_count % streams_per_thread;
+get_num_my_streams(int thread_id){
+  int num_streams_loc = atomic_load(&num_streams);
+  int num_threads_loc = atomic_load(&num_threads);
+  if (num_threads_loc - 1 == thread_id){
+    return num_streams_loc % streams_per_thread;
   } else{
     return streams_per_thread;
   }
@@ -444,24 +446,23 @@ void * args
 {
   gpu_stream_set_t *stream_set = (gpu_stream_set_t *) args;
   gpu_trace_channel_stack_init(stream_set->ptr);
-  int my_streams;
+  int num_my_streams;
 
   while (!atomic_load(&stop_trace_flag)) {
     //getting data from a trace channel
 
-    my_streams = get_my_streams(stream_set->thread_id);
+    num_my_streams = get_num_my_streams(stream_set->thread_id);
 
-
-    for (int set_index = 0; set_index < my_streams; ++set_index) {
+    for (int set_index = 0; set_index < num_my_streams; ++set_index) {
       gpu_trace_activities_process(set_index);
       gpu_trace_channel_set_await(set_index);
     }
 
   }
 
-  my_streams = get_my_streams(stream_set->thread_id);
+  num_my_streams = get_num_my_streams(stream_set->thread_id);
 
-  for (int set_index = 0; set_index < my_streams; ++set_index) {
+  for (int set_index = 0; set_index < num_my_streams; ++set_index) {
     gpu_trace_activities_process(set_index);
     gpu_trace_channel_set_await(set_index);
     gpu_trace_channel_set_release(set_index);
@@ -481,9 +482,10 @@ gpu_trace_fini
 
   atomic_store(&stop_trace_flag, true);
 
-  for (int t = 0; t < num_threads; ++t) {
+  int num_threads_loc = atomic_load(&num_threads);
+  for (int t = 0; t < num_threads_loc; ++t) {
     gpu_trace_channel_stack_init(trace_channel_set_array[t]);
-    int my_streams = get_my_streams(t);
+    int my_streams = get_num_my_streams(t);
     for (int set_index = 0; set_index < my_streams; ++set_index) {
       gpu_trace_channel_set_notify(set_index);
     }
@@ -500,29 +502,31 @@ gpu_trace_channel_set_append
  gpu_trace_t *trace
 )
 {
+  int num_threads_loc;
   static int stream_id = 0;
 
   if (stream_id == 0) {
-    trace_channel_set_array[num_threads] = gpu_trace_channel_stack_alloc(max_threads_consumers);
+
+    num_threads_loc = atomic_fetch_add(&num_threads, 1);
+    PRINT("gpu-trace: Create new thread (num = %u)\n", num_threads_loc);
+    assert(num_threads_loc < max_threads_consumers);
+
+    trace_channel_set_array[num_threads_loc] = gpu_trace_channel_stack_alloc(max_threads_consumers);
 
     gpu_stream_set_t *stream_set = hpcrun_malloc(sizeof(gpu_stream_set_t));
-    stream_set->ptr=trace_channel_set_array[num_threads];
-    stream_set->thread_id=num_threads;
+    stream_set->ptr=trace_channel_set_array[num_threads_loc];
+    stream_set->thread_id=num_threads_loc;
 
     pthread_create(&trace->thread, NULL, (pthread_start_routine_t) gpu_trace_record,
                    stream_set);
-    num_threads++;
-    assert(num_threads < max_threads_consumers);
   }
 
   gpu_trace_channel_set_insert(trace->trace_channel, stream_id);
 
-  PRINT("set_index = %d -> stream = %u\n", num_threads, stream_id);
+  PRINT("gpu-trace:Thread_id = %d -> stream = %u\n", num_threads_loc, stream_id);
 
   atomic_fetch_add(&active_streams_counter, 1);
   stream_id = (stream_id+1) % streams_per_thread;
-
-
 }
 
 

From 6e545e69304441e0b0543f653b14aa9406badc77 Mon Sep 17 00:00:00 2001
From: Dejan XXX <dx4@llnl.cs.rice.edu>
Date: Sat, 3 Oct 2020 10:08:56 -0500
Subject: [PATCH 064/177] main: control_knob_init must go before
 hpcrun_registered_sources_init

---
 src/tool/hpcrun/gpu/gpu-trace.c         | 2 +-
 src/tool/hpcrun/main.c                  | 4 ++--
 src/tool/hpcrun/sample-sources/nvidia.c | 7 ++-----
 3 files changed, 5 insertions(+), 8 deletions(-)

diff --git a/src/tool/hpcrun/gpu/gpu-trace.c b/src/tool/hpcrun/gpu/gpu-trace.c
index d4383aa12b..70234e7a85 100644
--- a/src/tool/hpcrun/gpu/gpu-trace.c
+++ b/src/tool/hpcrun/gpu/gpu-trace.c
@@ -502,7 +502,7 @@ gpu_trace_channel_set_append
  gpu_trace_t *trace
 )
 {
-  int num_threads_loc;
+  static int num_threads_loc;
   static int stream_id = 0;
 
   if (stream_id == 0) {
diff --git a/src/tool/hpcrun/main.c b/src/tool/hpcrun/main.c
index fd48cc0e38..82630a1fda 100644
--- a/src/tool/hpcrun/main.c
+++ b/src/tool/hpcrun/main.c
@@ -945,10 +945,10 @@ monitor_init_process(int *argc, char **argv, void* data)
   // fnbounds must be after module_ignore_map
   fnbounds_init();
 
-  hpcrun_registered_sources_init();
-
   control_knob_init();
 
+  hpcrun_registered_sources_init();
+
   hpcrun_do_custom_init();
 
   // for debugging, limit the life of the execution with an alarm.
diff --git a/src/tool/hpcrun/sample-sources/nvidia.c b/src/tool/hpcrun/sample-sources/nvidia.c
index aeb3093768..5d41b347e6 100644
--- a/src/tool/hpcrun/sample-sources/nvidia.c
+++ b/src/tool/hpcrun/sample-sources/nvidia.c
@@ -268,8 +268,8 @@ METHOD_FN(init)
   // Reset cupti flags
   cupti_device_init();
 
-//  // Init records
-//  gpu_trace_init();
+  // Init records
+  gpu_trace_init();
 }
 
 static void
@@ -414,9 +414,6 @@ METHOD_FN(process_event_list, int lush_metrics)
 static void
 METHOD_FN(finalize_event_list)
 {
-  // Init records
-  gpu_trace_init();
-
   cupti_enable_activities();
 }
 

From 43b81e3b092acca41d2bcf551707e8c2f98c9f7c Mon Sep 17 00:00:00 2001
From: Dejan XXX <dx4@llnl.cs.rice.edu>
Date: Sat, 3 Oct 2020 12:56:05 -0500
Subject: [PATCH 065/177] working version

---
 src/tool/hpcrun/gpu/gpu-trace-channel.c |  2 +-
 src/tool/hpcrun/gpu/gpu-trace.c         | 19 +++++++++++--------
 src/tool/hpcrun/main.c                  |  3 +++
 3 files changed, 15 insertions(+), 9 deletions(-)

diff --git a/src/tool/hpcrun/gpu/gpu-trace-channel.c b/src/tool/hpcrun/gpu/gpu-trace-channel.c
index 7d7c5c7cd2..d593735c0e 100644
--- a/src/tool/hpcrun/gpu/gpu-trace-channel.c
+++ b/src/tool/hpcrun/gpu/gpu-trace-channel.c
@@ -55,7 +55,7 @@
 
 #define SECONDS_UNTIL_WAKEUP 2
 
-#define DEBUG 1
+#define DEBUG 0
 
 //******************************************************************************
 // local includes
diff --git a/src/tool/hpcrun/gpu/gpu-trace.c b/src/tool/hpcrun/gpu/gpu-trace.c
index 70234e7a85..4588d0e52d 100644
--- a/src/tool/hpcrun/gpu/gpu-trace.c
+++ b/src/tool/hpcrun/gpu/gpu-trace.c
@@ -83,7 +83,7 @@
 // macros
 //******************************************************************************
 
-#define DEBUG 1
+#define DEBUG 0
 #include "gpu-print.h"
 
 #define MIN(a,b)  (((a)<=(b))?(a):(b))
@@ -430,7 +430,8 @@ get_num_my_streams(int thread_id){
   int num_streams_loc = atomic_load(&num_streams);
   int num_threads_loc = atomic_load(&num_threads);
   if (num_threads_loc - 1 == thread_id){
-    return num_streams_loc % streams_per_thread;
+    int my_streams = num_streams_loc % streams_per_thread;
+    return my_streams?my_streams:streams_per_thread;
   } else{
     return streams_per_thread;
   }
@@ -494,6 +495,8 @@ gpu_trace_fini
 
   while (atomic_load(&active_streams_counter));
 
+  printf("NUM_TRACE_THREADS = %d\n", atomic_load(&num_threads));
+
 }
 
 static void
@@ -502,10 +505,10 @@ gpu_trace_channel_set_append
  gpu_trace_t *trace
 )
 {
-  static int num_threads_loc;
-  static int stream_id = 0;
+  int num_threads_loc;
+  static __thread int stream_id_loc = 0;
 
-  if (stream_id == 0) {
+  if (stream_id_loc == 0) {
 
     num_threads_loc = atomic_fetch_add(&num_threads, 1);
     PRINT("gpu-trace: Create new thread (num = %u)\n", num_threads_loc);
@@ -521,12 +524,12 @@ gpu_trace_channel_set_append
                    stream_set);
   }
 
-  gpu_trace_channel_set_insert(trace->trace_channel, stream_id);
+  gpu_trace_channel_set_insert(trace->trace_channel, stream_id_loc);
 
-  PRINT("gpu-trace:Thread_id = %d -> stream = %u\n", num_threads_loc, stream_id);
+  PRINT("gpu-trace:Thread_id = %d -> stream = %u\n", num_threads_loc, stream_id_loc);
 
   atomic_fetch_add(&active_streams_counter, 1);
-  stream_id = (stream_id+1) % streams_per_thread;
+  stream_id_loc = (stream_id_loc+1) % streams_per_thread;
 }
 
 
diff --git a/src/tool/hpcrun/main.c b/src/tool/hpcrun/main.c
index 82630a1fda..f7b874196b 100644
--- a/src/tool/hpcrun/main.c
+++ b/src/tool/hpcrun/main.c
@@ -1008,6 +1008,8 @@ monitor_fini_process(int how, void* data)
 void
 monitor_begin_process_exit(int how)
 {
+//TODO:Check with John if we should delete this or adjust hpcrun_fini_internal
+#if 0
   if (hpcrun_get_disabled()) {
     return;
   }
@@ -1029,6 +1031,7 @@ monitor_begin_process_exit(int how)
 
 
   hpcrun_safe_exit();
+#endif
 }
 
 static fork_data_t from_fork;

From fe711815b002eae24820536584340ca4cf45c143 Mon Sep 17 00:00:00 2001
From: Dejan XXX <dx4@llnl.cs.rice.edu>
Date: Sat, 3 Oct 2020 21:17:42 -0500
Subject: [PATCH 066/177] gpu_trace_demultiplexer implemented

---
 src/tool/hpcrun/Makefile.am                   |   1 +
 src/tool/hpcrun/Makefile.in                   | 108 +++++---
 .../hpcrun/gpu/gpu-activity-multiplexer.c     |  10 +-
 .../hpcrun/gpu/gpu-activity-multiplexer.h     |   4 +-
 src/tool/hpcrun/gpu/gpu-trace-channel-set.c   |  73 ++++--
 src/tool/hpcrun/gpu/gpu-trace-channel-set.h   |  40 ++-
 src/tool/hpcrun/gpu/gpu-trace-demultiplexer.c | 231 ++++++++++++++++++
 src/tool/hpcrun/gpu/gpu-trace-demultiplexer.h | 126 ++++++++++
 src/tool/hpcrun/gpu/gpu-trace.c               | 137 ++---------
 9 files changed, 541 insertions(+), 189 deletions(-)
 create mode 100644 src/tool/hpcrun/gpu/gpu-trace-demultiplexer.c
 create mode 100644 src/tool/hpcrun/gpu/gpu-trace-demultiplexer.h

diff --git a/src/tool/hpcrun/Makefile.am b/src/tool/hpcrun/Makefile.am
index 4aba76313d..d1a1cd5c2f 100644
--- a/src/tool/hpcrun/Makefile.am
+++ b/src/tool/hpcrun/Makefile.am
@@ -384,6 +384,7 @@ MY_BASE_FILES =				\
 	gpu/gpu-trace-channel.c		\
 	gpu/gpu-trace-item.c		\
 	gpu/gpu-trace-channel-set.c	\
+	gpu/gpu-trace-demultiplexer.c	\
 	\
 	ompt/ompt-callstack.c           \
 	ompt/ompt-defer.c               \
diff --git a/src/tool/hpcrun/Makefile.in b/src/tool/hpcrun/Makefile.in
index fb18716199..73208f5c76 100644
--- a/src/tool/hpcrun/Makefile.in
+++ b/src/tool/hpcrun/Makefile.in
@@ -494,18 +494,19 @@ am__libhpcrun_la_SOURCES_DIST = utilities/first_func.c main.h main.c \
 	gpu/gpu-operation-channel.c gpu/gpu-operation-channel-set.c \
 	gpu/gpu-splay-allocator.c gpu/gpu-stream-id-map.c \
 	gpu/gpu-trace.c gpu/gpu-trace-channel.c gpu/gpu-trace-item.c \
-	gpu/gpu-trace-channel-set.c ompt/ompt-callstack.c \
-	ompt/ompt-defer.c ompt/ompt-device.c ompt/ompt-defer-write.c \
-	ompt/ompt-interface.c ompt/ompt-queues.c ompt/ompt-region.c \
-	ompt/ompt-region-debug.c ompt/ompt-placeholders.c \
-	ompt/ompt-device-map.c ompt/ompt-task.c ompt/ompt-thread.c \
-	syscalls/poll.c syscalls/ppoll.c syscalls/select.c \
-	utilities/executable-path.h utilities/executable-path.c \
-	utilities/hpcrun-nanotime.h utilities/hpcrun-nanotime.c \
-	utilities/ip-normalized.h utilities/ip-normalized.c \
-	utilities/line_wrapping.c utilities/timer.c \
-	utilities/tokenize.h utilities/tokenize.c utilities/unlink.h \
-	utilities/unlink.c trampoline/common/trampoline_eager.c \
+	gpu/gpu-trace-channel-set.c gpu/gpu-trace-demultiplexer.c \
+	ompt/ompt-callstack.c ompt/ompt-defer.c ompt/ompt-device.c \
+	ompt/ompt-defer-write.c ompt/ompt-interface.c \
+	ompt/ompt-queues.c ompt/ompt-region.c ompt/ompt-region-debug.c \
+	ompt/ompt-placeholders.c ompt/ompt-device-map.c \
+	ompt/ompt-task.c ompt/ompt-thread.c syscalls/poll.c \
+	syscalls/ppoll.c syscalls/select.c utilities/executable-path.h \
+	utilities/executable-path.c utilities/hpcrun-nanotime.h \
+	utilities/hpcrun-nanotime.c utilities/ip-normalized.h \
+	utilities/ip-normalized.c utilities/line_wrapping.c \
+	utilities/timer.c utilities/tokenize.h utilities/tokenize.c \
+	utilities/unlink.h utilities/unlink.c \
+	trampoline/common/trampoline_eager.c \
 	trampoline/common/trampoline_lazy.c \
 	sample-sources/perf/event_custom.c \
 	sample-sources/perf/linux_perf.c \
@@ -673,6 +674,7 @@ am__objects_15 = utilities/libhpcrun_la-first_func.lo \
 	gpu/libhpcrun_la-gpu-trace-channel.lo \
 	gpu/libhpcrun_la-gpu-trace-item.lo \
 	gpu/libhpcrun_la-gpu-trace-channel-set.lo \
+	gpu/libhpcrun_la-gpu-trace-demultiplexer.lo \
 	ompt/libhpcrun_la-ompt-callstack.lo \
 	ompt/libhpcrun_la-ompt-defer.lo \
 	ompt/libhpcrun_la-ompt-device.lo \
@@ -914,18 +916,19 @@ am__libhpcrun_o_SOURCES_DIST = utilities/first_func.c main.h main.c \
 	gpu/gpu-operation-channel.c gpu/gpu-operation-channel-set.c \
 	gpu/gpu-splay-allocator.c gpu/gpu-stream-id-map.c \
 	gpu/gpu-trace.c gpu/gpu-trace-channel.c gpu/gpu-trace-item.c \
-	gpu/gpu-trace-channel-set.c ompt/ompt-callstack.c \
-	ompt/ompt-defer.c ompt/ompt-device.c ompt/ompt-defer-write.c \
-	ompt/ompt-interface.c ompt/ompt-queues.c ompt/ompt-region.c \
-	ompt/ompt-region-debug.c ompt/ompt-placeholders.c \
-	ompt/ompt-device-map.c ompt/ompt-task.c ompt/ompt-thread.c \
-	syscalls/poll.c syscalls/ppoll.c syscalls/select.c \
-	utilities/executable-path.h utilities/executable-path.c \
-	utilities/hpcrun-nanotime.h utilities/hpcrun-nanotime.c \
-	utilities/ip-normalized.h utilities/ip-normalized.c \
-	utilities/line_wrapping.c utilities/timer.c \
-	utilities/tokenize.h utilities/tokenize.c utilities/unlink.h \
-	utilities/unlink.c trampoline/common/trampoline_eager.c \
+	gpu/gpu-trace-channel-set.c gpu/gpu-trace-demultiplexer.c \
+	ompt/ompt-callstack.c ompt/ompt-defer.c ompt/ompt-device.c \
+	ompt/ompt-defer-write.c ompt/ompt-interface.c \
+	ompt/ompt-queues.c ompt/ompt-region.c ompt/ompt-region-debug.c \
+	ompt/ompt-placeholders.c ompt/ompt-device-map.c \
+	ompt/ompt-task.c ompt/ompt-thread.c syscalls/poll.c \
+	syscalls/ppoll.c syscalls/select.c utilities/executable-path.h \
+	utilities/executable-path.c utilities/hpcrun-nanotime.h \
+	utilities/hpcrun-nanotime.c utilities/ip-normalized.h \
+	utilities/ip-normalized.c utilities/line_wrapping.c \
+	utilities/timer.c utilities/tokenize.h utilities/tokenize.c \
+	utilities/unlink.h utilities/unlink.c \
+	trampoline/common/trampoline_eager.c \
 	trampoline/common/trampoline_lazy.c \
 	sample-sources/perf/event_custom.c \
 	sample-sources/perf/linux_perf.c \
@@ -1095,6 +1098,7 @@ am__objects_55 = utilities/libhpcrun_o-first_func.$(OBJEXT) \
 	gpu/libhpcrun_o-gpu-trace-channel.$(OBJEXT) \
 	gpu/libhpcrun_o-gpu-trace-item.$(OBJEXT) \
 	gpu/libhpcrun_o-gpu-trace-channel-set.$(OBJEXT) \
+	gpu/libhpcrun_o-gpu-trace-demultiplexer.$(OBJEXT) \
 	ompt/libhpcrun_o-ompt-callstack.$(OBJEXT) \
 	ompt/libhpcrun_o-ompt-defer.$(OBJEXT) \
 	ompt/libhpcrun_o-ompt-device.$(OBJEXT) \
@@ -1821,20 +1825,21 @@ MY_BASE_FILES = utilities/first_func.c main.h main.c disabled.c \
 	gpu/gpu-operation-channel.c gpu/gpu-operation-channel-set.c \
 	gpu/gpu-splay-allocator.c gpu/gpu-stream-id-map.c \
 	gpu/gpu-trace.c gpu/gpu-trace-channel.c gpu/gpu-trace-item.c \
-	gpu/gpu-trace-channel-set.c ompt/ompt-callstack.c \
-	ompt/ompt-defer.c ompt/ompt-device.c ompt/ompt-defer-write.c \
-	ompt/ompt-interface.c ompt/ompt-queues.c ompt/ompt-region.c \
-	ompt/ompt-region-debug.c ompt/ompt-placeholders.c \
-	ompt/ompt-device-map.c ompt/ompt-task.c ompt/ompt-thread.c \
-	syscalls/poll.c syscalls/ppoll.c syscalls/select.c \
-	utilities/executable-path.h utilities/executable-path.c \
-	utilities/hpcrun-nanotime.h utilities/hpcrun-nanotime.c \
-	utilities/ip-normalized.h utilities/ip-normalized.c \
-	utilities/line_wrapping.c utilities/timer.c \
-	utilities/tokenize.h utilities/tokenize.c utilities/unlink.h \
-	utilities/unlink.c $(am__append_8) $(am__append_9) \
-	$(am__append_10) $(am__append_12) $(am__append_13) \
-	$(am__append_14) $(am__append_15) $(am__append_17)
+	gpu/gpu-trace-channel-set.c gpu/gpu-trace-demultiplexer.c \
+	ompt/ompt-callstack.c ompt/ompt-defer.c ompt/ompt-device.c \
+	ompt/ompt-defer-write.c ompt/ompt-interface.c \
+	ompt/ompt-queues.c ompt/ompt-region.c ompt/ompt-region-debug.c \
+	ompt/ompt-placeholders.c ompt/ompt-device-map.c \
+	ompt/ompt-task.c ompt/ompt-thread.c syscalls/poll.c \
+	syscalls/ppoll.c syscalls/select.c utilities/executable-path.h \
+	utilities/executable-path.c utilities/hpcrun-nanotime.h \
+	utilities/hpcrun-nanotime.c utilities/ip-normalized.h \
+	utilities/ip-normalized.c utilities/line_wrapping.c \
+	utilities/timer.c utilities/tokenize.h utilities/tokenize.c \
+	utilities/unlink.h utilities/unlink.c $(am__append_8) \
+	$(am__append_9) $(am__append_10) $(am__append_12) \
+	$(am__append_13) $(am__append_14) $(am__append_15) \
+	$(am__append_17)
 MY_DYNAMIC_FILES = \
 	fnbounds/fnbounds_client.c	\
 	fnbounds/fnbounds_dynamic.c	\
@@ -2542,6 +2547,8 @@ gpu/libhpcrun_la-gpu-trace-item.lo: gpu/$(am__dirstamp) \
 	gpu/$(DEPDIR)/$(am__dirstamp)
 gpu/libhpcrun_la-gpu-trace-channel-set.lo: gpu/$(am__dirstamp) \
 	gpu/$(DEPDIR)/$(am__dirstamp)
+gpu/libhpcrun_la-gpu-trace-demultiplexer.lo: gpu/$(am__dirstamp) \
+	gpu/$(DEPDIR)/$(am__dirstamp)
 ompt/$(am__dirstamp):
 	@$(MKDIR_P) ompt
 	@: > ompt/$(am__dirstamp)
@@ -3204,6 +3211,8 @@ gpu/libhpcrun_o-gpu-trace-item.$(OBJEXT): gpu/$(am__dirstamp) \
 	gpu/$(DEPDIR)/$(am__dirstamp)
 gpu/libhpcrun_o-gpu-trace-channel-set.$(OBJEXT): gpu/$(am__dirstamp) \
 	gpu/$(DEPDIR)/$(am__dirstamp)
+gpu/libhpcrun_o-gpu-trace-demultiplexer.$(OBJEXT):  \
+	gpu/$(am__dirstamp) gpu/$(DEPDIR)/$(am__dirstamp)
 ompt/libhpcrun_o-ompt-callstack.$(OBJEXT): ompt/$(am__dirstamp) \
 	ompt/$(DEPDIR)/$(am__dirstamp)
 ompt/libhpcrun_o-ompt-defer.$(OBJEXT): ompt/$(am__dirstamp) \
@@ -3733,6 +3742,7 @@ distclean-compile:
 @AMDEP_TRUE@@am__include@ @am__quote@gpu/$(DEPDIR)/libhpcrun_la-gpu-stream-id-map.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@gpu/$(DEPDIR)/libhpcrun_la-gpu-trace-channel-set.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@gpu/$(DEPDIR)/libhpcrun_la-gpu-trace-channel.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@gpu/$(DEPDIR)/libhpcrun_la-gpu-trace-demultiplexer.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@gpu/$(DEPDIR)/libhpcrun_la-gpu-trace-item.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@gpu/$(DEPDIR)/libhpcrun_la-gpu-trace.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@gpu/$(DEPDIR)/libhpcrun_o-gpu-activity-channel.Po@am__quote@
@@ -3762,6 +3772,7 @@ distclean-compile:
 @AMDEP_TRUE@@am__include@ @am__quote@gpu/$(DEPDIR)/libhpcrun_o-gpu-stream-id-map.Po@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@gpu/$(DEPDIR)/libhpcrun_o-gpu-trace-channel-set.Po@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@gpu/$(DEPDIR)/libhpcrun_o-gpu-trace-channel.Po@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@gpu/$(DEPDIR)/libhpcrun_o-gpu-trace-demultiplexer.Po@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@gpu/$(DEPDIR)/libhpcrun_o-gpu-trace-item.Po@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@gpu/$(DEPDIR)/libhpcrun_o-gpu-trace.Po@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@gpu/amd/$(DEPDIR)/libhpcrun_la-rocm-binary-processing.Plo@am__quote@
@@ -4947,6 +4958,13 @@ gpu/libhpcrun_la-gpu-trace-channel-set.lo: gpu/gpu-trace-channel-set.c
 @AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
 @am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -c -o gpu/libhpcrun_la-gpu-trace-channel-set.lo `test -f 'gpu/gpu-trace-channel-set.c' || echo '$(srcdir)/'`gpu/gpu-trace-channel-set.c
 
+gpu/libhpcrun_la-gpu-trace-demultiplexer.lo: gpu/gpu-trace-demultiplexer.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -MT gpu/libhpcrun_la-gpu-trace-demultiplexer.lo -MD -MP -MF gpu/$(DEPDIR)/libhpcrun_la-gpu-trace-demultiplexer.Tpo -c -o gpu/libhpcrun_la-gpu-trace-demultiplexer.lo `test -f 'gpu/gpu-trace-demultiplexer.c' || echo '$(srcdir)/'`gpu/gpu-trace-demultiplexer.c
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) gpu/$(DEPDIR)/libhpcrun_la-gpu-trace-demultiplexer.Tpo gpu/$(DEPDIR)/libhpcrun_la-gpu-trace-demultiplexer.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='gpu/gpu-trace-demultiplexer.c' object='gpu/libhpcrun_la-gpu-trace-demultiplexer.lo' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -c -o gpu/libhpcrun_la-gpu-trace-demultiplexer.lo `test -f 'gpu/gpu-trace-demultiplexer.c' || echo '$(srcdir)/'`gpu/gpu-trace-demultiplexer.c
+
 ompt/libhpcrun_la-ompt-callstack.lo: ompt/ompt-callstack.c
 @am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -MT ompt/libhpcrun_la-ompt-callstack.lo -MD -MP -MF ompt/$(DEPDIR)/libhpcrun_la-ompt-callstack.Tpo -c -o ompt/libhpcrun_la-ompt-callstack.lo `test -f 'ompt/ompt-callstack.c' || echo '$(srcdir)/'`ompt/ompt-callstack.c
 @am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) ompt/$(DEPDIR)/libhpcrun_la-ompt-callstack.Tpo ompt/$(DEPDIR)/libhpcrun_la-ompt-callstack.Plo
@@ -7159,6 +7177,20 @@ gpu/libhpcrun_o-gpu-trace-channel-set.obj: gpu/gpu-trace-channel-set.c
 @AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
 @am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_o_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_o_CFLAGS) $(CFLAGS) -c -o gpu/libhpcrun_o-gpu-trace-channel-set.obj `if test -f 'gpu/gpu-trace-channel-set.c'; then $(CYGPATH_W) 'gpu/gpu-trace-channel-set.c'; else $(CYGPATH_W) '$(srcdir)/gpu/gpu-trace-channel-set.c'; fi`
 
+gpu/libhpcrun_o-gpu-trace-demultiplexer.o: gpu/gpu-trace-demultiplexer.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_o_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_o_CFLAGS) $(CFLAGS) -MT gpu/libhpcrun_o-gpu-trace-demultiplexer.o -MD -MP -MF gpu/$(DEPDIR)/libhpcrun_o-gpu-trace-demultiplexer.Tpo -c -o gpu/libhpcrun_o-gpu-trace-demultiplexer.o `test -f 'gpu/gpu-trace-demultiplexer.c' || echo '$(srcdir)/'`gpu/gpu-trace-demultiplexer.c
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) gpu/$(DEPDIR)/libhpcrun_o-gpu-trace-demultiplexer.Tpo gpu/$(DEPDIR)/libhpcrun_o-gpu-trace-demultiplexer.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='gpu/gpu-trace-demultiplexer.c' object='gpu/libhpcrun_o-gpu-trace-demultiplexer.o' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_o_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_o_CFLAGS) $(CFLAGS) -c -o gpu/libhpcrun_o-gpu-trace-demultiplexer.o `test -f 'gpu/gpu-trace-demultiplexer.c' || echo '$(srcdir)/'`gpu/gpu-trace-demultiplexer.c
+
+gpu/libhpcrun_o-gpu-trace-demultiplexer.obj: gpu/gpu-trace-demultiplexer.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_o_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_o_CFLAGS) $(CFLAGS) -MT gpu/libhpcrun_o-gpu-trace-demultiplexer.obj -MD -MP -MF gpu/$(DEPDIR)/libhpcrun_o-gpu-trace-demultiplexer.Tpo -c -o gpu/libhpcrun_o-gpu-trace-demultiplexer.obj `if test -f 'gpu/gpu-trace-demultiplexer.c'; then $(CYGPATH_W) 'gpu/gpu-trace-demultiplexer.c'; else $(CYGPATH_W) '$(srcdir)/gpu/gpu-trace-demultiplexer.c'; fi`
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) gpu/$(DEPDIR)/libhpcrun_o-gpu-trace-demultiplexer.Tpo gpu/$(DEPDIR)/libhpcrun_o-gpu-trace-demultiplexer.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='gpu/gpu-trace-demultiplexer.c' object='gpu/libhpcrun_o-gpu-trace-demultiplexer.obj' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_o_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_o_CFLAGS) $(CFLAGS) -c -o gpu/libhpcrun_o-gpu-trace-demultiplexer.obj `if test -f 'gpu/gpu-trace-demultiplexer.c'; then $(CYGPATH_W) 'gpu/gpu-trace-demultiplexer.c'; else $(CYGPATH_W) '$(srcdir)/gpu/gpu-trace-demultiplexer.c'; fi`
+
 ompt/libhpcrun_o-ompt-callstack.o: ompt/ompt-callstack.c
 @am__fastdepCC_TRUE@	$(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_o_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_o_CFLAGS) $(CFLAGS) -MT ompt/libhpcrun_o-ompt-callstack.o -MD -MP -MF ompt/$(DEPDIR)/libhpcrun_o-ompt-callstack.Tpo -c -o ompt/libhpcrun_o-ompt-callstack.o `test -f 'ompt/ompt-callstack.c' || echo '$(srcdir)/'`ompt/ompt-callstack.c
 @am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) ompt/$(DEPDIR)/libhpcrun_o-ompt-callstack.Tpo ompt/$(DEPDIR)/libhpcrun_o-ompt-callstack.Po
diff --git a/src/tool/hpcrun/gpu/gpu-activity-multiplexer.c b/src/tool/hpcrun/gpu/gpu-activity-multiplexer.c
index abc80f93ad..8ba5fd09f1 100644
--- a/src/tool/hpcrun/gpu/gpu-activity-multiplexer.c
+++ b/src/tool/hpcrun/gpu/gpu-activity-multiplexer.c
@@ -41,21 +41,25 @@
 //
 // ******************************************************* EndRiceCopyright *
 
+
 #include <pthread.h>
 
 #include <lib/prof-lean/stdatomic.h>
 
 
+#define DEBUG 0
+
 #include "gpu-activity.h"
 #include "gpu-activity-channel.h"
-#include "gpu-operation-channel-set.h"
 #include "gpu-activity-multiplexer.h"
-#include "gpu-monitoring-thread-api.h"
 #include "gpu-activity-process.h"
+#include "gpu-monitoring-thread-api.h"
+#include "gpu-operation-channel-set.h"
 #include "gpu-trace.h"
-
 #include "gpu-print.h"
 
+
+
 //TODO: Figure out how to get max number of application threads
 #define max_threads_consumers 1000
 
diff --git a/src/tool/hpcrun/gpu/gpu-activity-multiplexer.h b/src/tool/hpcrun/gpu/gpu-activity-multiplexer.h
index dc3832bf3f..62f68c8dc3 100644
--- a/src/tool/hpcrun/gpu/gpu-activity-multiplexer.h
+++ b/src/tool/hpcrun/gpu/gpu-activity-multiplexer.h
@@ -42,8 +42,8 @@
 // ******************************************************* EndRiceCopyright *
 
 
-#ifndef gpu_channel_multiplexer_h
-#define gpu_channel_multiplexer_h
+#ifndef gpu_activity_multiplexer_h
+#define gpu_activity_multiplexer_h
 
 #include <hpcrun/thread_data.h>
 #include "gpu-operation-channel.h"
diff --git a/src/tool/hpcrun/gpu/gpu-trace-channel-set.c b/src/tool/hpcrun/gpu/gpu-trace-channel-set.c
index aaa7c8f416..c4d48e8bfd 100644
--- a/src/tool/hpcrun/gpu/gpu-trace-channel-set.c
+++ b/src/tool/hpcrun/gpu/gpu-trace-channel-set.c
@@ -102,7 +102,7 @@ typed_stack_declare_type(gpu_trace_channel_ptr_t);
 // local data
 //******************************************************************************
 
-static __thread
+//static __thread
 typed_stack_elem_ptr(gpu_trace_channel_ptr_t) *gpu_trace_channel_stack;
 
 
@@ -135,10 +135,13 @@ static void
 gpu_trace_channel_set_forall
 (
  gpu_trace_channel_fn_t channel_fn,
+ void *gpu_trace_channel_stack_ptr,
  int set_index
 
 )
 {
+  gpu_trace_channel_stack = gpu_trace_channel_stack_ptr;
+
   channel_stack_forall(&gpu_trace_channel_stack[set_index], channel_forone,
     channel_fn);
 }
@@ -149,25 +152,17 @@ gpu_trace_channel_set_forall
 // interface operations
 //******************************************************************************
 
-void
-gpu_trace_channel_stack_init
-(
- void *trace_channel_set_ptr
-)
-{
-  gpu_trace_channel_stack = trace_channel_set_ptr;
-}
-
 void *
 gpu_trace_channel_stack_alloc(int size){
-	gpu_trace_channel_stack = hpcrun_malloc_safe( size * sizeof(typed_stack_elem_ptr(gpu_trace_channel_ptr_t)));
-  return gpu_trace_channel_stack;
+//	gpu_trace_channel_stack = hpcrun_malloc_safe( size * sizeof(typed_stack_elem_ptr(gpu_trace_channel_ptr_t)));
+  return hpcrun_malloc_safe( size * sizeof(typed_stack_elem_ptr(gpu_trace_channel_ptr_t)));
 }
 
 void
 gpu_trace_channel_set_insert
 (
  gpu_trace_channel_t *channel,
+ void *gpu_trace_channel_stack_ptr,
  int set_index
 )
 {
@@ -183,16 +178,62 @@ gpu_trace_channel_set_insert
   channel_stack_elem_ptr_set(e, 0);
 
 	  // add the entry to the channel stack
+  gpu_trace_channel_stack =  gpu_trace_channel_stack_ptr;
   channel_stack_push(&gpu_trace_channel_stack[set_index], e);
 }
 
 
 void
-gpu_trace_channel_set_apply
+gpu_trace_channel_set_process
 (
- gpu_trace_channel_fn_t channel_fn,
- int set_index
+void *gpu_trace_channel_stack_ptr,
+int set_index
 )
 {
-  gpu_trace_channel_set_forall(channel_fn, set_index);
+  gpu_trace_channel_stack =  gpu_trace_channel_stack_ptr;
+  gpu_trace_channel_set_forall(gpu_trace_channel_consume,
+                               gpu_trace_channel_stack,
+                               set_index);
 }
+
+
+void
+gpu_trace_channel_set_release
+(
+void *gpu_trace_channel_stack_ptr,
+int set_index
+)
+{
+  gpu_trace_channel_stack =  gpu_trace_channel_stack_ptr;
+  gpu_trace_channel_set_forall(gpu_trace_stream_release,
+                               gpu_trace_channel_stack,
+                               set_index);
+}
+
+
+void
+gpu_trace_channel_set_notify
+(
+void *gpu_trace_channel_stack_ptr,
+int set_index
+)
+{
+  gpu_trace_channel_stack =  gpu_trace_channel_stack_ptr;
+  gpu_trace_channel_set_forall(gpu_trace_channel_signal_consumer,
+                               gpu_trace_channel_stack,
+                               set_index);
+}
+
+
+void
+gpu_trace_channel_set_await
+(
+void *gpu_trace_channel_stack_ptr,
+int set_index
+)
+{
+  gpu_trace_channel_stack =  gpu_trace_channel_stack_ptr;
+  gpu_trace_channel_set_forall(gpu_trace_channel_await,
+                               gpu_trace_channel_stack,
+                               set_index);
+}
\ No newline at end of file
diff --git a/src/tool/hpcrun/gpu/gpu-trace-channel-set.h b/src/tool/hpcrun/gpu/gpu-trace-channel-set.h
index a4136f6fd2..662745302e 100644
--- a/src/tool/hpcrun/gpu/gpu-trace-channel-set.h
+++ b/src/tool/hpcrun/gpu/gpu-trace-channel-set.h
@@ -69,14 +69,6 @@ typedef void (*gpu_trace_channel_fn_t)
 // interface operations
 //******************************************************************************
 
-
-void
-gpu_trace_channel_stack_init
-(
- void *trace_channel_set_ptr
-);
-
-
 void *
 gpu_trace_channel_stack_alloc
 (
@@ -88,17 +80,43 @@ void
 gpu_trace_channel_set_insert
 (
  gpu_trace_channel_t *channel,
+ void *gpu_trace_channel_stack,
  int set_index
 );
 
 
 void
-gpu_trace_channel_set_apply
+gpu_trace_channel_set_process
 (
- gpu_trace_channel_fn_t channel_fn,
- int set_index
+void *gpu_trace_channel_stack,
+int set_index
+);
+
+
+void
+gpu_trace_channel_set_release
+(
+void *gpu_trace_channel_stack,
+int set_index
 );
 
 
+void
+gpu_trace_channel_set_notify
+(
+void *gpu_trace_channel_stack,
+int set_index
+);
+
+void
+gpu_trace_channel_set_await
+(
+void *gpu_trace_channel_stack,
+int set_index
+);
+
+
+
+
 
 #endif
diff --git a/src/tool/hpcrun/gpu/gpu-trace-demultiplexer.c b/src/tool/hpcrun/gpu/gpu-trace-demultiplexer.c
new file mode 100644
index 0000000000..8c717e025b
--- /dev/null
+++ b/src/tool/hpcrun/gpu/gpu-trace-demultiplexer.c
@@ -0,0 +1,231 @@
+
+// * BeginRiceCopyright *****************************************************
+// -*-Mode: C++;-*- // technically C99
+//
+// --------------------------------------------------------------------------
+// Part of HPCToolkit (hpctoolkit.org)
+//
+// Information about sources of support for research and development of
+// HPCToolkit is at 'hpctoolkit.org' and in 'README.Acknowledgments'.
+// --------------------------------------------------------------------------
+//
+// Copyright ((c)) 2002-2020, Rice University
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+//   notice, this list of conditions and the following disclaimer.
+//
+// * Redistributions in binary form must reproduce the above copyright
+//   notice, this list of conditions and the following disclaimer in the
+//   documentation and/or other materials provided with the distribution.
+//
+// * Neither the name of Rice University (RICE) nor the names of its
+//   contributors may be used to endorse or promote products derived from
+//   this software without specific prior written permission.
+//
+// This software is provided by RICE and contributors "as is" and any
+// express or implied warranties, including, but not limited to, the
+// implied warranties of merchantability and fitness for a particular
+// purpose are disclaimed. In no event shall RICE or contributors be
+// liable for any direct, indirect, incidental, special, exemplary, or
+// consequential damages (including, but not limited to, procurement of
+// substitute goods or services; loss of use, data, or profits; or
+// business interruption) however caused and on any theory of liability,
+// whether in contract, strict liability, or tort (including negligence
+// or otherwise) arising in any way out of the use of this software, even
+// if advised of the possibility of such damage.
+//
+// ******************************************************* EndRiceCopyright *
+
+#include <lib/prof-lean/stdatomic.h>
+#include <memory/hpcrun-malloc.h>
+#include <pthread.h>
+
+
+#define DEBUG 0
+
+
+#include "gpu-trace-demultiplexer.h"
+#include "gpu-trace.h"
+#include "gpu-print.h"
+
+
+
+//******************************************************************************
+// type declarations
+//******************************************************************************
+
+typedef void *(*pthread_start_routine_t)(void *);
+
+typedef struct gpu_trace_channel_set_t{
+void *next;
+pthread_t thread;
+void *channel_set_ptr;
+int channel_set_index;
+atomic_uint channel_index;
+}gpu_trace_channel_set_t;
+
+
+//******************************************************************************
+// local variables
+//******************************************************************************
+
+static gpu_trace_channel_set_t *trace_channel_set_list_head = NULL;
+static gpu_trace_channel_set_t *trace_channel_set_list_tail = NULL;
+
+static int streams_per_thread;
+static uint32_t channel_set_index_count;
+
+
+//******************************************************************************
+// private operations
+//******************************************************************************
+
+static gpu_trace_channel_set_t *
+gpu_trace_channel_set_create
+(
+void
+)
+{
+  gpu_trace_channel_set_t *new_channel_set= hpcrun_malloc(sizeof(gpu_trace_channel_set_t));
+  new_channel_set->next = NULL;
+  new_channel_set->channel_set_ptr = gpu_trace_channel_stack_alloc(1);
+  new_channel_set->channel_set_index = channel_set_index_count++;
+  atomic_store(&new_channel_set->channel_index, 0);
+
+  pthread_create(&new_channel_set->thread, NULL, (pthread_start_routine_t) gpu_trace_record,
+                 new_channel_set);
+
+  return new_channel_set;
+}
+
+
+static void
+gpu_trace_channel_set_init
+(
+void
+)
+{
+  control_knob_value_get_int("STREAMS_PER_THREAD", &streams_per_thread);
+  channel_set_index_count = 0;
+  trace_channel_set_list_head = gpu_trace_channel_set_create();
+  trace_channel_set_list_tail = trace_channel_set_list_head;
+
+  PRINT("streams_per_thread = %d %d\n", streams_per_thread);
+
+
+}
+
+
+
+//******************************************************************************
+// interface operations
+//******************************************************************************
+
+void *
+gpu_trace_channel_set_get_ptr
+(
+gpu_trace_channel_set_t *channel_set
+)
+{
+  return channel_set->channel_set_ptr;
+};
+
+
+int
+gpu_trace_channel_set_get_channel_num
+(
+gpu_trace_channel_set_t *channel_set
+)
+{
+  return atomic_load(&channel_set->channel_index);
+}
+
+void
+gpu_trace_demultiplexer_fini
+(
+void
+)
+{
+}
+
+
+pthread_t
+gpu_trace_demultiplexer_push
+(
+gpu_trace_channel_t *trace_channel
+)
+{
+
+  if (trace_channel_set_list_head == NULL){
+    gpu_trace_channel_set_init();
+  }
+
+  if (atomic_load(&trace_channel_set_list_tail->channel_index) == streams_per_thread){
+    // Create new channel_set
+    trace_channel_set_list_tail->next = gpu_trace_channel_set_create();
+    trace_channel_set_list_tail = trace_channel_set_list_tail->next;
+  }
+
+  gpu_trace_channel_set_insert(trace_channel,
+                               trace_channel_set_list_tail->channel_set_ptr,
+                               atomic_fetch_add(&trace_channel_set_list_tail->channel_index,1));
+
+  return trace_channel_set_list_tail->thread;
+}
+
+
+void
+gpu_trace_demultiplexer_notify
+(
+ void
+)
+{
+  gpu_trace_channel_set_t *iter;
+
+  for (iter = trace_channel_set_list_head; iter != NULL; iter = iter->next){
+
+    int channel_num = atomic_load(&iter->channel_index);
+    for (int channel_idx = 0; channel_idx < channel_num; ++channel_idx) {
+      gpu_trace_channel_set_notify(iter->channel_set_ptr, channel_idx);
+    }
+  }
+}
+
+
+
+
+void
+gpu_trace_demultiplexer_release
+(
+ void
+)
+{
+}
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/src/tool/hpcrun/gpu/gpu-trace-demultiplexer.h b/src/tool/hpcrun/gpu/gpu-trace-demultiplexer.h
new file mode 100644
index 0000000000..3be14d2cda
--- /dev/null
+++ b/src/tool/hpcrun/gpu/gpu-trace-demultiplexer.h
@@ -0,0 +1,126 @@
+
+// * BeginRiceCopyright *****************************************************
+// -*-Mode: C++;-*- // technically C99
+//
+// --------------------------------------------------------------------------
+// Part of HPCToolkit (hpctoolkit.org)
+//
+// Information about sources of support for research and development of
+// HPCToolkit is at 'hpctoolkit.org' and in 'README.Acknowledgments'.
+// --------------------------------------------------------------------------
+//
+// Copyright ((c)) 2002-2020, Rice University
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+//   notice, this list of conditions and the following disclaimer.
+//
+// * Redistributions in binary form must reproduce the above copyright
+//   notice, this list of conditions and the following disclaimer in the
+//   documentation and/or other materials provided with the distribution.
+//
+// * Neither the name of Rice University (RICE) nor the names of its
+//   contributors may be used to endorse or promote products derived from
+//   this software without specific prior written permission.
+//
+// This software is provided by RICE and contributors "as is" and any
+// express or implied warranties, including, but not limited to, the
+// implied warranties of merchantability and fitness for a particular
+// purpose are disclaimed. In no event shall RICE or contributors be
+// liable for any direct, indirect, incidental, special, exemplary, or
+// consequential damages (including, but not limited to, procurement of
+// substitute goods or services; loss of use, data, or profits; or
+// business interruption) however caused and on any theory of liability,
+// whether in contract, strict liability, or tort (including negligence
+// or otherwise) arising in any way out of the use of this software, even
+// if advised of the possibility of such damage.
+//
+// ******************************************************* EndRiceCopyright *
+
+
+#ifndef gpu_trace_demultiplexer_h
+#define gpu_trace_demultiplexer_h
+
+//#include <hpcrun/thread_data.h>
+
+//******************************************************************************
+// type declarations
+//******************************************************************************
+typedef struct gpu_trace_channel_set_t gpu_trace_channel_set_t;
+typedef struct gpu_trace_channel_t gpu_trace_channel_t;
+typedef struct gpu_trace_t gpu_trace_t;
+//******************************************************************************
+// local variables
+//******************************************************************************
+
+
+//******************************************************************************
+// private operations
+//******************************************************************************
+
+
+//******************************************************************************
+// interface operations
+//******************************************************************************
+
+
+void *
+gpu_trace_channel_set_get_ptr
+(
+gpu_trace_channel_set_t *channel_set
+);
+
+int
+gpu_trace_channel_set_get_channel_num
+(
+gpu_trace_channel_set_t *channel_set
+);
+
+void
+gpu_trace_demultiplexer_fini
+(
+void
+);
+
+
+pthread_t
+gpu_trace_demultiplexer_push
+(
+gpu_trace_channel_t *trace_channel
+);
+
+
+void
+gpu_trace_demultiplexer_notify
+(
+void
+);
+
+
+void
+gpu_trace_demultiplexer_release
+(
+void
+);
+
+
+
+#endif
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/src/tool/hpcrun/gpu/gpu-trace.c b/src/tool/hpcrun/gpu/gpu-trace.c
index 4588d0e52d..45f69d58d3 100644
--- a/src/tool/hpcrun/gpu/gpu-trace.c
+++ b/src/tool/hpcrun/gpu/gpu-trace.c
@@ -72,10 +72,12 @@
 
 #include "gpu-context-id-map.h"
 #include "gpu-monitoring.h"
+#include "gpu-trace.h"
 #include "gpu-trace-channel.h"
+#include "gpu-trace-demultiplexer.h"
 #include "gpu-trace-item.h"
 #include "gpu-trace-channel-set.h"
-#include "gpu-trace.h"
+
 
 
 
@@ -118,12 +120,6 @@ static atomic_ullong active_streams_counter;
 
 static atomic_ullong num_streams;
 
-static int streams_per_thread;
-static int max_threads_consumers;
-static atomic_uint num_threads;
-
-static void **trace_channel_set_array;
-
 static __thread uint64_t stream_start = 0;
 
 
@@ -316,46 +312,6 @@ consume_one_trace_item
 }
 
 
-static void
-gpu_trace_activities_process
-(
- int set_index
-)
-{
-  gpu_trace_channel_set_apply(gpu_trace_channel_consume, set_index);
-}
-
-
-static void
-gpu_trace_channel_set_release
-(
-int set_index
-)
-{
-  gpu_trace_channel_set_apply(gpu_trace_stream_release, set_index);
-}
-
-
-static void
-gpu_trace_channel_set_notify
-(
-int set_index
-)
-{
-  gpu_trace_channel_set_apply(gpu_trace_channel_signal_consumer, set_index);
-}
-
-
-static void
-gpu_trace_channel_set_await
-(
-int set_index
-)
-{
-  gpu_trace_channel_set_apply(gpu_trace_channel_await, set_index);
-}
-
-
 static int
 gpu_trace_stream_id
 (
@@ -412,32 +368,10 @@ gpu_trace_init
 )
 {
   atomic_store(&stop_trace_flag, false);
-  atomic_store(&num_threads, 0);
   atomic_store(&active_streams_counter, 0);
   atomic_store(&num_streams, 0);
-
-  control_knob_value_get_int("STREAMS_PER_THREAD", &streams_per_thread);
-  control_knob_value_get_int("MAX_THREADS_CONSUMERS", &max_threads_consumers);
-
-  printf("streams_per_thread, max_threads_consumers = %d %d\n", streams_per_thread, max_threads_consumers);
-
-  trace_channel_set_array = hpcrun_malloc(sizeof(void *) * max_threads_consumers);
-
 }
 
-static int
-get_num_my_streams(int thread_id){
-  int num_streams_loc = atomic_load(&num_streams);
-  int num_threads_loc = atomic_load(&num_threads);
-  if (num_threads_loc - 1 == thread_id){
-    int my_streams = num_streams_loc % streams_per_thread;
-    return my_streams?my_streams:streams_per_thread;
-  } else{
-    return streams_per_thread;
-  }
-}
-
-
 void *
 gpu_trace_record
 (
@@ -445,28 +379,26 @@ gpu_trace_record
 void * args
 )
 {
-  gpu_stream_set_t *stream_set = (gpu_stream_set_t *) args;
-  gpu_trace_channel_stack_init(stream_set->ptr);
-  int num_my_streams;
+  gpu_trace_channel_set_t *channel_set = (gpu_trace_channel_set_t *) args;
+  void *channel_set_ptr = gpu_trace_channel_set_get_ptr(channel_set);
+  int channel_num;
 
   while (!atomic_load(&stop_trace_flag)) {
     //getting data from a trace channel
 
-    num_my_streams = get_num_my_streams(stream_set->thread_id);
-
-    for (int set_index = 0; set_index < num_my_streams; ++set_index) {
-      gpu_trace_activities_process(set_index);
-      gpu_trace_channel_set_await(set_index);
+    channel_num = gpu_trace_channel_set_get_channel_num(channel_set);
+    for (int channel_idx = 0; channel_idx < channel_num; ++channel_idx) {
+      gpu_trace_channel_set_process(channel_set_ptr, channel_idx);
+      gpu_trace_channel_set_await(channel_set_ptr, channel_idx);
     }
 
   }
 
-  num_my_streams = get_num_my_streams(stream_set->thread_id);
-
-  for (int set_index = 0; set_index < num_my_streams; ++set_index) {
-    gpu_trace_activities_process(set_index);
-    gpu_trace_channel_set_await(set_index);
-    gpu_trace_channel_set_release(set_index);
+  channel_num = gpu_trace_channel_set_get_channel_num(channel_set);
+  for (int channel_idx = 0; channel_idx < channel_num; ++channel_idx) {
+    gpu_trace_channel_set_process(channel_set_ptr, channel_idx);
+    gpu_trace_channel_set_await(channel_set_ptr, channel_idx);
+    gpu_trace_channel_set_release(channel_set_ptr, channel_idx);
   }
 
   return NULL;
@@ -483,53 +415,20 @@ gpu_trace_fini
 
   atomic_store(&stop_trace_flag, true);
 
-  int num_threads_loc = atomic_load(&num_threads);
-  for (int t = 0; t < num_threads_loc; ++t) {
-    gpu_trace_channel_stack_init(trace_channel_set_array[t]);
-    int my_streams = get_num_my_streams(t);
-    for (int set_index = 0; set_index < my_streams; ++set_index) {
-      gpu_trace_channel_set_notify(set_index);
-    }
-  }
-
+  gpu_trace_demultiplexer_notify();
 
   while (atomic_load(&active_streams_counter));
-
-  printf("NUM_TRACE_THREADS = %d\n", atomic_load(&num_threads));
-
 }
 
+
 static void
 gpu_trace_channel_set_append
 (
  gpu_trace_t *trace
 )
 {
-  int num_threads_loc;
-  static __thread int stream_id_loc = 0;
-
-  if (stream_id_loc == 0) {
-
-    num_threads_loc = atomic_fetch_add(&num_threads, 1);
-    PRINT("gpu-trace: Create new thread (num = %u)\n", num_threads_loc);
-    assert(num_threads_loc < max_threads_consumers);
-
-    trace_channel_set_array[num_threads_loc] = gpu_trace_channel_stack_alloc(max_threads_consumers);
-
-    gpu_stream_set_t *stream_set = hpcrun_malloc(sizeof(gpu_stream_set_t));
-    stream_set->ptr=trace_channel_set_array[num_threads_loc];
-    stream_set->thread_id=num_threads_loc;
-
-    pthread_create(&trace->thread, NULL, (pthread_start_routine_t) gpu_trace_record,
-                   stream_set);
-  }
-
-  gpu_trace_channel_set_insert(trace->trace_channel, stream_id_loc);
-
-  PRINT("gpu-trace:Thread_id = %d -> stream = %u\n", num_threads_loc, stream_id_loc);
-
+  trace->thread = gpu_trace_demultiplexer_push(trace->trace_channel);
   atomic_fetch_add(&active_streams_counter, 1);
-  stream_id_loc = (stream_id_loc+1) % streams_per_thread;
 }
 
 

From 586686eacc4b921003006f694c5d7044ea24f85e Mon Sep 17 00:00:00 2001
From: Dejan XXX <dx4@llnl.cs.rice.edu>
Date: Sat, 3 Oct 2020 22:30:34 -0500
Subject: [PATCH 067/177] gpu_trace_set_apply version -> making empty files

---
 src/tool/hpcrun/gpu/gpu-trace-channel-set.c   | 59 +++++++++----------
 src/tool/hpcrun/gpu/gpu-trace-channel-set.h   | 22 +++----
 src/tool/hpcrun/gpu/gpu-trace-demultiplexer.c | 10 +---
 src/tool/hpcrun/gpu/gpu-trace.c               | 21 ++-----
 4 files changed, 46 insertions(+), 66 deletions(-)

diff --git a/src/tool/hpcrun/gpu/gpu-trace-channel-set.c b/src/tool/hpcrun/gpu/gpu-trace-channel-set.c
index c4d48e8bfd..8261673e18 100644
--- a/src/tool/hpcrun/gpu/gpu-trace-channel-set.c
+++ b/src/tool/hpcrun/gpu/gpu-trace-channel-set.c
@@ -52,10 +52,10 @@
 #include <hpcrun/control-knob.h>
 
 #include "gpu-print.h"
+#include "gpu-trace.h"
 #include "gpu-trace-channel.h"
 #include "gpu-trace-channel-set.h"
-
-#include "gpu-trace.h"
+#include "gpu-trace-demultiplexer.h"
 
 
 
@@ -87,7 +87,6 @@
 
 typedef gpu_trace_channel_t* gpu_trace_channel_ptr_t;
 
-
 typedef struct {
   s_element_ptr_t next;
   gpu_trace_channel_ptr_t channel;
@@ -182,58 +181,58 @@ gpu_trace_channel_set_insert
   channel_stack_push(&gpu_trace_channel_stack[set_index], e);
 }
 
+static void
+gpu_trace_channel_set_apply
+(
+gpu_trace_channel_fn_t channel_fn,
+gpu_trace_channel_set_t *channel_set
+)
+{
+  gpu_trace_channel_stack =  gpu_trace_channel_set_get_ptr(channel_set);
+  int num_streams = gpu_trace_channel_set_get_channel_num(channel_set);
+
+  for (int channel_idx = 0; channel_idx < num_streams; ++channel_idx) {
+    gpu_trace_channel_set_forall(channel_fn,
+                                 gpu_trace_channel_stack,
+                                 channel_idx);
+  }
+}
 
 void
 gpu_trace_channel_set_process
 (
-void *gpu_trace_channel_stack_ptr,
-int set_index
+gpu_trace_channel_set_t *channel_set
 )
 {
-  gpu_trace_channel_stack =  gpu_trace_channel_stack_ptr;
-  gpu_trace_channel_set_forall(gpu_trace_channel_consume,
-                               gpu_trace_channel_stack,
-                               set_index);
+  gpu_trace_channel_set_apply(gpu_trace_channel_consume, channel_set);
 }
 
 
 void
-gpu_trace_channel_set_release
+gpu_trace_channel_set_await
 (
-void *gpu_trace_channel_stack_ptr,
-int set_index
+gpu_trace_channel_set_t *channel_set
 )
 {
-  gpu_trace_channel_stack =  gpu_trace_channel_stack_ptr;
-  gpu_trace_channel_set_forall(gpu_trace_stream_release,
-                               gpu_trace_channel_stack,
-                               set_index);
+  gpu_trace_channel_set_apply(gpu_trace_channel_await, channel_set);
 }
 
 
 void
-gpu_trace_channel_set_notify
+gpu_trace_channel_set_release
 (
-void *gpu_trace_channel_stack_ptr,
-int set_index
+gpu_trace_channel_set_t *channel_set
 )
 {
-  gpu_trace_channel_stack =  gpu_trace_channel_stack_ptr;
-  gpu_trace_channel_set_forall(gpu_trace_channel_signal_consumer,
-                               gpu_trace_channel_stack,
-                               set_index);
+  gpu_trace_channel_set_apply(gpu_trace_stream_release, channel_set);
 }
 
 
 void
-gpu_trace_channel_set_await
+gpu_trace_channel_set_notify
 (
-void *gpu_trace_channel_stack_ptr,
-int set_index
+gpu_trace_channel_set_t *channel_set
 )
 {
-  gpu_trace_channel_stack =  gpu_trace_channel_stack_ptr;
-  gpu_trace_channel_set_forall(gpu_trace_channel_await,
-                               gpu_trace_channel_stack,
-                               set_index);
+  gpu_trace_channel_set_apply(gpu_trace_channel_signal_consumer, channel_set);
 }
\ No newline at end of file
diff --git a/src/tool/hpcrun/gpu/gpu-trace-channel-set.h b/src/tool/hpcrun/gpu/gpu-trace-channel-set.h
index 662745302e..8b77f4df95 100644
--- a/src/tool/hpcrun/gpu/gpu-trace-channel-set.h
+++ b/src/tool/hpcrun/gpu/gpu-trace-channel-set.h
@@ -51,7 +51,7 @@
 //******************************************************************************
 
 typedef struct gpu_trace_channel_t gpu_trace_channel_t;
-
+typedef struct gpu_trace_channel_set_t gpu_trace_channel_set_t;
 
 
 //******************************************************************************
@@ -88,35 +88,31 @@ gpu_trace_channel_set_insert
 void
 gpu_trace_channel_set_process
 (
-void *gpu_trace_channel_stack,
-int set_index
+gpu_trace_channel_set_t *channel_set
 );
 
 
 void
-gpu_trace_channel_set_release
+gpu_trace_channel_set_await
 (
-void *gpu_trace_channel_stack,
-int set_index
+gpu_trace_channel_set_t *channel_set
 );
 
 
 void
-gpu_trace_channel_set_notify
+gpu_trace_channel_set_release
 (
-void *gpu_trace_channel_stack,
-int set_index
+gpu_trace_channel_set_t *channel_set
 );
 
+
 void
-gpu_trace_channel_set_await
+gpu_trace_channel_set_notify
 (
-void *gpu_trace_channel_stack,
-int set_index
+gpu_trace_channel_set_t *channel_set
 );
 
 
 
 
-
 #endif
diff --git a/src/tool/hpcrun/gpu/gpu-trace-demultiplexer.c b/src/tool/hpcrun/gpu/gpu-trace-demultiplexer.c
index 8c717e025b..876f21c75c 100644
--- a/src/tool/hpcrun/gpu/gpu-trace-demultiplexer.c
+++ b/src/tool/hpcrun/gpu/gpu-trace-demultiplexer.c
@@ -48,9 +48,9 @@
 
 #define DEBUG 0
 
-
-#include "gpu-trace-demultiplexer.h"
+#include "gpu-trace-channel-set.h"
 #include "gpu-trace.h"
+#include "gpu-trace-demultiplexer.h"
 #include "gpu-print.h"
 
 
@@ -188,11 +188,7 @@ gpu_trace_demultiplexer_notify
   gpu_trace_channel_set_t *iter;
 
   for (iter = trace_channel_set_list_head; iter != NULL; iter = iter->next){
-
-    int channel_num = atomic_load(&iter->channel_index);
-    for (int channel_idx = 0; channel_idx < channel_num; ++channel_idx) {
-      gpu_trace_channel_set_notify(iter->channel_set_ptr, channel_idx);
-    }
+    gpu_trace_channel_set_notify(iter);
   }
 }
 
diff --git a/src/tool/hpcrun/gpu/gpu-trace.c b/src/tool/hpcrun/gpu/gpu-trace.c
index 45f69d58d3..a2a7de3ced 100644
--- a/src/tool/hpcrun/gpu/gpu-trace.c
+++ b/src/tool/hpcrun/gpu/gpu-trace.c
@@ -375,32 +375,21 @@ gpu_trace_init
 void *
 gpu_trace_record
 (
-// void *trace_channel_set_ptr
 void * args
 )
 {
   gpu_trace_channel_set_t *channel_set = (gpu_trace_channel_set_t *) args;
-  void *channel_set_ptr = gpu_trace_channel_set_get_ptr(channel_set);
-  int channel_num;
 
   while (!atomic_load(&stop_trace_flag)) {
     //getting data from a trace channel
-
-    channel_num = gpu_trace_channel_set_get_channel_num(channel_set);
-    for (int channel_idx = 0; channel_idx < channel_num; ++channel_idx) {
-      gpu_trace_channel_set_process(channel_set_ptr, channel_idx);
-      gpu_trace_channel_set_await(channel_set_ptr, channel_idx);
-    }
-
+    gpu_trace_channel_set_process(channel_set);
+    gpu_trace_channel_set_await(channel_set);
   }
 
-  channel_num = gpu_trace_channel_set_get_channel_num(channel_set);
-  for (int channel_idx = 0; channel_idx < channel_num; ++channel_idx) {
-    gpu_trace_channel_set_process(channel_set_ptr, channel_idx);
-    gpu_trace_channel_set_await(channel_set_ptr, channel_idx);
-    gpu_trace_channel_set_release(channel_set_ptr, channel_idx);
-  }
+  gpu_trace_channel_set_process(channel_set);
+  gpu_trace_channel_set_await(channel_set);
 
+  gpu_trace_channel_set_release(channel_set);
   return NULL;
 }
 

From bb4aaaedb78864dd748f33d6308a4c39dc9e4317 Mon Sep 17 00:00:00 2001
From: Dejan XXX <dx4@llnl.cs.rice.edu>
Date: Sat, 3 Oct 2020 23:18:53 -0500
Subject: [PATCH 068/177] working version 2

---
 src/tool/hpcrun/gpu/gpu-trace-channel-set.c   | 3 +--
 src/tool/hpcrun/gpu/gpu-trace-demultiplexer.c | 2 +-
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/src/tool/hpcrun/gpu/gpu-trace-channel-set.c b/src/tool/hpcrun/gpu/gpu-trace-channel-set.c
index 8261673e18..c562608579 100644
--- a/src/tool/hpcrun/gpu/gpu-trace-channel-set.c
+++ b/src/tool/hpcrun/gpu/gpu-trace-channel-set.c
@@ -101,7 +101,7 @@ typed_stack_declare_type(gpu_trace_channel_ptr_t);
 // local data
 //******************************************************************************
 
-//static __thread
+static __thread
 typed_stack_elem_ptr(gpu_trace_channel_ptr_t) *gpu_trace_channel_stack;
 
 
@@ -153,7 +153,6 @@ gpu_trace_channel_set_forall
 
 void *
 gpu_trace_channel_stack_alloc(int size){
-//	gpu_trace_channel_stack = hpcrun_malloc_safe( size * sizeof(typed_stack_elem_ptr(gpu_trace_channel_ptr_t)));
   return hpcrun_malloc_safe( size * sizeof(typed_stack_elem_ptr(gpu_trace_channel_ptr_t)));
 }
 
diff --git a/src/tool/hpcrun/gpu/gpu-trace-demultiplexer.c b/src/tool/hpcrun/gpu/gpu-trace-demultiplexer.c
index 876f21c75c..8cefee21a8 100644
--- a/src/tool/hpcrun/gpu/gpu-trace-demultiplexer.c
+++ b/src/tool/hpcrun/gpu/gpu-trace-demultiplexer.c
@@ -40,7 +40,7 @@
 // if advised of the possibility of such damage.
 //
 // ******************************************************* EndRiceCopyright *
-
+#include <hpcrun/control-knob.h>
 #include <lib/prof-lean/stdatomic.h>
 #include <memory/hpcrun-malloc.h>
 #include <pthread.h>

From b5f00aa5d884da5de209a3c6739f343923b059ab Mon Sep 17 00:00:00 2001
From: Dejan XXX <dx4@llnl.cs.rice.edu>
Date: Sun, 4 Oct 2020 15:44:32 -0500
Subject: [PATCH 069/177] new_channel_set->channel_set_ptr =
 gpu_trace_channel_stack_alloc(streams_per_thread);

---
 src/tool/hpcrun/gpu/gpu-trace-channel-set.c   | 24 +++++++++-----
 src/tool/hpcrun/gpu/gpu-trace-channel-set.h   |  7 +++++
 src/tool/hpcrun/gpu/gpu-trace-channel.c       |  5 ++-
 src/tool/hpcrun/gpu/gpu-trace-demultiplexer.c | 31 ++++++++-----------
 src/tool/hpcrun/gpu/gpu-trace-demultiplexer.h |  6 ----
 src/tool/hpcrun/gpu/gpu-trace.c               | 15 ++-------
 6 files changed, 44 insertions(+), 44 deletions(-)

diff --git a/src/tool/hpcrun/gpu/gpu-trace-channel-set.c b/src/tool/hpcrun/gpu/gpu-trace-channel-set.c
index c562608579..e2511107eb 100644
--- a/src/tool/hpcrun/gpu/gpu-trace-channel-set.c
+++ b/src/tool/hpcrun/gpu/gpu-trace-channel-set.c
@@ -51,6 +51,9 @@
 #include <hpcrun/thread_data.h>
 #include <hpcrun/control-knob.h>
 
+
+#define DEBUG 0
+
 #include "gpu-print.h"
 #include "gpu-trace.h"
 #include "gpu-trace-channel.h"
@@ -134,13 +137,11 @@ static void
 gpu_trace_channel_set_forall
 (
  gpu_trace_channel_fn_t channel_fn,
- void *gpu_trace_channel_stack_ptr,
+ typed_stack_elem_ptr(gpu_trace_channel_ptr_t) *gpu_trace_channel_stack,
  int set_index
 
 )
 {
-  gpu_trace_channel_stack = gpu_trace_channel_stack_ptr;
-
   channel_stack_forall(&gpu_trace_channel_stack[set_index], channel_forone,
     channel_fn);
 }
@@ -151,6 +152,16 @@ gpu_trace_channel_set_forall
 // interface operations
 //******************************************************************************
 
+void
+gpu_trace_channel_stack_init
+(
+gpu_trace_channel_set_t *channel_set
+)
+{
+  gpu_trace_channel_stack = (typed_stack_elem_ptr(gpu_trace_channel_ptr_t) *)gpu_trace_channel_set_get_ptr(channel_set);
+}
+
+
 void *
 gpu_trace_channel_stack_alloc(int size){
   return hpcrun_malloc_safe( size * sizeof(typed_stack_elem_ptr(gpu_trace_channel_ptr_t)));
@@ -171,7 +182,6 @@ gpu_trace_channel_set_insert
   // initialize the new entry
   e->channel = channel;
 
-
   // clear the entry's next ptr
   channel_stack_elem_ptr_set(e, 0);
 
@@ -187,10 +197,10 @@ gpu_trace_channel_fn_t channel_fn,
 gpu_trace_channel_set_t *channel_set
 )
 {
-  gpu_trace_channel_stack =  gpu_trace_channel_set_get_ptr(channel_set);
-  int num_streams = gpu_trace_channel_set_get_channel_num(channel_set);
+  int channel_count = gpu_trace_channel_set_get_channel_num(channel_set);
+  gpu_trace_channel_stack = (typed_stack_elem_ptr(gpu_trace_channel_ptr_t) *) gpu_trace_channel_set_get_ptr(channel_set);
 
-  for (int channel_idx = 0; channel_idx < num_streams; ++channel_idx) {
+  for (int channel_idx = 0; channel_idx < channel_count; ++channel_idx) {
     gpu_trace_channel_set_forall(channel_fn,
                                  gpu_trace_channel_stack,
                                  channel_idx);
diff --git a/src/tool/hpcrun/gpu/gpu-trace-channel-set.h b/src/tool/hpcrun/gpu/gpu-trace-channel-set.h
index 8b77f4df95..441f889d1b 100644
--- a/src/tool/hpcrun/gpu/gpu-trace-channel-set.h
+++ b/src/tool/hpcrun/gpu/gpu-trace-channel-set.h
@@ -69,6 +69,13 @@ typedef void (*gpu_trace_channel_fn_t)
 // interface operations
 //******************************************************************************
 
+void
+gpu_trace_channel_stack_init
+(
+gpu_trace_channel_set_t *channel_set
+);
+
+
 void *
 gpu_trace_channel_stack_alloc
 (
diff --git a/src/tool/hpcrun/gpu/gpu-trace-channel.c b/src/tool/hpcrun/gpu/gpu-trace-channel.c
index d593735c0e..bfaa0e743d 100644
--- a/src/tool/hpcrun/gpu/gpu-trace-channel.c
+++ b/src/tool/hpcrun/gpu/gpu-trace-channel.c
@@ -209,7 +209,10 @@ gpu_trace_channel_consume
  gpu_trace_channel_t *channel
 )
 {
-  PRINT("gpu_trace_channel_consume:: channel_count = %u\n", channel->count);
+  PRINT("gpu_trace_channel_consume:: channel_count = %u, channel_td = %p, last_time = %lu\n", channel->count,
+        channel->td, channel->td->last_time_us);
+
+
   hpcrun_set_thread_data(channel->td);
 
   // steal elements previously pushed by the producer
diff --git a/src/tool/hpcrun/gpu/gpu-trace-demultiplexer.c b/src/tool/hpcrun/gpu/gpu-trace-demultiplexer.c
index 8cefee21a8..a661fafe9d 100644
--- a/src/tool/hpcrun/gpu/gpu-trace-demultiplexer.c
+++ b/src/tool/hpcrun/gpu/gpu-trace-demultiplexer.c
@@ -62,11 +62,11 @@
 typedef void *(*pthread_start_routine_t)(void *);
 
 typedef struct gpu_trace_channel_set_t{
-void *next;
-pthread_t thread;
-void *channel_set_ptr;
-int channel_set_index;
-atomic_uint channel_index;
+  void *channel_set_ptr;
+  gpu_trace_channel_set_t *next;
+  pthread_t thread;
+  int channel_set_index;
+  atomic_uint channel_index;
 }gpu_trace_channel_set_t;
 
 
@@ -93,7 +93,7 @@ void
 {
   gpu_trace_channel_set_t *new_channel_set= hpcrun_malloc(sizeof(gpu_trace_channel_set_t));
   new_channel_set->next = NULL;
-  new_channel_set->channel_set_ptr = gpu_trace_channel_stack_alloc(1);
+  new_channel_set->channel_set_ptr = gpu_trace_channel_stack_alloc(streams_per_thread);
   new_channel_set->channel_set_index = channel_set_index_count++;
   atomic_store(&new_channel_set->channel_index, 0);
 
@@ -115,9 +115,7 @@ void
   trace_channel_set_list_head = gpu_trace_channel_set_create();
   trace_channel_set_list_tail = trace_channel_set_list_head;
 
-  PRINT("streams_per_thread = %d %d\n", streams_per_thread);
-
-
+  PRINT("streams_per_thread = %d \n", streams_per_thread);
 }
 
 
@@ -145,14 +143,6 @@ gpu_trace_channel_set_t *channel_set
   return atomic_load(&channel_set->channel_index);
 }
 
-void
-gpu_trace_demultiplexer_fini
-(
-void
-)
-{
-}
-
 
 pthread_t
 gpu_trace_demultiplexer_push
@@ -175,6 +165,10 @@ gpu_trace_channel_t *trace_channel
                                trace_channel_set_list_tail->channel_set_ptr,
                                atomic_fetch_add(&trace_channel_set_list_tail->channel_index,1));
 
+  PRINT("gpu_trace_demultiplexer_push: channel_set_ptr = %p, channel_set_index = %d | channel = %p\n",
+        trace_channel_set_list_tail->channel_set_ptr, trace_channel_set_list_tail->channel_set_index,
+        trace_channel);
+
   return trace_channel_set_list_tail->thread;
 }
 
@@ -187,7 +181,7 @@ gpu_trace_demultiplexer_notify
 {
   gpu_trace_channel_set_t *iter;
 
-  for (iter = trace_channel_set_list_head; iter != NULL; iter = iter->next){
+  for (iter = trace_channel_set_list_head; iter != trace_channel_set_list_tail; iter = iter->next){
     gpu_trace_channel_set_notify(iter);
   }
 }
@@ -201,6 +195,7 @@ gpu_trace_demultiplexer_release
  void
 )
 {
+  PRINT("gpu_trace_demultiplexer_release: NOT IMPLEMENTED\n");
 }
 
 
diff --git a/src/tool/hpcrun/gpu/gpu-trace-demultiplexer.h b/src/tool/hpcrun/gpu/gpu-trace-demultiplexer.h
index 3be14d2cda..6d8e58afb2 100644
--- a/src/tool/hpcrun/gpu/gpu-trace-demultiplexer.h
+++ b/src/tool/hpcrun/gpu/gpu-trace-demultiplexer.h
@@ -80,12 +80,6 @@ gpu_trace_channel_set_get_channel_num
 gpu_trace_channel_set_t *channel_set
 );
 
-void
-gpu_trace_demultiplexer_fini
-(
-void
-);
-
 
 pthread_t
 gpu_trace_demultiplexer_push
diff --git a/src/tool/hpcrun/gpu/gpu-trace.c b/src/tool/hpcrun/gpu/gpu-trace.c
index a2a7de3ced..6a4dcfcf1a 100644
--- a/src/tool/hpcrun/gpu/gpu-trace.c
+++ b/src/tool/hpcrun/gpu/gpu-trace.c
@@ -379,6 +379,7 @@ void * args
 )
 {
   gpu_trace_channel_set_t *channel_set = (gpu_trace_channel_set_t *) args;
+  gpu_trace_channel_stack_init(channel_set);
 
   while (!atomic_load(&stop_trace_flag)) {
     //getting data from a trace channel
@@ -410,17 +411,6 @@ gpu_trace_fini
 }
 
 
-static void
-gpu_trace_channel_set_append
-(
- gpu_trace_t *trace
-)
-{
-  trace->thread = gpu_trace_demultiplexer_push(trace->trace_channel);
-  atomic_fetch_add(&active_streams_counter, 1);
-}
-
-
 gpu_trace_t *
 gpu_trace_create
 (
@@ -433,7 +423,8 @@ gpu_trace_create
   // Create a new thread for the stream without libmonitor watching
   monitor_disable_new_threads();
 
-  gpu_trace_channel_set_append(trace);
+  trace->thread = gpu_trace_demultiplexer_push(trace->trace_channel);
+  atomic_fetch_add(&active_streams_counter, 1);
 
   monitor_enable_new_threads();
 

From 5c7b3bac998f2b8a7a650681b2180c525e4ab7ab Mon Sep 17 00:00:00 2001
From: dejangrubisic <grubisic.dejan@yahoo.com>
Date: Sun, 4 Oct 2020 17:13:38 -0500
Subject: [PATCH 070/177] gpu-trace-demultiplexer: Final version

---
 src/tool/hpcrun/gpu/gpu-trace-channel-set.c   |  57 ++---
 src/tool/hpcrun/gpu/gpu-trace-channel-set.h   |  16 +-
 src/tool/hpcrun/gpu/gpu-trace-channel.c       |  11 +-
 src/tool/hpcrun/gpu/gpu-trace-channel.h       |  12 +-
 src/tool/hpcrun/gpu/gpu-trace-demultiplexer.c |  23 +-
 src/tool/hpcrun/gpu/gpu-trace-demultiplexer.h |  16 +-
 src/tool/hpcrun/gpu/gpu-trace.c               | 207 +++++++++---------
 src/tool/hpcrun/gpu/gpu-trace.h               |  29 +--
 8 files changed, 185 insertions(+), 186 deletions(-)

diff --git a/src/tool/hpcrun/gpu/gpu-trace-channel-set.c b/src/tool/hpcrun/gpu/gpu-trace-channel-set.c
index e2511107eb..dca684c061 100644
--- a/src/tool/hpcrun/gpu/gpu-trace-channel-set.c
+++ b/src/tool/hpcrun/gpu/gpu-trace-channel-set.c
@@ -104,11 +104,6 @@ typed_stack_declare_type(gpu_trace_channel_ptr_t);
 // local data
 //******************************************************************************
 
-static __thread
-typed_stack_elem_ptr(gpu_trace_channel_ptr_t) *gpu_trace_channel_stack;
-
-
-
 //******************************************************************************
 // private operations
 //******************************************************************************
@@ -139,7 +134,6 @@ gpu_trace_channel_set_forall
  gpu_trace_channel_fn_t channel_fn,
  typed_stack_elem_ptr(gpu_trace_channel_ptr_t) *gpu_trace_channel_stack,
  int set_index
-
 )
 {
   channel_stack_forall(&gpu_trace_channel_stack[set_index], channel_forone,
@@ -147,26 +141,35 @@ gpu_trace_channel_set_forall
 }
 
 
-
-//******************************************************************************
-// interface operations
-//******************************************************************************
-
-void
-gpu_trace_channel_stack_init
+static void
+gpu_trace_channel_set_apply
 (
-gpu_trace_channel_set_t *channel_set
+ gpu_trace_channel_fn_t channel_fn,
+ gpu_trace_channel_set_t *channel_set
 )
 {
-  gpu_trace_channel_stack = (typed_stack_elem_ptr(gpu_trace_channel_ptr_t) *)gpu_trace_channel_set_get_ptr(channel_set);
+  int channel_count = gpu_trace_channel_set_get_channel_num(channel_set);
+  typed_stack_elem_ptr(gpu_trace_channel_ptr_t) * gpu_trace_channel_stack = gpu_trace_channel_set_get_ptr(channel_set);
+
+  for (int channel_idx = 0; channel_idx < channel_count; ++channel_idx) {
+    gpu_trace_channel_set_forall(channel_fn,
+                                 gpu_trace_channel_stack,
+                                 channel_idx);
+  }
 }
 
 
+
+//******************************************************************************
+// interface operations
+//******************************************************************************
+
 void *
 gpu_trace_channel_stack_alloc(int size){
   return hpcrun_malloc_safe( size * sizeof(typed_stack_elem_ptr(gpu_trace_channel_ptr_t)));
 }
 
+
 void
 gpu_trace_channel_set_insert
 (
@@ -186,31 +189,15 @@ gpu_trace_channel_set_insert
   channel_stack_elem_ptr_set(e, 0);
 
 	  // add the entry to the channel stack
-  gpu_trace_channel_stack =  gpu_trace_channel_stack_ptr;
+  typed_stack_elem_ptr(gpu_trace_channel_ptr_t) * gpu_trace_channel_stack = gpu_trace_channel_stack_ptr;
   channel_stack_push(&gpu_trace_channel_stack[set_index], e);
 }
 
-static void
-gpu_trace_channel_set_apply
-(
-gpu_trace_channel_fn_t channel_fn,
-gpu_trace_channel_set_t *channel_set
-)
-{
-  int channel_count = gpu_trace_channel_set_get_channel_num(channel_set);
-  gpu_trace_channel_stack = (typed_stack_elem_ptr(gpu_trace_channel_ptr_t) *) gpu_trace_channel_set_get_ptr(channel_set);
-
-  for (int channel_idx = 0; channel_idx < channel_count; ++channel_idx) {
-    gpu_trace_channel_set_forall(channel_fn,
-                                 gpu_trace_channel_stack,
-                                 channel_idx);
-  }
-}
 
 void
 gpu_trace_channel_set_process
 (
-gpu_trace_channel_set_t *channel_set
+ gpu_trace_channel_set_t *channel_set
 )
 {
   gpu_trace_channel_set_apply(gpu_trace_channel_consume, channel_set);
@@ -220,7 +207,7 @@ gpu_trace_channel_set_t *channel_set
 void
 gpu_trace_channel_set_await
 (
-gpu_trace_channel_set_t *channel_set
+ gpu_trace_channel_set_t *channel_set
 )
 {
   gpu_trace_channel_set_apply(gpu_trace_channel_await, channel_set);
@@ -230,7 +217,7 @@ gpu_trace_channel_set_t *channel_set
 void
 gpu_trace_channel_set_release
 (
-gpu_trace_channel_set_t *channel_set
+ gpu_trace_channel_set_t *channel_set
 )
 {
   gpu_trace_channel_set_apply(gpu_trace_stream_release, channel_set);
diff --git a/src/tool/hpcrun/gpu/gpu-trace-channel-set.h b/src/tool/hpcrun/gpu/gpu-trace-channel-set.h
index 441f889d1b..83b5b47644 100644
--- a/src/tool/hpcrun/gpu/gpu-trace-channel-set.h
+++ b/src/tool/hpcrun/gpu/gpu-trace-channel-set.h
@@ -54,6 +54,7 @@ typedef struct gpu_trace_channel_t gpu_trace_channel_t;
 typedef struct gpu_trace_channel_set_t gpu_trace_channel_set_t;
 
 
+
 //******************************************************************************
 // type declarations
 //******************************************************************************
@@ -69,13 +70,6 @@ typedef void (*gpu_trace_channel_fn_t)
 // interface operations
 //******************************************************************************
 
-void
-gpu_trace_channel_stack_init
-(
-gpu_trace_channel_set_t *channel_set
-);
-
-
 void *
 gpu_trace_channel_stack_alloc
 (
@@ -95,28 +89,28 @@ gpu_trace_channel_set_insert
 void
 gpu_trace_channel_set_process
 (
-gpu_trace_channel_set_t *channel_set
+ gpu_trace_channel_set_t *channel_set
 );
 
 
 void
 gpu_trace_channel_set_await
 (
-gpu_trace_channel_set_t *channel_set
+ gpu_trace_channel_set_t *channel_set
 );
 
 
 void
 gpu_trace_channel_set_release
 (
-gpu_trace_channel_set_t *channel_set
+ gpu_trace_channel_set_t *channel_set
 );
 
 
 void
 gpu_trace_channel_set_notify
 (
-gpu_trace_channel_set_t *channel_set
+ gpu_trace_channel_set_t *channel_set
 );
 
 
diff --git a/src/tool/hpcrun/gpu/gpu-trace-channel.c b/src/tool/hpcrun/gpu/gpu-trace-channel.c
index bfaa0e743d..39472bfc43 100644
--- a/src/tool/hpcrun/gpu/gpu-trace-channel.c
+++ b/src/tool/hpcrun/gpu/gpu-trace-channel.c
@@ -145,13 +145,20 @@ gpu_trace_channel_signal_consumer_when_full
 //******************************************************************************
 
 struct thread_data_t *
-gpu_trace_channel_get_td(gpu_trace_channel_t *ch)
+gpu_trace_channel_get_td
+(
+ gpu_trace_channel_t *ch
+)
 {
   return ch->td;
 }
 
+
 int
-gpu_trace_channel_get_stream_id(gpu_trace_channel_t *ch)
+gpu_trace_channel_get_stream_id
+(
+ gpu_trace_channel_t *ch
+)
 {
   return ch->td->core_profile_trace_data.id;
 }
diff --git a/src/tool/hpcrun/gpu/gpu-trace-channel.h b/src/tool/hpcrun/gpu/gpu-trace-channel.h
index 82ce178a20..b743342554 100644
--- a/src/tool/hpcrun/gpu/gpu-trace-channel.h
+++ b/src/tool/hpcrun/gpu/gpu-trace-channel.h
@@ -66,10 +66,18 @@ typedef struct gpu_trace_channel_t gpu_trace_channel_t;
 //******************************************************************************
 
 thread_data_t *
-gpu_trace_channel_get_td(gpu_trace_channel_t *ch);
+gpu_trace_channel_get_td
+(
+ gpu_trace_channel_t *ch
+);
+
 
 int
-gpu_trace_channel_get_stream_id(gpu_trace_channel_t *ch);
+gpu_trace_channel_get_stream_id
+(
+ gpu_trace_channel_t *ch
+);
+
 
 gpu_trace_channel_t *
 gpu_trace_channel_alloc
diff --git a/src/tool/hpcrun/gpu/gpu-trace-demultiplexer.c b/src/tool/hpcrun/gpu/gpu-trace-demultiplexer.c
index a661fafe9d..6cb6fbb2c5 100644
--- a/src/tool/hpcrun/gpu/gpu-trace-demultiplexer.c
+++ b/src/tool/hpcrun/gpu/gpu-trace-demultiplexer.c
@@ -40,6 +40,7 @@
 // if advised of the possibility of such damage.
 //
 // ******************************************************* EndRiceCopyright *
+
 #include <hpcrun/control-knob.h>
 #include <lib/prof-lean/stdatomic.h>
 #include <memory/hpcrun-malloc.h>
@@ -65,11 +66,11 @@ typedef struct gpu_trace_channel_set_t{
   void *channel_set_ptr;
   gpu_trace_channel_set_t *next;
   pthread_t thread;
-  int channel_set_index;
   atomic_uint channel_index;
 }gpu_trace_channel_set_t;
 
 
+
 //******************************************************************************
 // local variables
 //******************************************************************************
@@ -78,7 +79,7 @@ static gpu_trace_channel_set_t *trace_channel_set_list_head = NULL;
 static gpu_trace_channel_set_t *trace_channel_set_list_tail = NULL;
 
 static int streams_per_thread;
-static uint32_t channel_set_index_count;
+
 
 
 //******************************************************************************
@@ -88,13 +89,12 @@ static uint32_t channel_set_index_count;
 static gpu_trace_channel_set_t *
 gpu_trace_channel_set_create
 (
-void
+ void
 )
 {
   gpu_trace_channel_set_t *new_channel_set= hpcrun_malloc(sizeof(gpu_trace_channel_set_t));
   new_channel_set->next = NULL;
   new_channel_set->channel_set_ptr = gpu_trace_channel_stack_alloc(streams_per_thread);
-  new_channel_set->channel_set_index = channel_set_index_count++;
   atomic_store(&new_channel_set->channel_index, 0);
 
   pthread_create(&new_channel_set->thread, NULL, (pthread_start_routine_t) gpu_trace_record,
@@ -107,11 +107,10 @@ void
 static void
 gpu_trace_channel_set_init
 (
-void
+ void
 )
 {
   control_knob_value_get_int("STREAMS_PER_THREAD", &streams_per_thread);
-  channel_set_index_count = 0;
   trace_channel_set_list_head = gpu_trace_channel_set_create();
   trace_channel_set_list_tail = trace_channel_set_list_head;
 
@@ -127,7 +126,7 @@ void
 void *
 gpu_trace_channel_set_get_ptr
 (
-gpu_trace_channel_set_t *channel_set
+ gpu_trace_channel_set_t *channel_set
 )
 {
   return channel_set->channel_set_ptr;
@@ -137,7 +136,7 @@ gpu_trace_channel_set_t *channel_set
 int
 gpu_trace_channel_set_get_channel_num
 (
-gpu_trace_channel_set_t *channel_set
+ gpu_trace_channel_set_t *channel_set
 )
 {
   return atomic_load(&channel_set->channel_index);
@@ -147,7 +146,7 @@ gpu_trace_channel_set_t *channel_set
 pthread_t
 gpu_trace_demultiplexer_push
 (
-gpu_trace_channel_t *trace_channel
+ gpu_trace_channel_t *trace_channel
 )
 {
 
@@ -165,8 +164,8 @@ gpu_trace_channel_t *trace_channel
                                trace_channel_set_list_tail->channel_set_ptr,
                                atomic_fetch_add(&trace_channel_set_list_tail->channel_index,1));
 
-  PRINT("gpu_trace_demultiplexer_push: channel_set_ptr = %p, channel_set_index = %d | channel = %p\n",
-        trace_channel_set_list_tail->channel_set_ptr, trace_channel_set_list_tail->channel_set_index,
+  PRINT("gpu_trace_demultiplexer_push: channel_set_ptr = %p | channel = %p\n",
+        trace_channel_set_list_tail->channel_set_ptr,
         trace_channel);
 
   return trace_channel_set_list_tail->thread;
@@ -187,8 +186,6 @@ gpu_trace_demultiplexer_notify
 }
 
 
-
-
 void
 gpu_trace_demultiplexer_release
 (
diff --git a/src/tool/hpcrun/gpu/gpu-trace-demultiplexer.h b/src/tool/hpcrun/gpu/gpu-trace-demultiplexer.h
index 6d8e58afb2..bf09153a9e 100644
--- a/src/tool/hpcrun/gpu/gpu-trace-demultiplexer.h
+++ b/src/tool/hpcrun/gpu/gpu-trace-demultiplexer.h
@@ -45,14 +45,18 @@
 #ifndef gpu_trace_demultiplexer_h
 #define gpu_trace_demultiplexer_h
 
-//#include <hpcrun/thread_data.h>
+
 
 //******************************************************************************
 // type declarations
 //******************************************************************************
+
 typedef struct gpu_trace_channel_set_t gpu_trace_channel_set_t;
 typedef struct gpu_trace_channel_t gpu_trace_channel_t;
 typedef struct gpu_trace_t gpu_trace_t;
+
+
+
 //******************************************************************************
 // local variables
 //******************************************************************************
@@ -71,34 +75,34 @@ typedef struct gpu_trace_t gpu_trace_t;
 void *
 gpu_trace_channel_set_get_ptr
 (
-gpu_trace_channel_set_t *channel_set
+ gpu_trace_channel_set_t *channel_set
 );
 
 int
 gpu_trace_channel_set_get_channel_num
 (
-gpu_trace_channel_set_t *channel_set
+ gpu_trace_channel_set_t *channel_set
 );
 
 
 pthread_t
 gpu_trace_demultiplexer_push
 (
-gpu_trace_channel_t *trace_channel
+ gpu_trace_channel_t *trace_channel
 );
 
 
 void
 gpu_trace_demultiplexer_notify
 (
-void
+ void
 );
 
 
 void
 gpu_trace_demultiplexer_release
 (
-void
+ void
 );
 
 
diff --git a/src/tool/hpcrun/gpu/gpu-trace.c b/src/tool/hpcrun/gpu/gpu-trace.c
index 6a4dcfcf1a..198480bdeb 100644
--- a/src/tool/hpcrun/gpu/gpu-trace.c
+++ b/src/tool/hpcrun/gpu/gpu-trace.c
@@ -88,7 +88,6 @@
 #define DEBUG 0
 #include "gpu-print.h"
 
-#define MIN(a,b)  (((a)<=(b))?(a):(b))
 
 
 //******************************************************************************
@@ -258,60 +257,6 @@ gpu_trace_start_adjust
 }
 
 
-void
-consume_one_trace_item
-(
- thread_data_t* td,
- cct_node_t *call_path,
- uint64_t start_time,
- uint64_t end_time
-)
-{
-
-  cct_node_t *leaf = gpu_trace_cct_insert_context(td, call_path);
-
-  cct_node_t *no_activity = gpu_trace_cct_no_activity(td);
-
-  uint64_t start = gpu_trace_time(start_time);
-  uint64_t end   = gpu_trace_time(end_time);
-
-  stream_start_set(start_time);
-
-  start = gpu_trace_start_adjust(td, start, end);
-
-  int frequency = gpu_monitoring_trace_sample_frequency_get();
-
-  bool append = false;
-
-  if (frequency != -1) {
-    uint64_t cur_start = start_time;
-    uint64_t cur_end = end_time;
-    uint64_t intervals = (cur_start - stream_start_get() - 1) / frequency + 1;
-    uint64_t pivot = intervals * frequency + stream_start;
-
-    if (pivot <= cur_end && pivot >= cur_start) {
-      // only trace when the pivot is within the range
-      PRINT("pivot %" PRIu64 " not in <%" PRIu64 ", %" PRIu64
-          "> with intervals %" PRIu64 ", frequency %" PRIu64 "\n",
-           pivot, cur_start, cur_end, intervals, frequency);
-      append = true;
-    }
-  } else {
-    append = true;
-  }
-
-  if (append) {
-    gpu_trace_first(td, no_activity, start);
-
-    gpu_trace_stream_append(td, leaf, start);
-
-    gpu_trace_stream_append(td, no_activity, end);
-
-    PRINT("%p Append trace activity [%lu, %lu]\n", td, start, end);
-  }
-}
-
-
 static int
 gpu_trace_stream_id
 (
@@ -325,37 +270,6 @@ gpu_trace_stream_id
 }
 
 
-thread_data_t *
-gpu_trace_stream_acquire
-(
- void
-)
-{
-  thread_data_t *td = NULL;
-
-  int id = gpu_trace_stream_id();
-
-  // XXX(Keren): This API calls allocate_and_init_thread_data to bind td with the current thread
-  hpcrun_threadMgr_data_get_safe(id, NULL, &td, true);
-
-  return td;
-}
-
-
-void
-gpu_trace_stream_release
-(
- gpu_trace_channel_t *channel
-)
-{
-  thread_data_t *td = gpu_trace_channel_get_td(channel);
-
-  hpcrun_write_profile_data(&td->core_profile_trace_data);
-  hpcrun_trace_close(&td->core_profile_trace_data);
-  atomic_fetch_add(&active_streams_counter, -1);
-
-}
-
 
 //******************************************************************************
 // interface operations
@@ -372,14 +286,30 @@ gpu_trace_init
   atomic_store(&num_streams, 0);
 }
 
+
+void
+gpu_trace_fini
+(
+ void *arg
+)
+{
+  PRINT("gpu_trace_fini called\n");
+
+  atomic_store(&stop_trace_flag, true);
+
+  gpu_trace_demultiplexer_notify();
+
+  while (atomic_load(&active_streams_counter));
+}
+
+
 void *
 gpu_trace_record
 (
-void * args
+ void * args
 )
 {
   gpu_trace_channel_set_t *channel_set = (gpu_trace_channel_set_t *) args;
-  gpu_trace_channel_stack_init(channel_set);
 
   while (!atomic_load(&stop_trace_flag)) {
     //getting data from a trace channel
@@ -395,22 +325,6 @@ void * args
 }
 
 
-void
-gpu_trace_fini
-(
- void *arg
-)
-{
-  PRINT("gpu_trace_fini called\n");
-
-  atomic_store(&stop_trace_flag, true);
-
-  gpu_trace_demultiplexer_notify();
-
-  while (atomic_load(&active_streams_counter));
-}
-
-
 gpu_trace_t *
 gpu_trace_create
 (
@@ -451,3 +365,88 @@ gpu_trace_signal_consumer
 {
   gpu_trace_channel_signal_consumer(t->trace_channel);
 }
+
+
+void
+consume_one_trace_item
+(
+ thread_data_t* td,
+ cct_node_t *call_path,
+ uint64_t start_time,
+ uint64_t end_time
+)
+{
+
+  cct_node_t *leaf = gpu_trace_cct_insert_context(td, call_path);
+
+  cct_node_t *no_activity = gpu_trace_cct_no_activity(td);
+
+  uint64_t start = gpu_trace_time(start_time);
+  uint64_t end   = gpu_trace_time(end_time);
+
+  stream_start_set(start_time);
+
+  start = gpu_trace_start_adjust(td, start, end);
+
+  int frequency = gpu_monitoring_trace_sample_frequency_get();
+
+  bool append = false;
+
+  if (frequency != -1) {
+    uint64_t cur_start = start_time;
+    uint64_t cur_end = end_time;
+    uint64_t intervals = (cur_start - stream_start_get() - 1) / frequency + 1;
+    uint64_t pivot = intervals * frequency + stream_start;
+
+    if (pivot <= cur_end && pivot >= cur_start) {
+      // only trace when the pivot is within the range
+      PRINT("pivot %" PRIu64 " not in <%" PRIu64 ", %" PRIu64
+            "> with intervals %" PRIu64 ", frequency %" PRIu64 "\n",
+            pivot, cur_start, cur_end, intervals, frequency);
+      append = true;
+    }
+  } else {
+    append = true;
+  }
+
+  if (append) {
+    gpu_trace_first(td, no_activity, start);
+
+    gpu_trace_stream_append(td, leaf, start);
+
+    gpu_trace_stream_append(td, no_activity, end);
+
+    PRINT("%p Append trace activity [%lu, %lu]\n", td, start, end);
+  }
+}
+
+
+thread_data_t *
+gpu_trace_stream_acquire
+(
+ void
+)
+{
+  thread_data_t *td = NULL;
+
+  int id = gpu_trace_stream_id();
+
+  // XXX(Keren): This API calls allocate_and_init_thread_data to bind td with the current thread
+  hpcrun_threadMgr_data_get_safe(id, NULL, &td, true);
+
+  return td;
+}
+
+
+void
+gpu_trace_stream_release
+(
+ gpu_trace_channel_t *channel
+)
+{
+  thread_data_t *td = gpu_trace_channel_get_td(channel);
+
+  hpcrun_write_profile_data(&td->core_profile_trace_data);
+  hpcrun_trace_close(&td->core_profile_trace_data);
+  atomic_fetch_add(&active_streams_counter, -1);
+}
\ No newline at end of file
diff --git a/src/tool/hpcrun/gpu/gpu-trace.h b/src/tool/hpcrun/gpu/gpu-trace.h
index e3fb91291b..62d57e72ef 100644
--- a/src/tool/hpcrun/gpu/gpu-trace.h
+++ b/src/tool/hpcrun/gpu/gpu-trace.h
@@ -93,10 +93,10 @@ gpu_trace_init
 );
 
 
-gpu_trace_t *
-gpu_trace_create
+void
+gpu_trace_fini
 (
- void
+void *arg
 );
 
 
@@ -107,6 +107,13 @@ void *args
 );
 
 
+gpu_trace_t *
+gpu_trace_create
+(
+ void
+);
+
+
 void
 gpu_trace_produce
 (
@@ -123,9 +130,12 @@ gpu_trace_signal_consumer
 
 
 void
-gpu_trace_fini
+consume_one_trace_item
 (
- void *arg
+thread_data_t* td,
+cct_node_t *call_path,
+uint64_t start_time,
+uint64_t end_time
 );
 
 
@@ -143,13 +153,6 @@ gpu_trace_stream_release
 );
 
 
-void
-consume_one_trace_item
-(
- thread_data_t* td,
- cct_node_t *call_path,
- uint64_t start_time,
- uint64_t end_time
-);
+
 
 #endif 

From f4329b9f39847b602492c061d0ea5207e5f363e0 Mon Sep 17 00:00:00 2001
From: dejangrubisic <grubisic.dejan@yahoo.com>
Date: Sun, 4 Oct 2020 17:37:09 -0500
Subject: [PATCH 071/177] traceOrdered bit added in core_profile_data

---
 src/tool/hpcrun/gpu/gpu-trace.c | 10 ++++++++--
 src/tool/hpcrun/thread_data.c   |  1 +
 src/tool/hpcrun/write_data.c    |  1 +
 3 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/src/tool/hpcrun/gpu/gpu-trace.c b/src/tool/hpcrun/gpu/gpu-trace.c
index 198480bdeb..bb39919410 100644
--- a/src/tool/hpcrun/gpu/gpu-trace.c
+++ b/src/tool/hpcrun/gpu/gpu-trace.c
@@ -245,8 +245,14 @@ gpu_trace_start_adjust
 )
 {
   uint64_t last_end = td->gpu_trace_prev_time;
-  if (start < last_end) {
-    // If we have a hardware measurement error (Power9),
+
+  if (end < last_end){
+    // If stream becomes unordered, mark it (it will be sorted in prof)
+    td->core_profile_trace_data.traceOrdered = false;
+    return start;
+  }
+
+  if(start < last_end) {    // If we have a hardware measurement error (Power9),
     // set the offset as the end of the last activity
     start = last_end + 1;
   }
diff --git a/src/tool/hpcrun/thread_data.c b/src/tool/hpcrun/thread_data.c
index 6de9070757..30e82be48e 100644
--- a/src/tool/hpcrun/thread_data.c
+++ b/src/tool/hpcrun/thread_data.c
@@ -236,6 +236,7 @@ core_profile_trace_data_init(core_profile_trace_data_t * cptd, int id, cct_ctxt_
   // ----------------------------------------
   cptd->trace_min_time_us = 0;
   cptd->trace_max_time_us = 0;
+  cptd->traceOrdered = true;
 
   // ----------------------------------------
   // IO support
diff --git a/src/tool/hpcrun/write_data.c b/src/tool/hpcrun/write_data.c
index 74b68c64b4..6b7f5ad91b 100644
--- a/src/tool/hpcrun/write_data.c
+++ b/src/tool/hpcrun/write_data.c
@@ -198,6 +198,7 @@ lazy_open_data_file(core_profile_trace_data_t * cptd)
                         HPCRUN_FMT_NV_pid, pidStr,
                         HPCRUN_FMT_NV_traceMinTime, traceMinTimeStr,
                         HPCRUN_FMT_NV_traceMaxTime, traceMaxTimeStr,
+                        HPCRUN_FMT_NV_traceOrdered, cptd->traceOrdered?"1":"0",
                         NULL);
   return fs;
 }

From 2c021faea46de3c62d1adc06885a190d596ba7d7 Mon Sep 17 00:00:00 2001
From: Aaron Cherian <atc8@iris.cs.rice.edu>
Date: Sun, 4 Oct 2020 21:56:04 -0500
Subject: [PATCH 072/177] Fix papi compilation error

---
 src/tool/hpcrun/sample-sources/papi.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/src/tool/hpcrun/sample-sources/papi.c b/src/tool/hpcrun/sample-sources/papi.c
index 0ff8238ea3..519fa3ecee 100644
--- a/src/tool/hpcrun/sample-sources/papi.c
+++ b/src/tool/hpcrun/sample-sources/papi.c
@@ -594,6 +594,12 @@ METHOD_FN(display_events)
   printf("\n");
 }
 
+
+static void
+METHOD_FN(finalize_event_list)
+{
+}
+
 /***************************************************************************
  * object
  ***************************************************************************/

From 0add9f0b2d64efe8348486c884e544248bd1706d Mon Sep 17 00:00:00 2001
From: Aaron Cherian <atc8@iris.cs.rice.edu>
Date: Sun, 4 Oct 2020 21:56:25 -0500
Subject: [PATCH 073/177] Fix opencl configuration

---
 configure    | 2 +-
 configure.ac | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/configure b/configure
index f8dafa1627..8ed07fae0f 100755
--- a/configure
+++ b/configure
@@ -23624,7 +23624,7 @@ case "$OPENCL" in
     if test ! -f "${OPENCL}/include/CL/cl.h" ; then
       as_fn_error $? "unable to find CL/cl.h in: $OPENCL" "$LINENO" 5
     else
-      OPT_OPENCL_IFLAGS="-I${OPENCL}"
+      OPT_OPENCL_IFLAGS="-I${OPENCL}/include"
       OPT_HAVE_OPENCL=yes
     fi
     ;;
diff --git a/configure.ac b/configure.ac
index 8c4a4d63fb..82c5ec27c8 100644
--- a/configure.ac
+++ b/configure.ac
@@ -4728,7 +4728,7 @@ case "$OPENCL" in
     if test ! -f "${OPENCL}/include/CL/cl.h" ; then
       AC_MSG_ERROR([unable to find CL/cl.h in: $OPENCL])
     else
-      OPT_OPENCL_IFLAGS="-I${OPENCL}"
+      OPT_OPENCL_IFLAGS="-I${OPENCL}/include"
       OPT_HAVE_OPENCL=yes
     fi
     ;;

From 74e54dde30f88bc80d93c6dc945c0bf9d68eda53 Mon Sep 17 00:00:00 2001
From: Aaron Cherian <atc8@iris.cs.rice.edu>
Date: Sun, 4 Oct 2020 22:13:29 -0500
Subject: [PATCH 074/177] Revert "Fix papi compilation error"

This reverts commit 2c021faea46de3c62d1adc06885a190d596ba7d7.
---
 src/tool/hpcrun/sample-sources/papi.c | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/src/tool/hpcrun/sample-sources/papi.c b/src/tool/hpcrun/sample-sources/papi.c
index 519fa3ecee..0ff8238ea3 100644
--- a/src/tool/hpcrun/sample-sources/papi.c
+++ b/src/tool/hpcrun/sample-sources/papi.c
@@ -594,12 +594,6 @@ METHOD_FN(display_events)
   printf("\n");
 }
 
-
-static void
-METHOD_FN(finalize_event_list)
-{
-}
-
 /***************************************************************************
  * object
  ***************************************************************************/

From 4e74316434930ebe63dc87bb716a7c4b3d8b0071 Mon Sep 17 00:00:00 2001
From: dejangrubisic <grubisic.dejan@yahoo.com>
Date: Mon, 5 Oct 2020 11:51:06 -0500
Subject: [PATCH 075/177] Setting defaults for STREAMS_PER_THREAD=4,
 MAX_COMPLETION_CALLBACK_THREADS=1000, and opencl refactoring

---
 src/tool/hpcrun/control-knob.c                |  4 +-
 .../hpcrun/gpu/gpu-activity-multiplexer.c     | 69 +++++++------------
 .../hpcrun/gpu/gpu-activity-multiplexer.h     | 29 ++------
 .../hpcrun/gpu/gpu-operation-channel-set.c    | 11 ---
 src/tool/hpcrun/gpu/opencl/opencl-api.c       |  4 +-
 5 files changed, 36 insertions(+), 81 deletions(-)

diff --git a/src/tool/hpcrun/control-knob.c b/src/tool/hpcrun/control-knob.c
index 8172b95669..5eb0869d61 100644
--- a/src/tool/hpcrun/control-knob.c
+++ b/src/tool/hpcrun/control-knob.c
@@ -48,8 +48,8 @@ control_knob_register(char *name, char *value, control_knob_type type)
 
 static void
 control_knob_default_register(){
-  control_knob_register("STREAMS_PER_THREAD", "65536", ck_int);
-  control_knob_register("MAX_THREADS_CONSUMERS", "256", ck_int);
+  control_knob_register("STREAMS_PER_THREAD", "4", ck_int);
+  control_knob_register("MAX_COMPLETION_CALLBACK_THREADS", "1000", ck_int);
 }
 
 
diff --git a/src/tool/hpcrun/gpu/gpu-activity-multiplexer.c b/src/tool/hpcrun/gpu/gpu-activity-multiplexer.c
index 8ba5fd09f1..051b846286 100644
--- a/src/tool/hpcrun/gpu/gpu-activity-multiplexer.c
+++ b/src/tool/hpcrun/gpu/gpu-activity-multiplexer.c
@@ -42,9 +42,9 @@
 // ******************************************************* EndRiceCopyright *
 
 
-#include <pthread.h>
-
+#include <hpcrun/control-knob.h>
 #include <lib/prof-lean/stdatomic.h>
+#include <pthread.h>
 
 
 #define DEBUG 0
@@ -60,9 +60,6 @@
 
 
 
-//TODO: Figure out how to get max number of application threads
-#define max_threads_consumers 1000
-
 //******************************************************************************
 // type declarations
 //******************************************************************************
@@ -76,8 +73,8 @@ typedef void *(*pthread_start_routine_t)(void *);
 static _Atomic(bool) stop_activity_flag;
 static _Atomic(bool) gpu_trace_finished;
 
-static atomic_uint stream_id;
-static __thread uint32_t my_operation_set_id = -1;
+static atomic_uint operation_channels_count;
+static __thread uint32_t my_operation_channel_id = -1;
 static __thread gpu_operation_channel_t *gpu_operation_channel = NULL;
 static pthread_once_t is_initialized = PTHREAD_ONCE_INIT;
 
@@ -90,23 +87,23 @@ static pthread_once_t is_initialized = PTHREAD_ONCE_INIT;
 static void
 gpu_init_operation_channel(){
   // Create operation channel
-  my_operation_set_id = atomic_fetch_add(&stream_id, 1);
+  my_operation_channel_id = atomic_fetch_add(&operation_channels_count, 1);
   gpu_operation_channel = gpu_operation_channel_get();
-  gpu_operation_channel_set_insert(gpu_operation_channel, my_operation_set_id);
+  gpu_operation_channel_set_insert(gpu_operation_channel, my_operation_channel_id);
 }
 
 
 static void *
 gpu_activity_record
 (
-void
+ void
 )
 {
 
   while (!atomic_load(&stop_activity_flag)){
-    int current_stream_id = atomic_load(&stream_id);
+    int current_operation_channels_count = atomic_load(&operation_channels_count);
 
-    for (int set_index = 0; set_index < current_stream_id ; ++set_index) {
+    for (int set_index = 0; set_index < current_operation_channels_count ; ++set_index) {
       gpu_operation_channel_set_apply(gpu_operation_channel_consume, set_index);
 
       // TODO: change waiting policy to getting items when full
@@ -114,8 +111,8 @@ void
     }
   }
 
-  int current_stream_id = atomic_load(&stream_id);
-  for (int set_index = 0; set_index < current_stream_id; ++set_index) {
+  int current_operation_channels_count = atomic_load(&operation_channels_count);
+  for (int set_index = 0; set_index < current_operation_channels_count; ++set_index) {
     gpu_operation_channel_set_apply(gpu_operation_channel_consume, set_index);
     gpu_operation_channel_set_apply(gpu_operation_channel_await, set_index);
   }
@@ -130,15 +127,18 @@ void
 static void
 gpu_activity_multiplexer_create
 (
-void
+ void
 )
 {
   pthread_t thread;
   atomic_store(&stop_activity_flag, false);
   atomic_store(&gpu_trace_finished, false);
-  atomic_store(&stream_id, 0);
+  atomic_store(&operation_channels_count, 0);
+
+  int max_completion_cb_threads;
+  control_knob_value_get_int("MAX_COMPLETION_CALLBACK_THREADS", &max_completion_cb_threads);
 
-  gpu_operation_channel_stack_alloc(max_threads_consumers);
+  gpu_operation_channel_stack_alloc(max_completion_cb_threads);
   // You are the first to create monitor thread
   pthread_create(&thread, NULL, (pthread_start_routine_t) gpu_activity_record,
                  NULL);
@@ -151,17 +151,17 @@ void
 //******************************************************************************
 
 bool
-gpu_activity_is_multiplexer_initialized
+gpu_activity_multiplexer_my_channel_initialized
 (
  void
 )
 {
-  return (my_operation_set_id != -1);
+  return (my_operation_channel_id != -1);
 }
 
 
 void
-gpu_activity_multiplexer_init
+gpu_activity_multiplexer_my_channel_init
 (
  void
 )
@@ -174,15 +174,15 @@ gpu_activity_multiplexer_init
 void
 gpu_activity_multiplexer_fini
 (
-void
+ void
 )
 {
   PRINT("gpu_activity_multiplexer_fini called\n");
 
   atomic_store(&stop_activity_flag, true);
 
-  int current_stream_id = atomic_load(&stream_id);
-  for (int set_index = 0; set_index < current_stream_id; ++set_index) {
+  int current_operation_channels_count = atomic_load(&operation_channels_count);
+  for (int set_index = 0; set_index < current_operation_channels_count; ++set_index) {
     gpu_operation_channel_set_apply(gpu_operation_channel_signal_consumer, set_index);
   }
 
@@ -193,31 +193,12 @@ void
 void
 gpu_activity_multiplexer_push
 (
-gpu_activity_channel_t *initiator_channel,
-gpu_activity_t *gpu_activity
+ gpu_activity_channel_t *initiator_channel,
+ gpu_activity_t *gpu_activity
 )
 {
   gpu_operation_item_t item = (gpu_operation_item_t){.channel=initiator_channel, .activity=*gpu_activity};
   gpu_operation_channel_produce(gpu_operation_channel, &item);
-
-}
-
-
-void
-gpu_operation_release
-(
-gpu_operation_channel_t *channel
-)
-{
-}
-
-
-void
-gpu_activity_multiplexer_release
-(
- void
-)
-{
 }
 
 
diff --git a/src/tool/hpcrun/gpu/gpu-activity-multiplexer.h b/src/tool/hpcrun/gpu/gpu-activity-multiplexer.h
index 62f68c8dc3..cbd6db1136 100644
--- a/src/tool/hpcrun/gpu/gpu-activity-multiplexer.h
+++ b/src/tool/hpcrun/gpu/gpu-activity-multiplexer.h
@@ -70,49 +70,34 @@ typedef struct gpu_activity_t gpu_activity_t;
 
 
 bool
-gpu_activity_is_multiplexer_initialized
+gpu_activity_multiplexer_my_channel_initialized
 (
-void
+ void
 );
 
 
 void
-gpu_activity_multiplexer_init
+gpu_activity_multiplexer_my_channel_init
 (
-void
+ void
 );
 
 
 void
 gpu_activity_multiplexer_fini
 (
-void
+ void
 );
 
 
 void
 gpu_activity_multiplexer_push
 (
-gpu_activity_channel_t *initiator_channel,
-gpu_activity_t *gpu_activity
+ gpu_activity_channel_t *initiator_channel,
+ gpu_activity_t *gpu_activity
 );
 
 
-void
-gpu_activity_multiplexer_release
-(
-void
-);
-
-
-void
-gpu_operation_release
-(
-gpu_operation_channel_t *channel
-);
-
-
-
 
 #endif
 
diff --git a/src/tool/hpcrun/gpu/gpu-operation-channel-set.c b/src/tool/hpcrun/gpu/gpu-operation-channel-set.c
index 5b2d6f5283..9b73f684d5 100644
--- a/src/tool/hpcrun/gpu/gpu-operation-channel-set.c
+++ b/src/tool/hpcrun/gpu/gpu-operation-channel-set.c
@@ -187,14 +187,3 @@ gpu_operation_channel_set_apply
 }
 
 
-
-void
-gpu_operation_stream_release
-(
- int set_index
-)
-{
-  gpu_operation_channel_set_forall(gpu_operation_release, set_index);
-}
-
-
diff --git a/src/tool/hpcrun/gpu/opencl/opencl-api.c b/src/tool/hpcrun/gpu/opencl/opencl-api.c
index ba4272e45b..cf09ddff72 100644
--- a/src/tool/hpcrun/gpu/opencl/opencl-api.c
+++ b/src/tool/hpcrun/gpu/opencl/opencl-api.c
@@ -255,8 +255,8 @@ opencl_activity_process
   gpu_activity_t gpu_activity;
   opencl_activity_translate(&gpu_activity, event, cb_data);
 
-  if (gpu_activity_is_multiplexer_initialized() == false){
-    gpu_activity_multiplexer_init();
+  if (gpu_activity_multiplexer_my_channel_initialized() == false){
+    gpu_activity_multiplexer_my_channel_init();
   }
   gpu_activity_multiplexer_push(cb_data->details.initiator_channel, &gpu_activity);
 //  gpu_activity_process(&gpu_activity);

From cc205b5842d298e66cf07d0c8c07e4a4a36d7e5d Mon Sep 17 00:00:00 2001
From: dejangrubisic <grubisic.dejan@yahoo.com>
Date: Mon, 5 Oct 2020 12:25:14 -0500
Subject: [PATCH 076/177] Refactoring of func names

---
 src/tool/hpcrun/gpu/gpu-activity-multiplexer.c  | 2 +-
 src/tool/hpcrun/gpu/gpu-operation-channel-set.c | 2 +-
 src/tool/hpcrun/gpu/gpu-operation-channel-set.h | 2 +-
 src/tool/hpcrun/gpu/gpu-trace-channel-set.c     | 2 +-
 src/tool/hpcrun/gpu/gpu-trace-channel-set.h     | 2 +-
 src/tool/hpcrun/gpu/gpu-trace-demultiplexer.c   | 6 +++---
 6 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/src/tool/hpcrun/gpu/gpu-activity-multiplexer.c b/src/tool/hpcrun/gpu/gpu-activity-multiplexer.c
index 051b846286..51230238d3 100644
--- a/src/tool/hpcrun/gpu/gpu-activity-multiplexer.c
+++ b/src/tool/hpcrun/gpu/gpu-activity-multiplexer.c
@@ -138,7 +138,7 @@ gpu_activity_multiplexer_create
   int max_completion_cb_threads;
   control_knob_value_get_int("MAX_COMPLETION_CALLBACK_THREADS", &max_completion_cb_threads);
 
-  gpu_operation_channel_stack_alloc(max_completion_cb_threads);
+  gpu_operation_channel_set_alloc(max_completion_cb_threads);
   // You are the first to create monitor thread
   pthread_create(&thread, NULL, (pthread_start_routine_t) gpu_activity_record,
                  NULL);
diff --git a/src/tool/hpcrun/gpu/gpu-operation-channel-set.c b/src/tool/hpcrun/gpu/gpu-operation-channel-set.c
index 9b73f684d5..b2bd5aad68 100644
--- a/src/tool/hpcrun/gpu/gpu-operation-channel-set.c
+++ b/src/tool/hpcrun/gpu/gpu-operation-channel-set.c
@@ -150,7 +150,7 @@ gpu_operation_channel_set_forall
 // interface operations
 //******************************************************************************
 
-void gpu_operation_channel_stack_alloc(int size){
+void gpu_operation_channel_set_alloc(int size){
   gpu_operation_channel_stack = hpcrun_malloc_safe( size * sizeof(typed_stack_elem_ptr(gpu_operation_channel_ptr_t)));
 }
 
diff --git a/src/tool/hpcrun/gpu/gpu-operation-channel-set.h b/src/tool/hpcrun/gpu/gpu-operation-channel-set.h
index 5faacc838c..add13e2847 100644
--- a/src/tool/hpcrun/gpu/gpu-operation-channel-set.h
+++ b/src/tool/hpcrun/gpu/gpu-operation-channel-set.h
@@ -71,7 +71,7 @@ typedef void (*gpu_operation_channel_fn_t)
 //******************************************************************************
 
 void
-gpu_operation_channel_stack_alloc(int size);
+gpu_operation_channel_set_alloc(int size);
 
 
 void
diff --git a/src/tool/hpcrun/gpu/gpu-trace-channel-set.c b/src/tool/hpcrun/gpu/gpu-trace-channel-set.c
index dca684c061..a9c81790b2 100644
--- a/src/tool/hpcrun/gpu/gpu-trace-channel-set.c
+++ b/src/tool/hpcrun/gpu/gpu-trace-channel-set.c
@@ -165,7 +165,7 @@ gpu_trace_channel_set_apply
 //******************************************************************************
 
 void *
-gpu_trace_channel_stack_alloc(int size){
+gpu_trace_channel_set_alloc(int size){
   return hpcrun_malloc_safe( size * sizeof(typed_stack_elem_ptr(gpu_trace_channel_ptr_t)));
 }
 
diff --git a/src/tool/hpcrun/gpu/gpu-trace-channel-set.h b/src/tool/hpcrun/gpu/gpu-trace-channel-set.h
index 83b5b47644..cc45dda24d 100644
--- a/src/tool/hpcrun/gpu/gpu-trace-channel-set.h
+++ b/src/tool/hpcrun/gpu/gpu-trace-channel-set.h
@@ -71,7 +71,7 @@ typedef void (*gpu_trace_channel_fn_t)
 //******************************************************************************
 
 void *
-gpu_trace_channel_stack_alloc
+gpu_trace_channel_set_alloc
 (
  int size
 );
diff --git a/src/tool/hpcrun/gpu/gpu-trace-demultiplexer.c b/src/tool/hpcrun/gpu/gpu-trace-demultiplexer.c
index 6cb6fbb2c5..ac04f3e586 100644
--- a/src/tool/hpcrun/gpu/gpu-trace-demultiplexer.c
+++ b/src/tool/hpcrun/gpu/gpu-trace-demultiplexer.c
@@ -94,7 +94,7 @@ gpu_trace_channel_set_create
 {
   gpu_trace_channel_set_t *new_channel_set= hpcrun_malloc(sizeof(gpu_trace_channel_set_t));
   new_channel_set->next = NULL;
-  new_channel_set->channel_set_ptr = gpu_trace_channel_stack_alloc(streams_per_thread);
+  new_channel_set->channel_set_ptr = gpu_trace_channel_set_alloc(streams_per_thread);
   atomic_store(&new_channel_set->channel_index, 0);
 
   pthread_create(&new_channel_set->thread, NULL, (pthread_start_routine_t) gpu_trace_record,
@@ -105,7 +105,7 @@ gpu_trace_channel_set_create
 
 
 static void
-gpu_trace_channel_set_init
+gpu_trace_demultiplexer_init
 (
  void
 )
@@ -151,7 +151,7 @@ gpu_trace_demultiplexer_push
 {
 
   if (trace_channel_set_list_head == NULL){
-    gpu_trace_channel_set_init();
+    gpu_trace_demultiplexer_init();
   }
 
   if (atomic_load(&trace_channel_set_list_tail->channel_index) == streams_per_thread){

From 19c7f2e066559eb3c4239c0ce3f397b6abfc22c0 Mon Sep 17 00:00:00 2001
From: Aaron Cherian <aarontcopal2@iris16.ftm.alcf.anl.gov>
Date: Mon, 5 Oct 2020 18:37:09 +0000
Subject: [PATCH 077/177] instrumentation mode for opencl can be enabled by
 passing -e gpu=opencl,inst. Pass -e gpu=opencl for default profiling mode

---
 src/tool/hpcrun/gpu/opencl/opencl-api.c | 21 +++++++++++++++++---
 src/tool/hpcrun/gpu/opencl/opencl-api.h |  7 +++++++
 src/tool/hpcrun/sample-sources/opencl.c | 26 +++++++++++++++++++++++--
 3 files changed, 49 insertions(+), 5 deletions(-)

diff --git a/src/tool/hpcrun/gpu/opencl/opencl-api.c b/src/tool/hpcrun/gpu/opencl/opencl-api.c
index fbbc23dc94..a65c03b078 100644
--- a/src/tool/hpcrun/gpu/opencl/opencl-api.c
+++ b/src/tool/hpcrun/gpu/opencl/opencl-api.c
@@ -331,10 +331,13 @@ OPENCL_FN
 
 static atomic_ullong opencl_pending_operations;
 static atomic_long correlation_id;
+static bool instrumentation = false;
 
 #define CL_PROGRAM_DEBUG_INFO_SIZES_INTEL 0x4101
 #define CL_PROGRAM_DEBUG_INFO_INTEL       0x4100
 
+
+
 //******************************************************************************
 // private operations
 //******************************************************************************
@@ -505,7 +508,7 @@ opencl_subscriber_callback
   gpu_op_ccts_insert(api_node, &gpu_op_ccts, gpu_op_placeholder_flags);
   hpcrun_safe_exit();
 
-  if (type == kernel) {
+  if (instrumentation && type == kernel) {
     // Callback to produce gtpin correlation
     gtpin_produce_runtime_callstack(&gpu_op_ccts);
   }
@@ -602,8 +605,10 @@ opencl_api_initialize
  void
 )
 {
-  gpu_metrics_GPU_INST_enable();
-  gtpin_enable_profiling();
+	if (instrumentation) {
+  	gpu_metrics_GPU_INST_enable();
+  	gtpin_enable_profiling();
+	}
   atomic_store(&correlation_id, 0);
   atomic_store(&opencl_pending_operations, 0);
 }
@@ -902,6 +907,16 @@ clEnqueueWriteBuffer
 }
 
 
+void
+opencl_enable_instrumentation
+(
+	void
+)
+{
+	instrumentation = true;
+}
+
+
 void
 opencl_api_finalize
 (
diff --git a/src/tool/hpcrun/gpu/opencl/opencl-api.h b/src/tool/hpcrun/gpu/opencl/opencl-api.h
index ca847eacc5..6adae0f498 100644
--- a/src/tool/hpcrun/gpu/opencl/opencl-api.h
+++ b/src/tool/hpcrun/gpu/opencl/opencl-api.h
@@ -146,6 +146,13 @@ opencl_bind
 );
 
 
+void
+opencl_enable_instrumentation
+(
+	void
+);
+
+
 void
 opencl_api_finalize
 (
diff --git a/src/tool/hpcrun/sample-sources/opencl.c b/src/tool/hpcrun/sample-sources/opencl.c
index d3b42d5fe6..17c7c7fc75 100644
--- a/src/tool/hpcrun/sample-sources/opencl.c
+++ b/src/tool/hpcrun/sample-sources/opencl.c
@@ -66,10 +66,21 @@
 //******************************************************************************
 
 #define GPU_STRING "gpu=opencl"
+#define ENABLE_INSTRUMENTATION "gpu=opencl,inst"
+#define NO_THRESHOLD  1L
 static device_finalizer_fn_entry_t device_finalizer_shutdown;
 static device_finalizer_fn_entry_t device_trace_finalizer_shutdown;
 
 
+
+//******************************************************************************
+// type declarations
+//******************************************************************************
+
+static char opencl_name[128];
+
+
+
 //******************************************************************************
 // interface operations
 //******************************************************************************
@@ -129,7 +140,7 @@ static bool
 METHOD_FN(supports_event, const char *ev_str)
 {
   #ifndef HPCRUN_STATIC_LINK
-  return hpcrun_ev_is(ev_str, GPU_STRING);
+  return (hpcrun_ev_is(ev_str, GPU_STRING) || hpcrun_ev_is(ev_str, ENABLE_INSTRUMENTATION));
   #else
   return false;
   #endif
@@ -140,8 +151,19 @@ static void
 METHOD_FN(process_event_list, int lush_metrics)
 {
   int nevents = (self->evl).nevents;
-  gpu_metrics_default_enable();
   TMSG(OPENCL,"nevents = %d", nevents);
+  gpu_metrics_default_enable();
+
+	char* evlist = METHOD_CALL(self, get_event_str);
+  char* event = start_tok(evlist);
+	long th;
+  hpcrun_extract_ev_thresh(event, sizeof(opencl_name), opencl_name,
+    &th, NO_THRESHOLD);
+
+  if (hpcrun_ev_is(opencl_name, GPU_STRING)) {
+  } else if (hpcrun_ev_is(opencl_name, ENABLE_INSTRUMENTATION)) {
+		opencl_enable_instrumentation();
+  }
 }
 
 

From c9f4cbcb8ab3bbe16fea9b7aac7330ac1d3b18be Mon Sep 17 00:00:00 2001
From: Jokeren <robinho364@gmail.com>
Date: Mon, 5 Oct 2020 19:39:41 +0000
Subject: [PATCH 078/177] Use a global map to store correlation id; the gtpin
 callback thread cannot flush channel

---
 src/tool/hpcrun/Makefile.am                   |   3 +-
 src/tool/hpcrun/Makefile.in                   |  18 +-
 .../gtpin-correlation-id-map.c                | 264 ++++++++++++++++++
 .../gtpin-correlation-id-map.h                | 114 ++++++++
 .../instrumentation/gtpin-instrumentation.c   |  53 ++--
 5 files changed, 432 insertions(+), 20 deletions(-)
 create mode 100644 src/tool/hpcrun/gpu/instrumentation/gtpin-correlation-id-map.c
 create mode 100644 src/tool/hpcrun/gpu/instrumentation/gtpin-correlation-id-map.h

diff --git a/src/tool/hpcrun/Makefile.am b/src/tool/hpcrun/Makefile.am
index 805511a936..04aae9e695 100644
--- a/src/tool/hpcrun/Makefile.am
+++ b/src/tool/hpcrun/Makefile.am
@@ -515,7 +515,8 @@ endif
 if OPT_ENABLE_GTPIN
 MY_GTPIN_FILES = \
 	gpu/instrumentation/kernel-data-map.c \
-	gpu/instrumentation/gtpin-instrumentation.c
+	gpu/instrumentation/gtpin-instrumentation.c \
+	gpu/instrumentation/gtpin-correlation-id-map.c
 endif
 
 if OPT_ENABLE_ROCM
diff --git a/src/tool/hpcrun/Makefile.in b/src/tool/hpcrun/Makefile.in
index 0f758c8b9f..297d143238 100644
--- a/src/tool/hpcrun/Makefile.in
+++ b/src/tool/hpcrun/Makefile.in
@@ -543,6 +543,7 @@ am__libhpcrun_la_SOURCES_DIST = utilities/first_func.c main.h main.c \
 	gpu/opencl/opencl-activity-translate.c \
 	gpu/instrumentation/kernel-data-map.c \
 	gpu/instrumentation/gtpin-instrumentation.c \
+	gpu/instrumentation/gtpin-correlation-id-map.c \
 	unwind/common/backtrace.c unwind/common/unw-throw.c \
 	unwind/common/binarytree_uwi.c unwind/common/interval_t.c \
 	unwind/common/libunw_intervals.c unwind/common/stack_troll.c \
@@ -749,7 +750,8 @@ am__objects_35 =
 @OPT_ENABLE_OPENCL_TRUE@	gpu/opencl/libhpcrun_la-opencl-activity-translate.lo
 @OPT_ENABLE_OPENCL_TRUE@am__objects_41 = $(am__objects_40)
 @OPT_ENABLE_GTPIN_TRUE@am__objects_42 = gpu/instrumentation/libhpcrun_la-kernel-data-map.lo \
-@OPT_ENABLE_GTPIN_TRUE@	gpu/instrumentation/libhpcrun_la-gtpin-instrumentation.lo
+@OPT_ENABLE_GTPIN_TRUE@	gpu/instrumentation/libhpcrun_la-gtpin-instrumentation.lo \
+@OPT_ENABLE_GTPIN_TRUE@	gpu/instrumentation/libhpcrun_la-gtpin-correlation-id-map.lo
 @OPT_ENABLE_GTPIN_TRUE@am__objects_43 = $(am__objects_42)
 am__objects_44 = unwind/common/libhpcrun_la-backtrace.lo \
 	unwind/common/libhpcrun_la-unw-throw.lo
@@ -1879,7 +1881,8 @@ MY_AARCH64_FILES = \
 
 @OPT_ENABLE_GTPIN_TRUE@MY_GTPIN_FILES = \
 @OPT_ENABLE_GTPIN_TRUE@	gpu/instrumentation/kernel-data-map.c \
-@OPT_ENABLE_GTPIN_TRUE@	gpu/instrumentation/gtpin-instrumentation.c
+@OPT_ENABLE_GTPIN_TRUE@	gpu/instrumentation/gtpin-instrumentation.c \
+@OPT_ENABLE_GTPIN_TRUE@	gpu/instrumentation/gtpin-correlation-id-map.c
 
 @OPT_ENABLE_ROCM_TRUE@MY_ROCM_FILES = \
 @OPT_ENABLE_ROCM_TRUE@	sample-sources/amd.c \
@@ -2837,6 +2840,9 @@ gpu/instrumentation/libhpcrun_la-kernel-data-map.lo:  \
 gpu/instrumentation/libhpcrun_la-gtpin-instrumentation.lo:  \
 	gpu/instrumentation/$(am__dirstamp) \
 	gpu/instrumentation/$(DEPDIR)/$(am__dirstamp)
+gpu/instrumentation/libhpcrun_la-gtpin-correlation-id-map.lo:  \
+	gpu/instrumentation/$(am__dirstamp) \
+	gpu/instrumentation/$(DEPDIR)/$(am__dirstamp)
 unwind/common/libhpcrun_la-backtrace.lo:  \
 	unwind/common/$(am__dirstamp) \
 	unwind/common/$(DEPDIR)/$(am__dirstamp)
@@ -3731,6 +3737,7 @@ distclean-compile:
 @AMDEP_TRUE@@am__include@ @am__quote@gpu/amd/$(DEPDIR)/libhpcrun_la-rocm-debug-api.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@gpu/amd/$(DEPDIR)/libhpcrun_la-roctracer-activity-translate.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@gpu/amd/$(DEPDIR)/libhpcrun_la-roctracer-api.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@gpu/instrumentation/$(DEPDIR)/libhpcrun_la-gtpin-correlation-id-map.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@gpu/instrumentation/$(DEPDIR)/libhpcrun_la-gtpin-instrumentation.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@gpu/instrumentation/$(DEPDIR)/libhpcrun_la-kernel-data-map.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@gpu/level0/$(DEPDIR)/libhpcrun_la-level0-api.Plo@am__quote@
@@ -5424,6 +5431,13 @@ gpu/instrumentation/libhpcrun_la-gtpin-instrumentation.lo: gpu/instrumentation/g
 @AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
 @am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -c -o gpu/instrumentation/libhpcrun_la-gtpin-instrumentation.lo `test -f 'gpu/instrumentation/gtpin-instrumentation.c' || echo '$(srcdir)/'`gpu/instrumentation/gtpin-instrumentation.c
 
+gpu/instrumentation/libhpcrun_la-gtpin-correlation-id-map.lo: gpu/instrumentation/gtpin-correlation-id-map.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -MT gpu/instrumentation/libhpcrun_la-gtpin-correlation-id-map.lo -MD -MP -MF gpu/instrumentation/$(DEPDIR)/libhpcrun_la-gtpin-correlation-id-map.Tpo -c -o gpu/instrumentation/libhpcrun_la-gtpin-correlation-id-map.lo `test -f 'gpu/instrumentation/gtpin-correlation-id-map.c' || echo '$(srcdir)/'`gpu/instrumentation/gtpin-correlation-id-map.c
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) gpu/instrumentation/$(DEPDIR)/libhpcrun_la-gtpin-correlation-id-map.Tpo gpu/instrumentation/$(DEPDIR)/libhpcrun_la-gtpin-correlation-id-map.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='gpu/instrumentation/gtpin-correlation-id-map.c' object='gpu/instrumentation/libhpcrun_la-gtpin-correlation-id-map.lo' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -c -o gpu/instrumentation/libhpcrun_la-gtpin-correlation-id-map.lo `test -f 'gpu/instrumentation/gtpin-correlation-id-map.c' || echo '$(srcdir)/'`gpu/instrumentation/gtpin-correlation-id-map.c
+
 unwind/common/libhpcrun_la-backtrace.lo: unwind/common/backtrace.c
 @am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -MT unwind/common/libhpcrun_la-backtrace.lo -MD -MP -MF unwind/common/$(DEPDIR)/libhpcrun_la-backtrace.Tpo -c -o unwind/common/libhpcrun_la-backtrace.lo `test -f 'unwind/common/backtrace.c' || echo '$(srcdir)/'`unwind/common/backtrace.c
 @am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) unwind/common/$(DEPDIR)/libhpcrun_la-backtrace.Tpo unwind/common/$(DEPDIR)/libhpcrun_la-backtrace.Plo
diff --git a/src/tool/hpcrun/gpu/instrumentation/gtpin-correlation-id-map.c b/src/tool/hpcrun/gpu/instrumentation/gtpin-correlation-id-map.c
new file mode 100644
index 0000000000..7ec4c5a258
--- /dev/null
+++ b/src/tool/hpcrun/gpu/instrumentation/gtpin-correlation-id-map.c
@@ -0,0 +1,264 @@
+// -*-Mode: C++;-*- // technically C99
+
+// * BeginRiceCopyright *****************************************************
+//
+// --------------------------------------------------------------------------
+// Part of HPCToolkit (hpctoolkit.org)
+//
+// Information about sources of support for research and development of
+// HPCToolkit is at 'hpctoolkit.org' and in 'README.Acknowledgments'.
+// --------------------------------------------------------------------------
+//
+// Copyright ((c)) 2002-2020, Rice University
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+//   notice, this list of conditions and the following disclaimer.
+//
+// * Redistributions in binary form must reproduce the above copyright
+//   notice, this list of conditions and the following disclaimer in the
+//   documentation and/or other materials provided with the distribution.
+//
+// * Neither the name of Rice University (RICE) nor the names of its
+//   contributors may be used to endorse or promote products derived from
+//   this software without specific prior written permission.
+//
+// This software is provided by RICE and contributors "as is" and any
+// express or implied warranties, including, but not limited to, the
+// implied warranties of merchantability and fitness for a particular
+// purpose are disclaimed. In no event shall RICE or contributors be
+// liable for any direct, indirect, incidental, special, exemplary, or
+// consequential damages (including, but not limited to, procurement of
+// substitute goods or services; loss of use, data, or profits; or
+// business interruption) however caused and on any theory of liability,
+// whether in contract, strict liability, or tort (including negligence
+// or otherwise) arising in any way out of the use of this software, even
+// if advised of the possibility of such damage.
+//
+// ******************************************************* EndRiceCopyright *
+
+//*****************************************************************************
+// system includes
+//*****************************************************************************
+
+#include <assert.h>
+#include <string.h>
+
+
+
+//*****************************************************************************
+// local includes
+//*****************************************************************************
+
+#include <lib/prof-lean/splay-uint64.h>
+#include <lib/prof-lean/spinlock.h>
+
+#include "gtpin-correlation-id-map.h"
+#include <hpcrun/gpu/gpu-activity-channel.h>
+#include <hpcrun/gpu/gpu-splay-allocator.h>
+#include <hpcrun/gpu/gpu-op-placeholders.h>
+
+
+//*****************************************************************************
+// macros
+//*****************************************************************************
+
+#define DEBUG 0
+
+#include "../gpu-print.h"
+
+
+#define st_insert				\
+  typed_splay_insert(correlation_id)
+
+#define st_lookup				\
+  typed_splay_lookup(correlation_id)
+
+#define st_delete				\
+  typed_splay_delete(correlation_id)
+
+#define st_forall				\
+  typed_splay_forall(correlation_id)
+
+#define st_count				\
+  typed_splay_count(correlation_id)
+
+#define st_alloc(free_list)			\
+  typed_splay_alloc(free_list, gtpin_correlation_id_map_entry_t)
+
+#define st_free(free_list, node)		\
+  typed_splay_free(free_list, node)
+
+
+
+//*****************************************************************************
+// type declarations
+//*****************************************************************************
+
+#undef typed_splay_node
+#define typed_splay_node(correlation_id) gtpin_correlation_id_map_entry_t
+
+typedef struct typed_splay_node(correlation_id) {
+  struct typed_splay_node(correlation_id) *left;
+  struct typed_splay_node(correlation_id) *right;
+  uint64_t gtpin_correlation_id; // key
+
+  gpu_op_ccts_t op_ccts;
+  gpu_activity_channel_t *activity_channel;
+  uint64_t submit_time;
+} typed_splay_node(correlation_id); 
+
+
+//******************************************************************************
+// local data
+//******************************************************************************
+
+static gtpin_correlation_id_map_entry_t *map_root = NULL;
+
+static gtpin_correlation_id_map_entry_t *free_list = NULL;
+
+static spinlock_t gtpin_correlation_id_map_lock = SPINLOCK_UNLOCKED;
+
+//*****************************************************************************
+// private operations
+//*****************************************************************************
+
+typed_splay_impl(correlation_id)
+
+
+static gtpin_correlation_id_map_entry_t *
+gtpin_correlation_id_map_entry_alloc()
+{
+  return st_alloc(&free_list);
+}
+
+
+static gtpin_correlation_id_map_entry_t *
+gtpin_correlation_id_map_entry_new
+(
+ uint64_t gtpin_correlation_id, 
+ gpu_op_ccts_t *op_ccts,
+ gpu_activity_channel_t *activity_channel,
+ uint64_t submit_time
+)
+{
+  gtpin_correlation_id_map_entry_t *e = gtpin_correlation_id_map_entry_alloc();
+
+  e->gtpin_correlation_id = gtpin_correlation_id;
+  e->op_ccts = *op_ccts;
+  e->activity_channel = activity_channel;
+  e->submit_time = submit_time;
+
+  return e;
+}
+
+
+//*****************************************************************************
+// interface operations
+//*****************************************************************************
+
+gtpin_correlation_id_map_entry_t *
+gtpin_correlation_id_map_lookup
+(
+ uint64_t gtpin_correlation_id
+)
+{
+  spinlock_lock(&gtpin_correlation_id_map_lock);
+
+  uint64_t correlation_id = gtpin_correlation_id;
+  gtpin_correlation_id_map_entry_t *result = st_lookup(&map_root, correlation_id);
+
+  spinlock_unlock(&gtpin_correlation_id_map_lock);
+
+  return result;
+}
+
+
+void
+gtpin_correlation_id_map_insert
+(
+ uint64_t gtpin_correlation_id, 
+ gpu_op_ccts_t *op_ccts,
+ gpu_activity_channel_t *activity_channel,
+ uint64_t submit_time
+)
+{
+  spinlock_lock(&gtpin_correlation_id_map_lock);
+
+  gtpin_correlation_id_map_entry_t *entry = st_lookup(&map_root, gtpin_correlation_id);
+  if (entry) {
+    entry->op_ccts = *op_ccts;
+    entry->activity_channel = activity_channel;
+    entry->submit_time = submit_time;
+  } else {
+    gtpin_correlation_id_map_entry_t *entry = 
+      gtpin_correlation_id_map_entry_new(gtpin_correlation_id, op_ccts, activity_channel, submit_time);
+
+    st_insert(&map_root, entry);
+  }
+
+  spinlock_unlock(&gtpin_correlation_id_map_lock);
+}
+
+
+void
+gtpin_correlation_id_map_delete
+(
+ uint64_t gtpin_correlation_id
+)
+{
+  spinlock_lock(&gtpin_correlation_id_map_lock);
+
+  gtpin_correlation_id_map_entry_t *node = st_delete(&map_root, gtpin_correlation_id);
+  st_free(&free_list, node);
+
+  spinlock_unlock(&gtpin_correlation_id_map_lock);
+}
+
+
+gpu_op_ccts_t
+gtpin_correlation_id_map_entry_op_ccts_get
+(
+ gtpin_correlation_id_map_entry_t *entry
+)
+{
+  return entry->op_ccts;
+}
+
+
+gpu_activity_channel_t *
+gtpin_correlation_id_map_entry_activity_channel_get
+(
+ gtpin_correlation_id_map_entry_t *entry
+)
+{
+  return entry->activity_channel;
+}
+
+
+uint64_t
+gtpin_correlation_id_map_entry_submit_time_get
+(
+ gtpin_correlation_id_map_entry_t *entry
+)
+{
+  return entry->submit_time; 
+}
+
+
+//*****************************************************************************
+// debugging code
+//*****************************************************************************
+
+uint64_t
+gtpin_correlation_id_map_count
+(
+ void
+)
+{
+  return st_count(map_root);
+}
diff --git a/src/tool/hpcrun/gpu/instrumentation/gtpin-correlation-id-map.h b/src/tool/hpcrun/gpu/instrumentation/gtpin-correlation-id-map.h
new file mode 100644
index 0000000000..07a2d2c596
--- /dev/null
+++ b/src/tool/hpcrun/gpu/instrumentation/gtpin-correlation-id-map.h
@@ -0,0 +1,114 @@
+// -*-Mode: C++;-*- // technically C99
+
+// * BeginRiceCopyright *****************************************************
+//
+// --------------------------------------------------------------------------
+// Part of HPCToolkit (hpctoolkit.org)
+//
+// Information about sources of support for research and development of
+// HPCToolkit is at 'hpctoolkit.org' and in 'README.Acknowledgments'.
+// --------------------------------------------------------------------------
+//
+// Copyright ((c)) 2002-2020, Rice University
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+//   notice, this list of conditions and the following disclaimer.
+//
+// * Redistributions in binary form must reproduce the above copyright
+//   notice, this list of conditions and the following disclaimer in the
+//   documentation and/or other materials provided with the distribution.
+//
+// * Neither the name of Rice University (RICE) nor the names of its
+//   contributors may be used to endorse or promote products derived from
+//   this software without specific prior written permission.
+//
+// This software is provided by RICE and contributors "as is" and any
+// express or implied warranties, including, but not limited to, the
+// implied warranties of merchantability and fitness for a particular
+// purpose are disclaimed. In no event shall RICE or contributors be
+// liable for any direct, indirect, incidental, special, exemplary, or
+// consequential damages (including, but not limited to, procurement of
+// substitute goods or services; loss of use, data, or profits; or
+// business interruption) however caused and on any theory of liability,
+// whether in contract, strict liability, or tort (including negligence
+// or otherwise) arising in any way out of the use of this software, even
+// if advised of the possibility of such damage.
+//
+// ******************************************************* EndRiceCopyright *
+
+#ifndef gtpin_correlation_id_map_h
+#define gtpin_correlation_id_map_h
+
+
+//*****************************************************************************
+// system includes
+//*****************************************************************************
+
+#include <stdint.h>
+
+
+//*****************************************************************************
+// type definitions 
+//*****************************************************************************
+
+typedef struct gtpin_correlation_id_map_entry_t gtpin_correlation_id_map_entry_t;
+
+typedef struct gpu_op_ccts_t gpu_op_ccts_t;
+
+typedef struct gpu_activity_channel_t gpu_activity_channel_t;
+
+//*****************************************************************************
+// interface operations
+//*****************************************************************************
+
+gtpin_correlation_id_map_entry_t *
+gtpin_correlation_id_map_lookup
+(
+ uint64_t gtpin_correlation_id
+);
+
+
+void
+gtpin_correlation_id_map_insert
+(
+ uint64_t gtpin_correlation_id,
+ gpu_op_ccts_t *op_ccts,
+ gpu_activity_channel_t *activity_channel,
+ uint64_t submit_time
+);
+
+
+void
+gtpin_correlation_id_map_delete
+(
+ uint64_t gtpin_correlation_id
+);
+
+
+gpu_op_ccts_t
+gtpin_correlation_id_map_entry_op_ccts_get
+(
+ gtpin_correlation_id_map_entry_t *entry
+);
+
+
+gpu_activity_channel_t *
+gtpin_correlation_id_map_entry_activity_channel_get
+(
+ gtpin_correlation_id_map_entry_t *entry
+);
+
+
+uint64_t 
+gtpin_correlation_id_map_entry_submit_time_get
+(
+ gtpin_correlation_id_map_entry_t *entry
+);
+
+
+#endif
diff --git a/src/tool/hpcrun/gpu/instrumentation/gtpin-instrumentation.c b/src/tool/hpcrun/gpu/instrumentation/gtpin-instrumentation.c
index deb434c96f..37d539c807 100644
--- a/src/tool/hpcrun/gpu/instrumentation/gtpin-instrumentation.c
+++ b/src/tool/hpcrun/gpu/instrumentation/gtpin-instrumentation.c
@@ -79,6 +79,7 @@
 #include <lib/prof-lean/crypto-hash.h>
 #include <lib/prof-lean/spinlock.h>
 
+#include "gtpin-correlation-id-map.h"
 #include "gtpin-instrumentation.h"
 #include "kernel-data.h"
 #include "kernel-data-map.h"
@@ -164,7 +165,8 @@ createKernelNode
     gpu_op_ccts_insert(api_node, &gpu_op_ccts, gpu_op_placeholder_flags);
     hpcrun_safe_exit();
 
-    gpu_correlation_channel_produce(correlation_id, &gpu_op_ccts, cpu_submit_time);
+    gpu_activity_channel_t *activity_channel = gpu_activity_channel_get();
+    gtpin_correlation_id_map_insert(correlation_id, &gpu_op_ccts, activity_channel, cpu_submit_time);
   }
 }
 
@@ -282,9 +284,6 @@ activityNotify
  void
 )
 {
-  // Once you attribute this kernel, you want to update the host_correlation_id entry.
-  // Otherwise, the same memory might be reclaimed
-  // gpu_monitoring_thread_activities_ready(allow_update);
   gpu_monitoring_thread_activities_ready();
 }
 
@@ -316,12 +315,20 @@ kernelBlockActivityProcess
  uint64_t correlation_id,
  uint32_t loadmap_module_id,
  uint64_t offset,
- uint64_t execution_count
+ uint64_t execution_count,
+ gpu_activity_channel_t *activity_channel,
+ cct_node_t *host_op_node
 )
 {
   gpu_activity_t ga;
   kernelBlockActivityTranslate(&ga, correlation_id, loadmap_module_id, offset, execution_count);
-  gpu_activity_process(&ga);
+
+  ip_normalized_t ip = ga.details.kernel_block.pc;
+  cct_node_t *cct_child = hpcrun_cct_insert_ip_norm(host_op_node, ip); // how to set the ip_norm
+  if (cct_child) {
+    ga.cct_node = cct_child;
+    gpu_activity_channel_produce(activity_channel, &ga);
+  }
 }
 
 
@@ -407,9 +414,7 @@ onKernelRun
  void *v
 )
 {
-  ETMSG(OPENCL, "onKernelRun starting. Inserted: correlation %llu", (uint64_t)kernelExec);
-
-  gpu_activity_channel_consume(gpu_metrics_attribute);
+  ETMSG(OPENCL, "onKernelRun starting. Inserted: correlation %"PRIu64"", (uint64_t)kernelExec);
 
   GTPINTOOL_STATUS status = GTPINTOOL_STATUS_SUCCESS;
   GTPin_KernelProfilingActive(kernelExec, 1);
@@ -429,11 +434,27 @@ onKernelComplete
   // Receive correlations from the host thread.
   // XXX(Keren): This is done usually at the monitor thread, but not guaranteed.
   // For safety concern, we need to adopt the multiplexer framework.
-  activityNotify();  
+  //activityNotify();  
+  
+  uint64_t correlation_id = (uint64_t)kernelExec;
+
+  gtpin_correlation_id_map_entry_t *entry =
+    gtpin_correlation_id_map_lookup(correlation_id);
+
+  ETMSG(OPENCL, "onKernelComplete starting. Lookup: correlation %"PRIu64", result %p", correlation_id, entry);
+
+  if (entry == NULL) {
+    // XXX(Keren): the opencl/level zero api's kernel launch is not wrapped
+    return;
+  }
+
+  gpu_activity_channel_t *activity_channel = gtpin_correlation_id_map_entry_activity_channel_get(entry);
+  gpu_op_ccts_t gpu_op_ccts = gtpin_correlation_id_map_entry_op_ccts_get(entry);
+  cct_node_t *host_op_node = gpu_op_ccts_get(&gpu_op_ccts, gpu_placeholder_type_kernel);
 
   GTPINTOOL_STATUS status = GTPINTOOL_STATUS_SUCCESS;
   GTPinKernel kernel = GTPin_KernelExec_GetKernel(kernelExec);
-  ETMSG(OPENCL, "onKernelComplete starting. Lookup: correlation %llu, kernel: %llu", (uint64_t)kernelExec, (uint64_t)kernel);
+  ETMSG(OPENCL, "onKernelComplete starting. Lookup: kernel: %"PRIu64"", (uint64_t)kernel);
   assert(kernel_data_map_lookup((uint64_t)kernel) != 0);
 
   kernel_data_map_entry_t *kernel_data_map_entry = kernel_data_map_lookup((uint64_t)kernel);
@@ -459,8 +480,8 @@ onKernelComplete
 
     kernel_data_gtpin_inst_t *inst = block->inst;
     while (inst != NULL) {
-      kernelBlockActivityProcess((uint64_t)kernelExec, kernel_data.loadmap_module_id,
-        inst->offset, execution_count);
+      kernelBlockActivityProcess(correlation_id, kernel_data.loadmap_module_id,
+        inst->offset, execution_count, activity_channel, host_op_node);
       inst = inst->next;
     }
     block = block->next;
@@ -493,9 +514,6 @@ gtpin_enable_profiling
   // Use opencl/level zero runtime stack
   gtpin_use_runtime_callstack = true;
 
-  // Enable host correlation id replace
-  gpu_host_correlation_map_replace_set(true);
-
   GTPin_OnKernelBuild(onKernelBuild, NULL);
   GTPin_OnKernelRun(onKernelRun, NULL);
   GTPin_OnKernelComplete(onKernelComplete, NULL);
@@ -511,6 +529,7 @@ gtpin_produce_runtime_callstack
 )
 {
   if (gtpin_use_runtime_callstack) {
-    gpu_correlation_channel_produce(gtpin_correlation_id, gpu_op_ccts, gtpin_cpu_submit_time);
+    gpu_activity_channel_t *activity_channel = gpu_activity_channel_get();
+    gtpin_correlation_id_map_insert(gtpin_correlation_id, gpu_op_ccts, activity_channel, gtpin_cpu_submit_time);
   }
 }

From 4a494b7977dbb742b5befa242ab48c6a5ca6146a Mon Sep 17 00:00:00 2001
From: Jokeren <robinho364@gmail.com>
Date: Mon, 5 Oct 2020 19:44:13 +0000
Subject: [PATCH 079/177] Enable instruction metrics in gtpin

---
 src/tool/hpcrun/gpu/opencl/opencl-api.c | 11 +++++------
 src/tool/hpcrun/sample-sources/opencl.c | 12 ++++++------
 2 files changed, 11 insertions(+), 12 deletions(-)

diff --git a/src/tool/hpcrun/gpu/opencl/opencl-api.c b/src/tool/hpcrun/gpu/opencl/opencl-api.c
index 182cbc66cf..318eb8718e 100644
--- a/src/tool/hpcrun/gpu/opencl/opencl-api.c
+++ b/src/tool/hpcrun/gpu/opencl/opencl-api.c
@@ -605,10 +605,9 @@ opencl_api_initialize
  void
 )
 {
-	if (instrumentation) {
-  	gpu_metrics_GPU_INST_enable();
-  	gtpin_enable_profiling();
-	}
+  if (instrumentation) {
+    gtpin_enable_profiling();
+  }
   atomic_store(&correlation_id, 0);
   atomic_store(&opencl_pending_operations, 0);
 }
@@ -914,10 +913,10 @@ clEnqueueWriteBuffer
 void
 opencl_enable_instrumentation
 (
-	void
+ void
 )
 {
-	instrumentation = true;
+  instrumentation = true;
 }
 
 
diff --git a/src/tool/hpcrun/sample-sources/opencl.c b/src/tool/hpcrun/sample-sources/opencl.c
index 17c7c7fc75..bc5e338557 100644
--- a/src/tool/hpcrun/sample-sources/opencl.c
+++ b/src/tool/hpcrun/sample-sources/opencl.c
@@ -154,15 +154,15 @@ METHOD_FN(process_event_list, int lush_metrics)
   TMSG(OPENCL,"nevents = %d", nevents);
   gpu_metrics_default_enable();
 
-	char* evlist = METHOD_CALL(self, get_event_str);
+  char* evlist = METHOD_CALL(self, get_event_str);
   char* event = start_tok(evlist);
-	long th;
+  long th;
   hpcrun_extract_ev_thresh(event, sizeof(opencl_name), opencl_name,
     &th, NO_THRESHOLD);
 
   if (hpcrun_ev_is(opencl_name, GPU_STRING)) {
   } else if (hpcrun_ev_is(opencl_name, ENABLE_INSTRUMENTATION)) {
-		opencl_enable_instrumentation();
+    opencl_enable_instrumentation();
   }
 }
 
@@ -202,9 +202,9 @@ METHOD_FN(display_events)
   printf("Name\t\tDescription\n");
   printf("---------------------------------------------------------------------------\n");
   printf("%s\t\tOperation-level monitoring for opencl on a GPU.\n"
-	  "\t\tCollect timing information on GPU kernel invocations,\n"
-	  "\t\tmemory copies, etc.\n",
-	  GPU_STRING);
+    "\t\tCollect timing information on GPU kernel invocations,\n"
+    "\t\tmemory copies, etc.\n",
+    GPU_STRING);
   printf("\n");
 }
 

From 7fcd34d84448c0420784e10a8ed14055ab650841 Mon Sep 17 00:00:00 2001
From: Aaron Cherian <atc8@iris.cs.rice.edu>
Date: Mon, 5 Oct 2020 15:36:42 -0500
Subject: [PATCH 080/177] changing CL_TARGET_OPENCL_VERSION to 2.2. Some hacks
 for opencl profiling also have been removed

---
 src/lib/prof-lean/hpcrun-opencl.h       |  2 +-
 src/tool/hpcrun/gpu/opencl/opencl-api.c | 23 ++++++++++++-----------
 2 files changed, 13 insertions(+), 12 deletions(-)

diff --git a/src/lib/prof-lean/hpcrun-opencl.h b/src/lib/prof-lean/hpcrun-opencl.h
index 59e7cce526..bff7a75ff0 100644
--- a/src/lib/prof-lean/hpcrun-opencl.h
+++ b/src/lib/prof-lean/hpcrun-opencl.h
@@ -49,7 +49,7 @@
 // system includes
 //******************************************************************************
 
-#define CL_TARGET_OPENCL_VERSION 120
+#define CL_TARGET_OPENCL_VERSION 220
 #include <CL/cl.h>
 
 #endif
diff --git a/src/tool/hpcrun/gpu/opencl/opencl-api.c b/src/tool/hpcrun/gpu/opencl/opencl-api.c
index 182cbc66cf..053b782af4 100644
--- a/src/tool/hpcrun/gpu/opencl/opencl-api.c
+++ b/src/tool/hpcrun/gpu/opencl/opencl-api.c
@@ -237,7 +237,7 @@ OPENCL_QUEUE_FN
   (
    cl_context,
    cl_device_id,
-   const cl_bitfield *,
+   const cl_queue_properties *,
    cl_int*
   )
 );
@@ -605,10 +605,11 @@ opencl_api_initialize
  void
 )
 {
-	if (instrumentation) {
-  	gpu_metrics_GPU_INST_enable();
-  	gtpin_enable_profiling();
-	}
+  ETMSG(OPENCL, "CL_TARGET_OPENCL_VERSION: %d", CL_TARGET_OPENCL_VERSION);
+  if (instrumentation) {
+	gpu_metrics_GPU_INST_enable();
+	gtpin_enable_profiling();
+  }
   atomic_store(&correlation_id, 0);
   atomic_store(&opencl_pending_operations, 0);
 }
@@ -719,13 +720,13 @@ clCreateCommandQueueWithProperties
 (
  cl_context context,
  cl_device_id device,
- const cl_bitfield* properties,
+ const cl_queue_properties* properties,
  cl_int* errcode_ret
 )
 {
-  cl_bitfield *queue_properties = (cl_bitfield *)properties;
+  cl_queue_properties *queue_properties = (cl_queue_properties *)properties;
   if (properties == NULL) {
-    queue_properties = (cl_bitfield *)malloc(sizeof(cl_bitfield) * 3);
+    queue_properties = (cl_queue_properties *)malloc(sizeof(cl_queue_properties) * 3);
     queue_properties[0] = CL_QUEUE_PROPERTIES;
     queue_properties[1] = CL_QUEUE_PROFILING_ENABLE;
     queue_properties[2] = 0;
@@ -736,7 +737,7 @@ clCreateCommandQueueWithProperties
       if (properties[props_count] == CL_QUEUE_PROPERTIES) {
         queue_props_id = props_count;
         ++props_count;
-      } else if (properties[props_count] == 0x1094) {
+      } else if (properties[props_count] == CL_QUEUE_SIZE) {
         // TODO(Keren): A temporay hack
         ++props_count;
       }
@@ -744,7 +745,7 @@ clCreateCommandQueueWithProperties
     }
 
     if (queue_props_id >= 0 && queue_props_id + 1 < props_count) {
-      queue_properties = (cl_bitfield *)malloc(sizeof(cl_bitfield) * (props_count + 1));
+      queue_properties = (cl_queue_properties *)malloc(sizeof(cl_queue_properties) * (props_count + 1));
       for (int i = 0; i < props_count; ++i) {
         queue_properties[i] = properties[i];
       }
@@ -753,7 +754,7 @@ clCreateCommandQueueWithProperties
       queue_properties[props_count] = 0;
     } else {
       // We do not have a queue property entry, need to allocate a queue property entry and set up
-      queue_properties = (cl_bitfield *)malloc(sizeof(cl_bitfield) * (props_count + 3));
+      queue_properties = (cl_queue_properties *)malloc(sizeof(cl_queue_properties) * (props_count + 3));
       for (int i = 0; i < props_count; ++i) {
         queue_properties[i] = properties[i];
       }

From d77e81e56c399d947d5c259ba710f50270f502aa Mon Sep 17 00:00:00 2001
From: dejangrubisic <grubisic.dejan@yahoo.com>
Date: Mon, 5 Oct 2020 19:22:19 -0500
Subject: [PATCH 081/177] thread_fini_action: commented out
 opencl_api_finalize(NULL), if called early it will call
 gpu_activity_multiplexer_fini for all thread->deadlock in gpu_trace_record

---
 src/tool/hpcrun/gpu/gpu-trace-demultiplexer.c | 14 +++++++--
 src/tool/hpcrun/gpu/gpu-trace-demultiplexer.h | 10 ++++++-
 src/tool/hpcrun/gpu/gpu-trace.c               | 23 +++++++++------
 src/tool/hpcrun/gpu/gpu-trace.h               |  1 +
 src/tool/hpcrun/gpu/opencl/opencl-api.c       | 29 +++++++++++++------
 src/tool/hpcrun/gpu/opencl/opencl-intercept.h |  1 +
 src/tool/hpcrun/sample-sources/opencl.c       |  2 +-
 7 files changed, 58 insertions(+), 22 deletions(-)

diff --git a/src/tool/hpcrun/gpu/gpu-trace-demultiplexer.c b/src/tool/hpcrun/gpu/gpu-trace-demultiplexer.c
index ac04f3e586..b3587eecfd 100644
--- a/src/tool/hpcrun/gpu/gpu-trace-demultiplexer.c
+++ b/src/tool/hpcrun/gpu/gpu-trace-demultiplexer.c
@@ -130,10 +130,10 @@ gpu_trace_channel_set_get_ptr
 )
 {
   return channel_set->channel_set_ptr;
-};
+}
 
 
-int
+uint32_t
 gpu_trace_channel_set_get_channel_num
 (
  gpu_trace_channel_set_t *channel_set
@@ -143,6 +143,16 @@ gpu_trace_channel_set_get_channel_num
 }
 
 
+pthread_t
+gpu_trace_channel_set_get_thread
+(
+gpu_trace_channel_set_t *channel_set
+)
+{
+  return channel_set->thread;
+}
+
+
 pthread_t
 gpu_trace_demultiplexer_push
 (
diff --git a/src/tool/hpcrun/gpu/gpu-trace-demultiplexer.h b/src/tool/hpcrun/gpu/gpu-trace-demultiplexer.h
index bf09153a9e..18a59aa70f 100644
--- a/src/tool/hpcrun/gpu/gpu-trace-demultiplexer.h
+++ b/src/tool/hpcrun/gpu/gpu-trace-demultiplexer.h
@@ -78,7 +78,15 @@ gpu_trace_channel_set_get_ptr
  gpu_trace_channel_set_t *channel_set
 );
 
-int
+
+pthread_t
+gpu_trace_channel_set_get_thread
+(
+ gpu_trace_channel_set_t *channel_set
+);
+
+
+uint32_t
 gpu_trace_channel_set_get_channel_num
 (
  gpu_trace_channel_set_t *channel_set
diff --git a/src/tool/hpcrun/gpu/gpu-trace.c b/src/tool/hpcrun/gpu/gpu-trace.c
index bb39919410..a97cd99490 100644
--- a/src/tool/hpcrun/gpu/gpu-trace.c
+++ b/src/tool/hpcrun/gpu/gpu-trace.c
@@ -45,17 +45,9 @@
 // system includes
 //******************************************************************************
 
-#include <lib/prof-lean/stdatomic.h>
+#include <assert.h>
 #include <pthread.h>
 
-#include <hpcrun/cct/cct.h>
-#include <hpcrun/control-knob.h>
-#include <hpcrun/thread_data.h>
-#include <hpcrun/threadmgr.h>
-#include <hpcrun/trace.h>
-#include <hpcrun/write_data.h>
-
-#include <assert.h>
 
 
 //******************************************************************************
@@ -70,6 +62,15 @@
 // local includes
 //******************************************************************************
 
+#include <lib/prof-lean/stdatomic.h>
+
+#include <hpcrun/cct/cct.h>
+#include <hpcrun/control-knob.h>
+#include <hpcrun/thread_data.h>
+#include <hpcrun/threadmgr.h>
+#include <hpcrun/trace.h>
+#include <hpcrun/write_data.h>
+
 #include "gpu-context-id-map.h"
 #include "gpu-monitoring.h"
 #include "gpu-trace.h"
@@ -248,6 +249,7 @@ gpu_trace_start_adjust
 
   if (end < last_end){
     // If stream becomes unordered, mark it (it will be sorted in prof)
+    PRINT("TRACE NOT ORDERED: Trace_id = %u\n", td->core_profile_trace_data.id);
     td->core_profile_trace_data.traceOrdered = false;
     return start;
   }
@@ -321,6 +323,9 @@ gpu_trace_record
     //getting data from a trace channel
     gpu_trace_channel_set_process(channel_set);
     gpu_trace_channel_set_await(channel_set);
+    PRINT("TraceRecord_processed: thread: %ld, set_index = %d\n",
+           gpu_trace_channel_set_get_thread(channel_set),
+           gpu_trace_channel_set_get_channel_num(channel_set));
   }
 
   gpu_trace_channel_set_process(channel_set);
diff --git a/src/tool/hpcrun/gpu/gpu-trace.h b/src/tool/hpcrun/gpu/gpu-trace.h
index 62d57e72ef..d60ecbef27 100644
--- a/src/tool/hpcrun/gpu/gpu-trace.h
+++ b/src/tool/hpcrun/gpu/gpu-trace.h
@@ -44,6 +44,7 @@
 #ifndef gpu_trace_h
 #define gpu_trace_h
 
+#include <stdbool.h>
 
 
 //******************************************************************************
diff --git a/src/tool/hpcrun/gpu/opencl/opencl-api.c b/src/tool/hpcrun/gpu/opencl/opencl-api.c
index cf09ddff72..c0eaa155e8 100644
--- a/src/tool/hpcrun/gpu/opencl/opencl-api.c
+++ b/src/tool/hpcrun/gpu/opencl/opencl-api.c
@@ -321,6 +321,7 @@ opencl_object_t *cb_data
     cb_basic.kind = cb_data->kind;
     cb_basic.type = cb_data->details.mem_cb.type;
   }
+  cb_basic.cct_node = cb_data->details.cct_node;
 
   return cb_basic;
 }
@@ -333,11 +334,12 @@ opencl_cb_basic_print
 )
 {
 
-  ETMSG(OPENCL, " %s | Activity kind: %s | type: %s | correlation id: %"PRIu64 "",
+  ETMSG(OPENCL, " %s | Activity kind: %s | type: %s | correlation id: %"PRIu64 "| cct_node = %p",
         title,
         gpu_kind_to_string(cb_basic.kind),
         gpu_type_to_string(cb_basic.type),
-        cb_basic.correlation_id);
+        cb_basic.correlation_id,
+        cb_basic.cct_node);
 
 }
 
@@ -358,15 +360,11 @@ opencl_subscriber_callback
 )
 {
 
+  gpu_placeholder_type_t placeholder_type;
   uint64_t correlation_id = getCorrelationId();
 
   opencl_pending_operations_adjust(1);
   gpu_op_placeholder_flags_t gpu_op_placeholder_flags = 0;
-  gpu_op_ccts_t gpu_op_ccts;
-  gpu_correlation_id_map_insert(correlation_id, correlation_id);
-  cct_node_t *api_node = 
-    gpu_application_thread_correlation_callback(correlation_id);
-
 
   switch (cb_info->kind) {
 
@@ -376,9 +374,13 @@ opencl_subscriber_callback
         gpu_op_placeholder_flags_set(&gpu_op_placeholder_flags,
                                        gpu_placeholder_type_copyin);
 
+        placeholder_type = gpu_placeholder_type_copyin;
+
       }else if (cb_info->details.mem_cb.type == GPU_MEMCPY_D2H){
         gpu_op_placeholder_flags_set(&gpu_op_placeholder_flags,
                                        gpu_placeholder_type_copyout);
+
+        placeholder_type = gpu_placeholder_type_copyout;
       }
       break;
 
@@ -389,19 +391,28 @@ opencl_subscriber_callback
 
       gpu_op_placeholder_flags_set(&gpu_op_placeholder_flags, 
 				   gpu_placeholder_type_trace);
+
+      placeholder_type = gpu_placeholder_type_kernel;
       break;
     default:
       assert(0);
   }
 
+
+  gpu_correlation_id_map_insert(correlation_id, correlation_id);
+  cct_node_t *api_node =
+  gpu_application_thread_correlation_callback(correlation_id);
+
+  gpu_op_ccts_t gpu_op_ccts;
+
   hpcrun_safe_enter();
   gpu_op_ccts_insert(api_node, &gpu_op_ccts, gpu_op_placeholder_flags);
+  cct_node_t *cct_ph = gpu_op_ccts_get(&gpu_op_ccts, placeholder_type);
   hpcrun_safe_exit();
 
   gpu_activity_channel_consume(gpu_metrics_attribute);
 
-
-  cb_info->details.cct_node = api_node;
+  cb_info->details.cct_node = cct_ph;
   cb_info->details.initiator_channel = gpu_activity_channel_get();
   cb_info->details.submit_time = CPU_NANOTIME();
 
diff --git a/src/tool/hpcrun/gpu/opencl/opencl-intercept.h b/src/tool/hpcrun/gpu/opencl/opencl-intercept.h
index 1ad4210ddd..332b036b8c 100644
--- a/src/tool/hpcrun/gpu/opencl/opencl-intercept.h
+++ b/src/tool/hpcrun/gpu/opencl/opencl-intercept.h
@@ -117,6 +117,7 @@ typedef struct cl_basic_callback_t {
   uint64_t correlation_id;
   gpu_activity_kind_t kind;
   gpu_memcpy_type_t type;
+  cct_node_t *cct_node;
 } cl_basic_callback_t;
 
 
diff --git a/src/tool/hpcrun/sample-sources/opencl.c b/src/tool/hpcrun/sample-sources/opencl.c
index f4691c50fb..92b99823ca 100644
--- a/src/tool/hpcrun/sample-sources/opencl.c
+++ b/src/tool/hpcrun/sample-sources/opencl.c
@@ -106,7 +106,7 @@ static void
 METHOD_FN(thread_fini_action)
 {
   TMSG(OPENCL, "thread_fini_action");
-  opencl_api_finalize(NULL);
+//  opencl_api_finalize(NULL);
 }
 
 

From 136bc3472ffa27a4683ae9c4b93253cb8bea581b Mon Sep 17 00:00:00 2001
From: dejangrubisic <grubisic.dejan@yahoo.com>
Date: Tue, 6 Oct 2020 18:10:53 -0500
Subject: [PATCH 082/177] opencl finalizer-> thread_finalize, process_finalize

---
 .../hpcrun/gpu/gpu-activity-multiplexer.c     | 24 ++++--------
 .../hpcrun/gpu/gpu-operation-channel-set.c    | 37 +++++++++++++++++--
 .../hpcrun/gpu/gpu-operation-channel-set.h    | 19 ++++++++--
 src/tool/hpcrun/gpu/gpu-trace.c               |  2 +-
 src/tool/hpcrun/gpu/opencl/opencl-api.c       | 15 ++++++--
 src/tool/hpcrun/gpu/opencl/opencl-api.h       |  8 +++-
 src/tool/hpcrun/sample-sources/opencl.c       |  4 +-
 7 files changed, 76 insertions(+), 33 deletions(-)

diff --git a/src/tool/hpcrun/gpu/gpu-activity-multiplexer.c b/src/tool/hpcrun/gpu/gpu-activity-multiplexer.c
index 51230238d3..a5ab08a287 100644
--- a/src/tool/hpcrun/gpu/gpu-activity-multiplexer.c
+++ b/src/tool/hpcrun/gpu/gpu-activity-multiplexer.c
@@ -99,23 +99,16 @@ gpu_activity_record
  void
 )
 {
+  int current_operation_channels_count;
 
   while (!atomic_load(&stop_activity_flag)){
-    int current_operation_channels_count = atomic_load(&operation_channels_count);
-
-    for (int set_index = 0; set_index < current_operation_channels_count ; ++set_index) {
-      gpu_operation_channel_set_apply(gpu_operation_channel_consume, set_index);
-
-      // TODO: change waiting policy to getting items when full
-      gpu_operation_channel_set_apply(gpu_operation_channel_await, set_index);
-    }
+    current_operation_channels_count = atomic_load(&operation_channels_count);
+    gpu_operation_channel_set_process(current_operation_channels_count);
   }
 
-  int current_operation_channels_count = atomic_load(&operation_channels_count);
-  for (int set_index = 0; set_index < current_operation_channels_count; ++set_index) {
-    gpu_operation_channel_set_apply(gpu_operation_channel_consume, set_index);
-    gpu_operation_channel_set_apply(gpu_operation_channel_await, set_index);
-  }
+  current_operation_channels_count = atomic_load(&operation_channels_count);
+  gpu_operation_channel_set_process(current_operation_channels_count);
+  gpu_operation_channel_set_await(current_operation_channels_count);
 
   gpu_trace_fini(NULL);
   atomic_store(&gpu_trace_finished, true);
@@ -181,10 +174,7 @@ gpu_activity_multiplexer_fini
 
   atomic_store(&stop_activity_flag, true);
 
-  int current_operation_channels_count = atomic_load(&operation_channels_count);
-  for (int set_index = 0; set_index < current_operation_channels_count; ++set_index) {
-    gpu_operation_channel_set_apply(gpu_operation_channel_signal_consumer, set_index);
-  }
+  gpu_operation_channel_set_notify(atomic_load(&operation_channels_count));
 
   while (!atomic_load(&gpu_trace_finished));
 }
diff --git a/src/tool/hpcrun/gpu/gpu-operation-channel-set.c b/src/tool/hpcrun/gpu/gpu-operation-channel-set.c
index b2bd5aad68..4c6cb6dd41 100644
--- a/src/tool/hpcrun/gpu/gpu-operation-channel-set.c
+++ b/src/tool/hpcrun/gpu/gpu-operation-channel-set.c
@@ -145,6 +145,18 @@ gpu_operation_channel_set_forall
 }
 
 
+static void
+gpu_operation_channel_set_apply
+(
+gpu_operation_channel_fn_t channel_fn,
+int channel_num
+)
+{
+  for (int channel_idx = 0; channel_idx < channel_num; ++channel_idx) {
+    gpu_operation_channel_set_forall(channel_fn, channel_idx);
+  }
+}
+
 
 //******************************************************************************
 // interface operations
@@ -177,13 +189,30 @@ gpu_operation_channel_set_insert
 
 
 void
-gpu_operation_channel_set_apply
+gpu_operation_channel_set_process
 (
- gpu_operation_channel_fn_t channel_fn,
- int set_index
+ int channel_num
 )
 {
-  gpu_operation_channel_set_forall(channel_fn, set_index);
+  gpu_operation_channel_set_apply(gpu_operation_channel_consume, channel_num);
 }
 
 
+void
+gpu_operation_channel_set_await
+(
+ int channel_num
+)
+{
+  gpu_operation_channel_set_apply(gpu_operation_channel_await, channel_num);
+}
+
+
+void
+gpu_operation_channel_set_notify
+(
+ int channel_num
+)
+{
+  gpu_operation_channel_set_apply(gpu_operation_channel_signal_consumer, channel_num);
+}
\ No newline at end of file
diff --git a/src/tool/hpcrun/gpu/gpu-operation-channel-set.h b/src/tool/hpcrun/gpu/gpu-operation-channel-set.h
index add13e2847..41682776b3 100644
--- a/src/tool/hpcrun/gpu/gpu-operation-channel-set.h
+++ b/src/tool/hpcrun/gpu/gpu-operation-channel-set.h
@@ -83,13 +83,24 @@ gpu_operation_channel_set_insert
 
 
 void
-gpu_operation_channel_set_apply
+gpu_operation_channel_set_process
 (
- gpu_operation_channel_fn_t channel_fn,
- int set_index
+ int channel_num
+);
+
+
+void
+gpu_operation_channel_set_await
+(
+ int channel_num
 );
 
-void gpu_operation_channel_set_release(int set_index);
+
+void
+gpu_operation_channel_set_notify
+(
+ int channel_num
+);
 
 
 
diff --git a/src/tool/hpcrun/gpu/gpu-trace.c b/src/tool/hpcrun/gpu/gpu-trace.c
index a97cd99490..fd2d8931a0 100644
--- a/src/tool/hpcrun/gpu/gpu-trace.c
+++ b/src/tool/hpcrun/gpu/gpu-trace.c
@@ -322,7 +322,7 @@ gpu_trace_record
   while (!atomic_load(&stop_trace_flag)) {
     //getting data from a trace channel
     gpu_trace_channel_set_process(channel_set);
-    gpu_trace_channel_set_await(channel_set);
+
     PRINT("TraceRecord_processed: thread: %ld, set_index = %d\n",
            gpu_trace_channel_set_get_thread(channel_set),
            gpu_trace_channel_set_get_channel_num(channel_set));
diff --git a/src/tool/hpcrun/gpu/opencl/opencl-api.c b/src/tool/hpcrun/gpu/opencl/opencl-api.c
index c0eaa155e8..80167b66e5 100644
--- a/src/tool/hpcrun/gpu/opencl/opencl-api.c
+++ b/src/tool/hpcrun/gpu/opencl/opencl-api.c
@@ -519,14 +519,21 @@ opencl_bind
 
 
 void
-opencl_api_finalize
+opencl_api_thread_finalize
 (
   void *args
 )
 {
   opencl_wait_for_pending_operations();
-  gpu_activity_multiplexer_fini();
-
   gpu_application_thread_process_activities();
-
 }
+
+void
+opencl_api_process_finalize
+(
+void *args
+)
+{
+  opencl_api_thread_finalize(NULL);
+  gpu_activity_multiplexer_fini();
+}
\ No newline at end of file
diff --git a/src/tool/hpcrun/gpu/opencl/opencl-api.h b/src/tool/hpcrun/gpu/opencl/opencl-api.h
index 7cd00fb3b8..2cae426a93 100644
--- a/src/tool/hpcrun/gpu/opencl/opencl-api.h
+++ b/src/tool/hpcrun/gpu/opencl/opencl-api.h
@@ -138,11 +138,17 @@ opencl_bind
 
 
 void
-opencl_api_finalize
+opencl_api_thread_finalize
 (
   void *
 );
 
 
+void
+opencl_api_process_finalize
+(
+void *
+);
+
 
 #endif  //_OPENCL_API_H_
diff --git a/src/tool/hpcrun/sample-sources/opencl.c b/src/tool/hpcrun/sample-sources/opencl.c
index 92b99823ca..8301d98915 100644
--- a/src/tool/hpcrun/sample-sources/opencl.c
+++ b/src/tool/hpcrun/sample-sources/opencl.c
@@ -106,7 +106,7 @@ static void
 METHOD_FN(thread_fini_action)
 {
   TMSG(OPENCL, "thread_fini_action");
-//  opencl_api_finalize(NULL);
+  opencl_api_thread_finalize(NULL);
 }
 
 
@@ -161,7 +161,7 @@ METHOD_FN(finalize_event_list)
 //  device_trace_finalizer_shutdown.fn = gpu_trace_fini;
 //  device_finalizer_register(device_finalizer_type_shutdown, &device_trace_finalizer_shutdown);
 
-  device_finalizer_shutdown.fn = opencl_api_finalize;
+  device_finalizer_shutdown.fn = opencl_api_process_finalize;
   device_finalizer_register(device_finalizer_type_shutdown, &device_finalizer_shutdown);
 
 }

From abb1756f5a2b0f44bb7d0ae6e04ce4a79ca7098a Mon Sep 17 00:00:00 2001
From: Aaron Cherian <atc8@iris.cs.rice.edu>
Date: Wed, 7 Oct 2020 05:13:27 -0500
Subject: [PATCH 083/177] opencl H2D calls that bypass clEnqueueWriteBuffer and
 directly call clSetKernelArgs is now being recorded. ISSUE: some of the added
 H2D nodes are missing

---
 src/tool/hpcrun/Makefile.am                   |   3 +-
 src/tool/hpcrun/Makefile.in                   |  17 +-
 .../gpu/opencl/opencl-activity-translate.c    |  19 ++
 .../gpu/opencl/opencl-activity-translate.h    |  10 +-
 src/tool/hpcrun/gpu/opencl/opencl-api.c       | 219 ++++++++++++-
 src/tool/hpcrun/gpu/opencl/opencl-h2d-map.c   | 300 ++++++++++++++++++
 src/tool/hpcrun/gpu/opencl/opencl-h2d-map.h   | 149 +++++++++
 src/tool/hpcrun/sample-sources/opencl.c       |   4 +-
 8 files changed, 712 insertions(+), 9 deletions(-)
 create mode 100644 src/tool/hpcrun/gpu/opencl/opencl-h2d-map.c
 create mode 100644 src/tool/hpcrun/gpu/opencl/opencl-h2d-map.h

diff --git a/src/tool/hpcrun/Makefile.am b/src/tool/hpcrun/Makefile.am
index 04aae9e695..a40362084c 100644
--- a/src/tool/hpcrun/Makefile.am
+++ b/src/tool/hpcrun/Makefile.am
@@ -509,7 +509,8 @@ if OPT_ENABLE_OPENCL
 MY_OPENCL_FILES = sample-sources/opencl.c \
 	gpu/opencl/opencl-api.c \
 	gpu/opencl/opencl-memory-manager.c \
-	gpu/opencl/opencl-activity-translate.c 
+	gpu/opencl/opencl-activity-translate.c \
+	gpu/opencl/opencl-h2d-map.c 
 endif
 
 if OPT_ENABLE_GTPIN
diff --git a/src/tool/hpcrun/Makefile.in b/src/tool/hpcrun/Makefile.in
index 297d143238..f6e7afd068 100644
--- a/src/tool/hpcrun/Makefile.in
+++ b/src/tool/hpcrun/Makefile.in
@@ -541,6 +541,7 @@ am__libhpcrun_la_SOURCES_DIST = utilities/first_func.c main.h main.c \
 	gpu/level0/level0-handle-map.c sample-sources/opencl.c \
 	gpu/opencl/opencl-api.c gpu/opencl/opencl-memory-manager.c \
 	gpu/opencl/opencl-activity-translate.c \
+	gpu/opencl/opencl-h2d-map.c \
 	gpu/instrumentation/kernel-data-map.c \
 	gpu/instrumentation/gtpin-instrumentation.c \
 	gpu/instrumentation/gtpin-correlation-id-map.c \
@@ -747,7 +748,8 @@ am__objects_35 =
 @OPT_ENABLE_OPENCL_TRUE@	sample-sources/libhpcrun_la-opencl.lo \
 @OPT_ENABLE_OPENCL_TRUE@	gpu/opencl/libhpcrun_la-opencl-api.lo \
 @OPT_ENABLE_OPENCL_TRUE@	gpu/opencl/libhpcrun_la-opencl-memory-manager.lo \
-@OPT_ENABLE_OPENCL_TRUE@	gpu/opencl/libhpcrun_la-opencl-activity-translate.lo
+@OPT_ENABLE_OPENCL_TRUE@	gpu/opencl/libhpcrun_la-opencl-activity-translate.lo \
+@OPT_ENABLE_OPENCL_TRUE@	gpu/opencl/libhpcrun_la-opencl-h2d-map.lo
 @OPT_ENABLE_OPENCL_TRUE@am__objects_41 = $(am__objects_40)
 @OPT_ENABLE_GTPIN_TRUE@am__objects_42 = gpu/instrumentation/libhpcrun_la-kernel-data-map.lo \
 @OPT_ENABLE_GTPIN_TRUE@	gpu/instrumentation/libhpcrun_la-gtpin-instrumentation.lo \
@@ -1877,7 +1879,8 @@ MY_AARCH64_FILES = \
 @OPT_ENABLE_OPENCL_TRUE@MY_OPENCL_FILES = sample-sources/opencl.c \
 @OPT_ENABLE_OPENCL_TRUE@	gpu/opencl/opencl-api.c \
 @OPT_ENABLE_OPENCL_TRUE@	gpu/opencl/opencl-memory-manager.c \
-@OPT_ENABLE_OPENCL_TRUE@	gpu/opencl/opencl-activity-translate.c 
+@OPT_ENABLE_OPENCL_TRUE@	gpu/opencl/opencl-activity-translate.c \
+@OPT_ENABLE_OPENCL_TRUE@	gpu/opencl/opencl-h2d-map.c 
 
 @OPT_ENABLE_GTPIN_TRUE@MY_GTPIN_FILES = \
 @OPT_ENABLE_GTPIN_TRUE@	gpu/instrumentation/kernel-data-map.c \
@@ -2828,6 +2831,8 @@ gpu/opencl/libhpcrun_la-opencl-memory-manager.lo:  \
 gpu/opencl/libhpcrun_la-opencl-activity-translate.lo:  \
 	gpu/opencl/$(am__dirstamp) \
 	gpu/opencl/$(DEPDIR)/$(am__dirstamp)
+gpu/opencl/libhpcrun_la-opencl-h2d-map.lo: gpu/opencl/$(am__dirstamp) \
+	gpu/opencl/$(DEPDIR)/$(am__dirstamp)
 gpu/instrumentation/$(am__dirstamp):
 	@$(MKDIR_P) gpu/instrumentation
 	@: > gpu/instrumentation/$(am__dirstamp)
@@ -3766,6 +3771,7 @@ distclean-compile:
 @AMDEP_TRUE@@am__include@ @am__quote@gpu/nvidia/$(DEPDIR)/libhpcrun_o-cupti-gpu-api.Po@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@gpu/opencl/$(DEPDIR)/libhpcrun_la-opencl-activity-translate.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@gpu/opencl/$(DEPDIR)/libhpcrun_la-opencl-api.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@gpu/opencl/$(DEPDIR)/libhpcrun_la-opencl-h2d-map.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@gpu/opencl/$(DEPDIR)/libhpcrun_la-opencl-memory-manager.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@lush-agents/$(DEPDIR)/libagent_cilk_la-agent-cilk.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@lush-agents/$(DEPDIR)/libagent_pthread_la-agent-pthread.Plo@am__quote@
@@ -5417,6 +5423,13 @@ gpu/opencl/libhpcrun_la-opencl-activity-translate.lo: gpu/opencl/opencl-activity
 @AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
 @am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -c -o gpu/opencl/libhpcrun_la-opencl-activity-translate.lo `test -f 'gpu/opencl/opencl-activity-translate.c' || echo '$(srcdir)/'`gpu/opencl/opencl-activity-translate.c
 
+gpu/opencl/libhpcrun_la-opencl-h2d-map.lo: gpu/opencl/opencl-h2d-map.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -MT gpu/opencl/libhpcrun_la-opencl-h2d-map.lo -MD -MP -MF gpu/opencl/$(DEPDIR)/libhpcrun_la-opencl-h2d-map.Tpo -c -o gpu/opencl/libhpcrun_la-opencl-h2d-map.lo `test -f 'gpu/opencl/opencl-h2d-map.c' || echo '$(srcdir)/'`gpu/opencl/opencl-h2d-map.c
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) gpu/opencl/$(DEPDIR)/libhpcrun_la-opencl-h2d-map.Tpo gpu/opencl/$(DEPDIR)/libhpcrun_la-opencl-h2d-map.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='gpu/opencl/opencl-h2d-map.c' object='gpu/opencl/libhpcrun_la-opencl-h2d-map.lo' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -c -o gpu/opencl/libhpcrun_la-opencl-h2d-map.lo `test -f 'gpu/opencl/opencl-h2d-map.c' || echo '$(srcdir)/'`gpu/opencl/opencl-h2d-map.c
+
 gpu/instrumentation/libhpcrun_la-kernel-data-map.lo: gpu/instrumentation/kernel-data-map.c
 @am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -MT gpu/instrumentation/libhpcrun_la-kernel-data-map.lo -MD -MP -MF gpu/instrumentation/$(DEPDIR)/libhpcrun_la-kernel-data-map.Tpo -c -o gpu/instrumentation/libhpcrun_la-kernel-data-map.lo `test -f 'gpu/instrumentation/kernel-data-map.c' || echo '$(srcdir)/'`gpu/instrumentation/kernel-data-map.c
 @am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) gpu/instrumentation/$(DEPDIR)/libhpcrun_la-kernel-data-map.Tpo gpu/instrumentation/$(DEPDIR)/libhpcrun_la-kernel-data-map.Plo
diff --git a/src/tool/hpcrun/gpu/opencl/opencl-activity-translate.c b/src/tool/hpcrun/gpu/opencl/opencl-activity-translate.c
index 972914a3ea..c5dc08c402 100644
--- a/src/tool/hpcrun/gpu/opencl/opencl-activity-translate.c
+++ b/src/tool/hpcrun/gpu/opencl/opencl-activity-translate.c
@@ -143,3 +143,22 @@ opencl_activity_translate
   }
   cstack_ptr_set(&(ga->next), 0);
 }
+
+
+void
+opencl_clSetKernelArg_activity_translate
+(
+	gpu_activity_t *ga,
+	uint64_t correlation_id,
+	size_t size,
+	uint64_t start_time,
+	uint64_t end_time
+)
+{
+  ga->details.memcpy.correlation_id = correlation_id;
+  ga->details.memcpy.bytes = size;
+  ga->details.memcpy.copyKind = GPU_MEMCPY_H2D;
+  ga->kind = GPU_ACTIVITY_MEMCPY;
+  set_gpu_interval(&ga->details.interval, start_time, end_time);
+  cstack_ptr_set(&(ga->next), 0);
+}
diff --git a/src/tool/hpcrun/gpu/opencl/opencl-activity-translate.h b/src/tool/hpcrun/gpu/opencl/opencl-activity-translate.h
index 6c0f6f257d..cd01d23089 100644
--- a/src/tool/hpcrun/gpu/opencl/opencl-activity-translate.h
+++ b/src/tool/hpcrun/gpu/opencl/opencl-activity-translate.h
@@ -68,5 +68,13 @@ opencl_activity_translate
 );
 
 
-
+void
+opencl_clSetKernelArg_activity_translate
+(
+	gpu_activity_t *,
+	uint64_t,
+	size_t,
+	uint64_t,
+	uint64_t
+);
 #endif  //_OPENCL_ACTIVITY_TRANSLATE_H_
diff --git a/src/tool/hpcrun/gpu/opencl/opencl-api.c b/src/tool/hpcrun/gpu/opencl/opencl-api.c
index b82dca9dab..a6f8628fc5 100644
--- a/src/tool/hpcrun/gpu/opencl/opencl-api.c
+++ b/src/tool/hpcrun/gpu/opencl/opencl-api.c
@@ -71,13 +71,16 @@
 #include <hpcrun/messages/messages.h>
 #include <hpcrun/sample-sources/libdl.h>
 #include <hpcrun/files.h>
+#include <hpcrun/utilities/hpcrun-nanotime.h>
 #include <lib/prof-lean/hpcrun-opencl.h>
+#include <lib/prof-lean/splay-uint64.h>
 #include <lib/prof-lean/stdatomic.h>
 #include <lib/prof-lean/usec_time.h>
 
 #include "opencl-api.h"
 #include "opencl-activity-translate.h"
 #include "opencl-memory-manager.h"
+#include "opencl-h2d-map.h"
 
 
 
@@ -165,6 +168,8 @@
   macro(clEnqueueNDRangeKernel)  \
   macro(clEnqueueReadBuffer)  \
   macro(clEnqueueWriteBuffer)  \
+  macro(clCreateBuffer)  \
+  macro(clSetKernelArg)  \
   macro(clGetEventProfilingInfo)  \
   macro(clReleaseEvent)  \
   macro(clSetEventCallback)
@@ -180,6 +185,9 @@
 #define OPENCL_QUEUE_FN(fn, args)      \
   static cl_command_queue (*OPENCL_FN_NAME(fn)) args
 
+#define OPENCL_CREATEBUFFER_FN(fn, args)      \
+  static cl_mem (*OPENCL_FN_NAME(fn)) args
+
 #define HPCRUN_OPENCL_CALL(fn, args) (OPENCL_FN_NAME(fn) args)
 
 #define LINE_TABLE_FLAG " -gline-tables-only "
@@ -294,6 +302,31 @@ OPENCL_FN
 );
 
 
+OPENCL_CREATEBUFFER_FN
+(
+  clCreateBuffer,
+  (
+    cl_context,
+    cl_mem_flags,
+    size_t,
+    void *,
+    cl_int *
+  )
+);
+
+
+OPENCL_FN
+(
+  clSetKernelArg,
+  (
+    cl_kernel kernel,
+    cl_uint arg_index,
+    size_t arg_size,
+    const void* arg_value
+  )
+);
+
+
 OPENCL_FN
 (
   clGetEventProfilingInfo,
@@ -330,6 +363,7 @@ OPENCL_FN
 
 
 static atomic_ullong opencl_pending_operations;
+static atomic_ullong opencl_h2d_pending_operations;
 static atomic_long correlation_id;
 static bool instrumentation = false;
 
@@ -392,6 +426,16 @@ clBuildProgramCallback
 }
 
 
+static void
+opencl_h2d_pending_operations_adjust
+(
+ int value
+)
+{
+  atomic_fetch_add(&opencl_h2d_pending_operations, value);
+}
+
+
 static void
 opencl_pending_operations_adjust
 (
@@ -426,7 +470,108 @@ opencl_activity_process
 
 
 static void
-opencl_wait_for_pending_operations
+opencl_clSetKernelArg_activity_process
+(
+ uint64_t correlation_id,
+ opencl_h2d_map_entry_t *entry
+)
+{
+  gpu_activity_t gpu_activity;
+	size_t size = opencl_h2d_map_entry_size_get(entry); 
+	uint64_t start_time = opencl_h2d_map_entry_start_time_get(entry); 
+	uint64_t end_time = opencl_h2d_map_entry_end_time_get(entry); 
+  opencl_clSetKernelArg_activity_translate(&gpu_activity, correlation_id, size, start_time, end_time);
+  gpu_activity_process(&gpu_activity);
+}
+
+
+static uint64_t
+opencl_get_buffer_id
+(
+  const void *arg
+)
+{
+  cl_mem buffer = *(cl_mem*)arg;
+  return (uint64_t)buffer;
+}
+
+
+static bool
+opencl_isClArgBuffer
+(
+  const void *arg
+)
+{
+	/*
+	 * There are 2 scenarios in which opencl_isClArgBuffer will return false
+	 * 1. When clCreateBuffer was not called for arg before calling clSetKernelArg
+	 * 2. clEnqueueWriteBuffer is being called for arg. We shouldnt be recording duplicate H2D calls
+	 * */
+  uint64_t buffer_id = opencl_get_buffer_id(arg);
+	opencl_h2d_map_entry_t *entry = opencl_h2d_map_lookup(buffer_id);
+	bool isBuffer = entry ? true : false;
+	//ETMSG(OPENCL, "opencl_isClArgBuffer. buffer_id: %"PRIu64". isBuffer: %d",	buffer_id, isBuffer);
+	return isBuffer;
+}
+
+
+static void
+add_H2D_metrics_to_cct_node
+(
+	opencl_h2d_map_entry_t *entry,
+	splay_visit_t visit_type,
+	void *arg
+)
+{
+	uint64_t correlation_id = opencl_h2d_map_entry_correlation_get(entry); 
+	gpu_correlation_id_map_entry_t *cid_map_entry = 
+		gpu_correlation_id_map_lookup(correlation_id);
+	if (cid_map_entry == NULL) {
+		ETMSG(OPENCL, "cid_map_entry for correlation_id: %"PRIu64 " (clSetKernelArg H2D) not found", correlation_id);
+		return;
+	}
+	ETMSG(OPENCL, "completion type: %s, Correlation id: %"PRIu64 "", 
+			"memcpy_H2D", correlation_id);
+
+	uint64_t start_time = opencl_h2d_map_entry_start_time_get(entry); 
+	uint64_t end_time = opencl_h2d_map_entry_end_time_get(entry); 
+	ETMSG(OPENCL, "duration [%"PRIu64", %"PRIu64"]",start_time, end_time); 
+	opencl_activity_completion_notify();
+	opencl_clSetKernelArg_activity_process(correlation_id, entry);
+	uint64_t buffer_id = opencl_h2d_map_entry_buffer_id_get(entry);
+	//opencl_h2d_map_delete(buffer_id);
+  opencl_h2d_pending_operations_adjust(-1);
+  opencl_pending_operations_adjust(-1);
+}
+
+
+static void
+opencl_add_ccts_for_setClKernelArg
+(
+	void
+)
+{
+  uint64_t count = opencl_h2d_map_count();
+	if (atomic_load(&opencl_h2d_pending_operations) > 0) {
+		opencl_update_ccts_for_h2d_nodes(add_H2D_metrics_to_cct_node);
+	}
+}
+
+
+static void
+opencl_wait_for_non_clSetKernelArg_pending_operations
+(
+  void
+)
+{
+  ETMSG(OPENCL, "pending h2D operations: %lu", 
+	  atomic_load(&opencl_h2d_pending_operations));
+  while (atomic_load(&opencl_pending_operations) != atomic_load(&opencl_h2d_pending_operations));
+}
+
+
+static void
+opencl_wait_for_all_pending_operations
 (
   void
 )
@@ -612,6 +757,7 @@ opencl_api_initialize
   }
   atomic_store(&correlation_id, 0);
   atomic_store(&opencl_pending_operations, 0);
+  atomic_store(&opencl_h2d_pending_operations, 0);
 }
 
 
@@ -830,6 +976,8 @@ clEnqueueReadBuffer
  cl_event *event
 )
 {
+  ETMSG(OPENCL, "inside clEnqueueReadBuffer wrapper");
+
   uint64_t correlation_id = getCorrelationId();
   opencl_object_t *mem_info = opencl_malloc();
   mem_info->kind = OPENCL_MEMORY_CALLBACK;
@@ -878,7 +1026,13 @@ clEnqueueWriteBuffer
  cl_event *event
 )
 {
-  uint64_t correlation_id = getCorrelationId();
+  ETMSG(OPENCL, "inside clEnqueueWriteBuffer wrapper. cl_mem buffer: %p", buffer);
+	
+	opencl_h2d_map_delete((uint64_t)buffer);
+	opencl_h2d_pending_operations_adjust(-1);
+  //opencl_pending_operations_adjust(-1);
+  
+	uint64_t correlation_id = getCorrelationId();
   opencl_object_t *mem_info = opencl_malloc();
   mem_info->kind = OPENCL_MEMORY_CALLBACK;
   cl_memory_callback_t *mem_transfer_cb = &(mem_info->details.mem_cb);
@@ -912,6 +1066,63 @@ clEnqueueWriteBuffer
 }
 
 
+cl_mem
+clCreateBuffer
+(
+ cl_context context,
+ cl_mem_flags flags,
+ size_t size,
+ void* host_ptr,
+ cl_int* errcode_ret
+)
+{
+	uint64_t correlation_id = getCorrelationId();
+	opencl_h2d_pending_operations_adjust(1);
+  cl_mem buffer = 
+    HPCRUN_OPENCL_CALL(clCreateBuffer, (context, flags, size, host_ptr, errcode_ret));
+  uint64_t buffer_id = (uint64_t)buffer; 
+  //ETMSG(OPENCL, "inside clCreateBuffer wrapper. cl_mem buffer: %p. buffer_id: %"PRIu64"", buffer, buffer_id);
+	opencl_h2d_map_insert(buffer_id, correlation_id, size, 0, 0);
+  
+  return buffer;
+}
+
+
+cl_int
+clSetKernelArg
+(
+ cl_kernel kernel,
+ cl_uint arg_index,
+ size_t arg_size,
+ const void* arg_value
+)
+{
+	uint64_t start_time;
+	bool isClBuffer = opencl_isClArgBuffer(arg_value);
+  //ETMSG(OPENCL, "inside clSetKernelArg wrapper. isClBuffer: %d. *(cl_mem*)arg_value: %p",isClBuffer, *(cl_mem*)arg_value);
+	if (isClBuffer) {
+		start_time = hpcrun_nanotime();
+	}
+  cl_int return_status = 
+    HPCRUN_OPENCL_CALL(clSetKernelArg, (kernel, arg_index, arg_size, arg_value));
+	if (!isClBuffer) {
+		return return_status;	
+	}
+  uint64_t end_time = hpcrun_nanotime();
+  uint64_t buffer_id = opencl_get_buffer_id(arg_value);
+	opencl_h2d_map_entry_t *entry = opencl_h2d_map_lookup(buffer_id);
+	if (entry) {
+		size_t size = opencl_h2d_map_entry_size_get(entry);
+		uint64_t correlation_id = opencl_h2d_map_entry_correlation_get(entry);
+		opencl_subscriber_callback(memcpy_H2D, correlation_id);
+  	opencl_h2d_map_insert(buffer_id,correlation_id, size, start_time, end_time);
+	} else {
+		// there is no clCreateBuffer being invoked for this call. dont create map entries	
+	}
+  return return_status;
+}
+
+
 void
 opencl_enable_instrumentation
 (
@@ -928,6 +1139,8 @@ opencl_api_finalize
  void *args
 )
 {
-  opencl_wait_for_pending_operations();
+	opencl_wait_for_non_clSetKernelArg_pending_operations();
+	opencl_add_ccts_for_setClKernelArg();
+  opencl_wait_for_all_pending_operations();
   gpu_application_thread_process_activities();
 }
diff --git a/src/tool/hpcrun/gpu/opencl/opencl-h2d-map.c b/src/tool/hpcrun/gpu/opencl/opencl-h2d-map.c
new file mode 100644
index 0000000000..cfcbfe6830
--- /dev/null
+++ b/src/tool/hpcrun/gpu/opencl/opencl-h2d-map.c
@@ -0,0 +1,300 @@
+// -*-Mode: C++;-*- // technically C99
+
+// * BeginRiceCopyright *****************************************************
+//
+// --------------------------------------------------------------------------
+// Part of HPCToolkit (hpctoolkit.org)
+//
+// Information about sources of support for research and development of
+// HPCToolkit is at 'hpctoolkit.org' and in 'README.Acknowledgments'.
+// --------------------------------------------------------------------------
+//
+// Copyright ((c)) 2002-2020, Rice University
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+//   notice, this list of conditions and the following disclaimer.
+//
+// * Redistributions in binary form must reproduce the above copyright
+//   notice, this list of conditions and the following disclaimer in the
+//   documentation and/or other materials provided with the distribution.
+//
+// * Neither the name of Rice University (RICE) nor the names of its
+//   contributors may be used to endorse or promote products derived from
+//   this software without specific prior written permission.
+//
+// This software is provided by RICE and contributors "as is" and any
+// express or implied warranties, including, but not limited to, the
+// implied warranties of merchantability and fitness for a particular
+// purpose are disclaimed. In no event shall RICE or contributors be
+// liable for any direct, indirect, incidental, special, exemplary, or
+// consequential damages (including, but not limited to, procurement of
+// substitute goods or services; loss of use, data, or profits; or
+// business interruption) however caused and on any theory of liability,
+// whether in contract, strict liability, or tort (including negligence
+// or otherwise) arising in any way out of the use of this software, even
+// if advised of the possibility of such damage.
+//
+// ******************************************************* EndRiceCopyright *
+
+//*****************************************************************************
+// system includes
+//*****************************************************************************
+
+#include <assert.h>
+#include <string.h>
+
+
+
+//*****************************************************************************
+// local includes
+//*****************************************************************************
+
+#include <lib/prof-lean/splay-uint64.h>
+#include <lib/prof-lean/spinlock.h>
+
+#include "opencl-h2d-map.h"
+#include <hpcrun/gpu/gpu-activity-channel.h>
+#include <hpcrun/gpu/gpu-splay-allocator.h>
+#include <hpcrun/gpu/gpu-op-placeholders.h>
+
+
+//*****************************************************************************
+// macros
+//*****************************************************************************
+
+#define DEBUG 0
+
+#include "../gpu-print.h"
+
+
+#define st_insert				\
+  typed_splay_insert(correlation_id)
+
+#define st_lookup				\
+  typed_splay_lookup(correlation_id)
+
+#define st_delete				\
+  typed_splay_delete(correlation_id)
+
+#define st_forall				\
+  typed_splay_forall(correlation_id)
+
+#define st_count				\
+  typed_splay_count(correlation_id)
+
+#define st_alloc(free_list)			\
+  typed_splay_alloc(free_list, opencl_h2d_map_entry_t)
+
+#define st_free(free_list, node)		\
+  typed_splay_free(free_list, node)
+
+
+
+//*****************************************************************************
+// type declarations
+//*****************************************************************************
+
+#undef typed_splay_node
+#define typed_splay_node(correlation_id) opencl_h2d_map_entry_t
+
+typedef struct typed_splay_node(correlation_id) {
+  struct typed_splay_node(correlation_id) *left;
+  struct typed_splay_node(correlation_id) *right;
+  uint64_t buffer_id; // key
+
+  uint64_t corr_id;
+  size_t size;
+  uint64_t start_time;
+  uint64_t end_time;
+} typed_splay_node(correlation_id); 
+
+
+//******************************************************************************
+// local data
+//******************************************************************************
+
+static opencl_h2d_map_entry_t *map_root = NULL;
+
+static opencl_h2d_map_entry_t *free_list = NULL;
+
+static spinlock_t opencl_h2d_map_lock = SPINLOCK_UNLOCKED;
+
+//*****************************************************************************
+// private operations
+//*****************************************************************************
+
+typed_splay_impl(correlation_id)
+
+
+static opencl_h2d_map_entry_t *
+opencl_h2d_map_entry_alloc()
+{
+  return st_alloc(&free_list);
+}
+
+
+static opencl_h2d_map_entry_t *
+opencl_h2d_map_entry_new
+(
+ uint64_t buffer_id,
+ uint64_t correlation_id,
+ size_t size,
+ uint64_t start_time,
+ uint64_t end_time
+)
+{
+  opencl_h2d_map_entry_t *e = opencl_h2d_map_entry_alloc();
+
+  e->buffer_id = buffer_id;
+  e->corr_id = correlation_id;
+  e->size = size;
+  e->start_time = start_time;
+  e->end_time = end_time;
+
+  return e;
+}
+
+
+//*****************************************************************************
+// interface operations
+//*****************************************************************************
+
+opencl_h2d_map_entry_t *
+opencl_h2d_map_lookup
+(
+ uint64_t buffer_id
+)
+{
+  spinlock_lock(&opencl_h2d_map_lock);
+
+  uint64_t id = buffer_id;
+  opencl_h2d_map_entry_t *result = st_lookup(&map_root, id);
+
+  spinlock_unlock(&opencl_h2d_map_lock);
+
+  return result;
+}
+
+
+void
+opencl_h2d_map_insert
+(
+ uint64_t buffer_id, 
+ uint64_t correlation_id, 
+ size_t size,
+ uint64_t start_time,
+ uint64_t end_time
+)
+{
+  spinlock_lock(&opencl_h2d_map_lock);
+
+  opencl_h2d_map_entry_t *entry = st_lookup(&map_root, buffer_id);
+  if (entry) {
+    entry->corr_id = correlation_id;
+    entry->size = size;
+    entry->start_time = start_time;
+    entry->end_time = end_time;
+  } else {
+    opencl_h2d_map_entry_t *entry = 
+      opencl_h2d_map_entry_new(buffer_id, correlation_id, size, start_time, end_time);
+
+    st_insert(&map_root, entry);
+  }
+
+  spinlock_unlock(&opencl_h2d_map_lock);
+}
+
+
+void
+opencl_h2d_map_delete
+(
+ uint64_t buffer_id
+)
+{
+  spinlock_lock(&opencl_h2d_map_lock);
+
+  opencl_h2d_map_entry_t *node = st_delete(&map_root, buffer_id);
+  st_free(&free_list, node);
+
+  spinlock_unlock(&opencl_h2d_map_lock);
+}
+
+
+uint64_t
+opencl_h2d_map_entry_buffer_id_get
+(
+ opencl_h2d_map_entry_t *entry
+)
+{
+  return entry->buffer_id;
+}
+
+
+uint64_t
+opencl_h2d_map_entry_correlation_get
+(
+ opencl_h2d_map_entry_t *entry
+)
+{
+  return entry->corr_id;
+}
+
+
+size_t
+opencl_h2d_map_entry_size_get
+(
+ opencl_h2d_map_entry_t *entry
+)
+{
+  return entry->size;
+}
+
+
+uint64_t
+opencl_h2d_map_entry_start_time_get
+(
+ opencl_h2d_map_entry_t *entry
+)
+{
+  return entry->start_time;
+}
+
+
+uint64_t
+opencl_h2d_map_entry_end_time_get
+(
+ opencl_h2d_map_entry_t *entry
+)
+{
+  return entry->end_time;
+}
+
+
+void
+opencl_update_ccts_for_h2d_nodes
+(
+ opencl_splay_fn_t fn	
+)
+{
+	st_forall(map_root, splay_inorder, fn, NULL);
+}
+
+
+
+//*****************************************************************************
+// debugging code
+//*****************************************************************************
+
+uint64_t
+opencl_h2d_map_count
+(
+ void
+)
+{
+  return st_count(map_root);
+}
diff --git a/src/tool/hpcrun/gpu/opencl/opencl-h2d-map.h b/src/tool/hpcrun/gpu/opencl/opencl-h2d-map.h
new file mode 100644
index 0000000000..68d9ecdc38
--- /dev/null
+++ b/src/tool/hpcrun/gpu/opencl/opencl-h2d-map.h
@@ -0,0 +1,149 @@
+// -*-Mode: C++;-*- // technically C99
+
+// * BeginRiceCopyright *****************************************************
+//
+// --------------------------------------------------------------------------
+// Part of HPCToolkit (hpctoolkit.org)
+//
+// Information about sources of support for research and development of
+// HPCToolkit is at 'hpctoolkit.org' and in 'README.Acknowledgments'.
+// --------------------------------------------------------------------------
+//
+// Copyright ((c)) 2002-2020, Rice University
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+//   notice, this list of conditions and the following disclaimer.
+//
+// * Redistributions in binary form must reproduce the above copyright
+//   notice, this list of conditions and the following disclaimer in the
+//   documentation and/or other materials provided with the distribution.
+//
+// * Neither the name of Rice University (RICE) nor the names of its
+//   contributors may be used to endorse or promote products derived from
+//   this software without specific prior written permission.
+//
+// This software is provided by RICE and contributors "as is" and any
+// express or implied warranties, including, but not limited to, the
+// implied warranties of merchantability and fitness for a particular
+// purpose are disclaimed. In no event shall RICE or contributors be
+// liable for any direct, indirect, incidental, special, exemplary, or
+// consequential damages (including, but not limited to, procurement of
+// substitute goods or services; loss of use, data, or profits; or
+// business interruption) however caused and on any theory of liability,
+// whether in contract, strict liability, or tort (including negligence
+// or otherwise) arising in any way out of the use of this software, even
+// if advised of the possibility of such damage.
+//
+// ******************************************************* EndRiceCopyright *
+
+#ifndef opencl_h2d_map_h
+#define opencl_h2d_map_h
+
+
+//*****************************************************************************
+// system includes
+//*****************************************************************************
+
+#include <stdint.h>
+
+
+//*****************************************************************************
+// type definitions 
+//*****************************************************************************
+
+typedef struct opencl_h2d_map_entry_t opencl_h2d_map_entry_t;
+
+
+typedef void (*opencl_splay_fn_t)
+(
+	opencl_h2d_map_entry_t *,
+	splay_visit_t,
+	void *
+);
+
+
+
+//*****************************************************************************
+// interface operations
+//*****************************************************************************
+
+opencl_h2d_map_entry_t *
+opencl_h2d_map_lookup
+(
+ uint64_t
+);
+
+
+void
+opencl_h2d_map_insert
+(
+ uint64_t, 
+ uint64_t, 
+ size_t,
+ uint64_t,
+ uint64_t
+);
+
+
+void
+opencl_h2d_map_delete
+(
+ uint64_t
+);
+
+
+uint64_t
+opencl_h2d_map_entry_buffer_id_get
+(
+ opencl_h2d_map_entry_t *entry
+);
+
+
+uint64_t
+opencl_h2d_map_entry_correlation_get
+(
+ opencl_h2d_map_entry_t *
+);
+
+
+size_t
+opencl_h2d_map_entry_size_get
+(
+ opencl_h2d_map_entry_t *
+);
+
+
+uint64_t
+opencl_h2d_map_entry_start_time_get
+(
+ opencl_h2d_map_entry_t *
+);
+
+
+uint64_t
+opencl_h2d_map_entry_end_time_get
+(
+ opencl_h2d_map_entry_t *
+);
+
+
+void
+opencl_update_ccts_for_h2d_nodes
+(
+ opencl_splay_fn_t fn	
+);
+
+
+uint64_t
+opencl_h2d_map_count
+(
+ void
+);
+
+#endif
+
diff --git a/src/tool/hpcrun/sample-sources/opencl.c b/src/tool/hpcrun/sample-sources/opencl.c
index bc5e338557..79ca8bf2ba 100644
--- a/src/tool/hpcrun/sample-sources/opencl.c
+++ b/src/tool/hpcrun/sample-sources/opencl.c
@@ -181,8 +181,8 @@ METHOD_FN(finalize_event_list)
   device_finalizer_register(device_finalizer_type_shutdown, &device_finalizer_shutdown);
 
   // Register shutdown functions to write trace files
-  device_trace_finalizer_shutdown.fn = gpu_trace_fini;
-  device_finalizer_register(device_finalizer_type_shutdown, &device_trace_finalizer_shutdown);
+  //device_trace_finalizer_shutdown.fn = gpu_trace_fini;
+  //device_finalizer_register(device_finalizer_type_shutdown, &device_trace_finalizer_shutdown);
 }
 
 

From 13e9b86219de000b81387f26c978dd66c6073b93 Mon Sep 17 00:00:00 2001
From: dejangrubisic <grubisic.dejan@yahoo.com>
Date: Wed, 7 Oct 2020 18:23:49 -0500
Subject: [PATCH 084/177] NOT_COMPLETE:Merging gpu-trace-demultiplexer with
 opencl_instrumentation

---
 Makefile.in                                   |   9 +
 configure                                     | 259 ++++-
 configure.ac                                  | 176 +++-
 doc/Makefile.in                               |   9 +
 doc/man/Makefile.in                           |   9 +
 doc/manual/Makefile.in                        |   9 +
 doc/www/Makefile.in                           |   9 +
 lib/Makefile.in                               |   9 +
 src/Makefile.in                               |   9 +
 src/lib/Makefile.in                           |   9 +
 src/lib/analysis/Makefile.in                  |   9 +
 src/lib/banal/Makefile.in                     |   9 +
 src/lib/binutils/Makefile.in                  |   9 +
 src/lib/isa/Makefile.in                       |   9 +
 src/lib/prof-lean/Makefile.in                 |   9 +
 src/lib/prof/Makefile.in                      |   9 +
 src/lib/profxml/Makefile.in                   |   9 +
 src/lib/stubs-gcc_s/Makefile.in               |   9 +
 src/lib/support-lean/Makefile.in              |   9 +
 src/lib/support/Makefile.in                   |   9 +
 src/lib/xml/Makefile.in                       |   9 +
 src/tool/Makefile.in                          |   9 +
 src/tool/hpcfnbounds/Makefile.in              |   9 +
 src/tool/hpcfnbounds2/Makefile.in             |   9 +
 src/tool/hpclump/Makefile.in                  |   9 +
 src/tool/hpcprof-flat/Makefile.in             |   9 +
 src/tool/hpcprof-mpi/Makefile.in              |   9 +
 src/tool/hpcprof/Makefile.in                  |   9 +
 src/tool/hpcproftt/Makefile.in                |   9 +
 src/tool/hpcrun-flat/Makefile.in              |   9 +
 src/tool/hpcrun/Makefile.am                   |  41 +-
 src/tool/hpcrun/Makefile.in                   | 731 +++++++-------
 .../gtpin-correlation-id-map.h                | 141 +++
 .../instrumentation/gtpin-instrumentation.c   | 535 ++++++++++
 .../instrumentation/gtpin-instrumentation.h   |  71 ++
 .../gpu/opencl/opencl-activity-translate.c    |  27 +-
 .../gpu/opencl/opencl-activity-translate.h    |  11 +-
 src/tool/hpcrun/gpu/opencl/opencl-api.c       | 932 +++++++++++++++---
 src/tool/hpcrun/gpu/opencl/opencl-api.h       |  49 +-
 src/tool/hpcrun/gpu/opencl/opencl-h2d-map.h   | 149 +++
 src/tool/hpcrun/gpu/opencl/opencl-intercept.c |   8 +-
 src/tool/hpcrun/sample-sources/opencl.c       |  38 +-
 src/tool/hpcrun/utilities/bgq-cnk/Makefile.in |   9 +
 src/tool/hpcserver/Makefile.in                |   9 +
 src/tool/hpcserver/mpi/Makefile.in            |   9 +
 src/tool/hpcstruct/Makefile.in                |   9 +
 src/tool/hpctracedump/Makefile.in             |   9 +
 src/tool/misc/Makefile.in                     |   9 +
 src/tool/xprof/Makefile.in                    |   9 +
 49 files changed, 2919 insertions(+), 564 deletions(-)
 create mode 100644 src/tool/hpcrun/gpu/instrumentation/gtpin-correlation-id-map.h
 create mode 100644 src/tool/hpcrun/gpu/instrumentation/gtpin-instrumentation.c
 create mode 100644 src/tool/hpcrun/gpu/instrumentation/gtpin-instrumentation.h
 create mode 100644 src/tool/hpcrun/gpu/opencl/opencl-h2d-map.h

diff --git a/Makefile.in b/Makefile.in
index 30c8dee7ae..f9c4ceb6c5 100644
--- a/Makefile.in
+++ b/Makefile.in
@@ -341,9 +341,18 @@ OPT_CUDA_LDFLAGS = @OPT_CUDA_LDFLAGS@
 OPT_CUPTI = @OPT_CUPTI@
 OPT_CUPTI_IFLAGS = @OPT_CUPTI_IFLAGS@
 OPT_CUPTI_LDFLAGS = @OPT_CUPTI_LDFLAGS@
+OPT_GTPIN = @OPT_GTPIN@
+OPT_GTPIN_IFLAGS = @OPT_GTPIN_IFLAGS@
+OPT_GTPIN_LDFLAGS = @OPT_GTPIN_LDFLAGS@
+OPT_IGC = @OPT_IGC@
+OPT_IGC_IFLAGS = @OPT_IGC_IFLAGS@
+OPT_IGC_LDFLAGS = @OPT_IGC_LDFLAGS@
 OPT_LEVEL0 = @OPT_LEVEL0@
 OPT_LEVEL0_IFLAGS = @OPT_LEVEL0_IFLAGS@
 OPT_LEVEL0_LDFLAGS = @OPT_LEVEL0_LDFLAGS@
+OPT_METRICS_DISCOVERY = @OPT_METRICS_DISCOVERY@
+OPT_METRICS_DISCOVERY_IFLAGS = @OPT_METRICS_DISCOVERY_IFLAGS@
+OPT_METRICS_DISCOVERY_LDFLAGS = @OPT_METRICS_DISCOVERY_LDFLAGS@
 OPT_OBJCOPY = @OPT_OBJCOPY@
 OPT_OPENCL_IFLAGS = @OPT_OPENCL_IFLAGS@
 OPT_PAPI = @OPT_PAPI@
diff --git a/configure b/configure
index 9f02f01f5a..8ed07fae0f 100755
--- a/configure
+++ b/configure
@@ -657,9 +657,24 @@ OPT_ROCM_IFLAGS
 OPT_ROCM
 OPT_ENABLE_ROCM_FALSE
 OPT_ENABLE_ROCM_TRUE
+OPT_GTPIN_LDFLAGS
+OPT_GTPIN_IFLAGS
+OPT_GTPIN
+OPT_ENABLE_GTPIN_FALSE
+OPT_ENABLE_GTPIN_TRUE
+OPT_METRICS_DISCOVERY_LDFLAGS
+OPT_METRICS_DISCOVERY_IFLAGS
+OPT_METRICS_DISCOVERY
+OPT_ENABLE_METRICS_DISCOVERY_FALSE
+OPT_ENABLE_METRICS_DISCOVERY_TRUE
+OPT_IGC_LDFLAGS
+OPT_IGC_IFLAGS
+OPT_IGC
+OPT_ENABLE_IGC_FALSE
+OPT_ENABLE_IGC_TRUE
 OPT_OPENCL_IFLAGS
-ENABLE_OPENCL_FALSE
-ENABLE_OPENCL_TRUE
+OPT_ENABLE_OPENCL_FALSE
+OPT_ENABLE_OPENCL_TRUE
 OPT_CUPTI_LDFLAGS
 OPT_CUPTI_IFLAGS
 OPT_CUPTI
@@ -1059,6 +1074,9 @@ enable_xop
 with_cuda
 with_cupti
 with_opencl
+with_igc
+with_metrics_discovery
+with_gtpin
 with_rocm
 with_level0
 enable_data_centric_tracing
@@ -1812,6 +1830,10 @@ Optional Packages:
   --with-cupti=PATH       path to cupti install directory, default is from
                           cuda
   --with-opencl=PATH      path to opencl headers
+  --with-igc=PATH         path to igc install directory
+  --with-metrics-discovery=PATH
+                          path to metrics-discovery install directory
+  --with-gtpin=PATH       path to gtpin install directory
   --with-rocm=PATH        use given ROCM installation (absolute path) with
                           hpcrun (default is NO)
   --with-level0=PATH      use given Level Zero installation (absolute path)
@@ -23594,12 +23616,15 @@ if test "${with_opencl+set}" = set; then :
 fi
 
 
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for opencl" >&5
+$as_echo_n "checking for opencl... " >&6; }
+
 case "$OPENCL" in
   /* )
-    if test ! -f "${OPENCL}/CL/cl.h" ; then
+    if test ! -f "${OPENCL}/include/CL/cl.h" ; then
       as_fn_error $? "unable to find CL/cl.h in: $OPENCL" "$LINENO" 5
     else
-      OPT_OPENCL_IFLAGS="-I${OPENCL}"
+      OPT_OPENCL_IFLAGS="-I${OPENCL}/include"
       OPT_HAVE_OPENCL=yes
     fi
     ;;
@@ -23620,14 +23645,210 @@ $as_echo "$as_me: The <CL/cl.h> header file is available." >&6;}
 fi
 
  if test "$OPT_HAVE_OPENCL" = yes; then
-  ENABLE_OPENCL_TRUE=
-  ENABLE_OPENCL_FALSE='#'
+  OPT_ENABLE_OPENCL_TRUE=
+  OPT_ENABLE_OPENCL_FALSE='#'
+else
+  OPT_ENABLE_OPENCL_TRUE='#'
+  OPT_ENABLE_OPENCL_FALSE=
+fi
+
+
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $OPENCL" >&5
+$as_echo "$OPENCL" >&6; }
+
+
+
+#-------------------------------------------------
+# Option: --with-igc=PATH
+#-------------------------------------------------
+
+OPT_HAVE_IGC=no
+OPT_IGC_IFLAGS=
+OPT_IGC_LDFLAGS=
+
+
+# Check whether --with-igc was given.
+if test "${with_igc+set}" = set; then :
+  withval=$with_igc; IGC="$withval"
+fi
+
+
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for igc" >&5
+$as_echo_n "checking for igc... " >&6; }
+
+case "$IGC" in
+  /* )
+    if test ! -f "${IGC}/include/igc/igc.opencl.h" ; then
+      as_fn_error $? "unable to find igc.opencl.h in: $IGC" "$LINENO" 5
+    fi
+    OPT_IGC_IFLAGS="-I${IGC}/include"
+
+    IGC_LDFLAGS =
+    IGA_LDFLAGS =
+
+    for lib in $multilib_path ; do
+      if test -f "${IGC}/${lib}/libigc.so" ; then
+        IGC_LDFLAGS="-L${IGC}/$lib -Wl,-rpath=${IGC}/$lib -ligc"
+        break
+      fi
+    done
+    if test "x$IGC_LDFLAGS" = x ; then
+      as_fn_error $? "unable to find libigc.so in: $IGC" "$LINENO" 5
+    fi
+
+    for lib in $multilib_path ; do
+      if test -f "${IGC}/${lib}/libiga64.so" ; then
+        IGA_LDFLAGS="-L${IGC}/$lib -Wl,-rpath=${IGC}/$lib -liga64"
+        break
+      fi
+    done
+    if test "x$IGA_LDFLAGS" = x ; then
+      as_fn_error $? "unable to find libiga.so in: $IGC" "$LINENO" 5
+    fi
+
+    OPT_IGC_LDFLAGS="${IGC_LDFLAGS} ${IGA_LDFLAGS}"
+
+    OPT_HAVE_IGC=yes
+    ;;
+  no )
+    ;;
+  * )
+    as_fn_error $? "igc directory must be absolute path: $IGC" "$LINENO" 5
+    ;;
+esac
+
+ if test "$OPT_HAVE_IGC" = yes; then
+  OPT_ENABLE_IGC_TRUE=
+  OPT_ENABLE_IGC_FALSE='#'
+else
+  OPT_ENABLE_IGC_TRUE='#'
+  OPT_ENABLE_IGC_FALSE=
+fi
+
+
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $IGC" >&5
+$as_echo "$IGC" >&6; }
+
+
+
+
+
+#-------------------------------------------------
+# Option: --with-metrics-discovery=PATH
+#-------------------------------------------------
+
+OPT_HAVE_METRICS_DISCOVERY=no
+OPT_METRICS_DISCOVERY_IFLAGS=
+OPT_METRICS_DISCOVERY_LDFLAGS=
+
+
+# Check whether --with-metrics-discovery was given.
+if test "${with_metrics_discovery+set}" = set; then :
+  withval=$with_metrics_discovery; METRICS_DISCOVERY="$withval"
+fi
+
+
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for metrics-discovery" >&5
+$as_echo_n "checking for metrics-discovery... " >&6; }
+
+case "$METRICS_DISCOVERY" in
+  /* )
+    if test ! -f "${METRICS_DISCOVERY}/include/metrics_discovery_api.h" ; then
+      as_fn_error $? "unable to find metrics_discovery_api.h in: $METRICS_DISCOVERY" "$LINENO" 5
+    fi
+    OPT_METRICS_DISCOVERY_IFLAGS="-I${METRICS_DISCOVERY}/include"
+
+    for lib in $multilib_path ; do
+      if test -f "${METRICS_DISCOVERY}/${lib}/libmd.so" ; then
+        OPT_METRICS_DISCOVERY_LDFLAGS="-L${METRICS_DISCOVERY}/$lib -Wl,-rpath=${METRICS_DISCOVERY}/$lib -lmd"
+        break
+      fi
+    done
+    if test "x$OPT_METRICS_DISCOVERY_LDFLAGS" = x ; then
+      as_fn_error $? "unable to find libmd.so in: $METRICS_DISCOVERY" "$LINENO" 5
+    fi
+    OPT_HAVE_METRICS_DISCOVERY=yes
+    ;;
+  no )
+    ;;
+  * )
+    as_fn_error $? "metrics-discovery directory must be absolute path: $METRICS_DISCOVERY" "$LINENO" 5
+    ;;
+esac
+
+ if test "$OPT_HAVE_METRICS_DISCOVERY" = yes; then
+  OPT_ENABLE_METRICS_DISCOVERY_TRUE=
+  OPT_ENABLE_METRICS_DISCOVERY_FALSE='#'
+else
+  OPT_ENABLE_METRICS_DISCOVERY_TRUE='#'
+  OPT_ENABLE_METRICS_DISCOVERY_FALSE=
+fi
+
+
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $METRICS_DISCOVERY" >&5
+$as_echo "$METRICS_DISCOVERY" >&6; }
+
+
+
+
+
+#-------------------------------------------------
+# Option: --with-gtpin=PATH
+#-------------------------------------------------
+
+OPT_HAVE_GTPIN=no
+OPT_GTPIN_IFLAGS=
+OPT_GTPIN_LDFLAGS=
+
+
+# Check whether --with-gtpin was given.
+if test "${with_gtpin+set}" = set; then :
+  withval=$with_gtpin; GTPIN="$withval"
+fi
+
+
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for gtpin" >&5
+$as_echo_n "checking for gtpin... " >&6; }
+
+case "$GTPIN" in
+  /* )
+    if test ! -f "${GTPIN}/Profilers/Include/gtpin.h" ; then
+      as_fn_error $? "unable to find gtpin.h in: $GTPIN" "$LINENO" 5
+    fi
+    OPT_GTPIN_IFLAGS="-I${GTPIN}/Profilers/Include -I${GTPIN}/Profilers/Include/ged/intel64/"
+
+    if test -f "${GTPIN}/Profilers/Lib/intel64/libgtpin.so" ; then
+      OPT_GTPIN_LDFLAGS="-L${GTPIN}/Profilers/Lib/intel64/ -Wl,-rpath=${GTPIN}/Profilers/Lib/intel64/ -lgtpin"
+    fi
+
+    if test "x$OPT_GTPIN_LDFLAGS" = x ; then
+      as_fn_error $? "unable to find libgtpin.so in: $GTPIN" "$LINENO" 5
+    fi
+    OPT_HAVE_GTPIN=yes
+    ;;
+  no )
+    ;;
+  * )
+    as_fn_error $? "gtpin directory must be absolute path: $GTPIN" "$LINENO" 5
+    ;;
+esac
+
+ if test "$OPT_HAVE_GTPIN" = yes; then
+  OPT_ENABLE_GTPIN_TRUE=
+  OPT_ENABLE_GTPIN_FALSE='#'
 else
-  ENABLE_OPENCL_TRUE='#'
-  ENABLE_OPENCL_FALSE=
+  OPT_ENABLE_GTPIN_TRUE='#'
+  OPT_ENABLE_GTPIN_FALSE=
 fi
 
 
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $GTPIN" >&5
+$as_echo "$GTPIN" >&6; }
+
+
+
+
+
 
 #-------------------------------------------------
 # Option: --with-rocm=PATH
@@ -24323,8 +24544,20 @@ if test -z "${OPT_ENABLE_CUPTI_TRUE}" && test -z "${OPT_ENABLE_CUPTI_FALSE}"; th
   as_fn_error $? "conditional \"OPT_ENABLE_CUPTI\" was never defined.
 Usually this means the macro was only invoked conditionally." "$LINENO" 5
 fi
-if test -z "${ENABLE_OPENCL_TRUE}" && test -z "${ENABLE_OPENCL_FALSE}"; then
-  as_fn_error $? "conditional \"ENABLE_OPENCL\" was never defined.
+if test -z "${OPT_ENABLE_OPENCL_TRUE}" && test -z "${OPT_ENABLE_OPENCL_FALSE}"; then
+  as_fn_error $? "conditional \"OPT_ENABLE_OPENCL\" was never defined.
+Usually this means the macro was only invoked conditionally." "$LINENO" 5
+fi
+if test -z "${OPT_ENABLE_IGC_TRUE}" && test -z "${OPT_ENABLE_IGC_FALSE}"; then
+  as_fn_error $? "conditional \"OPT_ENABLE_IGC\" was never defined.
+Usually this means the macro was only invoked conditionally." "$LINENO" 5
+fi
+if test -z "${OPT_ENABLE_METRICS_DISCOVERY_TRUE}" && test -z "${OPT_ENABLE_METRICS_DISCOVERY_FALSE}"; then
+  as_fn_error $? "conditional \"OPT_ENABLE_METRICS_DISCOVERY\" was never defined.
+Usually this means the macro was only invoked conditionally." "$LINENO" 5
+fi
+if test -z "${OPT_ENABLE_GTPIN_TRUE}" && test -z "${OPT_ENABLE_GTPIN_FALSE}"; then
+  as_fn_error $? "conditional \"OPT_ENABLE_GTPIN\" was never defined.
 Usually this means the macro was only invoked conditionally." "$LINENO" 5
 fi
 if test -z "${OPT_ENABLE_ROCM_TRUE}" && test -z "${OPT_ENABLE_ROCM_FALSE}"; then
@@ -26989,6 +27222,12 @@ $as_echo "$as_me:   zlib:         ${ZLIB}" >&6;}
 $as_echo "$as_me:   cuda:         ${CUDA}" >&6;}
 { $as_echo "$as_me:${as_lineno-$LINENO}:   cupti:        ${CUPTI}" >&5
 $as_echo "$as_me:   cupti:        ${CUPTI}" >&6;}
+{ $as_echo "$as_me:${as_lineno-$LINENO}:   igc:          ${IGC}" >&5
+$as_echo "$as_me:   igc:          ${IGC}" >&6;}
+{ $as_echo "$as_me:${as_lineno-$LINENO}:   gtpin:        ${GTPIN}" >&5
+$as_echo "$as_me:   gtpin:        ${GTPIN}" >&6;}
+{ $as_echo "$as_me:${as_lineno-$LINENO}:   metrics-discovery: ${METRICS_DISCOVERY}" >&5
+$as_echo "$as_me:   metrics-discovery: ${METRICS_DISCOVERY}" >&6;}
 { $as_echo "$as_me:${as_lineno-$LINENO}:   papi-c-cupti: ${use_papi_c_cupti}" >&5
 $as_echo "$as_me:   papi-c-cupti: ${use_papi_c_cupti}" >&6;}
 { $as_echo "$as_me:${as_lineno-$LINENO}:   valgrind:     ${VALGRIND}" >&5
diff --git a/configure.ac b/configure.ac
index a7a5023d3a..82c5ec27c8 100644
--- a/configure.ac
+++ b/configure.ac
@@ -4721,12 +4721,14 @@ AC_ARG_WITH([opencl],
   [OPENCL="$withval"],
   [])
 
+AC_MSG_CHECKING([for opencl])
+
 case "$OPENCL" in
   /* )
-    if test ! -f "${OPENCL}/CL/cl.h" ; then
+    if test ! -f "${OPENCL}/include/CL/cl.h" ; then
       AC_MSG_ERROR([unable to find CL/cl.h in: $OPENCL])
     else
-      OPT_OPENCL_IFLAGS="-I${OPENCL}"
+      OPT_OPENCL_IFLAGS="-I${OPENCL}/include"
       OPT_HAVE_OPENCL=yes
     fi
     ;;
@@ -4745,9 +4747,174 @@ if test "$OPT_HAVE_OPENCL" = yes ; then
   AC_MSG_NOTICE([The <CL/cl.h> header file is available.])
 fi
 
-AM_CONDITIONAL([ENABLE_OPENCL], [test "$OPT_HAVE_OPENCL" = yes])
+AM_CONDITIONAL([OPT_ENABLE_OPENCL], [test "$OPT_HAVE_OPENCL" = yes])
+
+AC_MSG_RESULT([$OPENCL])
+
 AC_SUBST([OPT_OPENCL_IFLAGS])
 
+#-------------------------------------------------
+# Option: --with-igc=PATH
+#-------------------------------------------------
+
+OPT_HAVE_IGC=no
+OPT_IGC_IFLAGS=
+OPT_IGC_LDFLAGS=
+
+AC_ARG_WITH([igc],
+  [AS_HELP_STRING([--with-igc=PATH],
+      [path to igc install directory])],
+  [IGC="$withval"],
+  [])
+
+AC_MSG_CHECKING([for igc])
+
+case "$IGC" in
+  /* )
+    if test ! -f "${IGC}/include/igc/igc.opencl.h" ; then
+      AC_MSG_ERROR([unable to find igc.opencl.h in: $IGC])
+    fi
+    OPT_IGC_IFLAGS="-I${IGC}/include"
+
+    IGC_LDFLAGS = 
+    IGA_LDFLAGS =
+
+    for lib in $multilib_path ; do
+      if test -f "${IGC}/${lib}/libigc.so" ; then
+        IGC_LDFLAGS="-L${IGC}/$lib -Wl,-rpath=${IGC}/$lib -ligc"
+        break
+      fi
+    done
+    if test "x$IGC_LDFLAGS" = x ; then
+      AC_MSG_ERROR([unable to find libigc.so in: $IGC])
+    fi
+
+    for lib in $multilib_path ; do
+      if test -f "${IGC}/${lib}/libiga64.so" ; then
+        IGA_LDFLAGS="-L${IGC}/$lib -Wl,-rpath=${IGC}/$lib -liga64"
+        break
+      fi
+    done
+    if test "x$IGA_LDFLAGS" = x ; then
+      AC_MSG_ERROR([unable to find libiga.so in: $IGC])
+    fi
+
+    OPT_IGC_LDFLAGS="${IGC_LDFLAGS} ${IGA_LDFLAGS}"
+
+    OPT_HAVE_IGC=yes
+    ;;
+  no )
+    ;;
+  * )
+    AC_MSG_ERROR([igc directory must be absolute path: $IGC])
+    ;;
+esac
+
+AM_CONDITIONAL([OPT_ENABLE_IGC], [test "$OPT_HAVE_IGC" = yes])
+
+AC_MSG_RESULT([$IGC])
+
+AC_SUBST([OPT_IGC])
+AC_SUBST([OPT_IGC_IFLAGS])
+AC_SUBST([OPT_IGC_LDFLAGS])
+
+#-------------------------------------------------
+# Option: --with-metrics-discovery=PATH
+#-------------------------------------------------
+
+OPT_HAVE_METRICS_DISCOVERY=no
+OPT_METRICS_DISCOVERY_IFLAGS=
+OPT_METRICS_DISCOVERY_LDFLAGS=
+
+AC_ARG_WITH([metrics-discovery],
+  [AS_HELP_STRING([--with-metrics-discovery=PATH],
+      [path to metrics-discovery install directory])],
+  [METRICS_DISCOVERY="$withval"],
+  [])
+
+AC_MSG_CHECKING([for metrics-discovery])
+
+case "$METRICS_DISCOVERY" in
+  /* )
+    if test ! -f "${METRICS_DISCOVERY}/include/metrics_discovery_api.h" ; then
+      AC_MSG_ERROR([unable to find metrics_discovery_api.h in: $METRICS_DISCOVERY])
+    fi
+    OPT_METRICS_DISCOVERY_IFLAGS="-I${METRICS_DISCOVERY}/include"
+
+    for lib in $multilib_path ; do
+      if test -f "${METRICS_DISCOVERY}/${lib}/libmd.so" ; then
+        OPT_METRICS_DISCOVERY_LDFLAGS="-L${METRICS_DISCOVERY}/$lib -Wl,-rpath=${METRICS_DISCOVERY}/$lib -lmd"
+        break
+      fi
+    done
+    if test "x$OPT_METRICS_DISCOVERY_LDFLAGS" = x ; then
+      AC_MSG_ERROR([unable to find libmd.so in: $METRICS_DISCOVERY])
+    fi
+    OPT_HAVE_METRICS_DISCOVERY=yes
+    ;;
+  no )
+    ;;
+  * )
+    AC_MSG_ERROR([metrics-discovery directory must be absolute path: $METRICS_DISCOVERY])
+    ;;
+esac
+
+AM_CONDITIONAL([OPT_ENABLE_METRICS_DISCOVERY], [test "$OPT_HAVE_METRICS_DISCOVERY" = yes])
+
+AC_MSG_RESULT([$METRICS_DISCOVERY])
+
+AC_SUBST([OPT_METRICS_DISCOVERY])
+AC_SUBST([OPT_METRICS_DISCOVERY_IFLAGS])
+AC_SUBST([OPT_METRICS_DISCOVERY_LDFLAGS])
+
+#-------------------------------------------------
+# Option: --with-gtpin=PATH
+#-------------------------------------------------
+
+OPT_HAVE_GTPIN=no
+OPT_GTPIN_IFLAGS=
+OPT_GTPIN_LDFLAGS=
+
+AC_ARG_WITH([gtpin],
+  [AS_HELP_STRING([--with-gtpin=PATH],
+      [path to gtpin install directory])],
+  [GTPIN="$withval"],
+  [])
+
+AC_MSG_CHECKING([for gtpin])
+
+case "$GTPIN" in
+  /* )
+    if test ! -f "${GTPIN}/Profilers/Include/gtpin.h" ; then
+      AC_MSG_ERROR([unable to find gtpin.h in: $GTPIN])
+    fi
+    OPT_GTPIN_IFLAGS="-I${GTPIN}/Profilers/Include -I${GTPIN}/Profilers/Include/ged/intel64/"
+
+    if test -f "${GTPIN}/Profilers/Lib/intel64/libgtpin.so" ; then
+      OPT_GTPIN_LDFLAGS="-L${GTPIN}/Profilers/Lib/intel64/ -Wl,-rpath=${GTPIN}/Profilers/Lib/intel64/ -lgtpin"
+    fi
+
+    if test "x$OPT_GTPIN_LDFLAGS" = x ; then
+      AC_MSG_ERROR([unable to find libgtpin.so in: $GTPIN])
+    fi
+    OPT_HAVE_GTPIN=yes
+    ;;
+  no )
+    ;;
+  * )
+    AC_MSG_ERROR([gtpin directory must be absolute path: $GTPIN])
+    ;;
+esac
+
+AM_CONDITIONAL([OPT_ENABLE_GTPIN], [test "$OPT_HAVE_GTPIN" = yes])
+
+AC_MSG_RESULT([$GTPIN])
+
+AC_SUBST([OPT_GTPIN])
+AC_SUBST([OPT_GTPIN_IFLAGS])
+AC_SUBST([OPT_GTPIN_LDFLAGS])
+
+
 #-------------------------------------------------
 # Option: --with-rocm=PATH
 #-------------------------------------------------
@@ -5183,6 +5350,9 @@ AC_MSG_NOTICE([  xerces:       ${XERCES}])
 AC_MSG_NOTICE([  zlib:         ${ZLIB}])
 AC_MSG_NOTICE([  cuda:         ${CUDA}])
 AC_MSG_NOTICE([  cupti:        ${CUPTI}])
+AC_MSG_NOTICE([  igc:          ${IGC}])
+AC_MSG_NOTICE([  gtpin:        ${GTPIN}])
+AC_MSG_NOTICE([  metrics-discovery: ${METRICS_DISCOVERY}])
 AC_MSG_NOTICE([  papi-c-cupti: ${use_papi_c_cupti}])
 AC_MSG_NOTICE([  valgrind:     ${VALGRIND}])
 AC_MSG_NOTICE([  valgrind:     annotated: ${OPT_ENABLE_VG_ANNOTATIONS}])
diff --git a/doc/Makefile.in b/doc/Makefile.in
index dc8e3b5cc2..a20ab2984d 100644
--- a/doc/Makefile.in
+++ b/doc/Makefile.in
@@ -322,9 +322,18 @@ OPT_CUDA_LDFLAGS = @OPT_CUDA_LDFLAGS@
 OPT_CUPTI = @OPT_CUPTI@
 OPT_CUPTI_IFLAGS = @OPT_CUPTI_IFLAGS@
 OPT_CUPTI_LDFLAGS = @OPT_CUPTI_LDFLAGS@
+OPT_GTPIN = @OPT_GTPIN@
+OPT_GTPIN_IFLAGS = @OPT_GTPIN_IFLAGS@
+OPT_GTPIN_LDFLAGS = @OPT_GTPIN_LDFLAGS@
+OPT_IGC = @OPT_IGC@
+OPT_IGC_IFLAGS = @OPT_IGC_IFLAGS@
+OPT_IGC_LDFLAGS = @OPT_IGC_LDFLAGS@
 OPT_LEVEL0 = @OPT_LEVEL0@
 OPT_LEVEL0_IFLAGS = @OPT_LEVEL0_IFLAGS@
 OPT_LEVEL0_LDFLAGS = @OPT_LEVEL0_LDFLAGS@
+OPT_METRICS_DISCOVERY = @OPT_METRICS_DISCOVERY@
+OPT_METRICS_DISCOVERY_IFLAGS = @OPT_METRICS_DISCOVERY_IFLAGS@
+OPT_METRICS_DISCOVERY_LDFLAGS = @OPT_METRICS_DISCOVERY_LDFLAGS@
 OPT_OBJCOPY = @OPT_OBJCOPY@
 OPT_OPENCL_IFLAGS = @OPT_OPENCL_IFLAGS@
 OPT_PAPI = @OPT_PAPI@
diff --git a/doc/man/Makefile.in b/doc/man/Makefile.in
index a5f28dbe57..3b28c735e1 100644
--- a/doc/man/Makefile.in
+++ b/doc/man/Makefile.in
@@ -296,9 +296,18 @@ OPT_CUDA_LDFLAGS = @OPT_CUDA_LDFLAGS@
 OPT_CUPTI = @OPT_CUPTI@
 OPT_CUPTI_IFLAGS = @OPT_CUPTI_IFLAGS@
 OPT_CUPTI_LDFLAGS = @OPT_CUPTI_LDFLAGS@
+OPT_GTPIN = @OPT_GTPIN@
+OPT_GTPIN_IFLAGS = @OPT_GTPIN_IFLAGS@
+OPT_GTPIN_LDFLAGS = @OPT_GTPIN_LDFLAGS@
+OPT_IGC = @OPT_IGC@
+OPT_IGC_IFLAGS = @OPT_IGC_IFLAGS@
+OPT_IGC_LDFLAGS = @OPT_IGC_LDFLAGS@
 OPT_LEVEL0 = @OPT_LEVEL0@
 OPT_LEVEL0_IFLAGS = @OPT_LEVEL0_IFLAGS@
 OPT_LEVEL0_LDFLAGS = @OPT_LEVEL0_LDFLAGS@
+OPT_METRICS_DISCOVERY = @OPT_METRICS_DISCOVERY@
+OPT_METRICS_DISCOVERY_IFLAGS = @OPT_METRICS_DISCOVERY_IFLAGS@
+OPT_METRICS_DISCOVERY_LDFLAGS = @OPT_METRICS_DISCOVERY_LDFLAGS@
 OPT_OBJCOPY = @OPT_OBJCOPY@
 OPT_OPENCL_IFLAGS = @OPT_OPENCL_IFLAGS@
 OPT_PAPI = @OPT_PAPI@
diff --git a/doc/manual/Makefile.in b/doc/manual/Makefile.in
index 01986c8ee7..caaa631d57 100644
--- a/doc/manual/Makefile.in
+++ b/doc/manual/Makefile.in
@@ -293,9 +293,18 @@ OPT_CUDA_LDFLAGS = @OPT_CUDA_LDFLAGS@
 OPT_CUPTI = @OPT_CUPTI@
 OPT_CUPTI_IFLAGS = @OPT_CUPTI_IFLAGS@
 OPT_CUPTI_LDFLAGS = @OPT_CUPTI_LDFLAGS@
+OPT_GTPIN = @OPT_GTPIN@
+OPT_GTPIN_IFLAGS = @OPT_GTPIN_IFLAGS@
+OPT_GTPIN_LDFLAGS = @OPT_GTPIN_LDFLAGS@
+OPT_IGC = @OPT_IGC@
+OPT_IGC_IFLAGS = @OPT_IGC_IFLAGS@
+OPT_IGC_LDFLAGS = @OPT_IGC_LDFLAGS@
 OPT_LEVEL0 = @OPT_LEVEL0@
 OPT_LEVEL0_IFLAGS = @OPT_LEVEL0_IFLAGS@
 OPT_LEVEL0_LDFLAGS = @OPT_LEVEL0_LDFLAGS@
+OPT_METRICS_DISCOVERY = @OPT_METRICS_DISCOVERY@
+OPT_METRICS_DISCOVERY_IFLAGS = @OPT_METRICS_DISCOVERY_IFLAGS@
+OPT_METRICS_DISCOVERY_LDFLAGS = @OPT_METRICS_DISCOVERY_LDFLAGS@
 OPT_OBJCOPY = @OPT_OBJCOPY@
 OPT_OPENCL_IFLAGS = @OPT_OPENCL_IFLAGS@
 OPT_PAPI = @OPT_PAPI@
diff --git a/doc/www/Makefile.in b/doc/www/Makefile.in
index 3db6ccc8ed..864f7cd0e8 100644
--- a/doc/www/Makefile.in
+++ b/doc/www/Makefile.in
@@ -293,9 +293,18 @@ OPT_CUDA_LDFLAGS = @OPT_CUDA_LDFLAGS@
 OPT_CUPTI = @OPT_CUPTI@
 OPT_CUPTI_IFLAGS = @OPT_CUPTI_IFLAGS@
 OPT_CUPTI_LDFLAGS = @OPT_CUPTI_LDFLAGS@
+OPT_GTPIN = @OPT_GTPIN@
+OPT_GTPIN_IFLAGS = @OPT_GTPIN_IFLAGS@
+OPT_GTPIN_LDFLAGS = @OPT_GTPIN_LDFLAGS@
+OPT_IGC = @OPT_IGC@
+OPT_IGC_IFLAGS = @OPT_IGC_IFLAGS@
+OPT_IGC_LDFLAGS = @OPT_IGC_LDFLAGS@
 OPT_LEVEL0 = @OPT_LEVEL0@
 OPT_LEVEL0_IFLAGS = @OPT_LEVEL0_IFLAGS@
 OPT_LEVEL0_LDFLAGS = @OPT_LEVEL0_LDFLAGS@
+OPT_METRICS_DISCOVERY = @OPT_METRICS_DISCOVERY@
+OPT_METRICS_DISCOVERY_IFLAGS = @OPT_METRICS_DISCOVERY_IFLAGS@
+OPT_METRICS_DISCOVERY_LDFLAGS = @OPT_METRICS_DISCOVERY_LDFLAGS@
 OPT_OBJCOPY = @OPT_OBJCOPY@
 OPT_OPENCL_IFLAGS = @OPT_OPENCL_IFLAGS@
 OPT_PAPI = @OPT_PAPI@
diff --git a/lib/Makefile.in b/lib/Makefile.in
index 5139597fea..1bf14dd93a 100644
--- a/lib/Makefile.in
+++ b/lib/Makefile.in
@@ -292,9 +292,18 @@ OPT_CUDA_LDFLAGS = @OPT_CUDA_LDFLAGS@
 OPT_CUPTI = @OPT_CUPTI@
 OPT_CUPTI_IFLAGS = @OPT_CUPTI_IFLAGS@
 OPT_CUPTI_LDFLAGS = @OPT_CUPTI_LDFLAGS@
+OPT_GTPIN = @OPT_GTPIN@
+OPT_GTPIN_IFLAGS = @OPT_GTPIN_IFLAGS@
+OPT_GTPIN_LDFLAGS = @OPT_GTPIN_LDFLAGS@
+OPT_IGC = @OPT_IGC@
+OPT_IGC_IFLAGS = @OPT_IGC_IFLAGS@
+OPT_IGC_LDFLAGS = @OPT_IGC_LDFLAGS@
 OPT_LEVEL0 = @OPT_LEVEL0@
 OPT_LEVEL0_IFLAGS = @OPT_LEVEL0_IFLAGS@
 OPT_LEVEL0_LDFLAGS = @OPT_LEVEL0_LDFLAGS@
+OPT_METRICS_DISCOVERY = @OPT_METRICS_DISCOVERY@
+OPT_METRICS_DISCOVERY_IFLAGS = @OPT_METRICS_DISCOVERY_IFLAGS@
+OPT_METRICS_DISCOVERY_LDFLAGS = @OPT_METRICS_DISCOVERY_LDFLAGS@
 OPT_OBJCOPY = @OPT_OBJCOPY@
 OPT_OPENCL_IFLAGS = @OPT_OPENCL_IFLAGS@
 OPT_PAPI = @OPT_PAPI@
diff --git a/src/Makefile.in b/src/Makefile.in
index 488f2d163c..d0cc63b65c 100644
--- a/src/Makefile.in
+++ b/src/Makefile.in
@@ -322,9 +322,18 @@ OPT_CUDA_LDFLAGS = @OPT_CUDA_LDFLAGS@
 OPT_CUPTI = @OPT_CUPTI@
 OPT_CUPTI_IFLAGS = @OPT_CUPTI_IFLAGS@
 OPT_CUPTI_LDFLAGS = @OPT_CUPTI_LDFLAGS@
+OPT_GTPIN = @OPT_GTPIN@
+OPT_GTPIN_IFLAGS = @OPT_GTPIN_IFLAGS@
+OPT_GTPIN_LDFLAGS = @OPT_GTPIN_LDFLAGS@
+OPT_IGC = @OPT_IGC@
+OPT_IGC_IFLAGS = @OPT_IGC_IFLAGS@
+OPT_IGC_LDFLAGS = @OPT_IGC_LDFLAGS@
 OPT_LEVEL0 = @OPT_LEVEL0@
 OPT_LEVEL0_IFLAGS = @OPT_LEVEL0_IFLAGS@
 OPT_LEVEL0_LDFLAGS = @OPT_LEVEL0_LDFLAGS@
+OPT_METRICS_DISCOVERY = @OPT_METRICS_DISCOVERY@
+OPT_METRICS_DISCOVERY_IFLAGS = @OPT_METRICS_DISCOVERY_IFLAGS@
+OPT_METRICS_DISCOVERY_LDFLAGS = @OPT_METRICS_DISCOVERY_LDFLAGS@
 OPT_OBJCOPY = @OPT_OBJCOPY@
 OPT_OPENCL_IFLAGS = @OPT_OPENCL_IFLAGS@
 OPT_PAPI = @OPT_PAPI@
diff --git a/src/lib/Makefile.in b/src/lib/Makefile.in
index 3d501a0900..3194f6268d 100644
--- a/src/lib/Makefile.in
+++ b/src/lib/Makefile.in
@@ -334,9 +334,18 @@ OPT_CUDA_LDFLAGS = @OPT_CUDA_LDFLAGS@
 OPT_CUPTI = @OPT_CUPTI@
 OPT_CUPTI_IFLAGS = @OPT_CUPTI_IFLAGS@
 OPT_CUPTI_LDFLAGS = @OPT_CUPTI_LDFLAGS@
+OPT_GTPIN = @OPT_GTPIN@
+OPT_GTPIN_IFLAGS = @OPT_GTPIN_IFLAGS@
+OPT_GTPIN_LDFLAGS = @OPT_GTPIN_LDFLAGS@
+OPT_IGC = @OPT_IGC@
+OPT_IGC_IFLAGS = @OPT_IGC_IFLAGS@
+OPT_IGC_LDFLAGS = @OPT_IGC_LDFLAGS@
 OPT_LEVEL0 = @OPT_LEVEL0@
 OPT_LEVEL0_IFLAGS = @OPT_LEVEL0_IFLAGS@
 OPT_LEVEL0_LDFLAGS = @OPT_LEVEL0_LDFLAGS@
+OPT_METRICS_DISCOVERY = @OPT_METRICS_DISCOVERY@
+OPT_METRICS_DISCOVERY_IFLAGS = @OPT_METRICS_DISCOVERY_IFLAGS@
+OPT_METRICS_DISCOVERY_LDFLAGS = @OPT_METRICS_DISCOVERY_LDFLAGS@
 OPT_OBJCOPY = @OPT_OBJCOPY@
 OPT_OPENCL_IFLAGS = @OPT_OPENCL_IFLAGS@
 OPT_PAPI = @OPT_PAPI@
diff --git a/src/lib/analysis/Makefile.in b/src/lib/analysis/Makefile.in
index bd19270aa4..90a520bec7 100644
--- a/src/lib/analysis/Makefile.in
+++ b/src/lib/analysis/Makefile.in
@@ -370,9 +370,18 @@ OPT_CUDA_LDFLAGS = @OPT_CUDA_LDFLAGS@
 OPT_CUPTI = @OPT_CUPTI@
 OPT_CUPTI_IFLAGS = @OPT_CUPTI_IFLAGS@
 OPT_CUPTI_LDFLAGS = @OPT_CUPTI_LDFLAGS@
+OPT_GTPIN = @OPT_GTPIN@
+OPT_GTPIN_IFLAGS = @OPT_GTPIN_IFLAGS@
+OPT_GTPIN_LDFLAGS = @OPT_GTPIN_LDFLAGS@
+OPT_IGC = @OPT_IGC@
+OPT_IGC_IFLAGS = @OPT_IGC_IFLAGS@
+OPT_IGC_LDFLAGS = @OPT_IGC_LDFLAGS@
 OPT_LEVEL0 = @OPT_LEVEL0@
 OPT_LEVEL0_IFLAGS = @OPT_LEVEL0_IFLAGS@
 OPT_LEVEL0_LDFLAGS = @OPT_LEVEL0_LDFLAGS@
+OPT_METRICS_DISCOVERY = @OPT_METRICS_DISCOVERY@
+OPT_METRICS_DISCOVERY_IFLAGS = @OPT_METRICS_DISCOVERY_IFLAGS@
+OPT_METRICS_DISCOVERY_LDFLAGS = @OPT_METRICS_DISCOVERY_LDFLAGS@
 OPT_OBJCOPY = @OPT_OBJCOPY@
 OPT_OPENCL_IFLAGS = @OPT_OPENCL_IFLAGS@
 OPT_PAPI = @OPT_PAPI@
diff --git a/src/lib/banal/Makefile.in b/src/lib/banal/Makefile.in
index b36a253116..0544628b35 100644
--- a/src/lib/banal/Makefile.in
+++ b/src/lib/banal/Makefile.in
@@ -365,9 +365,18 @@ OPT_CUDA_LDFLAGS = @OPT_CUDA_LDFLAGS@
 OPT_CUPTI = @OPT_CUPTI@
 OPT_CUPTI_IFLAGS = @OPT_CUPTI_IFLAGS@
 OPT_CUPTI_LDFLAGS = @OPT_CUPTI_LDFLAGS@
+OPT_GTPIN = @OPT_GTPIN@
+OPT_GTPIN_IFLAGS = @OPT_GTPIN_IFLAGS@
+OPT_GTPIN_LDFLAGS = @OPT_GTPIN_LDFLAGS@
+OPT_IGC = @OPT_IGC@
+OPT_IGC_IFLAGS = @OPT_IGC_IFLAGS@
+OPT_IGC_LDFLAGS = @OPT_IGC_LDFLAGS@
 OPT_LEVEL0 = @OPT_LEVEL0@
 OPT_LEVEL0_IFLAGS = @OPT_LEVEL0_IFLAGS@
 OPT_LEVEL0_LDFLAGS = @OPT_LEVEL0_LDFLAGS@
+OPT_METRICS_DISCOVERY = @OPT_METRICS_DISCOVERY@
+OPT_METRICS_DISCOVERY_IFLAGS = @OPT_METRICS_DISCOVERY_IFLAGS@
+OPT_METRICS_DISCOVERY_LDFLAGS = @OPT_METRICS_DISCOVERY_LDFLAGS@
 OPT_OBJCOPY = @OPT_OBJCOPY@
 OPT_OPENCL_IFLAGS = @OPT_OPENCL_IFLAGS@
 OPT_PAPI = @OPT_PAPI@
diff --git a/src/lib/binutils/Makefile.in b/src/lib/binutils/Makefile.in
index d888152799..7eac0de1b7 100644
--- a/src/lib/binutils/Makefile.in
+++ b/src/lib/binutils/Makefile.in
@@ -376,9 +376,18 @@ OPT_CUDA_LDFLAGS = @OPT_CUDA_LDFLAGS@
 OPT_CUPTI = @OPT_CUPTI@
 OPT_CUPTI_IFLAGS = @OPT_CUPTI_IFLAGS@
 OPT_CUPTI_LDFLAGS = @OPT_CUPTI_LDFLAGS@
+OPT_GTPIN = @OPT_GTPIN@
+OPT_GTPIN_IFLAGS = @OPT_GTPIN_IFLAGS@
+OPT_GTPIN_LDFLAGS = @OPT_GTPIN_LDFLAGS@
+OPT_IGC = @OPT_IGC@
+OPT_IGC_IFLAGS = @OPT_IGC_IFLAGS@
+OPT_IGC_LDFLAGS = @OPT_IGC_LDFLAGS@
 OPT_LEVEL0 = @OPT_LEVEL0@
 OPT_LEVEL0_IFLAGS = @OPT_LEVEL0_IFLAGS@
 OPT_LEVEL0_LDFLAGS = @OPT_LEVEL0_LDFLAGS@
+OPT_METRICS_DISCOVERY = @OPT_METRICS_DISCOVERY@
+OPT_METRICS_DISCOVERY_IFLAGS = @OPT_METRICS_DISCOVERY_IFLAGS@
+OPT_METRICS_DISCOVERY_LDFLAGS = @OPT_METRICS_DISCOVERY_LDFLAGS@
 OPT_OBJCOPY = @OPT_OBJCOPY@
 OPT_OPENCL_IFLAGS = @OPT_OPENCL_IFLAGS@
 OPT_PAPI = @OPT_PAPI@
diff --git a/src/lib/isa/Makefile.in b/src/lib/isa/Makefile.in
index 8ced9a5aff..e1c22f050e 100644
--- a/src/lib/isa/Makefile.in
+++ b/src/lib/isa/Makefile.in
@@ -363,9 +363,18 @@ OPT_CUDA_LDFLAGS = @OPT_CUDA_LDFLAGS@
 OPT_CUPTI = @OPT_CUPTI@
 OPT_CUPTI_IFLAGS = @OPT_CUPTI_IFLAGS@
 OPT_CUPTI_LDFLAGS = @OPT_CUPTI_LDFLAGS@
+OPT_GTPIN = @OPT_GTPIN@
+OPT_GTPIN_IFLAGS = @OPT_GTPIN_IFLAGS@
+OPT_GTPIN_LDFLAGS = @OPT_GTPIN_LDFLAGS@
+OPT_IGC = @OPT_IGC@
+OPT_IGC_IFLAGS = @OPT_IGC_IFLAGS@
+OPT_IGC_LDFLAGS = @OPT_IGC_LDFLAGS@
 OPT_LEVEL0 = @OPT_LEVEL0@
 OPT_LEVEL0_IFLAGS = @OPT_LEVEL0_IFLAGS@
 OPT_LEVEL0_LDFLAGS = @OPT_LEVEL0_LDFLAGS@
+OPT_METRICS_DISCOVERY = @OPT_METRICS_DISCOVERY@
+OPT_METRICS_DISCOVERY_IFLAGS = @OPT_METRICS_DISCOVERY_IFLAGS@
+OPT_METRICS_DISCOVERY_LDFLAGS = @OPT_METRICS_DISCOVERY_LDFLAGS@
 OPT_OBJCOPY = @OPT_OBJCOPY@
 OPT_OPENCL_IFLAGS = @OPT_OPENCL_IFLAGS@
 OPT_PAPI = @OPT_PAPI@
diff --git a/src/lib/prof-lean/Makefile.in b/src/lib/prof-lean/Makefile.in
index 88a725cbea..a859751bb5 100644
--- a/src/lib/prof-lean/Makefile.in
+++ b/src/lib/prof-lean/Makefile.in
@@ -366,9 +366,18 @@ OPT_CUDA_LDFLAGS = @OPT_CUDA_LDFLAGS@
 OPT_CUPTI = @OPT_CUPTI@
 OPT_CUPTI_IFLAGS = @OPT_CUPTI_IFLAGS@
 OPT_CUPTI_LDFLAGS = @OPT_CUPTI_LDFLAGS@
+OPT_GTPIN = @OPT_GTPIN@
+OPT_GTPIN_IFLAGS = @OPT_GTPIN_IFLAGS@
+OPT_GTPIN_LDFLAGS = @OPT_GTPIN_LDFLAGS@
+OPT_IGC = @OPT_IGC@
+OPT_IGC_IFLAGS = @OPT_IGC_IFLAGS@
+OPT_IGC_LDFLAGS = @OPT_IGC_LDFLAGS@
 OPT_LEVEL0 = @OPT_LEVEL0@
 OPT_LEVEL0_IFLAGS = @OPT_LEVEL0_IFLAGS@
 OPT_LEVEL0_LDFLAGS = @OPT_LEVEL0_LDFLAGS@
+OPT_METRICS_DISCOVERY = @OPT_METRICS_DISCOVERY@
+OPT_METRICS_DISCOVERY_IFLAGS = @OPT_METRICS_DISCOVERY_IFLAGS@
+OPT_METRICS_DISCOVERY_LDFLAGS = @OPT_METRICS_DISCOVERY_LDFLAGS@
 OPT_OBJCOPY = @OPT_OBJCOPY@
 OPT_OPENCL_IFLAGS = @OPT_OPENCL_IFLAGS@
 OPT_PAPI = @OPT_PAPI@
diff --git a/src/lib/prof/Makefile.in b/src/lib/prof/Makefile.in
index f0c6d1532c..a64b4d4eff 100644
--- a/src/lib/prof/Makefile.in
+++ b/src/lib/prof/Makefile.in
@@ -373,9 +373,18 @@ OPT_CUDA_LDFLAGS = @OPT_CUDA_LDFLAGS@
 OPT_CUPTI = @OPT_CUPTI@
 OPT_CUPTI_IFLAGS = @OPT_CUPTI_IFLAGS@
 OPT_CUPTI_LDFLAGS = @OPT_CUPTI_LDFLAGS@
+OPT_GTPIN = @OPT_GTPIN@
+OPT_GTPIN_IFLAGS = @OPT_GTPIN_IFLAGS@
+OPT_GTPIN_LDFLAGS = @OPT_GTPIN_LDFLAGS@
+OPT_IGC = @OPT_IGC@
+OPT_IGC_IFLAGS = @OPT_IGC_IFLAGS@
+OPT_IGC_LDFLAGS = @OPT_IGC_LDFLAGS@
 OPT_LEVEL0 = @OPT_LEVEL0@
 OPT_LEVEL0_IFLAGS = @OPT_LEVEL0_IFLAGS@
 OPT_LEVEL0_LDFLAGS = @OPT_LEVEL0_LDFLAGS@
+OPT_METRICS_DISCOVERY = @OPT_METRICS_DISCOVERY@
+OPT_METRICS_DISCOVERY_IFLAGS = @OPT_METRICS_DISCOVERY_IFLAGS@
+OPT_METRICS_DISCOVERY_LDFLAGS = @OPT_METRICS_DISCOVERY_LDFLAGS@
 OPT_OBJCOPY = @OPT_OBJCOPY@
 OPT_OPENCL_IFLAGS = @OPT_OPENCL_IFLAGS@
 OPT_PAPI = @OPT_PAPI@
diff --git a/src/lib/profxml/Makefile.in b/src/lib/profxml/Makefile.in
index 1e9a4d8c0c..a5ee9980e7 100644
--- a/src/lib/profxml/Makefile.in
+++ b/src/lib/profxml/Makefile.in
@@ -368,9 +368,18 @@ OPT_CUDA_LDFLAGS = @OPT_CUDA_LDFLAGS@
 OPT_CUPTI = @OPT_CUPTI@
 OPT_CUPTI_IFLAGS = @OPT_CUPTI_IFLAGS@
 OPT_CUPTI_LDFLAGS = @OPT_CUPTI_LDFLAGS@
+OPT_GTPIN = @OPT_GTPIN@
+OPT_GTPIN_IFLAGS = @OPT_GTPIN_IFLAGS@
+OPT_GTPIN_LDFLAGS = @OPT_GTPIN_LDFLAGS@
+OPT_IGC = @OPT_IGC@
+OPT_IGC_IFLAGS = @OPT_IGC_IFLAGS@
+OPT_IGC_LDFLAGS = @OPT_IGC_LDFLAGS@
 OPT_LEVEL0 = @OPT_LEVEL0@
 OPT_LEVEL0_IFLAGS = @OPT_LEVEL0_IFLAGS@
 OPT_LEVEL0_LDFLAGS = @OPT_LEVEL0_LDFLAGS@
+OPT_METRICS_DISCOVERY = @OPT_METRICS_DISCOVERY@
+OPT_METRICS_DISCOVERY_IFLAGS = @OPT_METRICS_DISCOVERY_IFLAGS@
+OPT_METRICS_DISCOVERY_LDFLAGS = @OPT_METRICS_DISCOVERY_LDFLAGS@
 OPT_OBJCOPY = @OPT_OBJCOPY@
 OPT_OPENCL_IFLAGS = @OPT_OPENCL_IFLAGS@
 OPT_PAPI = @OPT_PAPI@
diff --git a/src/lib/stubs-gcc_s/Makefile.in b/src/lib/stubs-gcc_s/Makefile.in
index 5ee5bbf912..38b2ac32dd 100644
--- a/src/lib/stubs-gcc_s/Makefile.in
+++ b/src/lib/stubs-gcc_s/Makefile.in
@@ -346,9 +346,18 @@ OPT_CUDA_LDFLAGS = @OPT_CUDA_LDFLAGS@
 OPT_CUPTI = @OPT_CUPTI@
 OPT_CUPTI_IFLAGS = @OPT_CUPTI_IFLAGS@
 OPT_CUPTI_LDFLAGS = @OPT_CUPTI_LDFLAGS@
+OPT_GTPIN = @OPT_GTPIN@
+OPT_GTPIN_IFLAGS = @OPT_GTPIN_IFLAGS@
+OPT_GTPIN_LDFLAGS = @OPT_GTPIN_LDFLAGS@
+OPT_IGC = @OPT_IGC@
+OPT_IGC_IFLAGS = @OPT_IGC_IFLAGS@
+OPT_IGC_LDFLAGS = @OPT_IGC_LDFLAGS@
 OPT_LEVEL0 = @OPT_LEVEL0@
 OPT_LEVEL0_IFLAGS = @OPT_LEVEL0_IFLAGS@
 OPT_LEVEL0_LDFLAGS = @OPT_LEVEL0_LDFLAGS@
+OPT_METRICS_DISCOVERY = @OPT_METRICS_DISCOVERY@
+OPT_METRICS_DISCOVERY_IFLAGS = @OPT_METRICS_DISCOVERY_IFLAGS@
+OPT_METRICS_DISCOVERY_LDFLAGS = @OPT_METRICS_DISCOVERY_LDFLAGS@
 OPT_OBJCOPY = @OPT_OBJCOPY@
 OPT_OPENCL_IFLAGS = @OPT_OPENCL_IFLAGS@
 OPT_PAPI = @OPT_PAPI@
diff --git a/src/lib/support-lean/Makefile.in b/src/lib/support-lean/Makefile.in
index 4947727127..a527163052 100644
--- a/src/lib/support-lean/Makefile.in
+++ b/src/lib/support-lean/Makefile.in
@@ -352,9 +352,18 @@ OPT_CUDA_LDFLAGS = @OPT_CUDA_LDFLAGS@
 OPT_CUPTI = @OPT_CUPTI@
 OPT_CUPTI_IFLAGS = @OPT_CUPTI_IFLAGS@
 OPT_CUPTI_LDFLAGS = @OPT_CUPTI_LDFLAGS@
+OPT_GTPIN = @OPT_GTPIN@
+OPT_GTPIN_IFLAGS = @OPT_GTPIN_IFLAGS@
+OPT_GTPIN_LDFLAGS = @OPT_GTPIN_LDFLAGS@
+OPT_IGC = @OPT_IGC@
+OPT_IGC_IFLAGS = @OPT_IGC_IFLAGS@
+OPT_IGC_LDFLAGS = @OPT_IGC_LDFLAGS@
 OPT_LEVEL0 = @OPT_LEVEL0@
 OPT_LEVEL0_IFLAGS = @OPT_LEVEL0_IFLAGS@
 OPT_LEVEL0_LDFLAGS = @OPT_LEVEL0_LDFLAGS@
+OPT_METRICS_DISCOVERY = @OPT_METRICS_DISCOVERY@
+OPT_METRICS_DISCOVERY_IFLAGS = @OPT_METRICS_DISCOVERY_IFLAGS@
+OPT_METRICS_DISCOVERY_LDFLAGS = @OPT_METRICS_DISCOVERY_LDFLAGS@
 OPT_OBJCOPY = @OPT_OBJCOPY@
 OPT_OPENCL_IFLAGS = @OPT_OPENCL_IFLAGS@
 OPT_PAPI = @OPT_PAPI@
diff --git a/src/lib/support/Makefile.in b/src/lib/support/Makefile.in
index 0f4038a779..04ccf50066 100644
--- a/src/lib/support/Makefile.in
+++ b/src/lib/support/Makefile.in
@@ -381,9 +381,18 @@ OPT_CUDA_LDFLAGS = @OPT_CUDA_LDFLAGS@
 OPT_CUPTI = @OPT_CUPTI@
 OPT_CUPTI_IFLAGS = @OPT_CUPTI_IFLAGS@
 OPT_CUPTI_LDFLAGS = @OPT_CUPTI_LDFLAGS@
+OPT_GTPIN = @OPT_GTPIN@
+OPT_GTPIN_IFLAGS = @OPT_GTPIN_IFLAGS@
+OPT_GTPIN_LDFLAGS = @OPT_GTPIN_LDFLAGS@
+OPT_IGC = @OPT_IGC@
+OPT_IGC_IFLAGS = @OPT_IGC_IFLAGS@
+OPT_IGC_LDFLAGS = @OPT_IGC_LDFLAGS@
 OPT_LEVEL0 = @OPT_LEVEL0@
 OPT_LEVEL0_IFLAGS = @OPT_LEVEL0_IFLAGS@
 OPT_LEVEL0_LDFLAGS = @OPT_LEVEL0_LDFLAGS@
+OPT_METRICS_DISCOVERY = @OPT_METRICS_DISCOVERY@
+OPT_METRICS_DISCOVERY_IFLAGS = @OPT_METRICS_DISCOVERY_IFLAGS@
+OPT_METRICS_DISCOVERY_LDFLAGS = @OPT_METRICS_DISCOVERY_LDFLAGS@
 OPT_OBJCOPY = @OPT_OBJCOPY@
 OPT_OPENCL_IFLAGS = @OPT_OPENCL_IFLAGS@
 OPT_PAPI = @OPT_PAPI@
diff --git a/src/lib/xml/Makefile.in b/src/lib/xml/Makefile.in
index 970502f954..c84dfd0f73 100644
--- a/src/lib/xml/Makefile.in
+++ b/src/lib/xml/Makefile.in
@@ -365,9 +365,18 @@ OPT_CUDA_LDFLAGS = @OPT_CUDA_LDFLAGS@
 OPT_CUPTI = @OPT_CUPTI@
 OPT_CUPTI_IFLAGS = @OPT_CUPTI_IFLAGS@
 OPT_CUPTI_LDFLAGS = @OPT_CUPTI_LDFLAGS@
+OPT_GTPIN = @OPT_GTPIN@
+OPT_GTPIN_IFLAGS = @OPT_GTPIN_IFLAGS@
+OPT_GTPIN_LDFLAGS = @OPT_GTPIN_LDFLAGS@
+OPT_IGC = @OPT_IGC@
+OPT_IGC_IFLAGS = @OPT_IGC_IFLAGS@
+OPT_IGC_LDFLAGS = @OPT_IGC_LDFLAGS@
 OPT_LEVEL0 = @OPT_LEVEL0@
 OPT_LEVEL0_IFLAGS = @OPT_LEVEL0_IFLAGS@
 OPT_LEVEL0_LDFLAGS = @OPT_LEVEL0_LDFLAGS@
+OPT_METRICS_DISCOVERY = @OPT_METRICS_DISCOVERY@
+OPT_METRICS_DISCOVERY_IFLAGS = @OPT_METRICS_DISCOVERY_IFLAGS@
+OPT_METRICS_DISCOVERY_LDFLAGS = @OPT_METRICS_DISCOVERY_LDFLAGS@
 OPT_OBJCOPY = @OPT_OBJCOPY@
 OPT_OPENCL_IFLAGS = @OPT_OPENCL_IFLAGS@
 OPT_PAPI = @OPT_PAPI@
diff --git a/src/tool/Makefile.in b/src/tool/Makefile.in
index 545ef9b2a9..2fb762ac42 100644
--- a/src/tool/Makefile.in
+++ b/src/tool/Makefile.in
@@ -339,9 +339,18 @@ OPT_CUDA_LDFLAGS = @OPT_CUDA_LDFLAGS@
 OPT_CUPTI = @OPT_CUPTI@
 OPT_CUPTI_IFLAGS = @OPT_CUPTI_IFLAGS@
 OPT_CUPTI_LDFLAGS = @OPT_CUPTI_LDFLAGS@
+OPT_GTPIN = @OPT_GTPIN@
+OPT_GTPIN_IFLAGS = @OPT_GTPIN_IFLAGS@
+OPT_GTPIN_LDFLAGS = @OPT_GTPIN_LDFLAGS@
+OPT_IGC = @OPT_IGC@
+OPT_IGC_IFLAGS = @OPT_IGC_IFLAGS@
+OPT_IGC_LDFLAGS = @OPT_IGC_LDFLAGS@
 OPT_LEVEL0 = @OPT_LEVEL0@
 OPT_LEVEL0_IFLAGS = @OPT_LEVEL0_IFLAGS@
 OPT_LEVEL0_LDFLAGS = @OPT_LEVEL0_LDFLAGS@
+OPT_METRICS_DISCOVERY = @OPT_METRICS_DISCOVERY@
+OPT_METRICS_DISCOVERY_IFLAGS = @OPT_METRICS_DISCOVERY_IFLAGS@
+OPT_METRICS_DISCOVERY_LDFLAGS = @OPT_METRICS_DISCOVERY_LDFLAGS@
 OPT_OBJCOPY = @OPT_OBJCOPY@
 OPT_OPENCL_IFLAGS = @OPT_OPENCL_IFLAGS@
 OPT_PAPI = @OPT_PAPI@
diff --git a/src/tool/hpcfnbounds/Makefile.in b/src/tool/hpcfnbounds/Makefile.in
index dfe3a93fa0..0ae4b5aa47 100644
--- a/src/tool/hpcfnbounds/Makefile.in
+++ b/src/tool/hpcfnbounds/Makefile.in
@@ -449,9 +449,18 @@ OPT_CUDA_LDFLAGS = @OPT_CUDA_LDFLAGS@
 OPT_CUPTI = @OPT_CUPTI@
 OPT_CUPTI_IFLAGS = @OPT_CUPTI_IFLAGS@
 OPT_CUPTI_LDFLAGS = @OPT_CUPTI_LDFLAGS@
+OPT_GTPIN = @OPT_GTPIN@
+OPT_GTPIN_IFLAGS = @OPT_GTPIN_IFLAGS@
+OPT_GTPIN_LDFLAGS = @OPT_GTPIN_LDFLAGS@
+OPT_IGC = @OPT_IGC@
+OPT_IGC_IFLAGS = @OPT_IGC_IFLAGS@
+OPT_IGC_LDFLAGS = @OPT_IGC_LDFLAGS@
 OPT_LEVEL0 = @OPT_LEVEL0@
 OPT_LEVEL0_IFLAGS = @OPT_LEVEL0_IFLAGS@
 OPT_LEVEL0_LDFLAGS = @OPT_LEVEL0_LDFLAGS@
+OPT_METRICS_DISCOVERY = @OPT_METRICS_DISCOVERY@
+OPT_METRICS_DISCOVERY_IFLAGS = @OPT_METRICS_DISCOVERY_IFLAGS@
+OPT_METRICS_DISCOVERY_LDFLAGS = @OPT_METRICS_DISCOVERY_LDFLAGS@
 OPT_OBJCOPY = @OPT_OBJCOPY@
 OPT_OPENCL_IFLAGS = @OPT_OPENCL_IFLAGS@
 OPT_PAPI = @OPT_PAPI@
diff --git a/src/tool/hpcfnbounds2/Makefile.in b/src/tool/hpcfnbounds2/Makefile.in
index 46f9f0ce7f..8fd59fe54a 100644
--- a/src/tool/hpcfnbounds2/Makefile.in
+++ b/src/tool/hpcfnbounds2/Makefile.in
@@ -347,9 +347,18 @@ OPT_CUDA_LDFLAGS = @OPT_CUDA_LDFLAGS@
 OPT_CUPTI = @OPT_CUPTI@
 OPT_CUPTI_IFLAGS = @OPT_CUPTI_IFLAGS@
 OPT_CUPTI_LDFLAGS = @OPT_CUPTI_LDFLAGS@
+OPT_GTPIN = @OPT_GTPIN@
+OPT_GTPIN_IFLAGS = @OPT_GTPIN_IFLAGS@
+OPT_GTPIN_LDFLAGS = @OPT_GTPIN_LDFLAGS@
+OPT_IGC = @OPT_IGC@
+OPT_IGC_IFLAGS = @OPT_IGC_IFLAGS@
+OPT_IGC_LDFLAGS = @OPT_IGC_LDFLAGS@
 OPT_LEVEL0 = @OPT_LEVEL0@
 OPT_LEVEL0_IFLAGS = @OPT_LEVEL0_IFLAGS@
 OPT_LEVEL0_LDFLAGS = @OPT_LEVEL0_LDFLAGS@
+OPT_METRICS_DISCOVERY = @OPT_METRICS_DISCOVERY@
+OPT_METRICS_DISCOVERY_IFLAGS = @OPT_METRICS_DISCOVERY_IFLAGS@
+OPT_METRICS_DISCOVERY_LDFLAGS = @OPT_METRICS_DISCOVERY_LDFLAGS@
 OPT_OBJCOPY = @OPT_OBJCOPY@
 OPT_OPENCL_IFLAGS = @OPT_OPENCL_IFLAGS@
 OPT_PAPI = @OPT_PAPI@
diff --git a/src/tool/hpclump/Makefile.in b/src/tool/hpclump/Makefile.in
index 37fe2a0159..34792ea7be 100644
--- a/src/tool/hpclump/Makefile.in
+++ b/src/tool/hpclump/Makefile.in
@@ -380,9 +380,18 @@ OPT_CUDA_LDFLAGS = @OPT_CUDA_LDFLAGS@
 OPT_CUPTI = @OPT_CUPTI@
 OPT_CUPTI_IFLAGS = @OPT_CUPTI_IFLAGS@
 OPT_CUPTI_LDFLAGS = @OPT_CUPTI_LDFLAGS@
+OPT_GTPIN = @OPT_GTPIN@
+OPT_GTPIN_IFLAGS = @OPT_GTPIN_IFLAGS@
+OPT_GTPIN_LDFLAGS = @OPT_GTPIN_LDFLAGS@
+OPT_IGC = @OPT_IGC@
+OPT_IGC_IFLAGS = @OPT_IGC_IFLAGS@
+OPT_IGC_LDFLAGS = @OPT_IGC_LDFLAGS@
 OPT_LEVEL0 = @OPT_LEVEL0@
 OPT_LEVEL0_IFLAGS = @OPT_LEVEL0_IFLAGS@
 OPT_LEVEL0_LDFLAGS = @OPT_LEVEL0_LDFLAGS@
+OPT_METRICS_DISCOVERY = @OPT_METRICS_DISCOVERY@
+OPT_METRICS_DISCOVERY_IFLAGS = @OPT_METRICS_DISCOVERY_IFLAGS@
+OPT_METRICS_DISCOVERY_LDFLAGS = @OPT_METRICS_DISCOVERY_LDFLAGS@
 OPT_OBJCOPY = @OPT_OBJCOPY@
 OPT_OPENCL_IFLAGS = @OPT_OPENCL_IFLAGS@
 OPT_PAPI = @OPT_PAPI@
diff --git a/src/tool/hpcprof-flat/Makefile.in b/src/tool/hpcprof-flat/Makefile.in
index ad27eb7012..5f6ab64282 100644
--- a/src/tool/hpcprof-flat/Makefile.in
+++ b/src/tool/hpcprof-flat/Makefile.in
@@ -415,9 +415,18 @@ OPT_CUDA_LDFLAGS = @OPT_CUDA_LDFLAGS@
 OPT_CUPTI = @OPT_CUPTI@
 OPT_CUPTI_IFLAGS = @OPT_CUPTI_IFLAGS@
 OPT_CUPTI_LDFLAGS = @OPT_CUPTI_LDFLAGS@
+OPT_GTPIN = @OPT_GTPIN@
+OPT_GTPIN_IFLAGS = @OPT_GTPIN_IFLAGS@
+OPT_GTPIN_LDFLAGS = @OPT_GTPIN_LDFLAGS@
+OPT_IGC = @OPT_IGC@
+OPT_IGC_IFLAGS = @OPT_IGC_IFLAGS@
+OPT_IGC_LDFLAGS = @OPT_IGC_LDFLAGS@
 OPT_LEVEL0 = @OPT_LEVEL0@
 OPT_LEVEL0_IFLAGS = @OPT_LEVEL0_IFLAGS@
 OPT_LEVEL0_LDFLAGS = @OPT_LEVEL0_LDFLAGS@
+OPT_METRICS_DISCOVERY = @OPT_METRICS_DISCOVERY@
+OPT_METRICS_DISCOVERY_IFLAGS = @OPT_METRICS_DISCOVERY_IFLAGS@
+OPT_METRICS_DISCOVERY_LDFLAGS = @OPT_METRICS_DISCOVERY_LDFLAGS@
 OPT_OBJCOPY = @OPT_OBJCOPY@
 OPT_OPENCL_IFLAGS = @OPT_OPENCL_IFLAGS@
 OPT_PAPI = @OPT_PAPI@
diff --git a/src/tool/hpcprof-mpi/Makefile.in b/src/tool/hpcprof-mpi/Makefile.in
index 698b145708..d00867fd0f 100644
--- a/src/tool/hpcprof-mpi/Makefile.in
+++ b/src/tool/hpcprof-mpi/Makefile.in
@@ -415,9 +415,18 @@ OPT_CUDA_LDFLAGS = @OPT_CUDA_LDFLAGS@
 OPT_CUPTI = @OPT_CUPTI@
 OPT_CUPTI_IFLAGS = @OPT_CUPTI_IFLAGS@
 OPT_CUPTI_LDFLAGS = @OPT_CUPTI_LDFLAGS@
+OPT_GTPIN = @OPT_GTPIN@
+OPT_GTPIN_IFLAGS = @OPT_GTPIN_IFLAGS@
+OPT_GTPIN_LDFLAGS = @OPT_GTPIN_LDFLAGS@
+OPT_IGC = @OPT_IGC@
+OPT_IGC_IFLAGS = @OPT_IGC_IFLAGS@
+OPT_IGC_LDFLAGS = @OPT_IGC_LDFLAGS@
 OPT_LEVEL0 = @OPT_LEVEL0@
 OPT_LEVEL0_IFLAGS = @OPT_LEVEL0_IFLAGS@
 OPT_LEVEL0_LDFLAGS = @OPT_LEVEL0_LDFLAGS@
+OPT_METRICS_DISCOVERY = @OPT_METRICS_DISCOVERY@
+OPT_METRICS_DISCOVERY_IFLAGS = @OPT_METRICS_DISCOVERY_IFLAGS@
+OPT_METRICS_DISCOVERY_LDFLAGS = @OPT_METRICS_DISCOVERY_LDFLAGS@
 OPT_OBJCOPY = @OPT_OBJCOPY@
 OPT_OPENCL_IFLAGS = @OPT_OPENCL_IFLAGS@
 OPT_PAPI = @OPT_PAPI@
diff --git a/src/tool/hpcprof/Makefile.in b/src/tool/hpcprof/Makefile.in
index dcee5fd1ba..0a538fb755 100644
--- a/src/tool/hpcprof/Makefile.in
+++ b/src/tool/hpcprof/Makefile.in
@@ -412,9 +412,18 @@ OPT_CUDA_LDFLAGS = @OPT_CUDA_LDFLAGS@
 OPT_CUPTI = @OPT_CUPTI@
 OPT_CUPTI_IFLAGS = @OPT_CUPTI_IFLAGS@
 OPT_CUPTI_LDFLAGS = @OPT_CUPTI_LDFLAGS@
+OPT_GTPIN = @OPT_GTPIN@
+OPT_GTPIN_IFLAGS = @OPT_GTPIN_IFLAGS@
+OPT_GTPIN_LDFLAGS = @OPT_GTPIN_LDFLAGS@
+OPT_IGC = @OPT_IGC@
+OPT_IGC_IFLAGS = @OPT_IGC_IFLAGS@
+OPT_IGC_LDFLAGS = @OPT_IGC_LDFLAGS@
 OPT_LEVEL0 = @OPT_LEVEL0@
 OPT_LEVEL0_IFLAGS = @OPT_LEVEL0_IFLAGS@
 OPT_LEVEL0_LDFLAGS = @OPT_LEVEL0_LDFLAGS@
+OPT_METRICS_DISCOVERY = @OPT_METRICS_DISCOVERY@
+OPT_METRICS_DISCOVERY_IFLAGS = @OPT_METRICS_DISCOVERY_IFLAGS@
+OPT_METRICS_DISCOVERY_LDFLAGS = @OPT_METRICS_DISCOVERY_LDFLAGS@
 OPT_OBJCOPY = @OPT_OBJCOPY@
 OPT_OPENCL_IFLAGS = @OPT_OPENCL_IFLAGS@
 OPT_PAPI = @OPT_PAPI@
diff --git a/src/tool/hpcproftt/Makefile.in b/src/tool/hpcproftt/Makefile.in
index 90e9904509..e54666297b 100644
--- a/src/tool/hpcproftt/Makefile.in
+++ b/src/tool/hpcproftt/Makefile.in
@@ -415,9 +415,18 @@ OPT_CUDA_LDFLAGS = @OPT_CUDA_LDFLAGS@
 OPT_CUPTI = @OPT_CUPTI@
 OPT_CUPTI_IFLAGS = @OPT_CUPTI_IFLAGS@
 OPT_CUPTI_LDFLAGS = @OPT_CUPTI_LDFLAGS@
+OPT_GTPIN = @OPT_GTPIN@
+OPT_GTPIN_IFLAGS = @OPT_GTPIN_IFLAGS@
+OPT_GTPIN_LDFLAGS = @OPT_GTPIN_LDFLAGS@
+OPT_IGC = @OPT_IGC@
+OPT_IGC_IFLAGS = @OPT_IGC_IFLAGS@
+OPT_IGC_LDFLAGS = @OPT_IGC_LDFLAGS@
 OPT_LEVEL0 = @OPT_LEVEL0@
 OPT_LEVEL0_IFLAGS = @OPT_LEVEL0_IFLAGS@
 OPT_LEVEL0_LDFLAGS = @OPT_LEVEL0_LDFLAGS@
+OPT_METRICS_DISCOVERY = @OPT_METRICS_DISCOVERY@
+OPT_METRICS_DISCOVERY_IFLAGS = @OPT_METRICS_DISCOVERY_IFLAGS@
+OPT_METRICS_DISCOVERY_LDFLAGS = @OPT_METRICS_DISCOVERY_LDFLAGS@
 OPT_OBJCOPY = @OPT_OBJCOPY@
 OPT_OPENCL_IFLAGS = @OPT_OPENCL_IFLAGS@
 OPT_PAPI = @OPT_PAPI@
diff --git a/src/tool/hpcrun-flat/Makefile.in b/src/tool/hpcrun-flat/Makefile.in
index afca940de1..a41f83aac7 100644
--- a/src/tool/hpcrun-flat/Makefile.in
+++ b/src/tool/hpcrun-flat/Makefile.in
@@ -410,9 +410,18 @@ OPT_CUDA_LDFLAGS = @OPT_CUDA_LDFLAGS@
 OPT_CUPTI = @OPT_CUPTI@
 OPT_CUPTI_IFLAGS = @OPT_CUPTI_IFLAGS@
 OPT_CUPTI_LDFLAGS = @OPT_CUPTI_LDFLAGS@
+OPT_GTPIN = @OPT_GTPIN@
+OPT_GTPIN_IFLAGS = @OPT_GTPIN_IFLAGS@
+OPT_GTPIN_LDFLAGS = @OPT_GTPIN_LDFLAGS@
+OPT_IGC = @OPT_IGC@
+OPT_IGC_IFLAGS = @OPT_IGC_IFLAGS@
+OPT_IGC_LDFLAGS = @OPT_IGC_LDFLAGS@
 OPT_LEVEL0 = @OPT_LEVEL0@
 OPT_LEVEL0_IFLAGS = @OPT_LEVEL0_IFLAGS@
 OPT_LEVEL0_LDFLAGS = @OPT_LEVEL0_LDFLAGS@
+OPT_METRICS_DISCOVERY = @OPT_METRICS_DISCOVERY@
+OPT_METRICS_DISCOVERY_IFLAGS = @OPT_METRICS_DISCOVERY_IFLAGS@
+OPT_METRICS_DISCOVERY_LDFLAGS = @OPT_METRICS_DISCOVERY_LDFLAGS@
 OPT_OBJCOPY = @OPT_OBJCOPY@
 OPT_OPENCL_IFLAGS = @OPT_OPENCL_IFLAGS@
 OPT_PAPI = @OPT_PAPI@
diff --git a/src/tool/hpcrun/Makefile.am b/src/tool/hpcrun/Makefile.am
index d1a1cd5c2f..55ba8a02cc 100644
--- a/src/tool/hpcrun/Makefile.am
+++ b/src/tool/hpcrun/Makefile.am
@@ -511,20 +511,23 @@ MY_CUPTI_FILES = sample-sources/nvidia.c	\
 	gpu/nvidia/cupti-gpu-api.c		
 endif
 
-if ENABLE_OPENCL
-#MY_OPENCL_FILES =
-MY_BASE_FILES += \
-	sample-sources/opencl.c 				\
-	gpu/opencl/opencl-intercept.c			\
-	gpu/opencl/opencl-api.c					\
-	gpu/opencl/opencl-memory-manager.c  	\
-	gpu/opencl/opencl-activity-translate.c
+if OPT_ENABLE_OPENCL
+MY_OPENCL_FILES = sample-sources/opencl.c \
+	gpu/opencl/opencl-api.c \
+	gpu/opencl/opencl-memory-manager.c \
+	gpu/opencl/opencl-activity-translate.c \
+	gpu/opencl/opencl-h2d-map.c 
 endif
 
-
+if OPT_ENABLE_GTPIN
+MY_GTPIN_FILES = \
+	gpu/instrumentation/kernel-data-map.c \
+	gpu/instrumentation/gtpin-instrumentation.c \
+	gpu/instrumentation/gtpin-correlation-id-map.c
+endif
 
 if OPT_ENABLE_ROCM
-MY_ROCM_FILES=\
+MY_ROCM_FILES =\
 	sample-sources/amd.c \
 	gpu/amd/roctracer-activity-translate.c \
 	gpu/amd/roctracer-api.c \
@@ -534,7 +537,7 @@ endif
 
 if OPT_ENABLE_LEVEL0
 MY_LEVEL0_FILES=\
-	sample-sources/level0.c \	
+	sample-sources/level0.c \
 	gpu/level0/level0-api.c \
 	gpu/level0/level0-command-list-map.c \
 	gpu/level0/level0-command-process.c \
@@ -989,12 +992,24 @@ if OPT_ENABLE_LEVEL0
 
   MY_CPP_DEFINES  += -DHPCRUN_SS_LEVEL0
 endif
-if ENABLE_OPENCL
-  libhpcrun_la_CFLAGS += $(OPENCL_IFLAGS)
+
+if OPT_ENABLE_OPENCL
+  libhpcrun_la_SOURCES  += $(MY_OPENCL_FILES)
+  libhpcrun_la_CPPFLAGS += -DENABLE_OPENCL
+  libhpcrun_la_CFLAGS   += $(OPT_OPENCL_IFLAGS)
 
   MY_CPP_DEFINES  += -DHPCRUN_SS_OPENCL
 endif 
 
+if OPT_ENABLE_GTPIN
+  libhpcrun_la_SOURCES  += $(MY_GTPIN_FILES)
+  libhpcrun_la_CPPFLAGS += -DENABLE_GTPIN
+  libhpcrun_la_CFLAGS   += $(OPT_GTPIN_IFLAGS)
+  libhpcrun_la_LDFLAGS  += $(OPT_GTPIN_LDFLAGS)
+
+  MY_CPP_DEFINES  += -DHPCRUN_SS_GTPIN
+endif
+
 if UNW_LIBUNW
   UNW_SOURCE_FILES = $(UNW_LIBUNW_FILES)
   UNW_INCLUDE_DIRS = $(UNW_LIBUNW_INCLUDE_DIRS)
diff --git a/src/tool/hpcrun/Makefile.in b/src/tool/hpcrun/Makefile.in
index 73208f5c76..8a7caef95f 100644
--- a/src/tool/hpcrun/Makefile.in
+++ b/src/tool/hpcrun/Makefile.in
@@ -166,34 +166,26 @@ pkglibexec_PROGRAMS =
 @OPT_ENABLE_KERNEL_4_3_FALSE@@OPT_ENABLE_PERF_EVENT_TRUE@am__append_15 = sample-sources/perf/kernel_blocking_stub.c
 @OPT_PAPI_CUPTI_TRUE@am__append_16 = sample-sources/papi-c-cupti.c
 
-#MY_OPENCL_FILES =
-@ENABLE_OPENCL_TRUE@am__append_17 = \
-@ENABLE_OPENCL_TRUE@	sample-sources/opencl.c 				\
-@ENABLE_OPENCL_TRUE@	gpu/opencl/opencl-intercept.c			\
-@ENABLE_OPENCL_TRUE@	gpu/opencl/opencl-api.c					\
-@ENABLE_OPENCL_TRUE@	gpu/opencl/opencl-memory-manager.c  	\
-@ENABLE_OPENCL_TRUE@	gpu/opencl/opencl-activity-translate.c
-
-
 #
 # BG/Q backend requires special treatment to avoid deadlocks
 #
-@OPT_BGQ_BACKEND_TRUE@am__append_18 = -DUSE_HW_THREAD_ID -DNONZERO_THRESHOLD
+@OPT_BGQ_BACKEND_TRUE@am__append_17 = -DUSE_HW_THREAD_ID -DNONZERO_THRESHOLD
+@OPT_BGQ_BACKEND_TRUE@am__append_18 = -I$(srcdir)/utilities/bgq-cnk
 @OPT_BGQ_BACKEND_TRUE@am__append_19 = -I$(srcdir)/utilities/bgq-cnk
-@OPT_BGQ_BACKEND_TRUE@am__append_20 = -I$(srcdir)/utilities/bgq-cnk
+@OPT_ENABLE_MPI_WRAP_TRUE@am__append_20 = mpi-overrides.c
 @OPT_ENABLE_MPI_WRAP_TRUE@am__append_21 = mpi-overrides.c
-@OPT_ENABLE_MPI_WRAP_TRUE@am__append_22 = mpi-overrides.c
+@OPT_BGQ_BACKEND_TRUE@am__append_22 = utilities/bgq-cnk/libhardware-thread-id.la
 @OPT_BGQ_BACKEND_TRUE@am__append_23 = utilities/bgq-cnk/libhardware-thread-id.la
-@OPT_BGQ_BACKEND_TRUE@am__append_24 = utilities/bgq-cnk/libhardware-thread-id.la
 
 #  libhpcrun_o_LDFLAGS   += $(ZLIB_HPCLINK_LIB)
 
 #-----------------------------------------------------------
 # whirled peas
 #-----------------------------------------------------------
-@HOST_OS_LINUX_TRUE@am__append_25 = $(MY_LINUX_DYNAMIC_FILES)
+@HOST_OS_LINUX_TRUE@am__append_24 = $(MY_LINUX_DYNAMIC_FILES)
+@HOST_CPU_MIPS_TRUE@am__append_25 = $(MY_MIPS_FILES)
 @HOST_CPU_MIPS_TRUE@am__append_26 = $(MY_MIPS_FILES)
-@HOST_CPU_MIPS_TRUE@am__append_27 = $(MY_MIPS_FILES)
+@HOST_CPU_MIPS_TRUE@am__append_27 = $(MY_MIPS_INCLUDE_DIRS)
 @HOST_CPU_MIPS_TRUE@am__append_28 = $(MY_MIPS_INCLUDE_DIRS)
 @HOST_CPU_MIPS_TRUE@am__append_29 = $(MY_MIPS_INCLUDE_DIRS)
 @HOST_CPU_MIPS_TRUE@am__append_30 = $(MY_MIPS_INCLUDE_DIRS)
@@ -204,15 +196,15 @@ pkglibexec_PROGRAMS =
 @HOST_CPU_MIPS_TRUE@am__append_35 = $(MY_MIPS_INCLUDE_DIRS)
 @HOST_CPU_MIPS_TRUE@am__append_36 = $(MY_MIPS_INCLUDE_DIRS)
 @HOST_CPU_MIPS_TRUE@am__append_37 = $(MY_MIPS_INCLUDE_DIRS)
-@HOST_CPU_MIPS_TRUE@am__append_38 = $(MY_MIPS_INCLUDE_DIRS)
 
 # Note: setting CCASFLAGS here is a no-op hack with the side effect of
 # prefixing the tramp.s file names so they will be compiled separately
 # for .o and .so targets.  CFLAGS does this for the .c files, but
 # CFLAGS doesn't apply to .s files.  See the automake docs section
 # 8.3.9.2, Objects created with both libtool and without.
+@HOST_CPU_PPC_TRUE@am__append_38 = $(MY_PPC_FILES)
 @HOST_CPU_PPC_TRUE@am__append_39 = $(MY_PPC_FILES)
-@HOST_CPU_PPC_TRUE@am__append_40 = $(MY_PPC_FILES)
+@HOST_CPU_PPC_TRUE@am__append_40 = $(MY_PPC_INCLUDE_DIRS)
 @HOST_CPU_PPC_TRUE@am__append_41 = $(MY_PPC_INCLUDE_DIRS)
 @HOST_CPU_PPC_TRUE@am__append_42 = $(MY_PPC_INCLUDE_DIRS)
 @HOST_CPU_PPC_TRUE@am__append_43 = $(MY_PPC_INCLUDE_DIRS)
@@ -225,13 +217,13 @@ pkglibexec_PROGRAMS =
 @HOST_CPU_PPC_TRUE@am__append_50 = $(MY_PPC_INCLUDE_DIRS)
 @HOST_CPU_PPC_TRUE@am__append_51 = $(MY_PPC_INCLUDE_DIRS)
 @HOST_CPU_PPC_TRUE@am__append_52 = $(MY_PPC_INCLUDE_DIRS)
-@HOST_CPU_PPC_TRUE@am__append_53 = $(MY_PPC_INCLUDE_DIRS)
+@HOST_CPU_X86_FAMILY_TRUE@am__append_53 = $(MY_X86_FILES)
 @HOST_CPU_X86_FAMILY_TRUE@am__append_54 = $(MY_X86_FILES)
-@HOST_CPU_X86_FAMILY_TRUE@am__append_55 = $(MY_X86_FILES)
+@HOST_CPU_X86_FAMILY_TRUE@am__append_55 = $(MY_X86_INCLUDE_DIRS)
 @HOST_CPU_X86_FAMILY_TRUE@am__append_56 = $(MY_X86_INCLUDE_DIRS)
-@HOST_CPU_X86_FAMILY_TRUE@am__append_57 = $(MY_X86_INCLUDE_DIRS)
-@HOST_CPU_X86_FAMILY_TRUE@am__append_58 = $(XED2_HPCRUN_LIBS)
-@HOST_CPU_X86_FAMILY_TRUE@am__append_59 = $(XED2_HPCLINK_LIBS) 
+@HOST_CPU_X86_FAMILY_TRUE@am__append_57 = $(XED2_HPCRUN_LIBS)
+@HOST_CPU_X86_FAMILY_TRUE@am__append_58 = $(XED2_HPCLINK_LIBS) 
+@HOST_CPU_X86_FAMILY_TRUE@am__append_59 = $(MY_X86_INCLUDE_DIRS)
 @HOST_CPU_X86_FAMILY_TRUE@am__append_60 = $(MY_X86_INCLUDE_DIRS)
 @HOST_CPU_X86_FAMILY_TRUE@am__append_61 = $(MY_X86_INCLUDE_DIRS)
 @HOST_CPU_X86_FAMILY_TRUE@am__append_62 = $(MY_X86_INCLUDE_DIRS)
@@ -243,9 +235,9 @@ pkglibexec_PROGRAMS =
 @HOST_CPU_X86_FAMILY_TRUE@am__append_68 = $(MY_X86_INCLUDE_DIRS)
 @HOST_CPU_X86_FAMILY_TRUE@am__append_69 = $(MY_X86_INCLUDE_DIRS)
 @HOST_CPU_X86_FAMILY_TRUE@am__append_70 = $(MY_X86_INCLUDE_DIRS)
-@HOST_CPU_X86_FAMILY_TRUE@am__append_71 = $(MY_X86_INCLUDE_DIRS)
+@HOST_CPU_IA64_TRUE@am__append_71 = $(MY_IA64_FILES)
 @HOST_CPU_IA64_TRUE@am__append_72 = $(MY_IA64_FILES)
-@HOST_CPU_IA64_TRUE@am__append_73 = $(MY_IA64_FILES)
+@HOST_CPU_IA64_TRUE@am__append_73 = $(MY_IA64_INCLUDE_DIRS)
 @HOST_CPU_IA64_TRUE@am__append_74 = $(MY_IA64_INCLUDE_DIRS)
 @HOST_CPU_IA64_TRUE@am__append_75 = $(MY_IA64_INCLUDE_DIRS)
 @HOST_CPU_IA64_TRUE@am__append_76 = $(MY_IA64_INCLUDE_DIRS)
@@ -256,9 +248,9 @@ pkglibexec_PROGRAMS =
 @HOST_CPU_IA64_TRUE@am__append_81 = $(MY_IA64_INCLUDE_DIRS)
 @HOST_CPU_IA64_TRUE@am__append_82 = $(MY_IA64_INCLUDE_DIRS)
 @HOST_CPU_IA64_TRUE@am__append_83 = $(MY_IA64_INCLUDE_DIRS)
-@HOST_CPU_IA64_TRUE@am__append_84 = $(MY_IA64_INCLUDE_DIRS)
+@HOST_CPU_AARCH64_TRUE@am__append_84 = $(MY_AARCH64_FILES)
 @HOST_CPU_AARCH64_TRUE@am__append_85 = $(MY_AARCH64_FILES)
-@HOST_CPU_AARCH64_TRUE@am__append_86 = $(MY_AARCH64_FILES)
+@HOST_CPU_AARCH64_TRUE@am__append_86 = $(MY_AARCH64_INCLUDE_DIRS)
 @HOST_CPU_AARCH64_TRUE@am__append_87 = $(MY_AARCH64_INCLUDE_DIRS)
 @HOST_CPU_AARCH64_TRUE@am__append_88 = $(MY_AARCH64_INCLUDE_DIRS)
 @HOST_CPU_AARCH64_TRUE@am__append_89 = $(MY_AARCH64_INCLUDE_DIRS)
@@ -271,44 +263,50 @@ pkglibexec_PROGRAMS =
 @HOST_CPU_AARCH64_TRUE@am__append_96 = $(MY_AARCH64_INCLUDE_DIRS)
 @HOST_CPU_AARCH64_TRUE@am__append_97 = $(MY_AARCH64_INCLUDE_DIRS)
 @HOST_CPU_AARCH64_TRUE@am__append_98 = $(MY_AARCH64_INCLUDE_DIRS)
-@HOST_CPU_AARCH64_TRUE@am__append_99 = $(MY_AARCH64_INCLUDE_DIRS)
-@OPT_PAPI_DYNAMIC_TRUE@am__append_100 = $(MY_PAPI_FILES)
-@OPT_PAPI_DYNAMIC_TRUE@am__append_101 = $(PAPI_INC_FLGS)
-@OPT_PAPI_DYNAMIC_TRUE@am__append_102 = $(PAPI_LD_FLGS)
-@OPT_PAPI_DYNAMIC_TRUE@am__append_103 = -DHPCRUN_SS_PAPI
+@OPT_PAPI_DYNAMIC_TRUE@am__append_99 = $(MY_PAPI_FILES)
+@OPT_PAPI_DYNAMIC_TRUE@am__append_100 = $(PAPI_INC_FLGS)
+@OPT_PAPI_DYNAMIC_TRUE@am__append_101 = $(PAPI_LD_FLGS)
+@OPT_PAPI_DYNAMIC_TRUE@am__append_102 = -DHPCRUN_SS_PAPI
+@OPT_ENABLE_CUPTI_TRUE@am__append_103 = $(MY_CUPTI_FILES)
 @OPT_ENABLE_CUPTI_TRUE@am__append_104 = $(MY_CUPTI_FILES)
-@OPT_ENABLE_CUPTI_TRUE@am__append_105 = $(MY_CUPTI_FILES)
-@OPT_ENABLE_CUPTI_TRUE@am__append_106 = $(CUPTI_INC_FLGS)
-@OPT_ENABLE_CUPTI_TRUE@am__append_107 = -DHPCRUN_SS_NVIDIA
-@OPT_PAPI_CUPTI_TRUE@am__append_108 = $(CUPTI_INC_FLGS)
-@OPT_PAPI_CUPTI_TRUE@am__append_109 = -DHPCRUN_SS_PAPI_C_CUPTI
-@OPT_PAPI_STATIC_TRUE@am__append_110 = $(MY_PAPI_FILES)
-@OPT_PAPI_STATIC_TRUE@am__append_111 = $(PAPI_INC_FLGS)
-@OPT_PAPI_STATIC_TRUE@am__append_112 = $(PAPI_LD_FLGS)
-@OPT_PAPI_STATIC_TRUE@am__append_113 = -DHPCRUN_SS_PAPI
+@OPT_ENABLE_CUPTI_TRUE@am__append_105 = $(CUPTI_INC_FLGS)
+@OPT_ENABLE_CUPTI_TRUE@am__append_106 = -DHPCRUN_SS_NVIDIA
+@OPT_PAPI_CUPTI_TRUE@am__append_107 = $(CUPTI_INC_FLGS)
+@OPT_PAPI_CUPTI_TRUE@am__append_108 = -DHPCRUN_SS_PAPI_C_CUPTI
+@OPT_PAPI_STATIC_TRUE@am__append_109 = $(MY_PAPI_FILES)
+@OPT_PAPI_STATIC_TRUE@am__append_110 = $(PAPI_INC_FLGS)
+@OPT_PAPI_STATIC_TRUE@am__append_111 = $(PAPI_LD_FLGS)
+@OPT_PAPI_STATIC_TRUE@am__append_112 = -DHPCRUN_SS_PAPI
+@OPT_ENABLE_UPC_TRUE@am__append_113 = $(MY_UPC_FILES)
 @OPT_ENABLE_UPC_TRUE@am__append_114 = $(MY_UPC_FILES)
-@OPT_ENABLE_UPC_TRUE@am__append_115 = $(MY_UPC_FILES)
+@OPT_ENABLE_UPC_TRUE@am__append_115 = $(OPT_UPC_IFLAGS)
 @OPT_ENABLE_UPC_TRUE@am__append_116 = $(OPT_UPC_IFLAGS)
-@OPT_ENABLE_UPC_TRUE@am__append_117 = $(OPT_UPC_IFLAGS)
-@OPT_ENABLE_UPC_TRUE@am__append_118 = $(OPT_UPC_LDFLAGS)
+@OPT_ENABLE_UPC_TRUE@am__append_117 = $(OPT_UPC_LDFLAGS)
+@OPT_ENABLE_LUSH_PTHREADS_TRUE@am__append_118 = -DLUSH_PTHREADS
 @OPT_ENABLE_LUSH_PTHREADS_TRUE@am__append_119 = -DLUSH_PTHREADS
-@OPT_ENABLE_LUSH_PTHREADS_TRUE@am__append_120 = -DLUSH_PTHREADS
-@OPT_ENABLE_CUDA_TRUE@am__append_121 = $(MY_CUDA_FILES)
-@OPT_ENABLE_CUDA_TRUE@am__append_122 = -DENABLE_CUDA
-@OPT_ENABLE_CUDA_TRUE@am__append_123 = $(OPT_CUDA_IFLAGS)
-@OPT_ENABLE_CUDA_TRUE@am__append_124 = $(MY_CUDA_FILES)
-@OPT_ENABLE_ROCM_TRUE@am__append_125 = $(MY_ROCM_FILES)
-@OPT_ENABLE_ROCM_TRUE@am__append_126 = -DENABLE_ROCM
-@OPT_ENABLE_ROCM_TRUE@am__append_127 = $(OPT_ROCM_IFLAGS)
-@OPT_ENABLE_ROCM_TRUE@am__append_128 = -DHPCRUN_SS_AMD
-@OPT_ENABLE_LEVEL0_TRUE@am__append_129 = $(MY_LEVEL0_FILES)
-@OPT_ENABLE_LEVEL0_TRUE@am__append_130 = -DENABLE_LEVEL0
-@OPT_ENABLE_LEVEL0_TRUE@am__append_131 = $(OPT_LEVEL0_IFLAGS)
-@OPT_ENABLE_LEVEL0_TRUE@am__append_132 = -DHPCRUN_SS_LEVEL0
-@ENABLE_OPENCL_TRUE@am__append_133 = $(OPENCL_IFLAGS)
-@ENABLE_OPENCL_TRUE@am__append_134 = -DHPCRUN_SS_OPENCL
-@OPT_ENABLE_LUSH_TRUE@@OPT_WITH_CILK_TRUE@am__append_135 = libagent-cilk.la
-@OPT_ENABLE_LUSH_TRUE@am__append_136 = libagent-pthread.la \
+@OPT_ENABLE_CUDA_TRUE@am__append_120 = $(MY_CUDA_FILES)
+@OPT_ENABLE_CUDA_TRUE@am__append_121 = -DENABLE_CUDA
+@OPT_ENABLE_CUDA_TRUE@am__append_122 = $(OPT_CUDA_IFLAGS)
+@OPT_ENABLE_CUDA_TRUE@am__append_123 = $(MY_CUDA_FILES)
+@OPT_ENABLE_ROCM_TRUE@am__append_124 = $(MY_ROCM_FILES)
+@OPT_ENABLE_ROCM_TRUE@am__append_125 = -DENABLE_ROCM
+@OPT_ENABLE_ROCM_TRUE@am__append_126 = $(OPT_ROCM_IFLAGS)
+@OPT_ENABLE_ROCM_TRUE@am__append_127 = -DHPCRUN_SS_AMD
+@OPT_ENABLE_LEVEL0_TRUE@am__append_128 = $(MY_LEVEL0_FILES)
+@OPT_ENABLE_LEVEL0_TRUE@am__append_129 = -DENABLE_LEVEL0
+@OPT_ENABLE_LEVEL0_TRUE@am__append_130 = $(OPT_LEVEL0_IFLAGS)
+@OPT_ENABLE_LEVEL0_TRUE@am__append_131 = -DHPCRUN_SS_LEVEL0
+@OPT_ENABLE_OPENCL_TRUE@am__append_132 = $(MY_OPENCL_FILES)
+@OPT_ENABLE_OPENCL_TRUE@am__append_133 = -DENABLE_OPENCL
+@OPT_ENABLE_OPENCL_TRUE@am__append_134 = $(OPT_OPENCL_IFLAGS)
+@OPT_ENABLE_OPENCL_TRUE@am__append_135 = -DHPCRUN_SS_OPENCL
+@OPT_ENABLE_GTPIN_TRUE@am__append_136 = $(MY_GTPIN_FILES)
+@OPT_ENABLE_GTPIN_TRUE@am__append_137 = -DENABLE_GTPIN
+@OPT_ENABLE_GTPIN_TRUE@am__append_138 = $(OPT_GTPIN_IFLAGS)
+@OPT_ENABLE_GTPIN_TRUE@am__append_139 = $(OPT_GTPIN_LDFLAGS)
+@OPT_ENABLE_GTPIN_TRUE@am__append_140 = -DHPCRUN_SS_GTPIN
+@OPT_ENABLE_LUSH_TRUE@@OPT_WITH_CILK_TRUE@am__append_141 = libagent-cilk.la
+@OPT_ENABLE_LUSH_TRUE@am__append_142 = libagent-pthread.la \
 @OPT_ENABLE_LUSH_TRUE@	libagent-tbb.la
 subdir = src/tool/hpcrun
 ACLOCAL_M4 = $(top_srcdir)/aclocal.m4
@@ -450,7 +448,7 @@ libagent_tbb_la_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CC \
 	-o $@
 @OPT_ENABLE_LUSH_TRUE@am_libagent_tbb_la_rpath = -rpath $(pkglibdir)
 libhpcrun_la_DEPENDENCIES = $(HPCLIB_ProfLean) $(HPCLIB_SupportLean) \
-	$(am__append_23)
+	$(am__append_22)
 am__libhpcrun_la_SOURCES_DIST = utilities/first_func.c main.h main.c \
 	disabled.c closure-registry.c cct_insert_backtrace.c \
 	cct_backtrace_finalize.c env.c epoch.c files.c \
@@ -518,9 +516,6 @@ am__libhpcrun_la_SOURCES_DIST = utilities/first_func.c main.h main.c \
 	sample-sources/perf/perfmon-util-dummy.c \
 	sample-sources/perf/kernel_blocking.c \
 	sample-sources/perf/kernel_blocking_stub.c \
-	sample-sources/opencl.c gpu/opencl/opencl-intercept.c \
-	gpu/opencl/opencl-api.c gpu/opencl/opencl-memory-manager.c \
-	gpu/opencl/opencl-activity-translate.c \
 	fnbounds/fnbounds_client.c fnbounds/fnbounds_dynamic.c \
 	monitor-exts/openmp.c hpcrun_dlfns.c custom-init-dynamic.c \
 	os/linux/dylib.c unwind/common/default_validation_summary.c \
@@ -546,11 +541,17 @@ am__libhpcrun_la_SOURCES_DIST = utilities/first_func.c main.h main.c \
 	gpu/level0/level0-api.c gpu/level0/level0-command-list-map.c \
 	gpu/level0/level0-command-process.c \
 	gpu/level0/level0-data-node.c gpu/level0/level0-event-map.c \
-	gpu/level0/level0-handle-map.c unwind/common/backtrace.c \
-	unwind/common/unw-throw.c unwind/common/binarytree_uwi.c \
-	unwind/common/interval_t.c unwind/common/libunw_intervals.c \
-	unwind/common/stack_troll.c unwind/common/uw_hash.c \
-	unwind/common/uw_recipe_map.c \
+	gpu/level0/level0-handle-map.c sample-sources/opencl.c \
+	gpu/opencl/opencl-api.c gpu/opencl/opencl-memory-manager.c \
+	gpu/opencl/opencl-activity-translate.c \
+	gpu/opencl/opencl-h2d-map.c \
+	gpu/instrumentation/kernel-data-map.c \
+	gpu/instrumentation/gtpin-instrumentation.c \
+	gpu/instrumentation/gtpin-correlation-id-map.c \
+	unwind/common/backtrace.c unwind/common/unw-throw.c \
+	unwind/common/binarytree_uwi.c unwind/common/interval_t.c \
+	unwind/common/libunw_intervals.c unwind/common/stack_troll.c \
+	unwind/common/uw_hash.c unwind/common/uw_recipe_map.c \
 	unwind/generic-libunwind/libunw-unwind.c \
 	unwind/ppc64/ppc64-unwind.c \
 	unwind/ppc64/ppc64-unwind-interval.c \
@@ -585,13 +586,7 @@ am__libhpcrun_la_SOURCES_DIST = utilities/first_func.c main.h main.c \
 @OPT_ENABLE_PERF_EVENT_TRUE@@OPT_PERFMON_FALSE@am__objects_11 = sample-sources/perf/libhpcrun_la-perfmon-util-dummy.lo
 @OPT_ENABLE_KERNEL_4_3_TRUE@@OPT_ENABLE_PERF_EVENT_TRUE@am__objects_12 = sample-sources/perf/libhpcrun_la-kernel_blocking.lo
 @OPT_ENABLE_KERNEL_4_3_FALSE@@OPT_ENABLE_PERF_EVENT_TRUE@am__objects_13 = sample-sources/perf/libhpcrun_la-kernel_blocking_stub.lo
-@ENABLE_OPENCL_TRUE@am__objects_14 =  \
-@ENABLE_OPENCL_TRUE@	sample-sources/libhpcrun_la-opencl.lo \
-@ENABLE_OPENCL_TRUE@	gpu/opencl/libhpcrun_la-opencl-intercept.lo \
-@ENABLE_OPENCL_TRUE@	gpu/opencl/libhpcrun_la-opencl-api.lo \
-@ENABLE_OPENCL_TRUE@	gpu/opencl/libhpcrun_la-opencl-memory-manager.lo \
-@ENABLE_OPENCL_TRUE@	gpu/opencl/libhpcrun_la-opencl-activity-translate.lo
-am__objects_15 = utilities/libhpcrun_la-first_func.lo \
+am__objects_14 = utilities/libhpcrun_la-first_func.lo \
 	libhpcrun_la-main.lo libhpcrun_la-disabled.lo \
 	libhpcrun_la-closure-registry.lo \
 	libhpcrun_la-cct_insert_backtrace.lo \
@@ -696,39 +691,38 @@ am__objects_15 = utilities/libhpcrun_la-first_func.lo \
 	utilities/libhpcrun_la-tokenize.lo \
 	utilities/libhpcrun_la-unlink.lo $(am__objects_7) \
 	$(am__objects_8) $(am__objects_9) $(am__objects_10) \
-	$(am__objects_11) $(am__objects_12) $(am__objects_13) \
-	$(am__objects_14)
-am__objects_16 = fnbounds/libhpcrun_la-fnbounds_client.lo \
+	$(am__objects_11) $(am__objects_12) $(am__objects_13)
+am__objects_15 = fnbounds/libhpcrun_la-fnbounds_client.lo \
 	fnbounds/libhpcrun_la-fnbounds_dynamic.lo \
 	monitor-exts/libhpcrun_la-openmp.lo \
 	libhpcrun_la-hpcrun_dlfns.lo \
 	libhpcrun_la-custom-init-dynamic.lo
-am__objects_17 = os/linux/libhpcrun_la-dylib.lo
-@HOST_OS_LINUX_TRUE@am__objects_18 = $(am__objects_17)
-am__objects_19 =  \
+am__objects_16 = os/linux/libhpcrun_la-dylib.lo
+@HOST_OS_LINUX_TRUE@am__objects_17 = $(am__objects_16)
+am__objects_18 =  \
 	unwind/common/libhpcrun_la-default_validation_summary.lo
-@HOST_CPU_MIPS_TRUE@am__objects_20 = $(am__objects_19)
-am__objects_21 = trampoline/ppc64/libhpcrun_la-ppc64-tramp.lo \
+@HOST_CPU_MIPS_TRUE@am__objects_19 = $(am__objects_18)
+am__objects_20 = trampoline/ppc64/libhpcrun_la-ppc64-tramp.lo \
 	utilities/arch/ppc64/libhpcrun_la-ppc64-context-pc.lo
-@HOST_CPU_PPC_TRUE@am__objects_22 = $(am__objects_21)
-am__objects_23 = trampoline/x86-family/libhpcrun_la-x86-tramp.lo \
+@HOST_CPU_PPC_TRUE@am__objects_21 = $(am__objects_20)
+am__objects_22 = trampoline/x86-family/libhpcrun_la-x86-tramp.lo \
 	utilities/arch/x86-family/libhpcrun_la-x86-context-pc.lo
-@HOST_CPU_X86_FAMILY_TRUE@am__objects_24 = $(am__objects_23)
-am__objects_25 = trampoline/ia64/libhpcrun_la-ia64-tramp.lo \
+@HOST_CPU_X86_FAMILY_TRUE@am__objects_23 = $(am__objects_22)
+am__objects_24 = trampoline/ia64/libhpcrun_la-ia64-tramp.lo \
 	utilities/arch/ia64/libhpcrun_la-ia64-context-pc.lo
-@HOST_CPU_IA64_TRUE@am__objects_26 = $(am__objects_25)
-am__objects_27 = trampoline/aarch64/libhpcrun_la-aarch64-tramp.lo \
+@HOST_CPU_IA64_TRUE@am__objects_25 = $(am__objects_24)
+am__objects_26 = trampoline/aarch64/libhpcrun_la-aarch64-tramp.lo \
 	utilities/arch/libunwind/libhpcrun_la-libunwind-context-pc.lo
-@HOST_CPU_AARCH64_TRUE@am__objects_28 = $(am__objects_27)
-@OPT_PAPI_CUPTI_TRUE@am__objects_29 = sample-sources/libhpcrun_la-papi-c-cupti.lo
-@OPT_PAPI_COMPONENT_FALSE@am__objects_30 =  \
+@HOST_CPU_AARCH64_TRUE@am__objects_27 = $(am__objects_26)
+@OPT_PAPI_CUPTI_TRUE@am__objects_28 = sample-sources/libhpcrun_la-papi-c-cupti.lo
+@OPT_PAPI_COMPONENT_FALSE@am__objects_29 =  \
 @OPT_PAPI_COMPONENT_FALSE@	sample-sources/libhpcrun_la-papi.lo \
-@OPT_PAPI_COMPONENT_FALSE@	$(am__objects_29)
-@OPT_PAPI_COMPONENT_TRUE@am__objects_30 = sample-sources/libhpcrun_la-papi-c.lo \
+@OPT_PAPI_COMPONENT_FALSE@	$(am__objects_28)
+@OPT_PAPI_COMPONENT_TRUE@am__objects_29 = sample-sources/libhpcrun_la-papi-c.lo \
 @OPT_PAPI_COMPONENT_TRUE@	sample-sources/libhpcrun_la-papi-c-extended-info.lo \
-@OPT_PAPI_COMPONENT_TRUE@	$(am__objects_29)
-@OPT_PAPI_DYNAMIC_TRUE@am__objects_31 = $(am__objects_30)
-@OPT_ENABLE_CUPTI_TRUE@am__objects_32 =  \
+@OPT_PAPI_COMPONENT_TRUE@	$(am__objects_28)
+@OPT_PAPI_DYNAMIC_TRUE@am__objects_30 = $(am__objects_29)
+@OPT_ENABLE_CUPTI_TRUE@am__objects_31 =  \
 @OPT_ENABLE_CUPTI_TRUE@	sample-sources/libhpcrun_la-nvidia.lo \
 @OPT_ENABLE_CUPTI_TRUE@	gpu/nvidia/libhpcrun_la-cubin-hash-map.lo \
 @OPT_ENABLE_CUPTI_TRUE@	gpu/nvidia/libhpcrun_la-cubin-id-map.lo \
@@ -739,18 +733,18 @@ am__objects_27 = trampoline/aarch64/libhpcrun_la-aarch64-tramp.lo \
 @OPT_ENABLE_CUPTI_TRUE@	gpu/nvidia/libhpcrun_la-cupti-analysis.lo \
 @OPT_ENABLE_CUPTI_TRUE@	gpu/nvidia/libhpcrun_la-cupti-api.lo \
 @OPT_ENABLE_CUPTI_TRUE@	gpu/nvidia/libhpcrun_la-cupti-gpu-api.lo
-@OPT_ENABLE_CUPTI_TRUE@am__objects_33 = $(am__objects_32)
-am__objects_34 = sample-sources/libhpcrun_la-upc.lo
-@OPT_ENABLE_UPC_TRUE@am__objects_35 = $(am__objects_34)
-am__objects_36 =
-@OPT_ENABLE_ROCM_TRUE@am__objects_37 =  \
+@OPT_ENABLE_CUPTI_TRUE@am__objects_32 = $(am__objects_31)
+am__objects_33 = sample-sources/libhpcrun_la-upc.lo
+@OPT_ENABLE_UPC_TRUE@am__objects_34 = $(am__objects_33)
+am__objects_35 =
+@OPT_ENABLE_ROCM_TRUE@am__objects_36 =  \
 @OPT_ENABLE_ROCM_TRUE@	sample-sources/libhpcrun_la-amd.lo \
 @OPT_ENABLE_ROCM_TRUE@	gpu/amd/libhpcrun_la-roctracer-activity-translate.lo \
 @OPT_ENABLE_ROCM_TRUE@	gpu/amd/libhpcrun_la-roctracer-api.lo \
 @OPT_ENABLE_ROCM_TRUE@	gpu/amd/libhpcrun_la-rocm-debug-api.lo \
 @OPT_ENABLE_ROCM_TRUE@	gpu/amd/libhpcrun_la-rocm-binary-processing.lo
-@OPT_ENABLE_ROCM_TRUE@am__objects_38 = $(am__objects_37)
-@OPT_ENABLE_LEVEL0_TRUE@am__objects_39 =  \
+@OPT_ENABLE_ROCM_TRUE@am__objects_37 = $(am__objects_36)
+@OPT_ENABLE_LEVEL0_TRUE@am__objects_38 =  \
 @OPT_ENABLE_LEVEL0_TRUE@	sample-sources/libhpcrun_la-level0.lo \
 @OPT_ENABLE_LEVEL0_TRUE@	gpu/level0/libhpcrun_la-level0-api.lo \
 @OPT_ENABLE_LEVEL0_TRUE@	gpu/level0/libhpcrun_la-level0-command-list-map.lo \
@@ -758,24 +752,35 @@ am__objects_36 =
 @OPT_ENABLE_LEVEL0_TRUE@	gpu/level0/libhpcrun_la-level0-data-node.lo \
 @OPT_ENABLE_LEVEL0_TRUE@	gpu/level0/libhpcrun_la-level0-event-map.lo \
 @OPT_ENABLE_LEVEL0_TRUE@	gpu/level0/libhpcrun_la-level0-handle-map.lo
-@OPT_ENABLE_LEVEL0_TRUE@am__objects_40 = $(am__objects_39)
-am__objects_41 = unwind/common/libhpcrun_la-backtrace.lo \
+@OPT_ENABLE_LEVEL0_TRUE@am__objects_39 = $(am__objects_38)
+@OPT_ENABLE_OPENCL_TRUE@am__objects_40 =  \
+@OPT_ENABLE_OPENCL_TRUE@	sample-sources/libhpcrun_la-opencl.lo \
+@OPT_ENABLE_OPENCL_TRUE@	gpu/opencl/libhpcrun_la-opencl-api.lo \
+@OPT_ENABLE_OPENCL_TRUE@	gpu/opencl/libhpcrun_la-opencl-memory-manager.lo \
+@OPT_ENABLE_OPENCL_TRUE@	gpu/opencl/libhpcrun_la-opencl-activity-translate.lo \
+@OPT_ENABLE_OPENCL_TRUE@	gpu/opencl/libhpcrun_la-opencl-h2d-map.lo
+@OPT_ENABLE_OPENCL_TRUE@am__objects_41 = $(am__objects_40)
+@OPT_ENABLE_GTPIN_TRUE@am__objects_42 = gpu/instrumentation/libhpcrun_la-kernel-data-map.lo \
+@OPT_ENABLE_GTPIN_TRUE@	gpu/instrumentation/libhpcrun_la-gtpin-instrumentation.lo \
+@OPT_ENABLE_GTPIN_TRUE@	gpu/instrumentation/libhpcrun_la-gtpin-correlation-id-map.lo
+@OPT_ENABLE_GTPIN_TRUE@am__objects_43 = $(am__objects_42)
+am__objects_44 = unwind/common/libhpcrun_la-backtrace.lo \
 	unwind/common/libhpcrun_la-unw-throw.lo
-am__objects_42 = $(am__objects_41) \
+am__objects_45 = $(am__objects_44) \
 	unwind/common/libhpcrun_la-binarytree_uwi.lo \
 	unwind/common/libhpcrun_la-interval_t.lo \
 	unwind/common/libhpcrun_la-libunw_intervals.lo \
 	unwind/common/libhpcrun_la-stack_troll.lo \
 	unwind/common/libhpcrun_la-uw_hash.lo \
 	unwind/common/libhpcrun_la-uw_recipe_map.lo
-am__objects_43 = $(am__objects_42) \
+am__objects_46 = $(am__objects_45) \
 	unwind/generic-libunwind/libhpcrun_la-libunw-unwind.lo \
 	unwind/common/libhpcrun_la-default_validation_summary.lo
-am__objects_44 = $(am__objects_42) \
+am__objects_47 = $(am__objects_45) \
 	unwind/ppc64/libhpcrun_la-ppc64-unwind.lo \
 	unwind/ppc64/libhpcrun_la-ppc64-unwind-interval.lo \
 	unwind/common/libhpcrun_la-default_validation_summary.lo
-am__objects_45 = $(am__objects_42) \
+am__objects_48 = $(am__objects_45) \
 	unwind/x86-family/libhpcrun_la-x86-all.lo \
 	unwind/x86-family/libhpcrun_la-amd-xop.lo \
 	unwind/x86-family/libhpcrun_la-x86-cold-path.lo \
@@ -795,15 +800,16 @@ am__objects_45 = $(am__objects_42) \
 	unwind/x86-family/manual-intervals/libhpcrun_la-x86-32bit-icc-variant.lo \
 	unwind/x86-family/manual-intervals/libhpcrun_la-x86-fail-intervals.lo \
 	unwind/x86-family/manual-intervals/libhpcrun_la-x86-pgi-mp_pexit.lo
-@UNW_LIBUNW_FALSE@@UNW_PPC64_FALSE@@UNW_X86_TRUE@am__objects_46 = $(am__objects_45)
-@UNW_LIBUNW_FALSE@@UNW_PPC64_TRUE@am__objects_46 = $(am__objects_44)
-@UNW_LIBUNW_TRUE@am__objects_46 = $(am__objects_43)
-am_libhpcrun_la_OBJECTS = $(am__objects_15) $(am__objects_16) \
-	$(am__objects_18) $(am__objects_20) $(am__objects_22) \
-	$(am__objects_24) $(am__objects_26) $(am__objects_28) \
-	$(am__objects_31) $(am__objects_33) $(am__objects_35) \
-	$(am__objects_36) $(am__objects_38) $(am__objects_40) \
-	$(am__objects_46) utilities/libhpcrun_la-last_func.lo
+@UNW_LIBUNW_FALSE@@UNW_PPC64_FALSE@@UNW_X86_TRUE@am__objects_49 = $(am__objects_48)
+@UNW_LIBUNW_FALSE@@UNW_PPC64_TRUE@am__objects_49 = $(am__objects_47)
+@UNW_LIBUNW_TRUE@am__objects_49 = $(am__objects_46)
+am_libhpcrun_la_OBJECTS = $(am__objects_14) $(am__objects_15) \
+	$(am__objects_17) $(am__objects_19) $(am__objects_21) \
+	$(am__objects_23) $(am__objects_25) $(am__objects_27) \
+	$(am__objects_30) $(am__objects_32) $(am__objects_34) \
+	$(am__objects_35) $(am__objects_37) $(am__objects_39) \
+	$(am__objects_41) $(am__objects_43) $(am__objects_49) \
+	utilities/libhpcrun_la-last_func.lo
 libhpcrun_la_OBJECTS = $(am_libhpcrun_la_OBJECTS)
 libhpcrun_la_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) \
 	$(LIBTOOLFLAGS) --mode=link $(CCLD) $(libhpcrun_la_CFLAGS) \
@@ -940,9 +946,6 @@ am__libhpcrun_o_SOURCES_DIST = utilities/first_func.c main.h main.c \
 	sample-sources/perf/perfmon-util-dummy.c \
 	sample-sources/perf/kernel_blocking.c \
 	sample-sources/perf/kernel_blocking_stub.c \
-	sample-sources/opencl.c gpu/opencl/opencl-intercept.c \
-	gpu/opencl/opencl-api.c gpu/opencl/opencl-memory-manager.c \
-	gpu/opencl/opencl-activity-translate.c \
 	fnbounds/fnbounds_static.c custom-init-static.c \
 	unwind/common/default_validation_summary.c \
 	trampoline/ppc64/ppc64-tramp.s \
@@ -987,24 +990,19 @@ am__libhpcrun_o_SOURCES_DIST = utilities/first_func.c main.h main.c \
 	unwind/x86-family/manual-intervals/x86-fail-intervals.c \
 	unwind/x86-family/manual-intervals/x86-pgi-mp_pexit.c \
 	utilities/last_func.c
-@HOST_CPU_PPC_TRUE@am__objects_47 = trampoline/common/libhpcrun_o-trampoline_eager.$(OBJEXT)
-@HOST_CPU_PPC_FALSE@am__objects_48 = trampoline/common/libhpcrun_o-trampoline_lazy.$(OBJEXT)
-@OPT_ENABLE_PERF_EVENT_TRUE@am__objects_49 = sample-sources/perf/libhpcrun_o-event_custom.$(OBJEXT) \
+@HOST_CPU_PPC_TRUE@am__objects_50 = trampoline/common/libhpcrun_o-trampoline_eager.$(OBJEXT)
+@HOST_CPU_PPC_FALSE@am__objects_51 = trampoline/common/libhpcrun_o-trampoline_lazy.$(OBJEXT)
+@OPT_ENABLE_PERF_EVENT_TRUE@am__objects_52 = sample-sources/perf/libhpcrun_o-event_custom.$(OBJEXT) \
 @OPT_ENABLE_PERF_EVENT_TRUE@	sample-sources/perf/libhpcrun_o-linux_perf.$(OBJEXT) \
 @OPT_ENABLE_PERF_EVENT_TRUE@	sample-sources/perf/libhpcrun_o-perf_event_open.$(OBJEXT) \
 @OPT_ENABLE_PERF_EVENT_TRUE@	sample-sources/perf/libhpcrun_o-perf-util.$(OBJEXT) \
 @OPT_ENABLE_PERF_EVENT_TRUE@	sample-sources/perf/libhpcrun_o-perf_mmap.$(OBJEXT) \
 @OPT_ENABLE_PERF_EVENT_TRUE@	sample-sources/perf/libhpcrun_o-perf_skid.$(OBJEXT)
-@OPT_ENABLE_PERF_EVENT_TRUE@@OPT_PERFMON_TRUE@am__objects_50 = sample-sources/perf/libhpcrun_o-perfmon-util.$(OBJEXT)
-@OPT_ENABLE_PERF_EVENT_TRUE@@OPT_PERFMON_FALSE@am__objects_51 = sample-sources/perf/libhpcrun_o-perfmon-util-dummy.$(OBJEXT)
-@OPT_ENABLE_KERNEL_4_3_TRUE@@OPT_ENABLE_PERF_EVENT_TRUE@am__objects_52 = sample-sources/perf/libhpcrun_o-kernel_blocking.$(OBJEXT)
-@OPT_ENABLE_KERNEL_4_3_FALSE@@OPT_ENABLE_PERF_EVENT_TRUE@am__objects_53 = sample-sources/perf/libhpcrun_o-kernel_blocking_stub.$(OBJEXT)
-@ENABLE_OPENCL_TRUE@am__objects_54 = sample-sources/libhpcrun_o-opencl.$(OBJEXT) \
-@ENABLE_OPENCL_TRUE@	gpu/opencl/libhpcrun_o-opencl-intercept.$(OBJEXT) \
-@ENABLE_OPENCL_TRUE@	gpu/opencl/libhpcrun_o-opencl-api.$(OBJEXT) \
-@ENABLE_OPENCL_TRUE@	gpu/opencl/libhpcrun_o-opencl-memory-manager.$(OBJEXT) \
-@ENABLE_OPENCL_TRUE@	gpu/opencl/libhpcrun_o-opencl-activity-translate.$(OBJEXT)
-am__objects_55 = utilities/libhpcrun_o-first_func.$(OBJEXT) \
+@OPT_ENABLE_PERF_EVENT_TRUE@@OPT_PERFMON_TRUE@am__objects_53 = sample-sources/perf/libhpcrun_o-perfmon-util.$(OBJEXT)
+@OPT_ENABLE_PERF_EVENT_TRUE@@OPT_PERFMON_FALSE@am__objects_54 = sample-sources/perf/libhpcrun_o-perfmon-util-dummy.$(OBJEXT)
+@OPT_ENABLE_KERNEL_4_3_TRUE@@OPT_ENABLE_PERF_EVENT_TRUE@am__objects_55 = sample-sources/perf/libhpcrun_o-kernel_blocking.$(OBJEXT)
+@OPT_ENABLE_KERNEL_4_3_FALSE@@OPT_ENABLE_PERF_EVENT_TRUE@am__objects_56 = sample-sources/perf/libhpcrun_o-kernel_blocking_stub.$(OBJEXT)
+am__objects_57 = utilities/libhpcrun_o-first_func.$(OBJEXT) \
 	libhpcrun_o-main.$(OBJEXT) libhpcrun_o-disabled.$(OBJEXT) \
 	libhpcrun_o-closure-registry.$(OBJEXT) \
 	libhpcrun_o-cct_insert_backtrace.$(OBJEXT) \
@@ -1120,29 +1118,28 @@ am__objects_55 = utilities/libhpcrun_o-first_func.$(OBJEXT) \
 	utilities/libhpcrun_o-line_wrapping.$(OBJEXT) \
 	utilities/libhpcrun_o-timer.$(OBJEXT) \
 	utilities/libhpcrun_o-tokenize.$(OBJEXT) \
-	utilities/libhpcrun_o-unlink.$(OBJEXT) $(am__objects_47) \
-	$(am__objects_48) $(am__objects_49) $(am__objects_50) \
+	utilities/libhpcrun_o-unlink.$(OBJEXT) $(am__objects_50) \
 	$(am__objects_51) $(am__objects_52) $(am__objects_53) \
-	$(am__objects_54)
-am__objects_56 = fnbounds/libhpcrun_o-fnbounds_static.$(OBJEXT) \
+	$(am__objects_54) $(am__objects_55) $(am__objects_56)
+am__objects_58 = fnbounds/libhpcrun_o-fnbounds_static.$(OBJEXT) \
 	libhpcrun_o-custom-init-static.$(OBJEXT)
-am__objects_57 = unwind/common/libhpcrun_o-default_validation_summary.$(OBJEXT)
-@HOST_CPU_MIPS_TRUE@am__objects_58 = $(am__objects_57)
-am__objects_59 = trampoline/ppc64/libhpcrun_o-ppc64-tramp.$(OBJEXT) \
+am__objects_59 = unwind/common/libhpcrun_o-default_validation_summary.$(OBJEXT)
+@HOST_CPU_MIPS_TRUE@am__objects_60 = $(am__objects_59)
+am__objects_61 = trampoline/ppc64/libhpcrun_o-ppc64-tramp.$(OBJEXT) \
 	utilities/arch/ppc64/libhpcrun_o-ppc64-context-pc.$(OBJEXT)
-@HOST_CPU_PPC_TRUE@am__objects_60 = $(am__objects_59)
-am__objects_61 =  \
+@HOST_CPU_PPC_TRUE@am__objects_62 = $(am__objects_61)
+am__objects_63 =  \
 	trampoline/x86-family/libhpcrun_o-x86-tramp.$(OBJEXT) \
 	utilities/arch/x86-family/libhpcrun_o-x86-context-pc.$(OBJEXT)
-@HOST_CPU_X86_FAMILY_TRUE@am__objects_62 = $(am__objects_61)
-am__objects_63 = trampoline/ia64/libhpcrun_o-ia64-tramp.$(OBJEXT) \
+@HOST_CPU_X86_FAMILY_TRUE@am__objects_64 = $(am__objects_63)
+am__objects_65 = trampoline/ia64/libhpcrun_o-ia64-tramp.$(OBJEXT) \
 	utilities/arch/ia64/libhpcrun_o-ia64-context-pc.$(OBJEXT)
-@HOST_CPU_IA64_TRUE@am__objects_64 = $(am__objects_63)
-am__objects_65 =  \
+@HOST_CPU_IA64_TRUE@am__objects_66 = $(am__objects_65)
+am__objects_67 =  \
 	trampoline/aarch64/libhpcrun_o-aarch64-tramp.$(OBJEXT) \
 	utilities/arch/libunwind/libhpcrun_o-libunwind-context-pc.$(OBJEXT)
-@HOST_CPU_AARCH64_TRUE@am__objects_66 = $(am__objects_65)
-@OPT_ENABLE_CUPTI_TRUE@am__objects_67 = sample-sources/libhpcrun_o-nvidia.$(OBJEXT) \
+@HOST_CPU_AARCH64_TRUE@am__objects_68 = $(am__objects_67)
+@OPT_ENABLE_CUPTI_TRUE@am__objects_69 = sample-sources/libhpcrun_o-nvidia.$(OBJEXT) \
 @OPT_ENABLE_CUPTI_TRUE@	gpu/nvidia/libhpcrun_o-cubin-hash-map.$(OBJEXT) \
 @OPT_ENABLE_CUPTI_TRUE@	gpu/nvidia/libhpcrun_o-cubin-id-map.$(OBJEXT) \
 @OPT_ENABLE_CUPTI_TRUE@	gpu/nvidia/libhpcrun_o-cubin-symbols.$(OBJEXT) \
@@ -1152,33 +1149,33 @@ am__objects_65 =  \
 @OPT_ENABLE_CUPTI_TRUE@	gpu/nvidia/libhpcrun_o-cupti-analysis.$(OBJEXT) \
 @OPT_ENABLE_CUPTI_TRUE@	gpu/nvidia/libhpcrun_o-cupti-api.$(OBJEXT) \
 @OPT_ENABLE_CUPTI_TRUE@	gpu/nvidia/libhpcrun_o-cupti-gpu-api.$(OBJEXT)
-@OPT_ENABLE_CUPTI_TRUE@am__objects_68 = $(am__objects_67)
-@OPT_PAPI_CUPTI_TRUE@am__objects_69 = sample-sources/libhpcrun_o-papi-c-cupti.$(OBJEXT)
-@OPT_PAPI_COMPONENT_FALSE@am__objects_70 = sample-sources/libhpcrun_o-papi.$(OBJEXT) \
-@OPT_PAPI_COMPONENT_FALSE@	$(am__objects_69)
-@OPT_PAPI_COMPONENT_TRUE@am__objects_70 = sample-sources/libhpcrun_o-papi-c.$(OBJEXT) \
+@OPT_ENABLE_CUPTI_TRUE@am__objects_70 = $(am__objects_69)
+@OPT_PAPI_CUPTI_TRUE@am__objects_71 = sample-sources/libhpcrun_o-papi-c-cupti.$(OBJEXT)
+@OPT_PAPI_COMPONENT_FALSE@am__objects_72 = sample-sources/libhpcrun_o-papi.$(OBJEXT) \
+@OPT_PAPI_COMPONENT_FALSE@	$(am__objects_71)
+@OPT_PAPI_COMPONENT_TRUE@am__objects_72 = sample-sources/libhpcrun_o-papi-c.$(OBJEXT) \
 @OPT_PAPI_COMPONENT_TRUE@	sample-sources/libhpcrun_o-papi-c-extended-info.$(OBJEXT) \
-@OPT_PAPI_COMPONENT_TRUE@	$(am__objects_69)
-@OPT_PAPI_STATIC_TRUE@am__objects_71 = $(am__objects_70)
-am__objects_72 = sample-sources/libhpcrun_o-upc.$(OBJEXT)
-@OPT_ENABLE_UPC_TRUE@am__objects_73 = $(am__objects_72)
-am__objects_74 = unwind/common/libhpcrun_o-backtrace.$(OBJEXT) \
+@OPT_PAPI_COMPONENT_TRUE@	$(am__objects_71)
+@OPT_PAPI_STATIC_TRUE@am__objects_73 = $(am__objects_72)
+am__objects_74 = sample-sources/libhpcrun_o-upc.$(OBJEXT)
+@OPT_ENABLE_UPC_TRUE@am__objects_75 = $(am__objects_74)
+am__objects_76 = unwind/common/libhpcrun_o-backtrace.$(OBJEXT) \
 	unwind/common/libhpcrun_o-unw-throw.$(OBJEXT)
-am__objects_75 = $(am__objects_74) \
+am__objects_77 = $(am__objects_76) \
 	unwind/common/libhpcrun_o-binarytree_uwi.$(OBJEXT) \
 	unwind/common/libhpcrun_o-interval_t.$(OBJEXT) \
 	unwind/common/libhpcrun_o-libunw_intervals.$(OBJEXT) \
 	unwind/common/libhpcrun_o-stack_troll.$(OBJEXT) \
 	unwind/common/libhpcrun_o-uw_hash.$(OBJEXT) \
 	unwind/common/libhpcrun_o-uw_recipe_map.$(OBJEXT)
-am__objects_76 = $(am__objects_75) \
+am__objects_78 = $(am__objects_77) \
 	unwind/generic-libunwind/libhpcrun_o-libunw-unwind.$(OBJEXT) \
 	unwind/common/libhpcrun_o-default_validation_summary.$(OBJEXT)
-am__objects_77 = $(am__objects_75) \
+am__objects_79 = $(am__objects_77) \
 	unwind/ppc64/libhpcrun_o-ppc64-unwind.$(OBJEXT) \
 	unwind/ppc64/libhpcrun_o-ppc64-unwind-interval.$(OBJEXT) \
 	unwind/common/libhpcrun_o-default_validation_summary.$(OBJEXT)
-am__objects_78 = $(am__objects_75) \
+am__objects_80 = $(am__objects_77) \
 	unwind/x86-family/libhpcrun_o-x86-all.$(OBJEXT) \
 	unwind/x86-family/libhpcrun_o-amd-xop.$(OBJEXT) \
 	unwind/x86-family/libhpcrun_o-x86-cold-path.$(OBJEXT) \
@@ -1198,17 +1195,17 @@ am__objects_78 = $(am__objects_75) \
 	unwind/x86-family/manual-intervals/libhpcrun_o-x86-32bit-icc-variant.$(OBJEXT) \
 	unwind/x86-family/manual-intervals/libhpcrun_o-x86-fail-intervals.$(OBJEXT) \
 	unwind/x86-family/manual-intervals/libhpcrun_o-x86-pgi-mp_pexit.$(OBJEXT)
-@UNW_LIBUNW_FALSE@@UNW_PPC64_FALSE@@UNW_X86_TRUE@am__objects_79 = $(am__objects_78)
-@UNW_LIBUNW_FALSE@@UNW_PPC64_TRUE@am__objects_79 = $(am__objects_77)
-@UNW_LIBUNW_TRUE@am__objects_79 = $(am__objects_76)
-am_libhpcrun_o_OBJECTS = $(am__objects_55) $(am__objects_56) \
-	$(am__objects_58) $(am__objects_60) $(am__objects_62) \
-	$(am__objects_64) $(am__objects_66) $(am__objects_68) \
-	$(am__objects_71) $(am__objects_73) $(am__objects_36) \
-	$(am__objects_79) utilities/libhpcrun_o-last_func.$(OBJEXT)
+@UNW_LIBUNW_FALSE@@UNW_PPC64_FALSE@@UNW_X86_TRUE@am__objects_81 = $(am__objects_80)
+@UNW_LIBUNW_FALSE@@UNW_PPC64_TRUE@am__objects_81 = $(am__objects_79)
+@UNW_LIBUNW_TRUE@am__objects_81 = $(am__objects_78)
+am_libhpcrun_o_OBJECTS = $(am__objects_57) $(am__objects_58) \
+	$(am__objects_60) $(am__objects_62) $(am__objects_64) \
+	$(am__objects_66) $(am__objects_68) $(am__objects_70) \
+	$(am__objects_73) $(am__objects_75) $(am__objects_35) \
+	$(am__objects_81) utilities/libhpcrun_o-last_func.$(OBJEXT)
 libhpcrun_o_OBJECTS = $(am_libhpcrun_o_OBJECTS)
 libhpcrun_o_DEPENDENCIES = $(HPCLIB_ProfLean) $(HPCLIB_SupportLean) \
-	$(am__append_24)
+	$(am__append_23)
 libhpcrun_o_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) \
 	$(LIBTOOLFLAGS) --mode=link $(CCLD) $(libhpcrun_o_CFLAGS) \
 	$(CFLAGS) $(libhpcrun_o_LDFLAGS) $(LDFLAGS) -o $@
@@ -1501,9 +1498,18 @@ OPT_CUDA_LDFLAGS = @OPT_CUDA_LDFLAGS@
 OPT_CUPTI = @OPT_CUPTI@
 OPT_CUPTI_IFLAGS = @OPT_CUPTI_IFLAGS@
 OPT_CUPTI_LDFLAGS = @OPT_CUPTI_LDFLAGS@
+OPT_GTPIN = @OPT_GTPIN@
+OPT_GTPIN_IFLAGS = @OPT_GTPIN_IFLAGS@
+OPT_GTPIN_LDFLAGS = @OPT_GTPIN_LDFLAGS@
+OPT_IGC = @OPT_IGC@
+OPT_IGC_IFLAGS = @OPT_IGC_IFLAGS@
+OPT_IGC_LDFLAGS = @OPT_IGC_LDFLAGS@
 OPT_LEVEL0 = @OPT_LEVEL0@
 OPT_LEVEL0_IFLAGS = @OPT_LEVEL0_IFLAGS@
 OPT_LEVEL0_LDFLAGS = @OPT_LEVEL0_LDFLAGS@
+OPT_METRICS_DISCOVERY = @OPT_METRICS_DISCOVERY@
+OPT_METRICS_DISCOVERY_IFLAGS = @OPT_METRICS_DISCOVERY_IFLAGS@
+OPT_METRICS_DISCOVERY_LDFLAGS = @OPT_METRICS_DISCOVERY_LDFLAGS@
 OPT_OBJCOPY = @OPT_OBJCOPY@
 OPT_OPENCL_IFLAGS = @OPT_OPENCL_IFLAGS@
 OPT_PAPI = @OPT_PAPI@
@@ -1689,10 +1695,10 @@ bin_SCRIPTS = $(am__append_4) $(am__append_6)
 pkglibexec_SCRIPTS = $(am__append_1)
 include_HEADERS = $(am__append_2)
 pkglib_LIBRARIES = $(am__append_5)
-pkglib_LTLIBRARIES = $(am__append_3) $(am__append_7) $(am__append_135) \
-	$(am__append_136)
-BUILT_SOURCES = $(am__append_21)
-CLEANFILES = $(am__append_22)
+pkglib_LTLIBRARIES = $(am__append_3) $(am__append_7) $(am__append_141) \
+	$(am__append_142)
+BUILT_SOURCES = $(am__append_20)
+CLEANFILES = $(am__append_21)
 PAPI_INC_FLGS = @OPT_PAPI_IFLAGS@ 
 PAPI_LD_FLGS = @OPT_PAPI_LDFLAGS@
 CUPTI_INC_FLGS = @OPT_CUPTI_IFLAGS@
@@ -1778,10 +1784,10 @@ UNW_MIPS_INCLUDE_DIRS = \
 
 UNW_MIPS_LD_FLAGS = 
 MY_CPP_DEFINES = -D_GNU_SOURCE -DINLINE_FN=1 -DLOCAL_BUILD=1 \
-	-D__HIP_PLATFORM_HCC__=1 $(am__append_11) $(am__append_18) \
-	$(am__append_103) $(am__append_107) $(am__append_109) \
-	$(am__append_113) $(am__append_128) $(am__append_132) \
-	$(am__append_134)
+	-D__HIP_PLATFORM_HCC__=1 $(am__append_11) $(am__append_17) \
+	$(am__append_102) $(am__append_106) $(am__append_108) \
+	$(am__append_112) $(am__append_127) $(am__append_131) \
+	$(am__append_135) $(am__append_140)
 MY_BASE_FILES = utilities/first_func.c main.h main.c disabled.c \
 	closure-registry.c cct_insert_backtrace.c \
 	cct_backtrace_finalize.c env.c epoch.c files.c \
@@ -1838,8 +1844,7 @@ MY_BASE_FILES = utilities/first_func.c main.h main.c disabled.c \
 	utilities/timer.c utilities/tokenize.h utilities/tokenize.c \
 	utilities/unlink.h utilities/unlink.c $(am__append_8) \
 	$(am__append_9) $(am__append_10) $(am__append_12) \
-	$(am__append_13) $(am__append_14) $(am__append_15) \
-	$(am__append_17)
+	$(am__append_13) $(am__append_14) $(am__append_15)
 MY_DYNAMIC_FILES = \
 	fnbounds/fnbounds_client.c	\
 	fnbounds/fnbounds_dynamic.c	\
@@ -1891,6 +1896,17 @@ MY_AARCH64_FILES = \
 @OPT_ENABLE_CUPTI_TRUE@	gpu/nvidia/cupti-api.c			\
 @OPT_ENABLE_CUPTI_TRUE@	gpu/nvidia/cupti-gpu-api.c		
 
+@OPT_ENABLE_OPENCL_TRUE@MY_OPENCL_FILES = sample-sources/opencl.c \
+@OPT_ENABLE_OPENCL_TRUE@	gpu/opencl/opencl-api.c \
+@OPT_ENABLE_OPENCL_TRUE@	gpu/opencl/opencl-memory-manager.c \
+@OPT_ENABLE_OPENCL_TRUE@	gpu/opencl/opencl-activity-translate.c \
+@OPT_ENABLE_OPENCL_TRUE@	gpu/opencl/opencl-h2d-map.c 
+
+@OPT_ENABLE_GTPIN_TRUE@MY_GTPIN_FILES = \
+@OPT_ENABLE_GTPIN_TRUE@	gpu/instrumentation/kernel-data-map.c \
+@OPT_ENABLE_GTPIN_TRUE@	gpu/instrumentation/gtpin-instrumentation.c \
+@OPT_ENABLE_GTPIN_TRUE@	gpu/instrumentation/gtpin-correlation-id-map.c
+
 @OPT_ENABLE_ROCM_TRUE@MY_ROCM_FILES = \
 @OPT_ENABLE_ROCM_TRUE@	sample-sources/amd.c \
 @OPT_ENABLE_ROCM_TRUE@	gpu/amd/roctracer-activity-translate.c \
@@ -1944,15 +1960,16 @@ MY_AARCH64_INCLUDE_DIRS = \
 	-I$(srcdir)/utilities/arch/aarch64
 
 libhpcrun_la_SOURCES = $(MY_BASE_FILES) $(MY_DYNAMIC_FILES) \
-	$(am__append_25) $(am__append_26) $(am__append_39) \
-	$(am__append_54) $(am__append_72) $(am__append_85) \
-	$(am__append_100) $(am__append_104) $(am__append_114) \
-	$(am__append_121) $(am__append_125) $(am__append_129) \
-	$(UNW_SOURCE_FILES) utilities/last_func.c
+	$(am__append_24) $(am__append_25) $(am__append_38) \
+	$(am__append_53) $(am__append_71) $(am__append_84) \
+	$(am__append_99) $(am__append_103) $(am__append_113) \
+	$(am__append_120) $(am__append_124) $(am__append_128) \
+	$(am__append_132) $(am__append_136) $(UNW_SOURCE_FILES) \
+	utilities/last_func.c
 libhpcrun_o_SOURCES = $(MY_BASE_FILES) $(MY_STATIC_FILES) \
-	$(am__append_27) $(am__append_40) $(am__append_55) \
-	$(am__append_73) $(am__append_86) $(am__append_105) \
-	$(am__append_110) $(am__append_115) $(am__append_124) \
+	$(am__append_26) $(am__append_39) $(am__append_54) \
+	$(am__append_72) $(am__append_85) $(am__append_104) \
+	$(am__append_109) $(am__append_114) $(am__append_123) \
 	$(UNW_SOURCE_FILES) utilities/last_func.c
 libhpcrun_wrap_a_SOURCES = \
 	monitor-exts/openmp.c
@@ -1997,58 +2014,58 @@ libhpctoolkit_a_SOURCES = \
 # cppflags
 #-----------------------------------------------------------
 libhpcrun_la_CPPFLAGS = $(MY_CPP_DEFINES) $(LIBUNWIND_CPPFLAGS_DYN) \
-	$(MY_INCLUDE_DIRS) $(am__append_19) $(am__append_28) \
-	$(am__append_41) $(am__append_56) $(am__append_74) \
-	$(am__append_87) $(am__append_101) $(am__append_106) \
-	$(am__append_108) $(am__append_116) $(am__append_119) \
-	$(am__append_122) $(am__append_126) $(am__append_130) \
-	$(UNW_INCLUDE_DIRS)
+	$(MY_INCLUDE_DIRS) $(am__append_18) $(am__append_27) \
+	$(am__append_40) $(am__append_55) $(am__append_73) \
+	$(am__append_86) $(am__append_100) $(am__append_105) \
+	$(am__append_107) $(am__append_115) $(am__append_118) \
+	$(am__append_121) $(am__append_125) $(am__append_129) \
+	$(am__append_133) $(am__append_137) $(UNW_INCLUDE_DIRS)
 libhpcrun_o_CPPFLAGS = -DHPCRUN_STATIC_LINK $(MY_CPP_DEFINES) \
-	$(LIBUNWIND_CPPFLAGS_STAT) $(MY_INCLUDE_DIRS) $(am__append_20) \
-	$(am__append_29) $(am__append_42) $(am__append_57) \
-	$(am__append_75) $(am__append_88) $(am__append_111) \
-	$(am__append_117) $(am__append_120) $(UNW_INCLUDE_DIRS)
+	$(LIBUNWIND_CPPFLAGS_STAT) $(MY_INCLUDE_DIRS) $(am__append_19) \
+	$(am__append_28) $(am__append_41) $(am__append_56) \
+	$(am__append_74) $(am__append_87) $(am__append_110) \
+	$(am__append_116) $(am__append_119) $(UNW_INCLUDE_DIRS)
 libhpcrun_wrap_a_CPPFLAGS = \
 	-DHPCRUN_STATIC_LINK		\
 	$(MY_CPP_DEFINES)		\
 	$(MY_INCLUDE_DIRS)
 
 libhpcrun_ga_la_CPPFLAGS = $(MY_CPP_DEFINES) $(MY_INCLUDE_DIRS) \
-	$(am__append_30) $(am__append_43) $(am__append_60) \
-	$(am__append_76) $(am__append_89) $(UNW_INCLUDE_DIRS)
+	$(am__append_29) $(am__append_42) $(am__append_59) \
+	$(am__append_75) $(am__append_88) $(UNW_INCLUDE_DIRS)
 libhpcrun_ga_wrap_a_CPPFLAGS = -DHPCRUN_STATIC_LINK $(MY_CPP_DEFINES) \
-	$(MY_INCLUDE_DIRS) $(am__append_31) $(am__append_44) \
-	$(am__append_61) $(am__append_77) $(am__append_90) \
+	$(MY_INCLUDE_DIRS) $(am__append_30) $(am__append_43) \
+	$(am__append_60) $(am__append_76) $(am__append_89) \
 	$(UNW_INCLUDE_DIRS)
 libhpcrun_gprof_la_CPPFLAGS = $(MY_CPP_DEFINES) $(MY_INCLUDE_DIRS) \
-	$(am__append_45) $(am__append_62) $(am__append_91)
+	$(am__append_44) $(am__append_61) $(am__append_90)
 libhpcrun_gprof_wrap_a_CPPFLAGS = -DHPCRUN_STATIC_LINK \
-	$(MY_CPP_DEFINES) $(MY_INCLUDE_DIRS) $(am__append_46) \
-	$(am__append_63) $(am__append_92)
+	$(MY_CPP_DEFINES) $(MY_INCLUDE_DIRS) $(am__append_45) \
+	$(am__append_62) $(am__append_91)
 libhpcrun_io_la_CPPFLAGS = $(MY_CPP_DEFINES) $(MY_INCLUDE_DIRS) \
-	$(am__append_32) $(am__append_47) $(am__append_64) \
-	$(am__append_78) $(am__append_93) $(UNW_INCLUDE_DIRS)
+	$(am__append_31) $(am__append_46) $(am__append_63) \
+	$(am__append_77) $(am__append_92) $(UNW_INCLUDE_DIRS)
 libhpcrun_io_wrap_a_CPPFLAGS = -DHPCRUN_STATIC_LINK $(MY_CPP_DEFINES) \
-	$(MY_INCLUDE_DIRS) $(am__append_33) $(am__append_48) \
-	$(am__append_65) $(am__append_79) $(am__append_94) \
+	$(MY_INCLUDE_DIRS) $(am__append_32) $(am__append_47) \
+	$(am__append_64) $(am__append_78) $(am__append_93) \
 	$(UNW_INCLUDE_DIRS)
 libhpcrun_memleak_la_CPPFLAGS = $(MY_CPP_DEFINES) $(MY_INCLUDE_DIRS) \
-	$(am__append_34) $(am__append_49) $(am__append_66) \
-	$(am__append_80) $(am__append_95) $(UNW_INCLUDE_DIRS)
+	$(am__append_33) $(am__append_48) $(am__append_65) \
+	$(am__append_79) $(am__append_94) $(UNW_INCLUDE_DIRS)
 libhpcrun_memleak_wrap_a_CPPFLAGS = -DHPCRUN_STATIC_LINK \
-	$(MY_CPP_DEFINES) $(MY_INCLUDE_DIRS) $(am__append_35) \
-	$(am__append_50) $(am__append_67) $(am__append_81) \
-	$(am__append_96) $(UNW_INCLUDE_DIRS)
+	$(MY_CPP_DEFINES) $(MY_INCLUDE_DIRS) $(am__append_34) \
+	$(am__append_49) $(am__append_66) $(am__append_80) \
+	$(am__append_95) $(UNW_INCLUDE_DIRS)
 libhpcrun_pthread_la_CPPFLAGS = $(MY_CPP_DEFINES) $(MY_INCLUDE_DIRS) \
-	$(am__append_36) $(am__append_51) $(am__append_68) \
-	$(am__append_82) $(am__append_97) $(UNW_INCLUDE_DIRS)
+	$(am__append_35) $(am__append_50) $(am__append_67) \
+	$(am__append_81) $(am__append_96) $(UNW_INCLUDE_DIRS)
 libhpcrun_pthread_wrap_a_CPPFLAGS = -DHPCRUN_STATIC_LINK \
-	$(MY_CPP_DEFINES) $(MY_INCLUDE_DIRS) $(am__append_37) \
-	$(am__append_52) $(am__append_69) $(am__append_83) \
-	$(am__append_98) $(UNW_INCLUDE_DIRS)
+	$(MY_CPP_DEFINES) $(MY_INCLUDE_DIRS) $(am__append_36) \
+	$(am__append_51) $(am__append_68) $(am__append_82) \
+	$(am__append_97) $(UNW_INCLUDE_DIRS)
 libhpcrun_mpi_la_CPPFLAGS = $(MY_CPP_DEFINES) -I$(MPI_INC) \
-	$(MY_INCLUDE_DIRS) $(am__append_38) $(am__append_53) \
-	$(am__append_70) $(am__append_84) $(am__append_99) \
+	$(MY_INCLUDE_DIRS) $(am__append_37) $(am__append_52) \
+	$(am__append_69) $(am__append_83) $(am__append_98) \
 	$(UNW_INCLUDE_DIRS)
 libhpctoolkit_la_CPPFLAGS = \
 	$(MY_CPP_DEFINES)		\
@@ -2064,8 +2081,8 @@ libhpctoolkit_a_CPPFLAGS = \
 # cflags
 #-----------------------------------------------------------
 libhpcrun_la_CFLAGS = $(CFLAGS) $(HOST_CFLAGS) $(PERFMON_CFLAGS) \
-	$(am__append_123) $(am__append_127) $(am__append_131) \
-	$(am__append_133) $(GOTCHA_IFLAGS)
+	$(am__append_122) $(am__append_126) $(am__append_130) \
+	$(am__append_134) $(am__append_138) $(GOTCHA_IFLAGS)
 libhpcrun_o_CFLAGS = $(CFLAGS) $(HOST_CFLAGS) $(PERFMON_CFLAGS)
 libhpcrun_wrap_a_CFLAGS = $(CFLAGS) $(HOST_CFLAGS)
 libhpcrun_ga_la_CFLAGS = $(CFLAGS) $(HOST_CFLAGS)
@@ -2084,14 +2101,15 @@ libhpcrun_mpi_la_CFLAGS = $(CFLAGS) $(HOST_CFLAGS)
 # ldflags
 #-----------------------------------------------------------
 libhpcrun_la_LIBADD = $(HPCLIB_ProfLean) $(HPCLIB_SupportLean) \
-	$(am__append_23)
+	$(am__append_22)
 libhpcrun_o_LDADD = $(HPCLIB_ProfLean) $(HPCLIB_SupportLean) \
-	$(am__append_24)
+	$(am__append_23)
 libhpcrun_la_LDFLAGS = -Wl,-Bsymbolic -L$(LIBMONITOR_LIB) -lmonitor \
 	-lpthread -lrt -lelf -L$(LIBELF_LIB) $(LIBUNWIND_LDFLAGS_DYN) \
 	$(LZMA_LDFLAGS_DYN) $(PERFMON_LDFLAGS_DYN) $(MBEDTLS_LIBS) \
-	$(OPT_ROCM_LDFLAGS) $(am__append_58) $(am__append_102) \
-	$(am__append_118) $(GOTCHA_LDFLAGS) $(UNW_DYNAMIC_LD_FLAGS)
+	$(OPT_ROCM_LDFLAGS) $(am__append_57) $(am__append_101) \
+	$(am__append_117) $(am__append_139) $(GOTCHA_LDFLAGS) \
+	$(UNW_DYNAMIC_LD_FLAGS)
 libhpcrun_ga_la_LDFLAGS = -Wl,-Bsymbolic
 libhpcrun_gprof_la_LDFLAGS = -Wl,-Bsymbolic
 libhpcrun_io_la_LDFLAGS = -Wl,-Bsymbolic
@@ -2099,9 +2117,9 @@ libhpcrun_memleak_la_LDFLAGS = -Wl,-Bsymbolic
 libhpcrun_pthread_la_LDFLAGS = -Wl,-Bsymbolic
 libhpcrun_mpi_la_LDFLAGS = -Wl,-Bsymbolic
 libhpcrun_o_LDFLAGS = $(LIBUNWIND_LDFLAGS_STAT) \
-	$(PERFMON_LDFLAGS_STAT) $(am__append_59) $(am__append_112) \
+	$(PERFMON_LDFLAGS_STAT) $(am__append_58) $(am__append_111) \
 	$(UNW_STATIC_LD_FLAGS)
-MY_AGENT_INCLUDE_DIRS = $(MY_INCLUDE_DIRS) $(am__append_71) \
+MY_AGENT_INCLUDE_DIRS = $(MY_INCLUDE_DIRS) $(am__append_70) \
 	$(UNW_INCLUDE_DIRS)
 @HOST_CPU_AARCH64_TRUE@libhpcrun_la_CCASFLAGS = $(AM_CCASFLAGS)
 @HOST_CPU_PPC_TRUE@libhpcrun_la_CCASFLAGS = $(AM_CCASFLAGS)
@@ -2653,25 +2671,6 @@ sample-sources/perf/libhpcrun_la-kernel_blocking.lo:  \
 sample-sources/perf/libhpcrun_la-kernel_blocking_stub.lo:  \
 	sample-sources/perf/$(am__dirstamp) \
 	sample-sources/perf/$(DEPDIR)/$(am__dirstamp)
-sample-sources/libhpcrun_la-opencl.lo: sample-sources/$(am__dirstamp) \
-	sample-sources/$(DEPDIR)/$(am__dirstamp)
-gpu/opencl/$(am__dirstamp):
-	@$(MKDIR_P) gpu/opencl
-	@: > gpu/opencl/$(am__dirstamp)
-gpu/opencl/$(DEPDIR)/$(am__dirstamp):
-	@$(MKDIR_P) gpu/opencl/$(DEPDIR)
-	@: > gpu/opencl/$(DEPDIR)/$(am__dirstamp)
-gpu/opencl/libhpcrun_la-opencl-intercept.lo:  \
-	gpu/opencl/$(am__dirstamp) \
-	gpu/opencl/$(DEPDIR)/$(am__dirstamp)
-gpu/opencl/libhpcrun_la-opencl-api.lo: gpu/opencl/$(am__dirstamp) \
-	gpu/opencl/$(DEPDIR)/$(am__dirstamp)
-gpu/opencl/libhpcrun_la-opencl-memory-manager.lo:  \
-	gpu/opencl/$(am__dirstamp) \
-	gpu/opencl/$(DEPDIR)/$(am__dirstamp)
-gpu/opencl/libhpcrun_la-opencl-activity-translate.lo:  \
-	gpu/opencl/$(am__dirstamp) \
-	gpu/opencl/$(DEPDIR)/$(am__dirstamp)
 fnbounds/libhpcrun_la-fnbounds_client.lo: fnbounds/$(am__dirstamp) \
 	fnbounds/$(DEPDIR)/$(am__dirstamp)
 fnbounds/libhpcrun_la-fnbounds_dynamic.lo: fnbounds/$(am__dirstamp) \
@@ -2848,6 +2847,39 @@ gpu/level0/libhpcrun_la-level0-event-map.lo:  \
 gpu/level0/libhpcrun_la-level0-handle-map.lo:  \
 	gpu/level0/$(am__dirstamp) \
 	gpu/level0/$(DEPDIR)/$(am__dirstamp)
+sample-sources/libhpcrun_la-opencl.lo: sample-sources/$(am__dirstamp) \
+	sample-sources/$(DEPDIR)/$(am__dirstamp)
+gpu/opencl/$(am__dirstamp):
+	@$(MKDIR_P) gpu/opencl
+	@: > gpu/opencl/$(am__dirstamp)
+gpu/opencl/$(DEPDIR)/$(am__dirstamp):
+	@$(MKDIR_P) gpu/opencl/$(DEPDIR)
+	@: > gpu/opencl/$(DEPDIR)/$(am__dirstamp)
+gpu/opencl/libhpcrun_la-opencl-api.lo: gpu/opencl/$(am__dirstamp) \
+	gpu/opencl/$(DEPDIR)/$(am__dirstamp)
+gpu/opencl/libhpcrun_la-opencl-memory-manager.lo:  \
+	gpu/opencl/$(am__dirstamp) \
+	gpu/opencl/$(DEPDIR)/$(am__dirstamp)
+gpu/opencl/libhpcrun_la-opencl-activity-translate.lo:  \
+	gpu/opencl/$(am__dirstamp) \
+	gpu/opencl/$(DEPDIR)/$(am__dirstamp)
+gpu/opencl/libhpcrun_la-opencl-h2d-map.lo: gpu/opencl/$(am__dirstamp) \
+	gpu/opencl/$(DEPDIR)/$(am__dirstamp)
+gpu/instrumentation/$(am__dirstamp):
+	@$(MKDIR_P) gpu/instrumentation
+	@: > gpu/instrumentation/$(am__dirstamp)
+gpu/instrumentation/$(DEPDIR)/$(am__dirstamp):
+	@$(MKDIR_P) gpu/instrumentation/$(DEPDIR)
+	@: > gpu/instrumentation/$(DEPDIR)/$(am__dirstamp)
+gpu/instrumentation/libhpcrun_la-kernel-data-map.lo:  \
+	gpu/instrumentation/$(am__dirstamp) \
+	gpu/instrumentation/$(DEPDIR)/$(am__dirstamp)
+gpu/instrumentation/libhpcrun_la-gtpin-instrumentation.lo:  \
+	gpu/instrumentation/$(am__dirstamp) \
+	gpu/instrumentation/$(DEPDIR)/$(am__dirstamp)
+gpu/instrumentation/libhpcrun_la-gtpin-correlation-id-map.lo:  \
+	gpu/instrumentation/$(am__dirstamp) \
+	gpu/instrumentation/$(DEPDIR)/$(am__dirstamp)
 unwind/common/libhpcrun_la-backtrace.lo:  \
 	unwind/common/$(am__dirstamp) \
 	unwind/common/$(DEPDIR)/$(am__dirstamp)
@@ -3293,21 +3325,6 @@ sample-sources/perf/libhpcrun_o-kernel_blocking.$(OBJEXT):  \
 sample-sources/perf/libhpcrun_o-kernel_blocking_stub.$(OBJEXT):  \
 	sample-sources/perf/$(am__dirstamp) \
 	sample-sources/perf/$(DEPDIR)/$(am__dirstamp)
-sample-sources/libhpcrun_o-opencl.$(OBJEXT):  \
-	sample-sources/$(am__dirstamp) \
-	sample-sources/$(DEPDIR)/$(am__dirstamp)
-gpu/opencl/libhpcrun_o-opencl-intercept.$(OBJEXT):  \
-	gpu/opencl/$(am__dirstamp) \
-	gpu/opencl/$(DEPDIR)/$(am__dirstamp)
-gpu/opencl/libhpcrun_o-opencl-api.$(OBJEXT):  \
-	gpu/opencl/$(am__dirstamp) \
-	gpu/opencl/$(DEPDIR)/$(am__dirstamp)
-gpu/opencl/libhpcrun_o-opencl-memory-manager.$(OBJEXT):  \
-	gpu/opencl/$(am__dirstamp) \
-	gpu/opencl/$(DEPDIR)/$(am__dirstamp)
-gpu/opencl/libhpcrun_o-opencl-activity-translate.$(OBJEXT):  \
-	gpu/opencl/$(am__dirstamp) \
-	gpu/opencl/$(DEPDIR)/$(am__dirstamp)
 fnbounds/libhpcrun_o-fnbounds_static.$(OBJEXT):  \
 	fnbounds/$(am__dirstamp) fnbounds/$(DEPDIR)/$(am__dirstamp)
 unwind/common/libhpcrun_o-default_validation_summary.$(OBJEXT):  \
@@ -3560,6 +3577,8 @@ mostlyclean-compile:
 	-rm -f gpu/*.lo
 	-rm -f gpu/amd/*.$(OBJEXT)
 	-rm -f gpu/amd/*.lo
+	-rm -f gpu/instrumentation/*.$(OBJEXT)
+	-rm -f gpu/instrumentation/*.lo
 	-rm -f gpu/level0/*.$(OBJEXT)
 	-rm -f gpu/level0/*.lo
 	-rm -f gpu/nvidia/*.$(OBJEXT)
@@ -3779,6 +3798,9 @@ distclean-compile:
 @AMDEP_TRUE@@am__include@ @am__quote@gpu/amd/$(DEPDIR)/libhpcrun_la-rocm-debug-api.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@gpu/amd/$(DEPDIR)/libhpcrun_la-roctracer-activity-translate.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@gpu/amd/$(DEPDIR)/libhpcrun_la-roctracer-api.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@gpu/instrumentation/$(DEPDIR)/libhpcrun_la-gtpin-correlation-id-map.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@gpu/instrumentation/$(DEPDIR)/libhpcrun_la-gtpin-instrumentation.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@gpu/instrumentation/$(DEPDIR)/libhpcrun_la-kernel-data-map.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@gpu/level0/$(DEPDIR)/libhpcrun_la-level0-api.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@gpu/level0/$(DEPDIR)/libhpcrun_la-level0-command-list-map.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@gpu/level0/$(DEPDIR)/libhpcrun_la-level0-command-process.Plo@am__quote@
@@ -3805,12 +3827,8 @@ distclean-compile:
 @AMDEP_TRUE@@am__include@ @am__quote@gpu/nvidia/$(DEPDIR)/libhpcrun_o-cupti-gpu-api.Po@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@gpu/opencl/$(DEPDIR)/libhpcrun_la-opencl-activity-translate.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@gpu/opencl/$(DEPDIR)/libhpcrun_la-opencl-api.Plo@am__quote@
-@AMDEP_TRUE@@am__include@ @am__quote@gpu/opencl/$(DEPDIR)/libhpcrun_la-opencl-intercept.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@gpu/opencl/$(DEPDIR)/libhpcrun_la-opencl-h2d-map.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@gpu/opencl/$(DEPDIR)/libhpcrun_la-opencl-memory-manager.Plo@am__quote@
-@AMDEP_TRUE@@am__include@ @am__quote@gpu/opencl/$(DEPDIR)/libhpcrun_o-opencl-activity-translate.Po@am__quote@
-@AMDEP_TRUE@@am__include@ @am__quote@gpu/opencl/$(DEPDIR)/libhpcrun_o-opencl-api.Po@am__quote@
-@AMDEP_TRUE@@am__include@ @am__quote@gpu/opencl/$(DEPDIR)/libhpcrun_o-opencl-intercept.Po@am__quote@
-@AMDEP_TRUE@@am__include@ @am__quote@gpu/opencl/$(DEPDIR)/libhpcrun_o-opencl-memory-manager.Po@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@lush-agents/$(DEPDIR)/libagent_cilk_la-agent-cilk.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@lush-agents/$(DEPDIR)/libagent_pthread_la-agent-pthread.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@lush-agents/$(DEPDIR)/libagent_tbb_la-agent-tbb.Plo@am__quote@
@@ -3906,7 +3924,6 @@ distclean-compile:
 @AMDEP_TRUE@@am__include@ @am__quote@sample-sources/$(DEPDIR)/libhpcrun_o-nvidia.Po@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@sample-sources/$(DEPDIR)/libhpcrun_o-omp-idle.Po@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@sample-sources/$(DEPDIR)/libhpcrun_o-omp-mutex.Po@am__quote@
-@AMDEP_TRUE@@am__include@ @am__quote@sample-sources/$(DEPDIR)/libhpcrun_o-opencl.Po@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@sample-sources/$(DEPDIR)/libhpcrun_o-papi-c-cupti.Po@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@sample-sources/$(DEPDIR)/libhpcrun_o-papi-c-extended-info.Po@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@sample-sources/$(DEPDIR)/libhpcrun_o-papi-c.Po@am__quote@
@@ -5203,41 +5220,6 @@ sample-sources/perf/libhpcrun_la-kernel_blocking_stub.lo: sample-sources/perf/ke
 @AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
 @am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -c -o sample-sources/perf/libhpcrun_la-kernel_blocking_stub.lo `test -f 'sample-sources/perf/kernel_blocking_stub.c' || echo '$(srcdir)/'`sample-sources/perf/kernel_blocking_stub.c
 
-sample-sources/libhpcrun_la-opencl.lo: sample-sources/opencl.c
-@am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -MT sample-sources/libhpcrun_la-opencl.lo -MD -MP -MF sample-sources/$(DEPDIR)/libhpcrun_la-opencl.Tpo -c -o sample-sources/libhpcrun_la-opencl.lo `test -f 'sample-sources/opencl.c' || echo '$(srcdir)/'`sample-sources/opencl.c
-@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) sample-sources/$(DEPDIR)/libhpcrun_la-opencl.Tpo sample-sources/$(DEPDIR)/libhpcrun_la-opencl.Plo
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='sample-sources/opencl.c' object='sample-sources/libhpcrun_la-opencl.lo' libtool=yes @AMDEPBACKSLASH@
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
-@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -c -o sample-sources/libhpcrun_la-opencl.lo `test -f 'sample-sources/opencl.c' || echo '$(srcdir)/'`sample-sources/opencl.c
-
-gpu/opencl/libhpcrun_la-opencl-intercept.lo: gpu/opencl/opencl-intercept.c
-@am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -MT gpu/opencl/libhpcrun_la-opencl-intercept.lo -MD -MP -MF gpu/opencl/$(DEPDIR)/libhpcrun_la-opencl-intercept.Tpo -c -o gpu/opencl/libhpcrun_la-opencl-intercept.lo `test -f 'gpu/opencl/opencl-intercept.c' || echo '$(srcdir)/'`gpu/opencl/opencl-intercept.c
-@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) gpu/opencl/$(DEPDIR)/libhpcrun_la-opencl-intercept.Tpo gpu/opencl/$(DEPDIR)/libhpcrun_la-opencl-intercept.Plo
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='gpu/opencl/opencl-intercept.c' object='gpu/opencl/libhpcrun_la-opencl-intercept.lo' libtool=yes @AMDEPBACKSLASH@
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
-@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -c -o gpu/opencl/libhpcrun_la-opencl-intercept.lo `test -f 'gpu/opencl/opencl-intercept.c' || echo '$(srcdir)/'`gpu/opencl/opencl-intercept.c
-
-gpu/opencl/libhpcrun_la-opencl-api.lo: gpu/opencl/opencl-api.c
-@am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -MT gpu/opencl/libhpcrun_la-opencl-api.lo -MD -MP -MF gpu/opencl/$(DEPDIR)/libhpcrun_la-opencl-api.Tpo -c -o gpu/opencl/libhpcrun_la-opencl-api.lo `test -f 'gpu/opencl/opencl-api.c' || echo '$(srcdir)/'`gpu/opencl/opencl-api.c
-@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) gpu/opencl/$(DEPDIR)/libhpcrun_la-opencl-api.Tpo gpu/opencl/$(DEPDIR)/libhpcrun_la-opencl-api.Plo
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='gpu/opencl/opencl-api.c' object='gpu/opencl/libhpcrun_la-opencl-api.lo' libtool=yes @AMDEPBACKSLASH@
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
-@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -c -o gpu/opencl/libhpcrun_la-opencl-api.lo `test -f 'gpu/opencl/opencl-api.c' || echo '$(srcdir)/'`gpu/opencl/opencl-api.c
-
-gpu/opencl/libhpcrun_la-opencl-memory-manager.lo: gpu/opencl/opencl-memory-manager.c
-@am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -MT gpu/opencl/libhpcrun_la-opencl-memory-manager.lo -MD -MP -MF gpu/opencl/$(DEPDIR)/libhpcrun_la-opencl-memory-manager.Tpo -c -o gpu/opencl/libhpcrun_la-opencl-memory-manager.lo `test -f 'gpu/opencl/opencl-memory-manager.c' || echo '$(srcdir)/'`gpu/opencl/opencl-memory-manager.c
-@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) gpu/opencl/$(DEPDIR)/libhpcrun_la-opencl-memory-manager.Tpo gpu/opencl/$(DEPDIR)/libhpcrun_la-opencl-memory-manager.Plo
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='gpu/opencl/opencl-memory-manager.c' object='gpu/opencl/libhpcrun_la-opencl-memory-manager.lo' libtool=yes @AMDEPBACKSLASH@
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
-@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -c -o gpu/opencl/libhpcrun_la-opencl-memory-manager.lo `test -f 'gpu/opencl/opencl-memory-manager.c' || echo '$(srcdir)/'`gpu/opencl/opencl-memory-manager.c
-
-gpu/opencl/libhpcrun_la-opencl-activity-translate.lo: gpu/opencl/opencl-activity-translate.c
-@am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -MT gpu/opencl/libhpcrun_la-opencl-activity-translate.lo -MD -MP -MF gpu/opencl/$(DEPDIR)/libhpcrun_la-opencl-activity-translate.Tpo -c -o gpu/opencl/libhpcrun_la-opencl-activity-translate.lo `test -f 'gpu/opencl/opencl-activity-translate.c' || echo '$(srcdir)/'`gpu/opencl/opencl-activity-translate.c
-@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) gpu/opencl/$(DEPDIR)/libhpcrun_la-opencl-activity-translate.Tpo gpu/opencl/$(DEPDIR)/libhpcrun_la-opencl-activity-translate.Plo
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='gpu/opencl/opencl-activity-translate.c' object='gpu/opencl/libhpcrun_la-opencl-activity-translate.lo' libtool=yes @AMDEPBACKSLASH@
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
-@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -c -o gpu/opencl/libhpcrun_la-opencl-activity-translate.lo `test -f 'gpu/opencl/opencl-activity-translate.c' || echo '$(srcdir)/'`gpu/opencl/opencl-activity-translate.c
-
 fnbounds/libhpcrun_la-fnbounds_client.lo: fnbounds/fnbounds_client.c
 @am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -MT fnbounds/libhpcrun_la-fnbounds_client.lo -MD -MP -MF fnbounds/$(DEPDIR)/libhpcrun_la-fnbounds_client.Tpo -c -o fnbounds/libhpcrun_la-fnbounds_client.lo `test -f 'fnbounds/fnbounds_client.c' || echo '$(srcdir)/'`fnbounds/fnbounds_client.c
 @am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) fnbounds/$(DEPDIR)/libhpcrun_la-fnbounds_client.Tpo fnbounds/$(DEPDIR)/libhpcrun_la-fnbounds_client.Plo
@@ -5511,6 +5493,62 @@ gpu/level0/libhpcrun_la-level0-handle-map.lo: gpu/level0/level0-handle-map.c
 @AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
 @am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -c -o gpu/level0/libhpcrun_la-level0-handle-map.lo `test -f 'gpu/level0/level0-handle-map.c' || echo '$(srcdir)/'`gpu/level0/level0-handle-map.c
 
+sample-sources/libhpcrun_la-opencl.lo: sample-sources/opencl.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -MT sample-sources/libhpcrun_la-opencl.lo -MD -MP -MF sample-sources/$(DEPDIR)/libhpcrun_la-opencl.Tpo -c -o sample-sources/libhpcrun_la-opencl.lo `test -f 'sample-sources/opencl.c' || echo '$(srcdir)/'`sample-sources/opencl.c
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) sample-sources/$(DEPDIR)/libhpcrun_la-opencl.Tpo sample-sources/$(DEPDIR)/libhpcrun_la-opencl.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='sample-sources/opencl.c' object='sample-sources/libhpcrun_la-opencl.lo' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -c -o sample-sources/libhpcrun_la-opencl.lo `test -f 'sample-sources/opencl.c' || echo '$(srcdir)/'`sample-sources/opencl.c
+
+gpu/opencl/libhpcrun_la-opencl-api.lo: gpu/opencl/opencl-api.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -MT gpu/opencl/libhpcrun_la-opencl-api.lo -MD -MP -MF gpu/opencl/$(DEPDIR)/libhpcrun_la-opencl-api.Tpo -c -o gpu/opencl/libhpcrun_la-opencl-api.lo `test -f 'gpu/opencl/opencl-api.c' || echo '$(srcdir)/'`gpu/opencl/opencl-api.c
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) gpu/opencl/$(DEPDIR)/libhpcrun_la-opencl-api.Tpo gpu/opencl/$(DEPDIR)/libhpcrun_la-opencl-api.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='gpu/opencl/opencl-api.c' object='gpu/opencl/libhpcrun_la-opencl-api.lo' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -c -o gpu/opencl/libhpcrun_la-opencl-api.lo `test -f 'gpu/opencl/opencl-api.c' || echo '$(srcdir)/'`gpu/opencl/opencl-api.c
+
+gpu/opencl/libhpcrun_la-opencl-memory-manager.lo: gpu/opencl/opencl-memory-manager.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -MT gpu/opencl/libhpcrun_la-opencl-memory-manager.lo -MD -MP -MF gpu/opencl/$(DEPDIR)/libhpcrun_la-opencl-memory-manager.Tpo -c -o gpu/opencl/libhpcrun_la-opencl-memory-manager.lo `test -f 'gpu/opencl/opencl-memory-manager.c' || echo '$(srcdir)/'`gpu/opencl/opencl-memory-manager.c
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) gpu/opencl/$(DEPDIR)/libhpcrun_la-opencl-memory-manager.Tpo gpu/opencl/$(DEPDIR)/libhpcrun_la-opencl-memory-manager.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='gpu/opencl/opencl-memory-manager.c' object='gpu/opencl/libhpcrun_la-opencl-memory-manager.lo' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -c -o gpu/opencl/libhpcrun_la-opencl-memory-manager.lo `test -f 'gpu/opencl/opencl-memory-manager.c' || echo '$(srcdir)/'`gpu/opencl/opencl-memory-manager.c
+
+gpu/opencl/libhpcrun_la-opencl-activity-translate.lo: gpu/opencl/opencl-activity-translate.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -MT gpu/opencl/libhpcrun_la-opencl-activity-translate.lo -MD -MP -MF gpu/opencl/$(DEPDIR)/libhpcrun_la-opencl-activity-translate.Tpo -c -o gpu/opencl/libhpcrun_la-opencl-activity-translate.lo `test -f 'gpu/opencl/opencl-activity-translate.c' || echo '$(srcdir)/'`gpu/opencl/opencl-activity-translate.c
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) gpu/opencl/$(DEPDIR)/libhpcrun_la-opencl-activity-translate.Tpo gpu/opencl/$(DEPDIR)/libhpcrun_la-opencl-activity-translate.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='gpu/opencl/opencl-activity-translate.c' object='gpu/opencl/libhpcrun_la-opencl-activity-translate.lo' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -c -o gpu/opencl/libhpcrun_la-opencl-activity-translate.lo `test -f 'gpu/opencl/opencl-activity-translate.c' || echo '$(srcdir)/'`gpu/opencl/opencl-activity-translate.c
+
+gpu/opencl/libhpcrun_la-opencl-h2d-map.lo: gpu/opencl/opencl-h2d-map.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -MT gpu/opencl/libhpcrun_la-opencl-h2d-map.lo -MD -MP -MF gpu/opencl/$(DEPDIR)/libhpcrun_la-opencl-h2d-map.Tpo -c -o gpu/opencl/libhpcrun_la-opencl-h2d-map.lo `test -f 'gpu/opencl/opencl-h2d-map.c' || echo '$(srcdir)/'`gpu/opencl/opencl-h2d-map.c
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) gpu/opencl/$(DEPDIR)/libhpcrun_la-opencl-h2d-map.Tpo gpu/opencl/$(DEPDIR)/libhpcrun_la-opencl-h2d-map.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='gpu/opencl/opencl-h2d-map.c' object='gpu/opencl/libhpcrun_la-opencl-h2d-map.lo' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -c -o gpu/opencl/libhpcrun_la-opencl-h2d-map.lo `test -f 'gpu/opencl/opencl-h2d-map.c' || echo '$(srcdir)/'`gpu/opencl/opencl-h2d-map.c
+
+gpu/instrumentation/libhpcrun_la-kernel-data-map.lo: gpu/instrumentation/kernel-data-map.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -MT gpu/instrumentation/libhpcrun_la-kernel-data-map.lo -MD -MP -MF gpu/instrumentation/$(DEPDIR)/libhpcrun_la-kernel-data-map.Tpo -c -o gpu/instrumentation/libhpcrun_la-kernel-data-map.lo `test -f 'gpu/instrumentation/kernel-data-map.c' || echo '$(srcdir)/'`gpu/instrumentation/kernel-data-map.c
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) gpu/instrumentation/$(DEPDIR)/libhpcrun_la-kernel-data-map.Tpo gpu/instrumentation/$(DEPDIR)/libhpcrun_la-kernel-data-map.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='gpu/instrumentation/kernel-data-map.c' object='gpu/instrumentation/libhpcrun_la-kernel-data-map.lo' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -c -o gpu/instrumentation/libhpcrun_la-kernel-data-map.lo `test -f 'gpu/instrumentation/kernel-data-map.c' || echo '$(srcdir)/'`gpu/instrumentation/kernel-data-map.c
+
+gpu/instrumentation/libhpcrun_la-gtpin-instrumentation.lo: gpu/instrumentation/gtpin-instrumentation.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -MT gpu/instrumentation/libhpcrun_la-gtpin-instrumentation.lo -MD -MP -MF gpu/instrumentation/$(DEPDIR)/libhpcrun_la-gtpin-instrumentation.Tpo -c -o gpu/instrumentation/libhpcrun_la-gtpin-instrumentation.lo `test -f 'gpu/instrumentation/gtpin-instrumentation.c' || echo '$(srcdir)/'`gpu/instrumentation/gtpin-instrumentation.c
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) gpu/instrumentation/$(DEPDIR)/libhpcrun_la-gtpin-instrumentation.Tpo gpu/instrumentation/$(DEPDIR)/libhpcrun_la-gtpin-instrumentation.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='gpu/instrumentation/gtpin-instrumentation.c' object='gpu/instrumentation/libhpcrun_la-gtpin-instrumentation.lo' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -c -o gpu/instrumentation/libhpcrun_la-gtpin-instrumentation.lo `test -f 'gpu/instrumentation/gtpin-instrumentation.c' || echo '$(srcdir)/'`gpu/instrumentation/gtpin-instrumentation.c
+
+gpu/instrumentation/libhpcrun_la-gtpin-correlation-id-map.lo: gpu/instrumentation/gtpin-correlation-id-map.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -MT gpu/instrumentation/libhpcrun_la-gtpin-correlation-id-map.lo -MD -MP -MF gpu/instrumentation/$(DEPDIR)/libhpcrun_la-gtpin-correlation-id-map.Tpo -c -o gpu/instrumentation/libhpcrun_la-gtpin-correlation-id-map.lo `test -f 'gpu/instrumentation/gtpin-correlation-id-map.c' || echo '$(srcdir)/'`gpu/instrumentation/gtpin-correlation-id-map.c
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) gpu/instrumentation/$(DEPDIR)/libhpcrun_la-gtpin-correlation-id-map.Tpo gpu/instrumentation/$(DEPDIR)/libhpcrun_la-gtpin-correlation-id-map.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='gpu/instrumentation/gtpin-correlation-id-map.c' object='gpu/instrumentation/libhpcrun_la-gtpin-correlation-id-map.lo' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -c -o gpu/instrumentation/libhpcrun_la-gtpin-correlation-id-map.lo `test -f 'gpu/instrumentation/gtpin-correlation-id-map.c' || echo '$(srcdir)/'`gpu/instrumentation/gtpin-correlation-id-map.c
+
 unwind/common/libhpcrun_la-backtrace.lo: unwind/common/backtrace.c
 @am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -MT unwind/common/libhpcrun_la-backtrace.lo -MD -MP -MF unwind/common/$(DEPDIR)/libhpcrun_la-backtrace.Tpo -c -o unwind/common/libhpcrun_la-backtrace.lo `test -f 'unwind/common/backtrace.c' || echo '$(srcdir)/'`unwind/common/backtrace.c
 @am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) unwind/common/$(DEPDIR)/libhpcrun_la-backtrace.Tpo unwind/common/$(DEPDIR)/libhpcrun_la-backtrace.Plo
@@ -7667,76 +7705,6 @@ sample-sources/perf/libhpcrun_o-kernel_blocking_stub.obj: sample-sources/perf/ke
 @AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
 @am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_o_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_o_CFLAGS) $(CFLAGS) -c -o sample-sources/perf/libhpcrun_o-kernel_blocking_stub.obj `if test -f 'sample-sources/perf/kernel_blocking_stub.c'; then $(CYGPATH_W) 'sample-sources/perf/kernel_blocking_stub.c'; else $(CYGPATH_W) '$(srcdir)/sample-sources/perf/kernel_blocking_stub.c'; fi`
 
-sample-sources/libhpcrun_o-opencl.o: sample-sources/opencl.c
-@am__fastdepCC_TRUE@	$(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_o_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_o_CFLAGS) $(CFLAGS) -MT sample-sources/libhpcrun_o-opencl.o -MD -MP -MF sample-sources/$(DEPDIR)/libhpcrun_o-opencl.Tpo -c -o sample-sources/libhpcrun_o-opencl.o `test -f 'sample-sources/opencl.c' || echo '$(srcdir)/'`sample-sources/opencl.c
-@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) sample-sources/$(DEPDIR)/libhpcrun_o-opencl.Tpo sample-sources/$(DEPDIR)/libhpcrun_o-opencl.Po
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='sample-sources/opencl.c' object='sample-sources/libhpcrun_o-opencl.o' libtool=no @AMDEPBACKSLASH@
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
-@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_o_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_o_CFLAGS) $(CFLAGS) -c -o sample-sources/libhpcrun_o-opencl.o `test -f 'sample-sources/opencl.c' || echo '$(srcdir)/'`sample-sources/opencl.c
-
-sample-sources/libhpcrun_o-opencl.obj: sample-sources/opencl.c
-@am__fastdepCC_TRUE@	$(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_o_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_o_CFLAGS) $(CFLAGS) -MT sample-sources/libhpcrun_o-opencl.obj -MD -MP -MF sample-sources/$(DEPDIR)/libhpcrun_o-opencl.Tpo -c -o sample-sources/libhpcrun_o-opencl.obj `if test -f 'sample-sources/opencl.c'; then $(CYGPATH_W) 'sample-sources/opencl.c'; else $(CYGPATH_W) '$(srcdir)/sample-sources/opencl.c'; fi`
-@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) sample-sources/$(DEPDIR)/libhpcrun_o-opencl.Tpo sample-sources/$(DEPDIR)/libhpcrun_o-opencl.Po
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='sample-sources/opencl.c' object='sample-sources/libhpcrun_o-opencl.obj' libtool=no @AMDEPBACKSLASH@
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
-@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_o_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_o_CFLAGS) $(CFLAGS) -c -o sample-sources/libhpcrun_o-opencl.obj `if test -f 'sample-sources/opencl.c'; then $(CYGPATH_W) 'sample-sources/opencl.c'; else $(CYGPATH_W) '$(srcdir)/sample-sources/opencl.c'; fi`
-
-gpu/opencl/libhpcrun_o-opencl-intercept.o: gpu/opencl/opencl-intercept.c
-@am__fastdepCC_TRUE@	$(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_o_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_o_CFLAGS) $(CFLAGS) -MT gpu/opencl/libhpcrun_o-opencl-intercept.o -MD -MP -MF gpu/opencl/$(DEPDIR)/libhpcrun_o-opencl-intercept.Tpo -c -o gpu/opencl/libhpcrun_o-opencl-intercept.o `test -f 'gpu/opencl/opencl-intercept.c' || echo '$(srcdir)/'`gpu/opencl/opencl-intercept.c
-@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) gpu/opencl/$(DEPDIR)/libhpcrun_o-opencl-intercept.Tpo gpu/opencl/$(DEPDIR)/libhpcrun_o-opencl-intercept.Po
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='gpu/opencl/opencl-intercept.c' object='gpu/opencl/libhpcrun_o-opencl-intercept.o' libtool=no @AMDEPBACKSLASH@
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
-@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_o_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_o_CFLAGS) $(CFLAGS) -c -o gpu/opencl/libhpcrun_o-opencl-intercept.o `test -f 'gpu/opencl/opencl-intercept.c' || echo '$(srcdir)/'`gpu/opencl/opencl-intercept.c
-
-gpu/opencl/libhpcrun_o-opencl-intercept.obj: gpu/opencl/opencl-intercept.c
-@am__fastdepCC_TRUE@	$(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_o_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_o_CFLAGS) $(CFLAGS) -MT gpu/opencl/libhpcrun_o-opencl-intercept.obj -MD -MP -MF gpu/opencl/$(DEPDIR)/libhpcrun_o-opencl-intercept.Tpo -c -o gpu/opencl/libhpcrun_o-opencl-intercept.obj `if test -f 'gpu/opencl/opencl-intercept.c'; then $(CYGPATH_W) 'gpu/opencl/opencl-intercept.c'; else $(CYGPATH_W) '$(srcdir)/gpu/opencl/opencl-intercept.c'; fi`
-@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) gpu/opencl/$(DEPDIR)/libhpcrun_o-opencl-intercept.Tpo gpu/opencl/$(DEPDIR)/libhpcrun_o-opencl-intercept.Po
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='gpu/opencl/opencl-intercept.c' object='gpu/opencl/libhpcrun_o-opencl-intercept.obj' libtool=no @AMDEPBACKSLASH@
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
-@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_o_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_o_CFLAGS) $(CFLAGS) -c -o gpu/opencl/libhpcrun_o-opencl-intercept.obj `if test -f 'gpu/opencl/opencl-intercept.c'; then $(CYGPATH_W) 'gpu/opencl/opencl-intercept.c'; else $(CYGPATH_W) '$(srcdir)/gpu/opencl/opencl-intercept.c'; fi`
-
-gpu/opencl/libhpcrun_o-opencl-api.o: gpu/opencl/opencl-api.c
-@am__fastdepCC_TRUE@	$(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_o_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_o_CFLAGS) $(CFLAGS) -MT gpu/opencl/libhpcrun_o-opencl-api.o -MD -MP -MF gpu/opencl/$(DEPDIR)/libhpcrun_o-opencl-api.Tpo -c -o gpu/opencl/libhpcrun_o-opencl-api.o `test -f 'gpu/opencl/opencl-api.c' || echo '$(srcdir)/'`gpu/opencl/opencl-api.c
-@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) gpu/opencl/$(DEPDIR)/libhpcrun_o-opencl-api.Tpo gpu/opencl/$(DEPDIR)/libhpcrun_o-opencl-api.Po
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='gpu/opencl/opencl-api.c' object='gpu/opencl/libhpcrun_o-opencl-api.o' libtool=no @AMDEPBACKSLASH@
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
-@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_o_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_o_CFLAGS) $(CFLAGS) -c -o gpu/opencl/libhpcrun_o-opencl-api.o `test -f 'gpu/opencl/opencl-api.c' || echo '$(srcdir)/'`gpu/opencl/opencl-api.c
-
-gpu/opencl/libhpcrun_o-opencl-api.obj: gpu/opencl/opencl-api.c
-@am__fastdepCC_TRUE@	$(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_o_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_o_CFLAGS) $(CFLAGS) -MT gpu/opencl/libhpcrun_o-opencl-api.obj -MD -MP -MF gpu/opencl/$(DEPDIR)/libhpcrun_o-opencl-api.Tpo -c -o gpu/opencl/libhpcrun_o-opencl-api.obj `if test -f 'gpu/opencl/opencl-api.c'; then $(CYGPATH_W) 'gpu/opencl/opencl-api.c'; else $(CYGPATH_W) '$(srcdir)/gpu/opencl/opencl-api.c'; fi`
-@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) gpu/opencl/$(DEPDIR)/libhpcrun_o-opencl-api.Tpo gpu/opencl/$(DEPDIR)/libhpcrun_o-opencl-api.Po
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='gpu/opencl/opencl-api.c' object='gpu/opencl/libhpcrun_o-opencl-api.obj' libtool=no @AMDEPBACKSLASH@
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
-@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_o_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_o_CFLAGS) $(CFLAGS) -c -o gpu/opencl/libhpcrun_o-opencl-api.obj `if test -f 'gpu/opencl/opencl-api.c'; then $(CYGPATH_W) 'gpu/opencl/opencl-api.c'; else $(CYGPATH_W) '$(srcdir)/gpu/opencl/opencl-api.c'; fi`
-
-gpu/opencl/libhpcrun_o-opencl-memory-manager.o: gpu/opencl/opencl-memory-manager.c
-@am__fastdepCC_TRUE@	$(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_o_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_o_CFLAGS) $(CFLAGS) -MT gpu/opencl/libhpcrun_o-opencl-memory-manager.o -MD -MP -MF gpu/opencl/$(DEPDIR)/libhpcrun_o-opencl-memory-manager.Tpo -c -o gpu/opencl/libhpcrun_o-opencl-memory-manager.o `test -f 'gpu/opencl/opencl-memory-manager.c' || echo '$(srcdir)/'`gpu/opencl/opencl-memory-manager.c
-@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) gpu/opencl/$(DEPDIR)/libhpcrun_o-opencl-memory-manager.Tpo gpu/opencl/$(DEPDIR)/libhpcrun_o-opencl-memory-manager.Po
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='gpu/opencl/opencl-memory-manager.c' object='gpu/opencl/libhpcrun_o-opencl-memory-manager.o' libtool=no @AMDEPBACKSLASH@
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
-@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_o_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_o_CFLAGS) $(CFLAGS) -c -o gpu/opencl/libhpcrun_o-opencl-memory-manager.o `test -f 'gpu/opencl/opencl-memory-manager.c' || echo '$(srcdir)/'`gpu/opencl/opencl-memory-manager.c
-
-gpu/opencl/libhpcrun_o-opencl-memory-manager.obj: gpu/opencl/opencl-memory-manager.c
-@am__fastdepCC_TRUE@	$(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_o_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_o_CFLAGS) $(CFLAGS) -MT gpu/opencl/libhpcrun_o-opencl-memory-manager.obj -MD -MP -MF gpu/opencl/$(DEPDIR)/libhpcrun_o-opencl-memory-manager.Tpo -c -o gpu/opencl/libhpcrun_o-opencl-memory-manager.obj `if test -f 'gpu/opencl/opencl-memory-manager.c'; then $(CYGPATH_W) 'gpu/opencl/opencl-memory-manager.c'; else $(CYGPATH_W) '$(srcdir)/gpu/opencl/opencl-memory-manager.c'; fi`
-@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) gpu/opencl/$(DEPDIR)/libhpcrun_o-opencl-memory-manager.Tpo gpu/opencl/$(DEPDIR)/libhpcrun_o-opencl-memory-manager.Po
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='gpu/opencl/opencl-memory-manager.c' object='gpu/opencl/libhpcrun_o-opencl-memory-manager.obj' libtool=no @AMDEPBACKSLASH@
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
-@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_o_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_o_CFLAGS) $(CFLAGS) -c -o gpu/opencl/libhpcrun_o-opencl-memory-manager.obj `if test -f 'gpu/opencl/opencl-memory-manager.c'; then $(CYGPATH_W) 'gpu/opencl/opencl-memory-manager.c'; else $(CYGPATH_W) '$(srcdir)/gpu/opencl/opencl-memory-manager.c'; fi`
-
-gpu/opencl/libhpcrun_o-opencl-activity-translate.o: gpu/opencl/opencl-activity-translate.c
-@am__fastdepCC_TRUE@	$(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_o_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_o_CFLAGS) $(CFLAGS) -MT gpu/opencl/libhpcrun_o-opencl-activity-translate.o -MD -MP -MF gpu/opencl/$(DEPDIR)/libhpcrun_o-opencl-activity-translate.Tpo -c -o gpu/opencl/libhpcrun_o-opencl-activity-translate.o `test -f 'gpu/opencl/opencl-activity-translate.c' || echo '$(srcdir)/'`gpu/opencl/opencl-activity-translate.c
-@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) gpu/opencl/$(DEPDIR)/libhpcrun_o-opencl-activity-translate.Tpo gpu/opencl/$(DEPDIR)/libhpcrun_o-opencl-activity-translate.Po
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='gpu/opencl/opencl-activity-translate.c' object='gpu/opencl/libhpcrun_o-opencl-activity-translate.o' libtool=no @AMDEPBACKSLASH@
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
-@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_o_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_o_CFLAGS) $(CFLAGS) -c -o gpu/opencl/libhpcrun_o-opencl-activity-translate.o `test -f 'gpu/opencl/opencl-activity-translate.c' || echo '$(srcdir)/'`gpu/opencl/opencl-activity-translate.c
-
-gpu/opencl/libhpcrun_o-opencl-activity-translate.obj: gpu/opencl/opencl-activity-translate.c
-@am__fastdepCC_TRUE@	$(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_o_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_o_CFLAGS) $(CFLAGS) -MT gpu/opencl/libhpcrun_o-opencl-activity-translate.obj -MD -MP -MF gpu/opencl/$(DEPDIR)/libhpcrun_o-opencl-activity-translate.Tpo -c -o gpu/opencl/libhpcrun_o-opencl-activity-translate.obj `if test -f 'gpu/opencl/opencl-activity-translate.c'; then $(CYGPATH_W) 'gpu/opencl/opencl-activity-translate.c'; else $(CYGPATH_W) '$(srcdir)/gpu/opencl/opencl-activity-translate.c'; fi`
-@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) gpu/opencl/$(DEPDIR)/libhpcrun_o-opencl-activity-translate.Tpo gpu/opencl/$(DEPDIR)/libhpcrun_o-opencl-activity-translate.Po
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='gpu/opencl/opencl-activity-translate.c' object='gpu/opencl/libhpcrun_o-opencl-activity-translate.obj' libtool=no @AMDEPBACKSLASH@
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
-@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_o_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_o_CFLAGS) $(CFLAGS) -c -o gpu/opencl/libhpcrun_o-opencl-activity-translate.obj `if test -f 'gpu/opencl/opencl-activity-translate.c'; then $(CYGPATH_W) 'gpu/opencl/opencl-activity-translate.c'; else $(CYGPATH_W) '$(srcdir)/gpu/opencl/opencl-activity-translate.c'; fi`
-
 fnbounds/libhpcrun_o-fnbounds_static.o: fnbounds/fnbounds_static.c
 @am__fastdepCC_TRUE@	$(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_o_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_o_CFLAGS) $(CFLAGS) -MT fnbounds/libhpcrun_o-fnbounds_static.o -MD -MP -MF fnbounds/$(DEPDIR)/libhpcrun_o-fnbounds_static.Tpo -c -o fnbounds/libhpcrun_o-fnbounds_static.o `test -f 'fnbounds/fnbounds_static.c' || echo '$(srcdir)/'`fnbounds/fnbounds_static.c
 @am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) fnbounds/$(DEPDIR)/libhpcrun_o-fnbounds_static.Tpo fnbounds/$(DEPDIR)/libhpcrun_o-fnbounds_static.Po
@@ -8529,6 +8497,7 @@ clean-libtool:
 	-rm -rf fnbounds/.libs fnbounds/_libs
 	-rm -rf gpu/.libs gpu/_libs
 	-rm -rf gpu/amd/.libs gpu/amd/_libs
+	-rm -rf gpu/instrumentation/.libs gpu/instrumentation/_libs
 	-rm -rf gpu/level0/.libs gpu/level0/_libs
 	-rm -rf gpu/nvidia/.libs gpu/nvidia/_libs
 	-rm -rf gpu/opencl/.libs gpu/opencl/_libs
@@ -8781,6 +8750,8 @@ distclean-generic:
 	-rm -f gpu/$(am__dirstamp)
 	-rm -f gpu/amd/$(DEPDIR)/$(am__dirstamp)
 	-rm -f gpu/amd/$(am__dirstamp)
+	-rm -f gpu/instrumentation/$(DEPDIR)/$(am__dirstamp)
+	-rm -f gpu/instrumentation/$(am__dirstamp)
 	-rm -f gpu/level0/$(DEPDIR)/$(am__dirstamp)
 	-rm -f gpu/level0/$(am__dirstamp)
 	-rm -f gpu/nvidia/$(DEPDIR)/$(am__dirstamp)
@@ -8853,7 +8824,7 @@ clean-am: clean-generic clean-libtool clean-noinstPROGRAMS \
 	clean-pkglibexecPROGRAMS mostlyclean-am
 
 distclean: distclean-recursive
-	-rm -rf ./$(DEPDIR) cct/$(DEPDIR) fnbounds/$(DEPDIR) gpu/$(DEPDIR) gpu/amd/$(DEPDIR) gpu/level0/$(DEPDIR) gpu/nvidia/$(DEPDIR) gpu/opencl/$(DEPDIR) lush-agents/$(DEPDIR) lush/$(DEPDIR) memory/$(DEPDIR) messages/$(DEPDIR) monitor-exts/$(DEPDIR) ompt/$(DEPDIR) os/linux/$(DEPDIR) sample-sources/$(DEPDIR) sample-sources/blame-shift/$(DEPDIR) sample-sources/perf/$(DEPDIR) syscalls/$(DEPDIR) trampoline/aarch64/$(DEPDIR) trampoline/common/$(DEPDIR) trampoline/x86-family/$(DEPDIR) unwind/common/$(DEPDIR) unwind/generic-libunwind/$(DEPDIR) unwind/ppc64/$(DEPDIR) unwind/x86-family/$(DEPDIR) unwind/x86-family/manual-intervals/$(DEPDIR) utilities/$(DEPDIR) utilities/arch/ia64/$(DEPDIR) utilities/arch/libunwind/$(DEPDIR) utilities/arch/ppc64/$(DEPDIR) utilities/arch/x86-family/$(DEPDIR)
+	-rm -rf ./$(DEPDIR) cct/$(DEPDIR) fnbounds/$(DEPDIR) gpu/$(DEPDIR) gpu/amd/$(DEPDIR) gpu/instrumentation/$(DEPDIR) gpu/level0/$(DEPDIR) gpu/nvidia/$(DEPDIR) gpu/opencl/$(DEPDIR) lush-agents/$(DEPDIR) lush/$(DEPDIR) memory/$(DEPDIR) messages/$(DEPDIR) monitor-exts/$(DEPDIR) ompt/$(DEPDIR) os/linux/$(DEPDIR) sample-sources/$(DEPDIR) sample-sources/blame-shift/$(DEPDIR) sample-sources/perf/$(DEPDIR) syscalls/$(DEPDIR) trampoline/aarch64/$(DEPDIR) trampoline/common/$(DEPDIR) trampoline/x86-family/$(DEPDIR) unwind/common/$(DEPDIR) unwind/generic-libunwind/$(DEPDIR) unwind/ppc64/$(DEPDIR) unwind/x86-family/$(DEPDIR) unwind/x86-family/manual-intervals/$(DEPDIR) utilities/$(DEPDIR) utilities/arch/ia64/$(DEPDIR) utilities/arch/libunwind/$(DEPDIR) utilities/arch/ppc64/$(DEPDIR) utilities/arch/x86-family/$(DEPDIR)
 	-rm -f Makefile
 distclean-am: clean-am distclean-compile distclean-generic \
 	distclean-tags
@@ -8903,7 +8874,7 @@ install-ps-am:
 installcheck-am:
 
 maintainer-clean: maintainer-clean-recursive
-	-rm -rf ./$(DEPDIR) cct/$(DEPDIR) fnbounds/$(DEPDIR) gpu/$(DEPDIR) gpu/amd/$(DEPDIR) gpu/level0/$(DEPDIR) gpu/nvidia/$(DEPDIR) gpu/opencl/$(DEPDIR) lush-agents/$(DEPDIR) lush/$(DEPDIR) memory/$(DEPDIR) messages/$(DEPDIR) monitor-exts/$(DEPDIR) ompt/$(DEPDIR) os/linux/$(DEPDIR) sample-sources/$(DEPDIR) sample-sources/blame-shift/$(DEPDIR) sample-sources/perf/$(DEPDIR) syscalls/$(DEPDIR) trampoline/aarch64/$(DEPDIR) trampoline/common/$(DEPDIR) trampoline/x86-family/$(DEPDIR) unwind/common/$(DEPDIR) unwind/generic-libunwind/$(DEPDIR) unwind/ppc64/$(DEPDIR) unwind/x86-family/$(DEPDIR) unwind/x86-family/manual-intervals/$(DEPDIR) utilities/$(DEPDIR) utilities/arch/ia64/$(DEPDIR) utilities/arch/libunwind/$(DEPDIR) utilities/arch/ppc64/$(DEPDIR) utilities/arch/x86-family/$(DEPDIR)
+	-rm -rf ./$(DEPDIR) cct/$(DEPDIR) fnbounds/$(DEPDIR) gpu/$(DEPDIR) gpu/amd/$(DEPDIR) gpu/instrumentation/$(DEPDIR) gpu/level0/$(DEPDIR) gpu/nvidia/$(DEPDIR) gpu/opencl/$(DEPDIR) lush-agents/$(DEPDIR) lush/$(DEPDIR) memory/$(DEPDIR) messages/$(DEPDIR) monitor-exts/$(DEPDIR) ompt/$(DEPDIR) os/linux/$(DEPDIR) sample-sources/$(DEPDIR) sample-sources/blame-shift/$(DEPDIR) sample-sources/perf/$(DEPDIR) syscalls/$(DEPDIR) trampoline/aarch64/$(DEPDIR) trampoline/common/$(DEPDIR) trampoline/x86-family/$(DEPDIR) unwind/common/$(DEPDIR) unwind/generic-libunwind/$(DEPDIR) unwind/ppc64/$(DEPDIR) unwind/x86-family/$(DEPDIR) unwind/x86-family/manual-intervals/$(DEPDIR) utilities/$(DEPDIR) utilities/arch/ia64/$(DEPDIR) utilities/arch/libunwind/$(DEPDIR) utilities/arch/ppc64/$(DEPDIR) utilities/arch/x86-family/$(DEPDIR)
 	-rm -f Makefile
 maintainer-clean-am: distclean-am maintainer-clean-generic
 
diff --git a/src/tool/hpcrun/gpu/instrumentation/gtpin-correlation-id-map.h b/src/tool/hpcrun/gpu/instrumentation/gtpin-correlation-id-map.h
new file mode 100644
index 0000000000..6f03b6fe76
--- /dev/null
+++ b/src/tool/hpcrun/gpu/instrumentation/gtpin-correlation-id-map.h
@@ -0,0 +1,141 @@
+// -*-Mode: C++;-*- // technically C99
+
+// * BeginRiceCopyright *****************************************************
+//
+// --------------------------------------------------------------------------
+// Part of HPCToolkit (hpctoolkit.org)
+//
+// Information about sources of support for research and development of
+// HPCToolkit is at 'hpctoolkit.org' and in 'README.Acknowledgments'.
+// --------------------------------------------------------------------------
+//
+// Copyright ((c)) 2002-2020, Rice University
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+//   notice, this list of conditions and the following disclaimer.
+//
+// * Redistributions in binary form must reproduce the above copyright
+//   notice, this list of conditions and the following disclaimer in the
+//   documentation and/or other materials provided with the distribution.
+//
+// * Neither the name of Rice University (RICE) nor the names of its
+//   contributors may be used to endorse or promote products derived from
+//   this software without specific prior written permission.
+//
+// This software is provided by RICE and contributors "as is" and any
+// express or implied warranties, including, but not limited to, the
+// implied warranties of merchantability and fitness for a particular
+// purpose are disclaimed. In no event shall RICE or contributors be
+// liable for any direct, indirect, incidental, special, exemplary, or
+// consequential damages (including, but not limited to, procurement of
+// substitute goods or services; loss of use, data, or profits; or
+// business interruption) however caused and on any theory of liability,
+// whether in contract, strict liability, or tort (including negligence
+// or otherwise) arising in any way out of the use of this software, even
+// if advised of the possibility of such damage.
+//
+// ******************************************************* EndRiceCopyright *
+
+#ifndef gtpin_correlation_id_map_h
+#define gtpin_correlation_id_map_h
+
+
+//*****************************************************************************
+// system includes
+//*****************************************************************************
+
+#include <stdint.h>
+
+
+//*****************************************************************************
+// type definitions 
+//*****************************************************************************
+
+typedef struct gtpin_correlation_id_map_entry_t gtpin_correlation_id_map_entry_t;
+
+typedef struct gpu_op_ccts_t gpu_op_ccts_t;
+
+typedef struct gpu_activity_channel_t gpu_activity_channel_t;
+
+//*****************************************************************************
+// interface operations
+//*****************************************************************************
+
+gtpin_correlation_id_map_entry_t *
+gtpin_correlation_id_map_lookup
+(
+ uint64_t gtpin_correlation_id
+);
+
+
+void
+gtpin_correlation_id_map_insert
+(
+ uint64_t gtpin_correlation_id,
+ gpu_op_ccts_t *op_ccts,
+ gpu_activity_channel_t *activity_channel,
+ uint64_t submit_time
+);
+
+
+void
+gtpin_correlation_id_map_delete
+(
+ uint64_t gtpin_correlation_id
+);
+
+
+gpu_op_ccts_t
+gtpin_correlation_id_map_entry_op_ccts_get
+(
+ gtpin_correlation_id_map_entry_t *entry
+);
+
+
+typedef struct cl_basic_callback_t {
+  uint64_t correlation_id;
+  gpu_activity_kind_t kind;
+  gpu_memcpy_type_t type;
+  cct_node_t *cct_node;
+} cl_basic_callback_t;
+
+
+typedef struct cl_kernel_callback_t {
+  uint64_t correlation_id;
+} cl_kernel_callback_t;
+
+
+typedef struct cl_memory_callback_t {
+  uint64_t correlation_id;
+  gpu_memcpy_type_t type;
+  bool fromHostToDevice;
+  bool fromDeviceToHost;
+  size_t size;
+} cl_memory_callback_t;
+
+
+
+//******************************************************************************
+// interface operations
+//******************************************************************************
+
+gpu_activity_channel_t *
+gtpin_correlation_id_map_entry_activity_channel_get
+(
+ gtpin_correlation_id_map_entry_t *entry
+);
+
+
+uint64_t 
+gtpin_correlation_id_map_entry_submit_time_get
+(
+ gtpin_correlation_id_map_entry_t *entry
+);
+
+
+#endif
diff --git a/src/tool/hpcrun/gpu/instrumentation/gtpin-instrumentation.c b/src/tool/hpcrun/gpu/instrumentation/gtpin-instrumentation.c
new file mode 100644
index 0000000000..37d539c807
--- /dev/null
+++ b/src/tool/hpcrun/gpu/instrumentation/gtpin-instrumentation.c
@@ -0,0 +1,535 @@
+// -*-Mode: C++;-*- // technically C99
+
+// * BeginRiceCopyright *****************************************************
+//
+// --------------------------------------------------------------------------
+// Part of HPCToolkit (hpctoolkit.org)
+//
+// Information about sources of support for research and development of
+// HPCToolkit is at 'hpctoolkit.org' and in 'README.Acknowledgments'.
+// --------------------------------------------------------------------------
+//
+// Copyright ((c)) 2002-2020, Rice University
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+//   notice, this list of conditions and the following disclaimer.
+//
+// * Redistributions in binary form must reproduce the above copyright
+//   notice, this list of conditions and the following disclaimer in the
+//   documentation and/or other materials provided with the distribution.
+//
+// * Neither the name of Rice University (RICE) nor the names of its
+//   contributors may be used to endorse or promote products derived from
+//   this software without specific prior written permission.
+//
+// This software is provided by RICE and contributors "as is" and any
+// express or implied warranties, including, but not limited to, the
+// implied warranties of merchantability and fitness for a particular
+// purpose are disclaimed. In no event shall RICE or contributors be
+// liable for any direct, indirect, incidental, special, exemplary, or
+// consequential damages (including, but not limited to, procurement of
+// substitute goods or services; loss of use, data, or profits; or
+// business interruption) however caused and on any theory of liability,
+// whether in contract, strict liability, or tort (including negligence
+// or otherwise) arising in any way out of the use of this software, even
+// if advised of the possibility of such damage.
+//
+// ******************************************************* EndRiceCopyright *
+
+//******************************************************************************
+// system includes
+//******************************************************************************
+
+#include <assert.h>
+#include <stdlib.h>
+#include <errno.h>     // errno
+#include <fcntl.h>     // open
+#include <sys/stat.h>  // mkdir
+#include <dirent.h>
+#include <sys/types.h>
+#include <unistd.h>
+#include <gtpin.h>
+
+
+
+//******************************************************************************
+// local includes
+//******************************************************************************
+
+#include <hpcrun/safe-sampling.h>
+#include <hpcrun/cct/cct.h>
+#include <hpcrun/memory/hpcrun-malloc.h>
+#include <hpcrun/files.h>
+#include <hpcrun/gpu/gpu-activity-process.h>
+#include <hpcrun/gpu/gpu-activity-channel.h>
+#include <hpcrun/gpu/gpu-application-thread-api.h>
+#include <hpcrun/gpu/gpu-correlation.h>
+#include <hpcrun/gpu/gpu-correlation-channel.h>
+#include <hpcrun/gpu/gpu-host-correlation-map.h>
+#include <hpcrun/gpu/gpu-op-placeholders.h>
+#include <hpcrun/gpu/gpu-metrics.h>
+#include <hpcrun/gpu/gpu-monitoring-thread-api.h>
+#include <hpcrun/utilities/hpcrun-nanotime.h>
+
+#include <lib/prof-lean/crypto-hash.h>
+#include <lib/prof-lean/spinlock.h>
+
+#include "gtpin-correlation-id-map.h"
+#include "gtpin-instrumentation.h"
+#include "kernel-data.h"
+#include "kernel-data-map.h"
+
+//******************************************************************************
+// local data
+//******************************************************************************
+
+#define MAX_STR_SIZE 1024
+
+// TODO(Aaron): Why there are so many correlation ids
+static atomic_ullong correlation_id;
+
+static spinlock_t files_lock = SPINLOCK_UNLOCKED;
+
+static bool gtpin_use_runtime_callstack = false;
+
+static __thread uint64_t gtpin_correlation_id = 0;
+static __thread uint64_t gtpin_cpu_submit_time = 0;
+
+//******************************************************************************
+// private operations
+//******************************************************************************
+
+static void
+knobAddBool
+(
+ const char *name,
+ bool value
+)
+{
+  GTPinKnob knob = KNOB_FindArg(name);
+  assert(knob != NULL);
+  KnobValue knob_value;
+  knob_value.value._bool = value;
+  knob_value.type = KNOB_TYPE_BOOL;
+  KNOB_STATUS status = KNOB_AddValue(knob, &knob_value);
+  assert(status == KNOB_STATUS_SUCCESS);
+}
+
+
+void
+initializeInstrumentation
+(
+ void
+)
+{
+  atomic_store(&correlation_id, 100000000);  // to avoid conflict with opencl operation correlation ids, we start instrumentation ids with 5000 (TODO(Aaron):FIX)
+}
+
+
+static uint64_t
+getCorrelationId
+(
+ void
+)
+{
+  return atomic_fetch_add(&correlation_id, 1);
+}
+
+
+static void
+createKernelNode
+(
+ uint64_t correlation_id
+)
+{
+  uint64_t cpu_submit_time = hpcrun_nanotime();
+
+  if (gtpin_use_runtime_callstack) {
+    // XXX(Keren): gtpin's call stack is a mass, better to use opencl's call path
+    // onKernelRun->clEnqueueNDRangeKernel_wrapper->opencl_subscriber_callback
+    gtpin_correlation_id = correlation_id;
+    gtpin_cpu_submit_time = cpu_submit_time;
+  } else {
+    cct_node_t *api_node = gpu_application_thread_correlation_callback(correlation_id);
+
+    gpu_op_ccts_t gpu_op_ccts;
+    gpu_op_placeholder_flags_t gpu_op_placeholder_flags = 0;
+    gpu_op_placeholder_flags_set(&gpu_op_placeholder_flags, gpu_placeholder_type_kernel);
+
+    hpcrun_safe_enter();
+    gpu_op_ccts_insert(api_node, &gpu_op_ccts, gpu_op_placeholder_flags);
+    hpcrun_safe_exit();
+
+    gpu_activity_channel_t *activity_channel = gpu_activity_channel_get();
+    gtpin_correlation_id_map_insert(correlation_id, &gpu_op_ccts, activity_channel, cpu_submit_time);
+  }
+}
+
+
+static bool
+writeBinary
+(
+ const char *file_name,
+ const void *binary,
+ size_t binary_size
+)
+{
+  int fd;
+  errno = 0;
+  fd = open(file_name, O_WRONLY | O_CREAT | O_EXCL, 0644);
+  if (errno == EEXIST) {
+    close(fd);
+    return true;
+  }
+  if (fd >= 0) {
+    // Success
+    if (write(fd, binary, binary_size) != binary_size) {
+      close(fd);
+      return false;
+    } else {
+      close(fd);
+      return true;
+    }
+  } else {
+    // Failure to open is a fatal error.
+    hpcrun_abort("hpctoolkit: unable to open file: '%s'", file_name);
+    return false;
+  }
+}
+
+
+void
+computeBinaryHash
+(
+ const char *binary,
+ size_t binary_size,
+ char *file_name
+)
+{
+  // Compute hash for the binary
+  unsigned char hash[HASH_LENGTH];
+  crypto_hash_compute((const unsigned char *)binary, binary_size, hash, HASH_LENGTH);
+
+  size_t i;
+  size_t used = 0;
+  used += sprintf(&file_name[used], "%s", hpcrun_files_output_directory());
+  used += sprintf(&file_name[used], "%s", "/intel/");
+  mkdir(file_name, S_IRWXU | S_IRWXG | S_IROTH | S_IXOTH);
+  for (i = 0; i < HASH_LENGTH; ++i) {
+    used += sprintf(&file_name[used], "%02x", hash[i]);
+  }
+  used += sprintf(&file_name[used], "%s", ".gpubin");
+}
+
+
+static uint32_t
+findOrAddKernelModule
+(
+ GTPinKernel kernel
+)
+{
+  char kernel_name[MAX_STR_SIZE];
+  GTPINTOOL_STATUS status;
+
+  status = GTPin_KernelGetName(kernel, MAX_STR_SIZE, kernel_name, NULL);
+  assert(status == GTPINTOOL_STATUS_SUCCESS);
+
+  uint32_t kernel_elf_size = 0;
+  status = GTPin_GetElf(kernel, 0, NULL, &kernel_elf_size);
+  assert(status == GTPINTOOL_STATUS_SUCCESS);
+
+  char *kernel_elf = (char *)malloc(sizeof(char) * kernel_elf_size);
+  status = GTPin_GetElf(kernel, kernel_elf_size, kernel_elf, NULL);
+  assert(status == GTPINTOOL_STATUS_SUCCESS);
+
+  // Create file name
+  char file_name[PATH_MAX];
+  memset(file_name, 0, PATH_MAX);
+  computeBinaryHash(kernel_elf, kernel_elf_size, file_name);
+
+  // Write a file if does not exist
+  spinlock_lock(&files_lock);
+  writeBinary(file_name, kernel_elf, kernel_elf_size);
+  spinlock_unlock(&files_lock);
+
+  free(kernel_elf);
+
+  strncat(file_name, ".", 1);
+  strncat(file_name, kernel_name, strlen(kernel_name));
+
+  uint32_t module_id = 0;
+
+  hpcrun_loadmap_lock();
+  load_module_t *module = hpcrun_loadmap_findByName(file_name);
+  if (module == NULL) {
+    module_id = hpcrun_loadModule_add(file_name);
+  } else {
+    // Find module
+    module_id = module->id;
+  }
+  hpcrun_loadmap_unlock();
+
+  return module_id;
+}
+
+
+static void
+activityNotify
+(
+ void
+)
+{
+  gpu_monitoring_thread_activities_ready();
+}
+
+
+static void
+kernelBlockActivityTranslate
+(
+ gpu_activity_t *ga,
+ uint64_t correlation_id,
+ uint32_t loadmap_module_id,
+ uint64_t offset,
+ uint64_t execution_count
+)
+{
+  memset(&ga->details.kernel_block, 0, sizeof(gpu_kernel_block_t));
+  ga->details.kernel_block.external_id = correlation_id;
+  ga->details.kernel_block.pc.lm_id = (uint16_t)loadmap_module_id;
+  ga->details.kernel_block.pc.lm_ip = (uintptr_t)offset;
+  ga->details.kernel_block.execution_count = execution_count;
+  ga->kind = GPU_ACTIVITY_KERNEL_BLOCK;
+
+  cstack_ptr_set(&(ga->next), 0);
+}
+
+
+static void
+kernelBlockActivityProcess
+(
+ uint64_t correlation_id,
+ uint32_t loadmap_module_id,
+ uint64_t offset,
+ uint64_t execution_count,
+ gpu_activity_channel_t *activity_channel,
+ cct_node_t *host_op_node
+)
+{
+  gpu_activity_t ga;
+  kernelBlockActivityTranslate(&ga, correlation_id, loadmap_module_id, offset, execution_count);
+
+  ip_normalized_t ip = ga.details.kernel_block.pc;
+  cct_node_t *cct_child = hpcrun_cct_insert_ip_norm(host_op_node, ip); // how to set the ip_norm
+  if (cct_child) {
+    ga.cct_node = cct_child;
+    gpu_activity_channel_produce(activity_channel, &ga);
+  }
+}
+
+
+static void
+onKernelBuild
+(
+ GTPinKernel kernel,
+ void *v
+)
+{
+  GTPINTOOL_STATUS status = GTPINTOOL_STATUS_SUCCESS;
+
+  assert(kernel_data_map_lookup((uint64_t)kernel) == 0);
+
+  kernel_data_t kernel_data;
+  kernel_data.loadmap_module_id = findOrAddKernelModule(kernel);
+  kernel_data.kind = KERNEL_DATA_GTPIN;
+
+  kernel_data_gtpin_block_t *gtpin_block_head = NULL;
+  kernel_data_gtpin_block_t *gtpin_block_curr = NULL;
+
+  for (GTPinBBL block = GTPin_BBLHead(kernel); GTPin_BBLValid(block); block = GTPin_BBLNext(block)) {
+    GTPinINS head = GTPin_InsHead(block);
+    GTPinINS tail = GTPin_InsTail(block);
+    assert(GTPin_InsValid(head));
+    int32_t head_offset = GTPin_InsOffset(head);
+    int32_t tail_offset = GTPin_InsOffset(tail);
+
+    GTPinMem mem = NULL;
+    status = GTPin_MemClaim(kernel, sizeof(uint32_t), &mem);
+    assert(status == GTPINTOOL_STATUS_SUCCESS);
+    status = GTPin_OpcodeprofInstrument(head, mem);
+    assert(status == GTPINTOOL_STATUS_SUCCESS);
+
+    kernel_data_gtpin_block_t *gtpin_block = (kernel_data_gtpin_block_t *)hpcrun_malloc(sizeof(kernel_data_gtpin_block_t));
+    gtpin_block->head_offset = head_offset;
+    gtpin_block->tail_offset = tail_offset;
+    gtpin_block->mem = mem;
+    gtpin_block->next = NULL;
+
+    if (gtpin_block_head == NULL) {
+      gtpin_block_head = gtpin_block;
+    } else {
+      gtpin_block_curr->next = gtpin_block;
+    }
+    gtpin_block_curr = gtpin_block;
+    
+    // while loop that iterates for each instruction in the block and adds an offset entry in map
+    int32_t offset = head_offset;
+    GTPinINS inst = GTPin_InsHead(block);
+    kernel_data_gtpin_inst_t *gtpin_inst_curr = NULL;
+    while (offset <= tail_offset && offset != -1) {
+      kernel_data_gtpin_inst_t *gtpin_inst = (kernel_data_gtpin_inst_t *)hpcrun_malloc(sizeof(kernel_data_gtpin_inst_t));
+      gtpin_inst->offset = offset;
+      if (gtpin_inst_curr == NULL) {
+        gtpin_block_curr->inst = gtpin_inst;
+      } else {
+        gtpin_inst_curr->next = gtpin_inst;
+      }
+      gtpin_inst_curr = gtpin_inst;
+      inst = GTPin_InsNext(inst);
+      offset = GTPin_InsOffset(inst);
+    }
+  }
+
+  if (gtpin_block_head != NULL) {
+    kernel_data_gtpin_t *kernel_data_gtpin = (kernel_data_gtpin_t *)hpcrun_malloc(sizeof(kernel_data_gtpin_t));
+    kernel_data_gtpin->kernel_id = (uint64_t)kernel;
+    kernel_data_gtpin->block = gtpin_block_head;
+    kernel_data.data = kernel_data_gtpin; 
+    kernel_data_map_insert((uint64_t)kernel, kernel_data);
+  }
+
+  // add these details to cct_node. If thats not needed, we can create the kernel_cct in onKernelComplete
+  ETMSG(OPENCL, "onKernelBuild complete. Inserted key: %"PRIu64 "",(uint64_t)kernel);
+}
+
+
+static void
+onKernelRun
+(
+ GTPinKernelExec kernelExec,
+ void *v
+)
+{
+  ETMSG(OPENCL, "onKernelRun starting. Inserted: correlation %"PRIu64"", (uint64_t)kernelExec);
+
+  GTPINTOOL_STATUS status = GTPINTOOL_STATUS_SUCCESS;
+  GTPin_KernelProfilingActive(kernelExec, 1);
+  assert(status == GTPINTOOL_STATUS_SUCCESS);
+
+  createKernelNode((uint64_t)kernelExec);
+}
+
+
+static void
+onKernelComplete
+(
+ GTPinKernelExec kernelExec,
+ void *v
+)
+{
+  // Receive correlations from the host thread.
+  // XXX(Keren): This is done usually at the monitor thread, but not guaranteed.
+  // For safety concern, we need to adopt the multiplexer framework.
+  //activityNotify();  
+  
+  uint64_t correlation_id = (uint64_t)kernelExec;
+
+  gtpin_correlation_id_map_entry_t *entry =
+    gtpin_correlation_id_map_lookup(correlation_id);
+
+  ETMSG(OPENCL, "onKernelComplete starting. Lookup: correlation %"PRIu64", result %p", correlation_id, entry);
+
+  if (entry == NULL) {
+    // XXX(Keren): the opencl/level zero api's kernel launch is not wrapped
+    return;
+  }
+
+  gpu_activity_channel_t *activity_channel = gtpin_correlation_id_map_entry_activity_channel_get(entry);
+  gpu_op_ccts_t gpu_op_ccts = gtpin_correlation_id_map_entry_op_ccts_get(entry);
+  cct_node_t *host_op_node = gpu_op_ccts_get(&gpu_op_ccts, gpu_placeholder_type_kernel);
+
+  GTPINTOOL_STATUS status = GTPINTOOL_STATUS_SUCCESS;
+  GTPinKernel kernel = GTPin_KernelExec_GetKernel(kernelExec);
+  ETMSG(OPENCL, "onKernelComplete starting. Lookup: kernel: %"PRIu64"", (uint64_t)kernel);
+  assert(kernel_data_map_lookup((uint64_t)kernel) != 0);
+
+  kernel_data_map_entry_t *kernel_data_map_entry = kernel_data_map_lookup((uint64_t)kernel);
+  assert(kernel_data_map_entry != NULL);
+
+  kernel_data_t kernel_data = kernel_data_map_entry_kernel_data_get(kernel_data_map_entry);
+  assert(kernel_data.kind == KERNEL_DATA_GTPIN);
+
+  kernel_data_gtpin_t *kernel_data_gtpin = (kernel_data_gtpin_t *)kernel_data.data; 
+  kernel_data_gtpin_block_t *block = kernel_data_gtpin->block;
+
+  while (block != NULL) {
+    uint32_t thread_count = GTPin_MemSampleLength(block->mem);
+    assert(thread_count > 0);
+
+    uint32_t total = 0, value = 0;
+    for (uint32_t tid = 0; tid < thread_count; ++tid) {
+      status = GTPin_MemRead(block->mem, tid, sizeof(uint32_t), (char*)(&value), NULL);
+      assert(status == GTPINTOOL_STATUS_SUCCESS);
+      total += value;
+    }
+    uint64_t execution_count = total; // + bm->val 
+
+    kernel_data_gtpin_inst_t *inst = block->inst;
+    while (inst != NULL) {
+      kernelBlockActivityProcess(correlation_id, kernel_data.loadmap_module_id,
+        inst->offset, execution_count, activity_channel, host_op_node);
+      inst = inst->next;
+    }
+    block = block->next;
+    //how to make offset the primary key within the cct and += the execution value for existing ccts?
+  }
+}
+
+//******************************************************************************
+// interface operations
+//******************************************************************************
+
+void
+gtpin_enable_profiling
+(
+ void
+)
+{
+  ETMSG(OPENCL, "inside enableProfiling");
+  initializeInstrumentation();
+  knobAddBool("silent_warnings", true);
+
+#if 0
+  if (utils::GetEnv("PTI_GEN12") != nullptr) {
+    KnobAddBool("gen12_1", true);
+  }
+#endif
+
+  gpu_metrics_GPU_INST_enable();
+
+  // Use opencl/level zero runtime stack
+  gtpin_use_runtime_callstack = true;
+
+  GTPin_OnKernelBuild(onKernelBuild, NULL);
+  GTPin_OnKernelRun(onKernelRun, NULL);
+  GTPin_OnKernelComplete(onKernelComplete, NULL);
+
+  GTPIN_Start();
+}
+
+
+void
+gtpin_produce_runtime_callstack
+(
+ gpu_op_ccts_t *gpu_op_ccts
+)
+{
+  if (gtpin_use_runtime_callstack) {
+    gpu_activity_channel_t *activity_channel = gpu_activity_channel_get();
+    gtpin_correlation_id_map_insert(gtpin_correlation_id, gpu_op_ccts, activity_channel, gtpin_cpu_submit_time);
+  }
+}
diff --git a/src/tool/hpcrun/gpu/instrumentation/gtpin-instrumentation.h b/src/tool/hpcrun/gpu/instrumentation/gtpin-instrumentation.h
new file mode 100644
index 0000000000..1efd824d78
--- /dev/null
+++ b/src/tool/hpcrun/gpu/instrumentation/gtpin-instrumentation.h
@@ -0,0 +1,71 @@
+// -*-Mode: C++;-*- // technically C99
+
+// * BeginRiceCopyright *****************************************************
+//
+// --------------------------------------------------------------------------
+// Part of HPCToolkit (hpctoolkit.org)
+//
+// Information about sources of support for research and development of
+// HPCToolkit is at 'hpctoolkit.org' and in 'README.Acknowledgments'.
+// --------------------------------------------------------------------------
+//
+// Copyright ((c)) 2002-2020, Rice University
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+//   notice, this list of conditions and the following disclaimer.
+//
+// * Redistributions in binary form must reproduce the above copyright
+//   notice, this list of conditions and the following disclaimer in the
+//   documentation and/or other materials provided with the distribution.
+//
+// * Neither the name of Rice University (RICE) nor the names of its
+//   contributors may be used to endorse or promote products derived from
+//   this software without specific prior written permission.
+//
+// This software is provided by RICE and contributors "as is" and any
+// express or implied warranties, including, but not limited to, the
+// implied warranties of merchantability and fitness for a particular
+// purpose are disclaimed. In no event shall RICE or contributors be
+// liable for any direct, indirect, incidental, special, exemplary, or
+// consequential damages (including, but not limited to, procurement of
+// substitute goods or services; loss of use, data, or profits; or
+// business interruption) however caused and on any theory of liability,
+// whether in contract, strict liability, or tort (including negligence
+// or otherwise) arising in any way out of the use of this software, even
+// if advised of the possibility of such damage.
+//
+// ******************************************************* EndRiceCopyright *
+
+#ifndef gpu_instrumentation_gtpin_instrumentation_h
+#define gpu_instrumentation_gtpin_instrumentation_h
+
+//******************************************************************************
+// local includes
+//******************************************************************************
+
+//******************************************************************************
+// interface operations
+//******************************************************************************
+
+typedef struct gpu_op_ccts_t gpu_op_ccts_t;
+
+void
+gtpin_enable_profiling
+(
+ void
+);
+
+
+void
+gtpin_produce_runtime_callstack
+(
+ gpu_op_ccts_t *
+);
+
+
+#endif
diff --git a/src/tool/hpcrun/gpu/opencl/opencl-activity-translate.c b/src/tool/hpcrun/gpu/opencl/opencl-activity-translate.c
index 5d833a509b..253edf542f 100644
--- a/src/tool/hpcrun/gpu/opencl/opencl-activity-translate.c
+++ b/src/tool/hpcrun/gpu/opencl/opencl-activity-translate.c
@@ -58,7 +58,6 @@
 
 #include "opencl-activity-translate.h"
 #include "opencl-api.h"
-#include "opencl-intercept.h"
 
 
 
@@ -75,14 +74,14 @@ convert_kernel_launch
 )
 {
   memset(&ga->details.kernel, 0, sizeof(gpu_kernel_t));
-  getTimingInfoFromClEvent(&ga->details.interval, event);
+
+  opencl_timing_info_get(&ga->details.interval, event);
 
   ga->kind     = cb_data->kind;
   ga->cct_node = cb_data->details.cct_node;
 
   ga->details.kernel.correlation_id = cb_data->details.ker_cb.correlation_id;
   ga->details.kernel.submit_time    = cb_data->details.submit_time;
-
 }
 
 
@@ -95,7 +94,8 @@ convert_memcpy
 )
 {
   memset(&ga->details.memcpy, 0, sizeof(gpu_memcpy_t));
-  getTimingInfoFromClEvent(&ga->details.interval, event);
+
+  opencl_timing_info_get(&ga->details.interval, event);
 
   ga->kind     = cb_data->kind;
   ga->cct_node = cb_data->details.cct_node;
@@ -134,3 +134,22 @@ opencl_activity_translate
   }
   cstack_ptr_set(&(ga->next), 0);
 }
+
+
+void
+opencl_clSetKernelArg_activity_translate
+(
+	gpu_activity_t *ga,
+	uint64_t correlation_id,
+	size_t size,
+	uint64_t start_time,
+	uint64_t end_time
+)
+{
+  ga->details.memcpy.correlation_id = correlation_id;
+  ga->details.memcpy.bytes = size;
+  ga->details.memcpy.copyKind = GPU_MEMCPY_H2D;
+  ga->kind = GPU_ACTIVITY_MEMCPY;
+  set_gpu_interval(&ga->details.interval, start_time, end_time);
+  cstack_ptr_set(&(ga->next), 0);
+}
diff --git a/src/tool/hpcrun/gpu/opencl/opencl-activity-translate.h b/src/tool/hpcrun/gpu/opencl/opencl-activity-translate.h
index 824b2920f9..c52f240279 100644
--- a/src/tool/hpcrun/gpu/opencl/opencl-activity-translate.h
+++ b/src/tool/hpcrun/gpu/opencl/opencl-activity-translate.h
@@ -72,4 +72,13 @@ opencl_activity_translate
 );
 
 
-#endif
+void
+opencl_clSetKernelArg_activity_translate
+(
+	gpu_activity_t *,
+	uint64_t,
+	size_t,
+	uint64_t,
+	uint64_t
+);
+#endif  //_OPENCL_ACTIVITY_TRANSLATE_H_
diff --git a/src/tool/hpcrun/gpu/opencl/opencl-api.c b/src/tool/hpcrun/gpu/opencl/opencl-api.c
index 80167b66e5..79c0f35973 100644
--- a/src/tool/hpcrun/gpu/opencl/opencl-api.c
+++ b/src/tool/hpcrun/gpu/opencl/opencl-api.c
@@ -47,8 +47,11 @@
 
 #include <assert.h>
 #include <inttypes.h>
-
-
+#include <errno.h>     // errno
+#include <fcntl.h>     // open
+#include <sys/stat.h>  // mkdir
+#include <sys/types.h>
+#include <unistd.h>
 
 //******************************************************************************
 // local includes
@@ -65,15 +68,20 @@
 #include <hpcrun/gpu/gpu-metrics.h>
 #include <hpcrun/gpu/gpu-monitoring-thread-api.h>
 #include <hpcrun/gpu/gpu-op-placeholders.h>
+#include <hpcrun/gpu/instrumentation/gtpin-instrumentation.h>
 #include <hpcrun/messages/messages.h>
 #include <hpcrun/sample-sources/libdl.h>
+#include <hpcrun/files.h>
+#include <hpcrun/utilities/hpcrun-nanotime.h>
 #include <lib/prof-lean/hpcrun-opencl.h>
+#include <lib/prof-lean/splay-uint64.h>
 #include <lib/prof-lean/stdatomic.h>
 #include <lib/prof-lean/usec_time.h>
 
 #include "opencl-api.h"
 #include "opencl-activity-translate.h"
 #include "opencl-memory-manager.h"
+#include "opencl-h2d-map.h"
 
 
 
@@ -84,65 +92,65 @@
 
 #define CPU_NANOTIME() (usec_time() * 1000)
 
-#define FORALL_OPENCL_ERRORS(macro)					\
-  macro(CL_SUCCESS)							\
-  macro(CL_DEVICE_NOT_FOUND)						\
-  macro(CL_DEVICE_NOT_AVAILABLE)					\
-  macro(CL_COMPILER_NOT_AVAILABLE)					\
-  macro(CL_MEM_OBJECT_ALLOCATION_FAILURE)				\
-  macro(CL_OUT_OF_RESOURCES)						\
-  macro(CL_OUT_OF_HOST_MEMORY)						\
-  macro(CL_PROFILING_INFO_NOT_AVAILABLE)				\
-  macro(CL_MEM_COPY_OVERLAP)						\
-  macro(CL_IMAGE_FORMAT_MISMATCH)					\
-  macro(CL_IMAGE_FORMAT_NOT_SUPPORTED)					\
-  macro(CL_BUILD_PROGRAM_FAILURE)					\
-  macro(CL_MAP_FAILURE)							\
-  macro(CL_MISALIGNED_SUB_BUFFER_OFFSET)				\
-  macro(CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST)			\
-  macro(CL_COMPILE_PROGRAM_FAILURE)					\
-  macro(CL_LINKER_NOT_AVAILABLE)					\
-  macro(CL_LINK_PROGRAM_FAILURE)					\
-  macro(CL_DEVICE_PARTITION_FAILED)					\
-  macro(CL_KERNEL_ARG_INFO_NOT_AVAILABLE)				\
-  macro(CL_INVALID_VALUE)						\
-  macro(CL_INVALID_DEVICE_TYPE)						\
-  macro(CL_INVALID_PLATFORM)						\
-  macro(CL_INVALID_DEVICE)						\
-  macro(CL_INVALID_CONTEXT)						\
-  macro(CL_INVALID_QUEUE_PROPERTIES)					\
-  macro(CL_INVALID_COMMAND_QUEUE)					\
-  macro(CL_INVALID_HOST_PTR)						\
-  macro(CL_INVALID_MEM_OBJECT)						\
-  macro(CL_INVALID_IMAGE_FORMAT_DESCRIPTOR)				\
-  macro(CL_INVALID_IMAGE_SIZE)						\
-  macro(CL_INVALID_SAMPLER)						\
-  macro(CL_INVALID_BINARY)						\
-  macro(CL_INVALID_BUILD_OPTIONS)					\
-  macro(CL_INVALID_PROGRAM)						\
-  macro(CL_INVALID_PROGRAM_EXECUTABLE)					\
-  macro(CL_INVALID_KERNEL_NAME)						\
-  macro(CL_INVALID_KERNEL_DEFINITION)					\
-  macro(CL_INVALID_KERNEL)						\
-  macro(CL_INVALID_ARG_INDEX)						\
-  macro(CL_INVALID_ARG_VALUE)						\
-  macro(CL_INVALID_ARG_SIZE)						\
-  macro(CL_INVALID_KERNEL_ARGS)						\
-  macro(CL_INVALID_WORK_DIMENSION)					\
-  macro(CL_INVALID_WORK_GROUP_SIZE)					\
-  macro(CL_INVALID_WORK_ITEM_SIZE)					\
-  macro(CL_INVALID_GLOBAL_OFFSET)					\
-  macro(CL_INVALID_EVENT_WAIT_LIST)					\
-  macro(CL_INVALID_EVENT)						\
-  macro(CL_INVALID_OPERATION)						\
-  macro(CL_INVALID_GL_OBJECT)						\
-  macro(CL_INVALID_BUFFER_SIZE)						\
-  macro(CL_INVALID_MIP_LEVEL)						\
-  macro(CL_INVALID_GLOBAL_WORK_SIZE)					\
-  macro(CL_INVALID_PROPERTY)						\
-  macro(CL_INVALID_IMAGE_DESCRIPTOR)					\
-  macro(CL_INVALID_COMPILER_OPTIONS)					\
-  macro(CL_INVALID_LINKER_OPTIONS)					\
+#define FORALL_OPENCL_ERRORS(macro)          \
+  macro(CL_SUCCESS)              \
+  macro(CL_DEVICE_NOT_FOUND)            \
+  macro(CL_DEVICE_NOT_AVAILABLE)          \
+  macro(CL_COMPILER_NOT_AVAILABLE)          \
+  macro(CL_MEM_OBJECT_ALLOCATION_FAILURE)        \
+  macro(CL_OUT_OF_RESOURCES)            \
+  macro(CL_OUT_OF_HOST_MEMORY)            \
+  macro(CL_PROFILING_INFO_NOT_AVAILABLE)        \
+  macro(CL_MEM_COPY_OVERLAP)            \
+  macro(CL_IMAGE_FORMAT_MISMATCH)          \
+  macro(CL_IMAGE_FORMAT_NOT_SUPPORTED)          \
+  macro(CL_BUILD_PROGRAM_FAILURE)          \
+  macro(CL_MAP_FAILURE)              \
+  macro(CL_MISALIGNED_SUB_BUFFER_OFFSET)        \
+  macro(CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST)      \
+  macro(CL_COMPILE_PROGRAM_FAILURE)          \
+  macro(CL_LINKER_NOT_AVAILABLE)          \
+  macro(CL_LINK_PROGRAM_FAILURE)          \
+  macro(CL_DEVICE_PARTITION_FAILED)          \
+  macro(CL_KERNEL_ARG_INFO_NOT_AVAILABLE)        \
+  macro(CL_INVALID_VALUE)            \
+  macro(CL_INVALID_DEVICE_TYPE)            \
+  macro(CL_INVALID_PLATFORM)            \
+  macro(CL_INVALID_DEVICE)            \
+  macro(CL_INVALID_CONTEXT)            \
+  macro(CL_INVALID_QUEUE_PROPERTIES)          \
+  macro(CL_INVALID_COMMAND_QUEUE)          \
+  macro(CL_INVALID_HOST_PTR)            \
+  macro(CL_INVALID_MEM_OBJECT)            \
+  macro(CL_INVALID_IMAGE_FORMAT_DESCRIPTOR)        \
+  macro(CL_INVALID_IMAGE_SIZE)            \
+  macro(CL_INVALID_SAMPLER)            \
+  macro(CL_INVALID_BINARY)            \
+  macro(CL_INVALID_BUILD_OPTIONS)          \
+  macro(CL_INVALID_PROGRAM)            \
+  macro(CL_INVALID_PROGRAM_EXECUTABLE)          \
+  macro(CL_INVALID_KERNEL_NAME)            \
+  macro(CL_INVALID_KERNEL_DEFINITION)          \
+  macro(CL_INVALID_KERNEL)            \
+  macro(CL_INVALID_ARG_INDEX)            \
+  macro(CL_INVALID_ARG_VALUE)            \
+  macro(CL_INVALID_ARG_SIZE)            \
+  macro(CL_INVALID_KERNEL_ARGS)            \
+  macro(CL_INVALID_WORK_DIMENSION)          \
+  macro(CL_INVALID_WORK_GROUP_SIZE)          \
+  macro(CL_INVALID_WORK_ITEM_SIZE)          \
+  macro(CL_INVALID_GLOBAL_OFFSET)          \
+  macro(CL_INVALID_EVENT_WAIT_LIST)          \
+  macro(CL_INVALID_EVENT)            \
+  macro(CL_INVALID_OPERATION)            \
+  macro(CL_INVALID_GL_OBJECT)            \
+  macro(CL_INVALID_BUFFER_SIZE)            \
+  macro(CL_INVALID_MIP_LEVEL)            \
+  macro(CL_INVALID_GLOBAL_WORK_SIZE)          \
+  macro(CL_INVALID_PROPERTY)            \
+  macro(CL_INVALID_IMAGE_DESCRIPTOR)          \
+  macro(CL_INVALID_COMPILER_OPTIONS)          \
+  macro(CL_INVALID_LINKER_OPTIONS)          \
   macro(CL_INVALID_DEVICE_PARTITION_COUNT)
 
 
@@ -150,27 +158,39 @@
 
 #define opencl_path() "libOpenCL.so"
 
-#define FORALL_OPENCL_ROUTINES(macro)					\
-  macro(clGetEventProfilingInfo)					\
-  macro(clReleaseEvent)							\
+#define FORALL_OPENCL_ROUTINES(macro)  \
+  macro(clBuildProgram)  \
+  macro(clCreateProgramWithSource)  \
+  macro(clCreateCommandQueue)  \
+  macro(clCreateCommandQueueWithProperties)  \
+  macro(clEnqueueNDRangeKernel)  \
+  macro(clEnqueueReadBuffer)  \
+  macro(clEnqueueWriteBuffer)  \
+  macro(clCreateBuffer)  \
+  macro(clSetKernelArg)  \
+  macro(clGetEventProfilingInfo)  \
+  macro(clReleaseEvent)  \
   macro(clSetEventCallback)
 
 #define OPENCL_FN_NAME(f) DYN_FN_NAME(f)
 
-#define OPENCL_FN(fn, args)			\
+#define OPENCL_FN(fn, args)      \
   static cl_int (*OPENCL_FN_NAME(fn)) args
 
-#define HPCRUN_OPENCL_CALL(fn, args)					\
-  {									\
-    cl_int status = OPENCL_FN_NAME(fn) args;				\
-    if (status != CL_SUCCESS) {						\
-      ETMSG(OPENCL, "opencl call failed: %s",				\
-	    opencl_error_report(status));				\
-    }									\
-  }
+#define OPENCL_PROGRAM_FN(fn, args)      \
+  static cl_program (*OPENCL_FN_NAME(fn)) args
+
+#define OPENCL_QUEUE_FN(fn, args)      \
+  static cl_command_queue (*OPENCL_FN_NAME(fn)) args
+
+#define OPENCL_CREATEBUFFER_FN(fn, args)      \
+  static cl_mem (*OPENCL_FN_NAME(fn)) args
 
+#define HPCRUN_OPENCL_CALL(fn, args) (OPENCL_FN_NAME(fn) args)
 
+#define LINE_TABLE_FLAG " -gline-tables-only "
 
+#define CORRELATION_ID_INVALID -1
 //******************************************************************************
 // local data
 //******************************************************************************
@@ -181,15 +201,142 @@ static atomic_long correlation_id_counter;
 // opencl function pointers for late binding
 //----------------------------------------------------------
 
+OPENCL_FN
+(
+  clBuildProgram, 
+  (
+   cl_program program,
+   cl_uint num_devices,
+   const cl_device_id* device_list,
+   const char* options,
+   void (CL_CALLBACK* pfn_notify)(cl_program program, void* user_data),
+   void* user_data
+  )
+);
+
+
+OPENCL_PROGRAM_FN
+(
+  clCreateProgramWithSource, 
+  (
+   cl_context context,
+   cl_uint count,
+   const char** strings,
+   const size_t* lengths,
+   cl_int* errcode_ret
+  )
+);
+
+
+OPENCL_QUEUE_FN
+(
+  clCreateCommandQueue, 
+  (
+   cl_context,
+   cl_device_id,
+   cl_command_queue_properties,
+   cl_int*
+  )
+);
+
+
+OPENCL_QUEUE_FN
+(
+  clCreateCommandQueueWithProperties, 
+  (
+   cl_context,
+   cl_device_id,
+   const cl_queue_properties *,
+   cl_int*
+  )
+);
+
+
+OPENCL_FN
+(
+  clEnqueueNDRangeKernel, 
+  (
+   cl_command_queue,
+   cl_kernel,
+   cl_uint,
+   const size_t *, 
+   const size_t *,
+   const size_t *,
+   cl_uint,
+   const cl_event *,
+   cl_event *
+  )
+);
+
+
+OPENCL_FN
+(
+  clEnqueueReadBuffer, 
+  (
+   cl_command_queue,
+   cl_mem,
+   cl_bool,
+   size_t,
+   size_t,
+   void *,
+   cl_uint,
+   const cl_event *,
+   cl_event *
+  )
+);
+
+
+OPENCL_FN
+(
+  clEnqueueWriteBuffer, 
+  (
+   cl_command_queue,
+   cl_mem,
+   cl_bool,
+   size_t,
+   size_t,
+   const void *,
+   cl_uint,
+   const cl_event *,
+   cl_event *
+  )
+);
+
+
+OPENCL_CREATEBUFFER_FN
+(
+  clCreateBuffer,
+  (
+    cl_context,
+    cl_mem_flags,
+    size_t,
+    void *,
+    cl_int *
+  )
+);
+
+
+OPENCL_FN
+(
+  clSetKernelArg,
+  (
+    cl_kernel kernel,
+    cl_uint arg_index,
+    size_t arg_size,
+    const void* arg_value
+  )
+);
+
+
 OPENCL_FN
 (
   clGetEventProfilingInfo,
   (
-    cl_event event,
-    cl_profiling_info param_name,
-    size_t param_value_size,
-    void *param_value,
-    size_t *param_value_size_ret
+   cl_event event,
+   cl_profiling_info param_name,
+   size_t param_value_size,
+   void *param_value,
+   size_t *param_value_size_ret
   )
 );
 
@@ -198,7 +345,7 @@ OPENCL_FN
 (
   clReleaseEvent, 
   (
-    cl_event event
+   cl_event event
   )
 );
 
@@ -207,17 +354,21 @@ OPENCL_FN
 (
   clSetEventCallback,
   (
-    cl_event event,
-    cl_int command_exec_callback_type,
-    void (CL_CALLBACK *pfn_notify)
-    (cl_event event, cl_int event_command_status, void *user_data),
-    void *user_data
+   cl_event event,
+   cl_int command_exec_callback_type,
+   void (CL_CALLBACK *pfn_notify)
+   (cl_event event, cl_int event_command_status, void *user_data),
+   void *user_data
   )
 );
 
 
-
 static atomic_ullong opencl_pending_operations;
+static atomic_ullong opencl_h2d_pending_operations;
+static bool instrumentation = false;
+
+#define CL_PROGRAM_DEBUG_INFO_SIZES_INTEL 0x4101
+#define CL_PROGRAM_DEBUG_INFO_INTEL       0x4100
 
 
 
@@ -228,13 +379,74 @@ static atomic_ullong opencl_pending_operations;
 static uint64_t
 getCorrelationId
 (
-void
+ void
 )
 {
   return atomic_fetch_add(&correlation_id_counter, 1);
 }
 
 
+static void
+initializeKernelCallBackInfo
+(
+ opencl_object_t *ker_info,
+ uint64_t correlation_id
+)
+{
+  ker_info->kind = GPU_ACTIVITY_KERNEL;
+  ker_info->details.ker_cb.correlation_id = correlation_id;
+}
+
+
+static void
+initializeMemoryCallBackInfo
+(
+opencl_object_t *mem_info,
+gpu_memcpy_type_t type,
+size_t size,
+uint64_t correlation_id
+)
+{
+  mem_info->kind = GPU_ACTIVITY_MEMCPY;
+  mem_info->details.mem_cb.type = type;
+  mem_info->details.mem_cb.fromHostToDevice = (type == GPU_MEMCPY_H2D);
+  mem_info->details.mem_cb.fromDeviceToHost = (type == GPU_MEMCPY_D2H);
+  mem_info->details.mem_cb.size = size;
+
+  mem_info->details.mem_cb.correlation_id = correlation_id;
+}
+
+
+static void opencl_activity_completion_notify
+(
+ void
+)
+{
+  gpu_monitoring_thread_activities_ready();
+}
+
+
+// we are dumping the debuginfo since the binary does not have debugsection
+static void
+clBuildProgramCallback
+(
+ cl_program program,
+ void* user_data
+)
+{
+}
+
+
+static void
+opencl_h2d_pending_operations_adjust
+(
+ int value
+)
+{
+  atomic_fetch_add(&opencl_h2d_pending_operations, value);
+}
+
+
 static void
 opencl_pending_operations_adjust
 (
@@ -259,32 +471,119 @@ opencl_activity_process
     gpu_activity_multiplexer_my_channel_init();
   }
   gpu_activity_multiplexer_push(cb_data->details.initiator_channel, &gpu_activity);
-//  gpu_activity_process(&gpu_activity);
 }
 
 
 static void
-opencl_wait_for_pending_operations
+opencl_clSetKernelArg_activity_process
+(
+ uint64_t correlation_id,
+ opencl_h2d_map_entry_t *entry
+)
+{
+  gpu_activity_t gpu_activity;
+	size_t size = opencl_h2d_map_entry_size_get(entry); 
+	uint64_t start_time = opencl_h2d_map_entry_start_time_get(entry); 
+	uint64_t end_time = opencl_h2d_map_entry_end_time_get(entry); 
+  opencl_clSetKernelArg_activity_translate(&gpu_activity, correlation_id, size, start_time, end_time);
+  gpu_activity_process(&gpu_activity);
+}
+
+
+static uint64_t
+opencl_get_buffer_id
+(
+  const void *arg
+)
+{
+  cl_mem buffer = *(cl_mem*)arg;
+  return (uint64_t)buffer;
+}
+
+
+static bool
+opencl_isClArgBuffer
+(
+  const void *arg
+)
+{
+	/*
+	 * There are 2 scenarios in which opencl_isClArgBuffer will return false
+	 * 1. When clCreateBuffer was not called for arg before calling clSetKernelArg
+	 * 2. clEnqueueWriteBuffer is being called for arg. We shouldnt be recording duplicate H2D calls
+	 * */
+  uint64_t buffer_id = opencl_get_buffer_id(arg);
+	opencl_h2d_map_entry_t *entry = opencl_h2d_map_lookup(buffer_id);
+	bool isBuffer = entry ? true : false;
+	//ETMSG(OPENCL, "opencl_isClArgBuffer. buffer_id: %"PRIu64". isBuffer: %d",	buffer_id, isBuffer);
+	return isBuffer;
+}
+
+
+static void
+add_H2D_metrics_to_cct_node
+(
+	opencl_h2d_map_entry_t *entry,
+	splay_visit_t visit_type,
+	void *arg
+)
+{
+	uint64_t correlation_id = opencl_h2d_map_entry_correlation_get(entry); 
+	gpu_correlation_id_map_entry_t *cid_map_entry = 
+		gpu_correlation_id_map_lookup(correlation_id);
+	if (cid_map_entry == NULL) {
+		ETMSG(OPENCL, "cid_map_entry for correlation_id: %"PRIu64 " (clSetKernelArg H2D) not found", correlation_id);
+		return;
+	}
+	ETMSG(OPENCL, "completion type: %s, Correlation id: %"PRIu64 "", 
+			"memcpy_H2D", correlation_id);
+
+	uint64_t start_time = opencl_h2d_map_entry_start_time_get(entry); 
+	uint64_t end_time = opencl_h2d_map_entry_end_time_get(entry); 
+	ETMSG(OPENCL, "duration [%"PRIu64", %"PRIu64"]",start_time, end_time); 
+	opencl_activity_completion_notify();
+	opencl_clSetKernelArg_activity_process(correlation_id, entry);
+	uint64_t buffer_id = opencl_h2d_map_entry_buffer_id_get(entry);
+	//opencl_h2d_map_delete(buffer_id);
+  opencl_h2d_pending_operations_adjust(-1);
+  opencl_pending_operations_adjust(-1);
+}
+
+
+static void
+opencl_add_ccts_for_setClKernelArg
+(
+	void
+)
+{
+  uint64_t count = opencl_h2d_map_count();
+	if (atomic_load(&opencl_h2d_pending_operations) > 0) {
+		opencl_update_ccts_for_h2d_nodes(add_H2D_metrics_to_cct_node);
+	}
+}
+
+
+static void
+opencl_wait_for_non_clSetKernelArg_pending_operations
 (
   void
 )
 {
-  ETMSG(OPENCL, "pending operations: %lu", 
-	atomic_load(&opencl_pending_operations));
-  while (atomic_load(&opencl_pending_operations) != 0);
+  ETMSG(OPENCL, "pending h2D operations: %lu", 
+	  atomic_load(&opencl_h2d_pending_operations));
+  while (atomic_load(&opencl_pending_operations) != atomic_load(&opencl_h2d_pending_operations));
 }
 
 
-static const char*
-opencl_error_report
+static void
+opencl_wait_for_all_pending_operations
 (
-  cl_int error_status
+  void
 )
 {
-  switch(error_status) {
-    FORALL_OPENCL_ERRORS(CODE_TO_STRING)
-    default: return "Unknown OpenCL error";
-  }
+  ETMSG(OPENCL, "pending operations: %lu", 
+  atomic_load(&opencl_pending_operations));
+  while (atomic_load(&opencl_pending_operations) != 0);
 }
 
 
@@ -353,6 +652,16 @@ opencl_initialize_correlation_id
   atomic_store(&correlation_id_counter, 0);
 }
 
+static uint64_t
+get_corr_id
+(
+ opencl_object_t *cb_info
+){
+  return cb_info->details.ker_cb.correlation_id;
+}
+
+
+
 void
 opencl_subscriber_callback
 (
@@ -361,11 +670,22 @@ opencl_subscriber_callback
 {
 
   gpu_placeholder_type_t placeholder_type;
-  uint64_t correlation_id = getCorrelationId();
+  uint64_t correlation_id;
+
+  if( get_corr_id(cb_info) == CORRELATION_ID_INVALID){
+    correlation_id = getCorrelationId();
+  }else{
+    correlation_id = get_corr_id(cb_info);
+  }
+
 
   opencl_pending_operations_adjust(1);
   gpu_op_placeholder_flags_t gpu_op_placeholder_flags = 0;
 
+
+  gpu_correlation_id_map_insert(correlation_id, correlation_id);
+
+
   switch (cb_info->kind) {
 
     case GPU_ACTIVITY_MEMCPY:
@@ -387,22 +707,20 @@ opencl_subscriber_callback
     case GPU_ACTIVITY_KERNEL:
       cb_info->details.ker_cb.correlation_id = correlation_id;
       gpu_op_placeholder_flags_set(&gpu_op_placeholder_flags, 
-				   gpu_placeholder_type_kernel);
+           gpu_placeholder_type_kernel);
 
       gpu_op_placeholder_flags_set(&gpu_op_placeholder_flags, 
 				   gpu_placeholder_type_trace);
 
       placeholder_type = gpu_placeholder_type_kernel;
+
       break;
     default:
       assert(0);
   }
 
-
-  gpu_correlation_id_map_insert(correlation_id, correlation_id);
   cct_node_t *api_node =
   gpu_application_thread_correlation_callback(correlation_id);
-
   gpu_op_ccts_t gpu_op_ccts;
 
   hpcrun_safe_enter();
@@ -416,17 +734,24 @@ opencl_subscriber_callback
   cb_info->details.initiator_channel = gpu_activity_channel_get();
   cb_info->details.submit_time = CPU_NANOTIME();
 
+
+
+  if (cb_info->kind == GPU_ACTIVITY_KERNEL && instrumentation) {
+    // Callback to produce gtpin correlation
+    gtpin_produce_runtime_callstack(&gpu_op_ccts);
+  }
 }
 
 
 void
 opencl_activity_completion_callback
 (
-  cl_event event,
-  cl_int event_command_exec_status,
-  void *user_data
+ cl_event event,
+ cl_int event_command_exec_status,
+ void *user_data
 )
 {
+
   opencl_object_t *cb_data = (opencl_object_t*)user_data;
   cl_basic_callback_t cb_basic = opencl_cb_basic_get(cb_data);
 
@@ -445,22 +770,24 @@ opencl_activity_completion_callback
 
 
 void
-getTimingInfoFromClEvent
+opencl_timing_info_get
 (
-  gpu_interval_t *interval,
-  cl_event event
+ gpu_interval_t *interval,
+ cl_event event
 )
 {
   cl_ulong commandStart = 0;
   cl_ulong commandEnd = 0;
 
   HPCRUN_OPENCL_CALL(clGetEventProfilingInfo, 
-		     (event, CL_PROFILING_COMMAND_START, 
-		      sizeof(commandStart), &commandStart, NULL));
+         (event, CL_PROFILING_COMMAND_START, 
+          sizeof(commandStart), &commandStart, NULL));
 
   HPCRUN_OPENCL_CALL(clGetEventProfilingInfo, 
-		     (event, CL_PROFILING_COMMAND_END, 
-		      sizeof(commandEnd), &commandEnd, NULL));
+         (event, CL_PROFILING_COMMAND_END, 
+          sizeof(commandEnd), &commandEnd, NULL));
+
+  ETMSG(OPENCL, "duration [%lu, %lu]", commandStart, commandEnd);
 
   set_gpu_interval(interval, (uint64_t)commandStart, (uint64_t)commandEnd);
 }
@@ -469,26 +796,32 @@ getTimingInfoFromClEvent
 void
 clSetEventCallback_wrapper
 (
-  cl_event event,
-  cl_int event_command_status,
-  void (CL_CALLBACK *pfn_notify)
-  (cl_event event, cl_int event_command_status, void *user_data),
-  void *user_data
+ cl_event event,
+ cl_int event_command_status,
+ void (CL_CALLBACK *pfn_notify)
+ (cl_event event, cl_int event_command_status, void *user_data),
+ void *user_data
 )
 {
   HPCRUN_OPENCL_CALL(clSetEventCallback, 
-		     (event, event_command_status, pfn_notify, user_data));
+         (event, event_command_status, pfn_notify, user_data));
 }
 
 
 void
 opencl_api_initialize
 (
-  void
+ void
 )
 {
-  opencl_intercept_setup();
+  ETMSG(OPENCL, "CL_TARGET_OPENCL_VERSION: %d", CL_TARGET_OPENCL_VERSION);
+  if (instrumentation) {
+	gpu_metrics_GPU_INST_enable();
+	gtpin_enable_profiling();
+  }
+  atomic_store(&correlation_id_counter, 0);
   atomic_store(&opencl_pending_operations, 0);
+  atomic_store(&opencl_h2d_pending_operations, 0);
 }
 
 
@@ -504,7 +837,7 @@ opencl_bind
   CHK_DLOPEN(opencl, opencl_path(), RTLD_NOW | RTLD_GLOBAL);
   hpcrun_force_dlopen(false);
   
-#define OPENCL_BIND(fn)				\
+#define OPENCL_BIND(fn)        \
   CHK_DLSYM(opencl, fn);
   
   FORALL_OPENCL_ROUTINES(OPENCL_BIND)
@@ -518,13 +851,364 @@ opencl_bind
 }
 
 
+cl_program
+clCreateProgramWithSource
+(
+ cl_context context,
+ cl_uint count,
+ const char** strings,
+ const size_t* lengths,
+ cl_int* errcode_ret
+)
+{
+  ETMSG(OPENCL, "inside clCreateProgramWithSource_wrapper");
+
+  FILE *f_ptr;
+  for (int i = 0; i < (int)count; i++) {
+    // what if a single file has multiple kernels?
+    // we need to add logic to get filenames by reading the strings contents
+    char fileno = '0' + (i + 1); // right now we are naming the files as index numbers
+    // using malloc instead of hpcrun_malloc gives extra garbage characters in file name
+    char *filename = (char*)hpcrun_malloc(sizeof(fileno) + 1);
+    *filename = fileno + '\0';
+    f_ptr = fopen(filename, "w");
+    fwrite(strings[i], lengths[i], 1, f_ptr);
+  }
+  fclose(f_ptr);
+  
+  return HPCRUN_OPENCL_CALL(clCreateProgramWithSource, (context, count, strings, lengths, errcode_ret));
+}
+
+
+// one downside of this appproach is that we may override the callback provided by user
+cl_int
+clBuildProgram
+(
+ cl_program program,
+ cl_uint num_devices,
+ const cl_device_id* device_list,
+ const char* options,
+ void (CL_CALLBACK* pfn_notify)(cl_program program, void* user_data),
+ void* user_data
+)
+{
+  ETMSG(OPENCL, "inside clBuildProgram_wrapper");
+  // XXX(Aaron): Caution, what's the maximum length of options?
+  int len_options = options == NULL ? 0 : strlen(options);
+  int len_flag = strlen(LINE_TABLE_FLAG);
+  char *options_with_debug_flags = (char *)malloc((len_options + len_flag + 1) * sizeof(char));
+  memset(options_with_debug_flags, 0, (len_options + len_flag + 1));
+  if (len_options != 0) {
+    strncat(options_with_debug_flags, options, len_options);
+  }
+  strcat(options_with_debug_flags, LINE_TABLE_FLAG);
+  cl_int ret = HPCRUN_OPENCL_CALL(clBuildProgram, (program, num_devices, device_list, options_with_debug_flags, clBuildProgramCallback, user_data));
+  free(options_with_debug_flags);
+  return ret;
+}
+
+
+cl_command_queue
+clCreateCommandQueue
+(
+ cl_context context,
+ cl_device_id device,
+ cl_command_queue_properties properties,
+ cl_int *errcode_ret
+)
+{
+  // enabling profiling
+  properties |= (cl_command_queue_properties)CL_QUEUE_PROFILING_ENABLE; 
+
+  return HPCRUN_OPENCL_CALL(clCreateCommandQueue, (context, device,
+        properties,errcode_ret));  
+}
+
+
+cl_command_queue
+clCreateCommandQueueWithProperties
+(
+ cl_context context,
+ cl_device_id device,
+ const cl_queue_properties* properties,
+ cl_int* errcode_ret
+)
+{
+  cl_queue_properties *queue_properties = (cl_queue_properties *)properties;
+  if (properties == NULL) {
+    queue_properties = (cl_queue_properties *)malloc(sizeof(cl_queue_properties) * 3);
+    queue_properties[0] = CL_QUEUE_PROPERTIES;
+    queue_properties[1] = CL_QUEUE_PROFILING_ENABLE;
+    queue_properties[2] = 0;
+  } else {
+    int queue_props_id = -1;
+    int props_count = 0;
+    while (properties[props_count] != 0) {
+      if (properties[props_count] == CL_QUEUE_PROPERTIES) {
+        queue_props_id = props_count;
+        ++props_count;
+      } else if (properties[props_count] == CL_QUEUE_SIZE) {
+        // TODO(Keren): A temporay hack
+        ++props_count;
+      }
+      ++props_count;
+    }
+
+    if (queue_props_id >= 0 && queue_props_id + 1 < props_count) {
+      queue_properties = (cl_queue_properties *)malloc(sizeof(cl_queue_properties) * (props_count + 1));
+      for (int i = 0; i < props_count; ++i) {
+        queue_properties[i] = properties[i];
+      }
+      // We do have a queue property entry, just enable profiling
+      queue_properties[queue_props_id + 1] |= CL_QUEUE_PROFILING_ENABLE;
+      queue_properties[props_count] = 0;
+    } else {
+      // We do not have a queue property entry, need to allocate a queue property entry and set up
+      queue_properties = (cl_queue_properties *)malloc(sizeof(cl_queue_properties) * (props_count + 3));
+      for (int i = 0; i < props_count; ++i) {
+        queue_properties[i] = properties[i];
+      }
+      queue_properties[props_count] = CL_QUEUE_PROPERTIES;
+      queue_properties[props_count + 1] = CL_QUEUE_PROFILING_ENABLE;
+      queue_properties[props_count + 2] = 0;
+    }
+  }
+  cl_command_queue queue = HPCRUN_OPENCL_CALL(clCreateCommandQueueWithProperties, (context, device, queue_properties, errcode_ret));
+  if (queue_properties != NULL) {
+    // The property is created by us
+    free(queue_properties);
+  }
+  return queue;
+}
+
+
+cl_int
+clEnqueueNDRangeKernel
+(
+ cl_command_queue command_queue,
+ cl_kernel ocl_kernel,
+ cl_uint work_dim,
+ const size_t *global_work_offset, 
+ const size_t *global_work_size,
+ const size_t *local_work_size,
+ cl_uint num_events_in_wait_list,
+ const cl_event *event_wait_list,
+ cl_event *event
+)
+{
+  opencl_object_t *kernel_info = opencl_malloc();
+  initializeKernelCallBackInfo(kernel_info, CORRELATION_ID_INVALID);
+
+  opencl_subscriber_callback(kernel_info);
+
+  cl_event my_event;
+  cl_event *eventp;
+  if (!event) {
+    kernel_info->isInternalClEvent = true;
+    eventp = &my_event;
+  } else {
+    eventp = event;
+    kernel_info->isInternalClEvent = false;
+  }
+  cl_int return_status =
+            HPCRUN_OPENCL_CALL(clEnqueueNDRangeKernel,
+                          (command_queue, ocl_kernel,
+                                work_dim, global_work_offset,
+                                global_work_size, local_work_size,
+                                num_events_in_wait_list, event_wait_list,
+                                eventp)
+                              );
+
+  ETMSG(OPENCL, "Registering callback for kind: Kernel. "
+                "Correlation id: %"PRIu64 "", kernel_info->details.ker_cb.correlation_id);
+
+  clSetEventCallback_wrapper(*eventp, CL_COMPLETE,
+                             &opencl_activity_completion_callback, kernel_info);
+  return return_status;
+}
+
+
+cl_int
+clEnqueueReadBuffer
+(
+ cl_command_queue command_queue,
+ cl_mem buffer,
+ cl_bool blocking_read,
+ size_t offset,
+ size_t cb,
+ void *ptr,
+ cl_uint num_events_in_wait_list,
+ const cl_event *event_wait_list,
+ cl_event *event
+)
+{
+  ETMSG(OPENCL, "inside clEnqueueReadBuffer wrapper");
+
+  opencl_object_t *mem_info = opencl_malloc();
+  initializeMemoryCallBackInfo(mem_info, GPU_MEMCPY_D2H, cb, CORRELATION_ID_INVALID);
+
+  opencl_subscriber_callback(mem_info);
+
+  cl_event my_event;
+  cl_event *eventp;
+  if (!event) {
+    mem_info->isInternalClEvent = true;
+    eventp = &my_event;
+  } else {
+    eventp = event;
+    mem_info->isInternalClEvent = false;
+  }
+
+  cl_int return_status =
+  HPCRUN_OPENCL_CALL(clEnqueueReadBuffer,
+                     (command_queue, buffer, blocking_read, offset,
+                     cb, ptr, num_events_in_wait_list, event_wait_list, eventp));
+
+  ETMSG(OPENCL, "Registering callback for kind MEMCPY, type: D2H. "
+                "Correlation id: %"PRIu64 "", mem_info->details.mem_cb.correlation_id);
+  ETMSG(OPENCL, "%d(bytes) of data being transferred from device to host",
+        (long)cb);
+
+
+  clSetEventCallback_wrapper(*eventp, CL_COMPLETE,
+                             &opencl_activity_completion_callback, mem_info);
+
+  return return_status;
+}
+
+
+cl_int
+clEnqueueWriteBuffer
+(
+ cl_command_queue command_queue,
+ cl_mem buffer,
+ cl_bool blocking_write,
+ size_t offset,
+ size_t cb,
+ const void *ptr,
+ cl_uint num_events_in_wait_list,
+ const cl_event *event_wait_list,
+ cl_event *event
+)
+{
+  ETMSG(OPENCL, "inside clEnqueueWriteBuffer wrapper. cl_mem buffer: %p", buffer);
+
+  opencl_object_t *mem_info = opencl_malloc();
+  initializeMemoryCallBackInfo(mem_info, GPU_MEMCPY_H2D, cb, CORRELATION_ID_INVALID);
+
+  opencl_subscriber_callback(mem_info);
+
+  cl_event my_event;
+  cl_event *eventp;
+  if (!event) {
+    mem_info->isInternalClEvent = true;
+    eventp = &my_event;
+  } else {
+    eventp = event;
+    mem_info->isInternalClEvent = false;
+  }
+
+  cl_int return_status =
+  HPCRUN_OPENCL_CALL(clEnqueueWriteBuffer,
+                     (command_queue, buffer, blocking_write, offset, cb, ptr,
+                          num_events_in_wait_list, event_wait_list, eventp));
+
+
+  ETMSG(OPENCL, "Registering callback for kind MEMCPY, type: H2D. "
+                "Correlation id: %"PRIu64 "", mem_info->details.mem_cb.correlation_id);
+
+  ETMSG(OPENCL, "%d(bytes) of data being transferred from host to device",
+        (long)cb);
+
+
+  clSetEventCallback_wrapper(*eventp, CL_COMPLETE,
+                             &opencl_activity_completion_callback,
+                             (void*) mem_info);
+
+  return return_status;
+}
+
+
+cl_mem
+clCreateBuffer
+(
+ cl_context context,
+ cl_mem_flags flags,
+ size_t size,
+ void* host_ptr,
+ cl_int* errcode_ret
+)
+{
+	uint64_t correlation_id = getCorrelationId();
+	opencl_h2d_pending_operations_adjust(1);
+  cl_mem buffer = 
+    HPCRUN_OPENCL_CALL(clCreateBuffer, (context, flags, size, host_ptr, errcode_ret));
+  uint64_t buffer_id = (uint64_t)buffer; 
+  //ETMSG(OPENCL, "inside clCreateBuffer wrapper. cl_mem buffer: %p. buffer_id: %"PRIu64"", buffer, buffer_id);
+	opencl_h2d_map_insert(buffer_id, correlation_id, size, 0, 0);
+  
+  return buffer;
+}
+
+
+cl_int
+clSetKernelArg
+(
+ cl_kernel kernel,
+ cl_uint arg_index,
+ size_t arg_size,
+ const void* arg_value
+)
+{
+	uint64_t start_time;
+	bool isClBuffer = opencl_isClArgBuffer(arg_value);
+  //ETMSG(OPENCL, "inside clSetKernelArg wrapper. isClBuffer: %d. *(cl_mem*)arg_value: %p",isClBuffer, *(cl_mem*)arg_value);
+	if (isClBuffer) {
+		start_time = hpcrun_nanotime();
+	}
+  cl_int return_status = 
+    HPCRUN_OPENCL_CALL(clSetKernelArg, (kernel, arg_index, arg_size, arg_value));
+	if (!isClBuffer) {
+		return return_status;	
+	}
+  uint64_t end_time = hpcrun_nanotime();
+  uint64_t buffer_id = opencl_get_buffer_id(arg_value);
+	opencl_h2d_map_entry_t *entry = opencl_h2d_map_lookup(buffer_id);
+	if (entry) {
+		size_t size = opencl_h2d_map_entry_size_get(entry);
+
+		uint64_t correlation_id = opencl_h2d_map_entry_correlation_get(entry);
+    opencl_object_t *mem_info = opencl_malloc();
+    initializeMemoryCallBackInfo(mem_info, GPU_MEMCPY_H2D, size, correlation_id);
+    opencl_subscriber_callback(mem_info);
+
+  	opencl_h2d_map_insert(buffer_id,correlation_id, size, start_time, end_time);
+	} else {
+		// there is no clCreateBuffer being invoked for this call. dont create map entries	
+	}
+  return return_status;
+}
+
+
+void
+opencl_enable_instrumentation
+(
+ void
+)
+{
+  instrumentation = true;
+}
+
+
 void
 opencl_api_thread_finalize
 (
-  void *args
+ void *args
 )
 {
-  opencl_wait_for_pending_operations();
+	opencl_wait_for_non_clSetKernelArg_pending_operations();
+	opencl_add_ccts_for_setClKernelArg();
+  opencl_wait_for_all_pending_operations();
   gpu_application_thread_process_activities();
 }
 
diff --git a/src/tool/hpcrun/gpu/opencl/opencl-api.h b/src/tool/hpcrun/gpu/opencl/opencl-api.h
index 2cae426a93..ca166c4816 100644
--- a/src/tool/hpcrun/gpu/opencl/opencl-api.h
+++ b/src/tool/hpcrun/gpu/opencl/opencl-api.h
@@ -44,8 +44,6 @@
 #ifndef _OPENCL_API_H_
 #define _OPENCL_API_H_
 
-
-
 //******************************************************************************
 // local includes
 //******************************************************************************
@@ -62,6 +60,13 @@
 //typedef struct opencl_object_t opencl_object_t;
 
 
+//******************************************************************************
+// type declarations
+//******************************************************************************
+
+//TODO: move cl_basic_callback_t...
+
+
 
 //******************************************************************************
 // interface operations
@@ -99,48 +104,62 @@ opencl_subscriber_callback
 void
 opencl_activity_completion_callback
 (
-  cl_event,
-  cl_int,
-  void *
+ cl_event,
+ cl_int,
+ void *
 );
 
 
 void
-getTimingInfoFromClEvent
+opencl_timing_info_get
+(
+ gpu_interval_t *,
+ cl_event
+);
+
+
+cct_node_t *
+opencl_api_node_get
 (
-  gpu_interval_t *,
-  cl_event
+ void
 );
 
 
 void
 clSetEventCallback_wrapper
 (
-  cl_event,
-  cl_int,
-  void (CL_CALLBACK*)(cl_event, cl_int, void *),
-  void *
+ cl_event,
+ cl_int,
+ void (CL_CALLBACK*)(cl_event, cl_int, void *),
+ void *
 );
 
 
 void
 opencl_api_initialize
 (
-  void
+ void
 );
 
 
 int
 opencl_bind
 (
-  void
+ void
+);
+
+
+void
+opencl_enable_instrumentation
+(
+	void
 );
 
 
 void
 opencl_api_thread_finalize
 (
-  void *
+ void *
 );
 
 
diff --git a/src/tool/hpcrun/gpu/opencl/opencl-h2d-map.h b/src/tool/hpcrun/gpu/opencl/opencl-h2d-map.h
new file mode 100644
index 0000000000..68d9ecdc38
--- /dev/null
+++ b/src/tool/hpcrun/gpu/opencl/opencl-h2d-map.h
@@ -0,0 +1,149 @@
+// -*-Mode: C++;-*- // technically C99
+
+// * BeginRiceCopyright *****************************************************
+//
+// --------------------------------------------------------------------------
+// Part of HPCToolkit (hpctoolkit.org)
+//
+// Information about sources of support for research and development of
+// HPCToolkit is at 'hpctoolkit.org' and in 'README.Acknowledgments'.
+// --------------------------------------------------------------------------
+//
+// Copyright ((c)) 2002-2020, Rice University
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+//   notice, this list of conditions and the following disclaimer.
+//
+// * Redistributions in binary form must reproduce the above copyright
+//   notice, this list of conditions and the following disclaimer in the
+//   documentation and/or other materials provided with the distribution.
+//
+// * Neither the name of Rice University (RICE) nor the names of its
+//   contributors may be used to endorse or promote products derived from
+//   this software without specific prior written permission.
+//
+// This software is provided by RICE and contributors "as is" and any
+// express or implied warranties, including, but not limited to, the
+// implied warranties of merchantability and fitness for a particular
+// purpose are disclaimed. In no event shall RICE or contributors be
+// liable for any direct, indirect, incidental, special, exemplary, or
+// consequential damages (including, but not limited to, procurement of
+// substitute goods or services; loss of use, data, or profits; or
+// business interruption) however caused and on any theory of liability,
+// whether in contract, strict liability, or tort (including negligence
+// or otherwise) arising in any way out of the use of this software, even
+// if advised of the possibility of such damage.
+//
+// ******************************************************* EndRiceCopyright *
+
+#ifndef opencl_h2d_map_h
+#define opencl_h2d_map_h
+
+
+//*****************************************************************************
+// system includes
+//*****************************************************************************
+
+#include <stdint.h>
+
+
+//*****************************************************************************
+// type definitions 
+//*****************************************************************************
+
+typedef struct opencl_h2d_map_entry_t opencl_h2d_map_entry_t;
+
+
+typedef void (*opencl_splay_fn_t)
+(
+	opencl_h2d_map_entry_t *,
+	splay_visit_t,
+	void *
+);
+
+
+
+//*****************************************************************************
+// interface operations
+//*****************************************************************************
+
+opencl_h2d_map_entry_t *
+opencl_h2d_map_lookup
+(
+ uint64_t
+);
+
+
+void
+opencl_h2d_map_insert
+(
+ uint64_t, 
+ uint64_t, 
+ size_t,
+ uint64_t,
+ uint64_t
+);
+
+
+void
+opencl_h2d_map_delete
+(
+ uint64_t
+);
+
+
+uint64_t
+opencl_h2d_map_entry_buffer_id_get
+(
+ opencl_h2d_map_entry_t *entry
+);
+
+
+uint64_t
+opencl_h2d_map_entry_correlation_get
+(
+ opencl_h2d_map_entry_t *
+);
+
+
+size_t
+opencl_h2d_map_entry_size_get
+(
+ opencl_h2d_map_entry_t *
+);
+
+
+uint64_t
+opencl_h2d_map_entry_start_time_get
+(
+ opencl_h2d_map_entry_t *
+);
+
+
+uint64_t
+opencl_h2d_map_entry_end_time_get
+(
+ opencl_h2d_map_entry_t *
+);
+
+
+void
+opencl_update_ccts_for_h2d_nodes
+(
+ opencl_splay_fn_t fn	
+);
+
+
+uint64_t
+opencl_h2d_map_count
+(
+ void
+);
+
+#endif
+
diff --git a/src/tool/hpcrun/gpu/opencl/opencl-intercept.c b/src/tool/hpcrun/gpu/opencl/opencl-intercept.c
index b24ebbc14f..72a5379f9b 100644
--- a/src/tool/hpcrun/gpu/opencl/opencl-intercept.c
+++ b/src/tool/hpcrun/gpu/opencl/opencl-intercept.c
@@ -132,7 +132,7 @@ clCreateCommandQueue_wrapper
 
 
 static cl_int
-clEnqueueNDRangeKernel_wrapper
+clEnqueueNDRangeKernel
 (
   cl_command_queue command_queue,
   cl_kernel ocl_kernel,
@@ -146,7 +146,7 @@ clEnqueueNDRangeKernel_wrapper
 )
 {
   opencl_object_t *kernel_info = opencl_malloc();
-  initializeKernelCallBackInfo(kernel_info);
+  initializeKernelCallBackInfo(kernel_info, -1);
 
   opencl_subscriber_callback(kernel_info);
 
@@ -191,7 +191,7 @@ clEnqueueReadBuffer_wrapper
 )
 {
   opencl_object_t *mem_info = opencl_malloc();
-  initializeMemoryCallBackInfo(mem_info, GPU_MEMCPY_D2H, cb);
+  initializeMemoryCallBackInfo(mem_info, GPU_MEMCPY_D2H, cb, CORRELATION_ID_INVALID);
 
   opencl_subscriber_callback(mem_info);
 
@@ -241,7 +241,7 @@ clEnqueueWriteBuffer_wrapper
 {
 
   opencl_object_t *mem_info = opencl_malloc();
-  initializeMemoryCallBackInfo(mem_info, GPU_MEMCPY_H2D, cb);
+  initializeMemoryCallBackInfo(mem_info, GPU_MEMCPY_H2D, cb, CORRELATION_ID_INVALID);
 
   opencl_subscriber_callback(mem_info);
 
diff --git a/src/tool/hpcrun/sample-sources/opencl.c b/src/tool/hpcrun/sample-sources/opencl.c
index 8301d98915..323394a14e 100644
--- a/src/tool/hpcrun/sample-sources/opencl.c
+++ b/src/tool/hpcrun/sample-sources/opencl.c
@@ -50,6 +50,7 @@
 #include <monitor.h> 
 
 #include <hpcrun/device-finalizers.h>
+#include <hpcrun/gpu/gpu-trace.h>
 #include <hpcrun/gpu/gpu-metrics.h>
 #include <hpcrun/gpu/gpu-trace.h>
 #include <hpcrun/gpu/opencl/opencl-api.h>
@@ -66,10 +67,21 @@
 //******************************************************************************
 
 #define GPU_STRING "gpu=opencl"
+#define ENABLE_INSTRUMENTATION "gpu=opencl,inst"
+#define NO_THRESHOLD  1L
 static device_finalizer_fn_entry_t device_finalizer_shutdown;
 static device_finalizer_fn_entry_t device_trace_finalizer_shutdown;
 
 
+
+//******************************************************************************
+// type declarations
+//******************************************************************************
+
+static char opencl_name[128];
+
+
+
 //******************************************************************************
 // interface operations
 //******************************************************************************
@@ -129,7 +141,7 @@ static bool
 METHOD_FN(supports_event, const char *ev_str)
 {
   #ifndef HPCRUN_STATIC_LINK
-  return hpcrun_ev_is(ev_str, GPU_STRING);
+  return (hpcrun_ev_is(ev_str, GPU_STRING) || hpcrun_ev_is(ev_str, ENABLE_INSTRUMENTATION));
   #else
   return false;
   #endif
@@ -140,8 +152,19 @@ static void
 METHOD_FN(process_event_list, int lush_metrics)
 {
   int nevents = (self->evl).nevents;
-  gpu_metrics_default_enable();
   TMSG(OPENCL,"nevents = %d", nevents);
+  gpu_metrics_default_enable();
+
+  char* evlist = METHOD_CALL(self, get_event_str);
+  char* event = start_tok(evlist);
+  long th;
+  hpcrun_extract_ev_thresh(event, sizeof(opencl_name), opencl_name,
+    &th, NO_THRESHOLD);
+
+  if (hpcrun_ev_is(opencl_name, GPU_STRING)) {
+  } else if (hpcrun_ev_is(opencl_name, ENABLE_INSTRUMENTATION)) {
+    opencl_enable_instrumentation();
+  }
 }
 
 
@@ -156,11 +179,6 @@ METHOD_FN(finalize_event_list)
   #endif
   opencl_api_initialize();
 
-
-//  gpu_trace_fini - finalized from opencl_api_finalize -> gpu_activity_multiplexer_fini
-//  device_trace_finalizer_shutdown.fn = gpu_trace_fini;
-//  device_finalizer_register(device_finalizer_type_shutdown, &device_trace_finalizer_shutdown);
-
   device_finalizer_shutdown.fn = opencl_api_process_finalize;
   device_finalizer_register(device_finalizer_type_shutdown, &device_finalizer_shutdown);
 
@@ -183,9 +201,9 @@ METHOD_FN(display_events)
   printf("Name\t\tDescription\n");
   printf("---------------------------------------------------------------------------\n");
   printf("%s\t\tOperation-level monitoring for opencl on a GPU.\n"
-	  "\t\tCollect timing information on GPU kernel invocations,\n"
-	  "\t\tmemory copies, etc.\n",
-	  GPU_STRING);
+    "\t\tCollect timing information on GPU kernel invocations,\n"
+    "\t\tmemory copies, etc.\n",
+    GPU_STRING);
   printf("\n");
 }
 
diff --git a/src/tool/hpcrun/utilities/bgq-cnk/Makefile.in b/src/tool/hpcrun/utilities/bgq-cnk/Makefile.in
index 695da3069e..fe77a33bf2 100644
--- a/src/tool/hpcrun/utilities/bgq-cnk/Makefile.in
+++ b/src/tool/hpcrun/utilities/bgq-cnk/Makefile.in
@@ -311,9 +311,18 @@ OPT_CUDA_LDFLAGS = @OPT_CUDA_LDFLAGS@
 OPT_CUPTI = @OPT_CUPTI@
 OPT_CUPTI_IFLAGS = @OPT_CUPTI_IFLAGS@
 OPT_CUPTI_LDFLAGS = @OPT_CUPTI_LDFLAGS@
+OPT_GTPIN = @OPT_GTPIN@
+OPT_GTPIN_IFLAGS = @OPT_GTPIN_IFLAGS@
+OPT_GTPIN_LDFLAGS = @OPT_GTPIN_LDFLAGS@
+OPT_IGC = @OPT_IGC@
+OPT_IGC_IFLAGS = @OPT_IGC_IFLAGS@
+OPT_IGC_LDFLAGS = @OPT_IGC_LDFLAGS@
 OPT_LEVEL0 = @OPT_LEVEL0@
 OPT_LEVEL0_IFLAGS = @OPT_LEVEL0_IFLAGS@
 OPT_LEVEL0_LDFLAGS = @OPT_LEVEL0_LDFLAGS@
+OPT_METRICS_DISCOVERY = @OPT_METRICS_DISCOVERY@
+OPT_METRICS_DISCOVERY_IFLAGS = @OPT_METRICS_DISCOVERY_IFLAGS@
+OPT_METRICS_DISCOVERY_LDFLAGS = @OPT_METRICS_DISCOVERY_LDFLAGS@
 OPT_OBJCOPY = @OPT_OBJCOPY@
 OPT_OPENCL_IFLAGS = @OPT_OPENCL_IFLAGS@
 OPT_PAPI = @OPT_PAPI@
diff --git a/src/tool/hpcserver/Makefile.in b/src/tool/hpcserver/Makefile.in
index 5c1d265a5a..71e3486625 100644
--- a/src/tool/hpcserver/Makefile.in
+++ b/src/tool/hpcserver/Makefile.in
@@ -365,9 +365,18 @@ OPT_CUDA_LDFLAGS = @OPT_CUDA_LDFLAGS@
 OPT_CUPTI = @OPT_CUPTI@
 OPT_CUPTI_IFLAGS = @OPT_CUPTI_IFLAGS@
 OPT_CUPTI_LDFLAGS = @OPT_CUPTI_LDFLAGS@
+OPT_GTPIN = @OPT_GTPIN@
+OPT_GTPIN_IFLAGS = @OPT_GTPIN_IFLAGS@
+OPT_GTPIN_LDFLAGS = @OPT_GTPIN_LDFLAGS@
+OPT_IGC = @OPT_IGC@
+OPT_IGC_IFLAGS = @OPT_IGC_IFLAGS@
+OPT_IGC_LDFLAGS = @OPT_IGC_LDFLAGS@
 OPT_LEVEL0 = @OPT_LEVEL0@
 OPT_LEVEL0_IFLAGS = @OPT_LEVEL0_IFLAGS@
 OPT_LEVEL0_LDFLAGS = @OPT_LEVEL0_LDFLAGS@
+OPT_METRICS_DISCOVERY = @OPT_METRICS_DISCOVERY@
+OPT_METRICS_DISCOVERY_IFLAGS = @OPT_METRICS_DISCOVERY_IFLAGS@
+OPT_METRICS_DISCOVERY_LDFLAGS = @OPT_METRICS_DISCOVERY_LDFLAGS@
 OPT_OBJCOPY = @OPT_OBJCOPY@
 OPT_OPENCL_IFLAGS = @OPT_OPENCL_IFLAGS@
 OPT_PAPI = @OPT_PAPI@
diff --git a/src/tool/hpcserver/mpi/Makefile.in b/src/tool/hpcserver/mpi/Makefile.in
index 467606c9a5..586726c2b0 100644
--- a/src/tool/hpcserver/mpi/Makefile.in
+++ b/src/tool/hpcserver/mpi/Makefile.in
@@ -373,9 +373,18 @@ OPT_CUDA_LDFLAGS = @OPT_CUDA_LDFLAGS@
 OPT_CUPTI = @OPT_CUPTI@
 OPT_CUPTI_IFLAGS = @OPT_CUPTI_IFLAGS@
 OPT_CUPTI_LDFLAGS = @OPT_CUPTI_LDFLAGS@
+OPT_GTPIN = @OPT_GTPIN@
+OPT_GTPIN_IFLAGS = @OPT_GTPIN_IFLAGS@
+OPT_GTPIN_LDFLAGS = @OPT_GTPIN_LDFLAGS@
+OPT_IGC = @OPT_IGC@
+OPT_IGC_IFLAGS = @OPT_IGC_IFLAGS@
+OPT_IGC_LDFLAGS = @OPT_IGC_LDFLAGS@
 OPT_LEVEL0 = @OPT_LEVEL0@
 OPT_LEVEL0_IFLAGS = @OPT_LEVEL0_IFLAGS@
 OPT_LEVEL0_LDFLAGS = @OPT_LEVEL0_LDFLAGS@
+OPT_METRICS_DISCOVERY = @OPT_METRICS_DISCOVERY@
+OPT_METRICS_DISCOVERY_IFLAGS = @OPT_METRICS_DISCOVERY_IFLAGS@
+OPT_METRICS_DISCOVERY_LDFLAGS = @OPT_METRICS_DISCOVERY_LDFLAGS@
 OPT_OBJCOPY = @OPT_OBJCOPY@
 OPT_OPENCL_IFLAGS = @OPT_OPENCL_IFLAGS@
 OPT_PAPI = @OPT_PAPI@
diff --git a/src/tool/hpcstruct/Makefile.in b/src/tool/hpcstruct/Makefile.in
index a39dec41d1..18e5d3e025 100644
--- a/src/tool/hpcstruct/Makefile.in
+++ b/src/tool/hpcstruct/Makefile.in
@@ -405,9 +405,18 @@ OPT_CUDA_LDFLAGS = @OPT_CUDA_LDFLAGS@
 OPT_CUPTI = @OPT_CUPTI@
 OPT_CUPTI_IFLAGS = @OPT_CUPTI_IFLAGS@
 OPT_CUPTI_LDFLAGS = @OPT_CUPTI_LDFLAGS@
+OPT_GTPIN = @OPT_GTPIN@
+OPT_GTPIN_IFLAGS = @OPT_GTPIN_IFLAGS@
+OPT_GTPIN_LDFLAGS = @OPT_GTPIN_LDFLAGS@
+OPT_IGC = @OPT_IGC@
+OPT_IGC_IFLAGS = @OPT_IGC_IFLAGS@
+OPT_IGC_LDFLAGS = @OPT_IGC_LDFLAGS@
 OPT_LEVEL0 = @OPT_LEVEL0@
 OPT_LEVEL0_IFLAGS = @OPT_LEVEL0_IFLAGS@
 OPT_LEVEL0_LDFLAGS = @OPT_LEVEL0_LDFLAGS@
+OPT_METRICS_DISCOVERY = @OPT_METRICS_DISCOVERY@
+OPT_METRICS_DISCOVERY_IFLAGS = @OPT_METRICS_DISCOVERY_IFLAGS@
+OPT_METRICS_DISCOVERY_LDFLAGS = @OPT_METRICS_DISCOVERY_LDFLAGS@
 OPT_OBJCOPY = @OPT_OBJCOPY@
 OPT_OPENCL_IFLAGS = @OPT_OPENCL_IFLAGS@
 OPT_PAPI = @OPT_PAPI@
diff --git a/src/tool/hpctracedump/Makefile.in b/src/tool/hpctracedump/Makefile.in
index 61d5b3f16d..cd2eef79a8 100644
--- a/src/tool/hpctracedump/Makefile.in
+++ b/src/tool/hpctracedump/Makefile.in
@@ -351,9 +351,18 @@ OPT_CUDA_LDFLAGS = @OPT_CUDA_LDFLAGS@
 OPT_CUPTI = @OPT_CUPTI@
 OPT_CUPTI_IFLAGS = @OPT_CUPTI_IFLAGS@
 OPT_CUPTI_LDFLAGS = @OPT_CUPTI_LDFLAGS@
+OPT_GTPIN = @OPT_GTPIN@
+OPT_GTPIN_IFLAGS = @OPT_GTPIN_IFLAGS@
+OPT_GTPIN_LDFLAGS = @OPT_GTPIN_LDFLAGS@
+OPT_IGC = @OPT_IGC@
+OPT_IGC_IFLAGS = @OPT_IGC_IFLAGS@
+OPT_IGC_LDFLAGS = @OPT_IGC_LDFLAGS@
 OPT_LEVEL0 = @OPT_LEVEL0@
 OPT_LEVEL0_IFLAGS = @OPT_LEVEL0_IFLAGS@
 OPT_LEVEL0_LDFLAGS = @OPT_LEVEL0_LDFLAGS@
+OPT_METRICS_DISCOVERY = @OPT_METRICS_DISCOVERY@
+OPT_METRICS_DISCOVERY_IFLAGS = @OPT_METRICS_DISCOVERY_IFLAGS@
+OPT_METRICS_DISCOVERY_LDFLAGS = @OPT_METRICS_DISCOVERY_LDFLAGS@
 OPT_OBJCOPY = @OPT_OBJCOPY@
 OPT_OPENCL_IFLAGS = @OPT_OPENCL_IFLAGS@
 OPT_PAPI = @OPT_PAPI@
diff --git a/src/tool/misc/Makefile.in b/src/tool/misc/Makefile.in
index de8f4f89f4..704457677d 100644
--- a/src/tool/misc/Makefile.in
+++ b/src/tool/misc/Makefile.in
@@ -306,9 +306,18 @@ OPT_CUDA_LDFLAGS = @OPT_CUDA_LDFLAGS@
 OPT_CUPTI = @OPT_CUPTI@
 OPT_CUPTI_IFLAGS = @OPT_CUPTI_IFLAGS@
 OPT_CUPTI_LDFLAGS = @OPT_CUPTI_LDFLAGS@
+OPT_GTPIN = @OPT_GTPIN@
+OPT_GTPIN_IFLAGS = @OPT_GTPIN_IFLAGS@
+OPT_GTPIN_LDFLAGS = @OPT_GTPIN_LDFLAGS@
+OPT_IGC = @OPT_IGC@
+OPT_IGC_IFLAGS = @OPT_IGC_IFLAGS@
+OPT_IGC_LDFLAGS = @OPT_IGC_LDFLAGS@
 OPT_LEVEL0 = @OPT_LEVEL0@
 OPT_LEVEL0_IFLAGS = @OPT_LEVEL0_IFLAGS@
 OPT_LEVEL0_LDFLAGS = @OPT_LEVEL0_LDFLAGS@
+OPT_METRICS_DISCOVERY = @OPT_METRICS_DISCOVERY@
+OPT_METRICS_DISCOVERY_IFLAGS = @OPT_METRICS_DISCOVERY_IFLAGS@
+OPT_METRICS_DISCOVERY_LDFLAGS = @OPT_METRICS_DISCOVERY_LDFLAGS@
 OPT_OBJCOPY = @OPT_OBJCOPY@
 OPT_OPENCL_IFLAGS = @OPT_OPENCL_IFLAGS@
 OPT_PAPI = @OPT_PAPI@
diff --git a/src/tool/xprof/Makefile.in b/src/tool/xprof/Makefile.in
index edef2ccc2c..0a4a968e9b 100644
--- a/src/tool/xprof/Makefile.in
+++ b/src/tool/xprof/Makefile.in
@@ -372,9 +372,18 @@ OPT_CUDA_LDFLAGS = @OPT_CUDA_LDFLAGS@
 OPT_CUPTI = @OPT_CUPTI@
 OPT_CUPTI_IFLAGS = @OPT_CUPTI_IFLAGS@
 OPT_CUPTI_LDFLAGS = @OPT_CUPTI_LDFLAGS@
+OPT_GTPIN = @OPT_GTPIN@
+OPT_GTPIN_IFLAGS = @OPT_GTPIN_IFLAGS@
+OPT_GTPIN_LDFLAGS = @OPT_GTPIN_LDFLAGS@
+OPT_IGC = @OPT_IGC@
+OPT_IGC_IFLAGS = @OPT_IGC_IFLAGS@
+OPT_IGC_LDFLAGS = @OPT_IGC_LDFLAGS@
 OPT_LEVEL0 = @OPT_LEVEL0@
 OPT_LEVEL0_IFLAGS = @OPT_LEVEL0_IFLAGS@
 OPT_LEVEL0_LDFLAGS = @OPT_LEVEL0_LDFLAGS@
+OPT_METRICS_DISCOVERY = @OPT_METRICS_DISCOVERY@
+OPT_METRICS_DISCOVERY_IFLAGS = @OPT_METRICS_DISCOVERY_IFLAGS@
+OPT_METRICS_DISCOVERY_LDFLAGS = @OPT_METRICS_DISCOVERY_LDFLAGS@
 OPT_OBJCOPY = @OPT_OBJCOPY@
 OPT_OPENCL_IFLAGS = @OPT_OPENCL_IFLAGS@
 OPT_PAPI = @OPT_PAPI@

From 44e9cf8fb46ee3ffc81620943061d3b2b57c6889 Mon Sep 17 00:00:00 2001
From: Aaron Cherian <atc8@iris.cs.rice.edu>
Date: Thu, 8 Oct 2020 01:58:21 -0500
Subject: [PATCH 085/177] added NULL checks to some opencl function intercepts

---
 src/tool/hpcrun/gpu/opencl/opencl-api.c | 48 ++++++++++++++++---------
 1 file changed, 31 insertions(+), 17 deletions(-)

diff --git a/src/tool/hpcrun/gpu/opencl/opencl-api.c b/src/tool/hpcrun/gpu/opencl/opencl-api.c
index 9a6ba18dab..85fd442d9a 100644
--- a/src/tool/hpcrun/gpu/opencl/opencl-api.c
+++ b/src/tool/hpcrun/gpu/opencl/opencl-api.c
@@ -128,6 +128,8 @@
 
 #define CORRELATION_ID_INVALID -1
 
+#define BUFFER_ID_INVALID -1
+
 
 
 //******************************************************************************
@@ -436,8 +438,13 @@ opencl_get_buffer_id
   const void *arg
 )
 {
-  cl_mem buffer = *(cl_mem*)arg;
-  return (uint64_t)buffer;
+  if (arg != NULL) {
+    cl_mem buffer = *(cl_mem*)arg;
+    return (uint64_t)buffer;
+  } else {
+    return BUFFER_ID_INVALID;
+  }
+
 }
 
 
@@ -453,8 +460,13 @@ opencl_isClArgBuffer
 	 * 2. clEnqueueWriteBuffer is being called for arg. We shouldnt be recording duplicate H2D calls
 	 * */
   uint64_t buffer_id = opencl_get_buffer_id(arg);
-	opencl_h2d_map_entry_t *entry = opencl_h2d_map_lookup(buffer_id);
-	bool isBuffer = entry ? true : false;
+  bool isBuffer;
+  if (buffer_id == BUFFER_ID_INVALID) {
+    isBuffer = false;
+  } else {
+	  opencl_h2d_map_entry_t *entry = opencl_h2d_map_lookup(buffer_id);
+	  isBuffer = entry ? true : false;
+  }
 	//ETMSG(OPENCL, "opencl_isClArgBuffer. buffer_id: %"PRIu64". isBuffer: %d",	buffer_id, isBuffer);
 	return isBuffer;
 }
@@ -787,19 +799,21 @@ clCreateProgramWithSource
 {
   ETMSG(OPENCL, "inside clCreateProgramWithSource_wrapper");
 
-  FILE *f_ptr;
-  for (int i = 0; i < (int)count; i++) {
-    // what if a single file has multiple kernels?
-    // we need to add logic to get filenames by reading the strings contents
-    char fileno = '0' + (i + 1); // right now we are naming the files as index numbers
-    // using malloc instead of hpcrun_malloc gives extra garbage characters in file name
-    char *filename = (char*)hpcrun_malloc(sizeof(fileno) + 1);
-    *filename = fileno + '\0';
-    f_ptr = fopen(filename, "w");
-    fwrite(strings[i], lengths[i], 1, f_ptr);
+  if (strings != NULL && lengths != NULL) {
+    FILE *f_ptr;
+    for (int i = 0; i < (int)count; i++) {
+      // what if a single file has multiple kernels?
+      // we need to add logic to get filenames by reading the strings contents
+      char fileno = '0' + (i + 1); // right now we are naming the files as index numbers
+      // using malloc instead of hpcrun_malloc gives extra garbage characters in file name
+      char *filename = (char *)hpcrun_malloc(sizeof(fileno) + 1);
+      *filename = fileno + '\0';
+      f_ptr = fopen(filename, "w");
+      fwrite(strings[i], lengths[i], 1, f_ptr);
+    }
+    fclose(f_ptr);
   }
-  fclose(f_ptr);
-  
+
   return HPCRUN_OPENCL_CALL(clCreateProgramWithSource, (context, count, strings, lengths, errcode_ret));
 }
 
@@ -1078,7 +1092,7 @@ clSetKernelArg
 )
 {
 	bool isClBuffer = opencl_isClArgBuffer(arg_value);
-  ETMSG(OPENCL, "inside clSetKernelArg wrapper. isClBuffer: %d. *(cl_mem*)arg_value: %p",isClBuffer, *(cl_mem*)arg_value);
+  ETMSG(OPENCL, "inside clSetKernelArg wrapper."); //isClBuffer: %d. *(cl_mem*)arg_value: %p",isClBuffer, *(cl_mem*)arg_value
 	uint64_t start_time = hpcrun_nanotime();
 
   cl_int return_status = 

From 8b91f7124439b9b04d81b82fd199ccee0fe2c988 Mon Sep 17 00:00:00 2001
From: Aaron Cherian <atc8@iris.cs.rice.edu>
Date: Thu, 8 Oct 2020 03:36:22 -0500
Subject: [PATCH 086/177] opencl H2D and D2H operations made via
 clEnqueueMapBuffer also will be recorded now

---
 src/tool/hpcrun/gpu/opencl/opencl-api.c | 88 ++++++++++++++++++++++++-
 1 file changed, 86 insertions(+), 2 deletions(-)

diff --git a/src/tool/hpcrun/gpu/opencl/opencl-api.c b/src/tool/hpcrun/gpu/opencl/opencl-api.c
index 85fd442d9a..003c27a8fc 100644
--- a/src/tool/hpcrun/gpu/opencl/opencl-api.c
+++ b/src/tool/hpcrun/gpu/opencl/opencl-api.c
@@ -102,6 +102,7 @@
   macro(clEnqueueNDRangeKernel)  \
   macro(clEnqueueReadBuffer)  \
   macro(clEnqueueWriteBuffer)  \
+  macro(clEnqueueMapBuffer) \
   macro(clCreateBuffer)  \
   macro(clSetKernelArg)  \
   macro(clGetEventProfilingInfo)  \
@@ -119,6 +120,9 @@
 #define OPENCL_QUEUE_FN(fn, args)      \
   static cl_command_queue (*OPENCL_FN_NAME(fn)) args
 
+#define OPENCL_ENQUEUEMAPBUFFER_FN(fn, args)    \
+  static void* (*OPENCL_FN_NAME(fn)) args
+
 #define OPENCL_CREATEBUFFER_FN(fn, args)      \
   static cl_mem (*OPENCL_FN_NAME(fn)) args
 
@@ -253,6 +257,24 @@ OPENCL_FN
 );
 
 
+OPENCL_ENQUEUEMAPBUFFER_FN
+(
+  clEnqueueMapBuffer, 
+  (
+    cl_command_queue,
+    cl_mem,
+    cl_bool,
+    cl_map_flags,
+    size_t,
+    size_t,
+    cl_uint,
+    const cl_event*,
+    cl_event*,
+    cl_int*
+  )
+);
+
+
 OPENCL_CREATEBUFFER_FN
 (
   clCreateBuffer,
@@ -1060,6 +1082,68 @@ clEnqueueWriteBuffer
 }
 
 
+void*
+clEnqueueMapBuffer
+(
+  cl_command_queue command_queue,
+  cl_mem buffer,
+  cl_bool blocking_map,
+  cl_map_flags map_flags,
+  size_t offset,
+  size_t size,
+  cl_uint num_events_in_wait_list,
+  const cl_event* event_wait_list,
+  cl_event* event,
+  cl_int* errcode_ret
+)
+{
+  ETMSG(OPENCL, "inside clEnqueueMapBuffer wrapper");
+
+  opencl_object_t *mem_info = opencl_malloc();
+  if (map_flags == CL_MAP_READ) {
+    initializeMemoryCallBackInfo(mem_info, GPU_MEMCPY_D2H, size, CORRELATION_ID_INVALID);
+  } else {
+    //map_flags == CL_MAP_WRITE || map_flags == CL_MAP_WRITE_INVALIDATE_REGION
+    initializeMemoryCallBackInfo(mem_info, GPU_MEMCPY_H2D, size, CORRELATION_ID_INVALID);
+  }
+  
+  opencl_subscriber_callback(mem_info);
+
+  cl_event my_event;
+  cl_event *eventp;
+  if (!event) {
+    mem_info->isInternalClEvent = true;
+    eventp = &my_event;
+  } else {
+    eventp = event;
+    mem_info->isInternalClEvent = false;
+  }
+
+  void *map_ptr =
+  HPCRUN_OPENCL_CALL(clEnqueueMapBuffer,
+                     (command_queue, buffer, blocking_map, map_flags, offset,
+                     size, num_events_in_wait_list, event_wait_list, eventp, errcode_ret));
+
+  if (map_flags == CL_MAP_READ) {
+    ETMSG(OPENCL, "Registering callback for kind MEMCPY, type: D2H. "
+                  "Correlation id: %"PRIu64 "", mem_info->details.mem_cb.correlation_id);
+    ETMSG(OPENCL, "%d(bytes) of data being transferred from device to host",
+          (long)size);
+  } else {
+    ETMSG(OPENCL, "Registering callback for kind MEMCPY, type: H2D. "
+                  "Correlation id: %"PRIu64 "", mem_info->details.mem_cb.correlation_id);
+    ETMSG(OPENCL, "%d(bytes) of data being transferred from host to device",
+          (long)size);
+  }
+
+
+  clSetEventCallback_wrapper(*eventp, CL_COMPLETE,
+                             &opencl_activity_completion_callback, mem_info);
+
+  return map_ptr;
+}
+
+
 cl_mem
 clCreateBuffer
 (
@@ -1075,7 +1159,7 @@ clCreateBuffer
   cl_mem buffer = 
     HPCRUN_OPENCL_CALL(clCreateBuffer, (context, flags, size, host_ptr, errcode_ret));
   uint64_t buffer_id = (uint64_t)buffer; 
-  ETMSG(OPENCL, "inside clCreateBuffer wrapper. cl_mem buffer: %p. buffer_id: %"PRIu64"", buffer, buffer_id);
+  //ETMSG(OPENCL, "inside clCreateBuffer wrapper. cl_mem buffer: %p. buffer_id: %"PRIu64"", buffer, buffer_id);
 	opencl_h2d_map_insert(buffer_id, correlation_id, size, 0, 0);
   
   return buffer;
@@ -1092,7 +1176,7 @@ clSetKernelArg
 )
 {
 	bool isClBuffer = opencl_isClArgBuffer(arg_value);
-  ETMSG(OPENCL, "inside clSetKernelArg wrapper."); //isClBuffer: %d. *(cl_mem*)arg_value: %p",isClBuffer, *(cl_mem*)arg_value
+  //ETMSG(OPENCL, "inside clSetKernelArg wrapper."); //isClBuffer: %d. *(cl_mem*)arg_value: %p",isClBuffer, *(cl_mem*)arg_value
 	uint64_t start_time = hpcrun_nanotime();
 
   cl_int return_status = 

From 12f31a4559c0eebb79adeaa7d85557cb0fe42b4e Mon Sep 17 00:00:00 2001
From: Aaron Cherian <atc8@iris.cs.rice.edu>
Date: Thu, 8 Oct 2020 20:14:44 -0500
Subject: [PATCH 087/177] adding context id and stream id for H2D calls from
 clSetKernelArgs

---
 src/tool/hpcrun/Makefile.am                   |   4 +-
 src/tool/hpcrun/Makefile.in                   |  33 ++-
 .../gpu/opencl/opencl-activity-translate.c    |   5 +
 .../gpu/opencl/opencl-activity-translate.h    |   2 +
 src/tool/hpcrun/gpu/opencl/opencl-api.c       | 110 ++++++--
 .../hpcrun/gpu/opencl/opencl-context-map.c    | 248 +++++++++++++++++
 .../hpcrun/gpu/opencl/opencl-context-map.h    | 111 ++++++++
 src/tool/hpcrun/gpu/opencl/opencl-h2d-map.c   |  26 +-
 src/tool/hpcrun/gpu/opencl/opencl-h2d-map.h   |  19 +-
 .../hpcrun/gpu/opencl/opencl-kernel-map.c     | 262 ++++++++++++++++++
 .../hpcrun/gpu/opencl/opencl-kernel-map.h     | 119 ++++++++
 .../hpcrun/gpu/opencl/opencl-memory-manager.h |   2 +
 12 files changed, 902 insertions(+), 39 deletions(-)
 create mode 100644 src/tool/hpcrun/gpu/opencl/opencl-context-map.c
 create mode 100644 src/tool/hpcrun/gpu/opencl/opencl-context-map.h
 create mode 100644 src/tool/hpcrun/gpu/opencl/opencl-kernel-map.c
 create mode 100644 src/tool/hpcrun/gpu/opencl/opencl-kernel-map.h

diff --git a/src/tool/hpcrun/Makefile.am b/src/tool/hpcrun/Makefile.am
index 55ba8a02cc..ce8247caa1 100644
--- a/src/tool/hpcrun/Makefile.am
+++ b/src/tool/hpcrun/Makefile.am
@@ -516,7 +516,9 @@ MY_OPENCL_FILES = sample-sources/opencl.c \
 	gpu/opencl/opencl-api.c \
 	gpu/opencl/opencl-memory-manager.c \
 	gpu/opencl/opencl-activity-translate.c \
-	gpu/opencl/opencl-h2d-map.c 
+	gpu/opencl/opencl-h2d-map.c \
+	gpu/opencl/opencl-kernel-map.c \
+	gpu/opencl/opencl-context-map.c 
 endif
 
 if OPT_ENABLE_GTPIN
diff --git a/src/tool/hpcrun/Makefile.in b/src/tool/hpcrun/Makefile.in
index 8a7caef95f..4d24288890 100644
--- a/src/tool/hpcrun/Makefile.in
+++ b/src/tool/hpcrun/Makefile.in
@@ -544,7 +544,8 @@ am__libhpcrun_la_SOURCES_DIST = utilities/first_func.c main.h main.c \
 	gpu/level0/level0-handle-map.c sample-sources/opencl.c \
 	gpu/opencl/opencl-api.c gpu/opencl/opencl-memory-manager.c \
 	gpu/opencl/opencl-activity-translate.c \
-	gpu/opencl/opencl-h2d-map.c \
+	gpu/opencl/opencl-h2d-map.c gpu/opencl/opencl-kernel-map.c \
+	gpu/opencl/opencl-context-map.c \
 	gpu/instrumentation/kernel-data-map.c \
 	gpu/instrumentation/gtpin-instrumentation.c \
 	gpu/instrumentation/gtpin-correlation-id-map.c \
@@ -758,7 +759,9 @@ am__objects_35 =
 @OPT_ENABLE_OPENCL_TRUE@	gpu/opencl/libhpcrun_la-opencl-api.lo \
 @OPT_ENABLE_OPENCL_TRUE@	gpu/opencl/libhpcrun_la-opencl-memory-manager.lo \
 @OPT_ENABLE_OPENCL_TRUE@	gpu/opencl/libhpcrun_la-opencl-activity-translate.lo \
-@OPT_ENABLE_OPENCL_TRUE@	gpu/opencl/libhpcrun_la-opencl-h2d-map.lo
+@OPT_ENABLE_OPENCL_TRUE@	gpu/opencl/libhpcrun_la-opencl-h2d-map.lo \
+@OPT_ENABLE_OPENCL_TRUE@	gpu/opencl/libhpcrun_la-opencl-kernel-map.lo \
+@OPT_ENABLE_OPENCL_TRUE@	gpu/opencl/libhpcrun_la-opencl-context-map.lo
 @OPT_ENABLE_OPENCL_TRUE@am__objects_41 = $(am__objects_40)
 @OPT_ENABLE_GTPIN_TRUE@am__objects_42 = gpu/instrumentation/libhpcrun_la-kernel-data-map.lo \
 @OPT_ENABLE_GTPIN_TRUE@	gpu/instrumentation/libhpcrun_la-gtpin-instrumentation.lo \
@@ -1900,7 +1903,9 @@ MY_AARCH64_FILES = \
 @OPT_ENABLE_OPENCL_TRUE@	gpu/opencl/opencl-api.c \
 @OPT_ENABLE_OPENCL_TRUE@	gpu/opencl/opencl-memory-manager.c \
 @OPT_ENABLE_OPENCL_TRUE@	gpu/opencl/opencl-activity-translate.c \
-@OPT_ENABLE_OPENCL_TRUE@	gpu/opencl/opencl-h2d-map.c 
+@OPT_ENABLE_OPENCL_TRUE@	gpu/opencl/opencl-h2d-map.c \
+@OPT_ENABLE_OPENCL_TRUE@	gpu/opencl/opencl-kernel-map.c \
+@OPT_ENABLE_OPENCL_TRUE@	gpu/opencl/opencl-context-map.c 
 
 @OPT_ENABLE_GTPIN_TRUE@MY_GTPIN_FILES = \
 @OPT_ENABLE_GTPIN_TRUE@	gpu/instrumentation/kernel-data-map.c \
@@ -2865,6 +2870,12 @@ gpu/opencl/libhpcrun_la-opencl-activity-translate.lo:  \
 	gpu/opencl/$(DEPDIR)/$(am__dirstamp)
 gpu/opencl/libhpcrun_la-opencl-h2d-map.lo: gpu/opencl/$(am__dirstamp) \
 	gpu/opencl/$(DEPDIR)/$(am__dirstamp)
+gpu/opencl/libhpcrun_la-opencl-kernel-map.lo:  \
+	gpu/opencl/$(am__dirstamp) \
+	gpu/opencl/$(DEPDIR)/$(am__dirstamp)
+gpu/opencl/libhpcrun_la-opencl-context-map.lo:  \
+	gpu/opencl/$(am__dirstamp) \
+	gpu/opencl/$(DEPDIR)/$(am__dirstamp)
 gpu/instrumentation/$(am__dirstamp):
 	@$(MKDIR_P) gpu/instrumentation
 	@: > gpu/instrumentation/$(am__dirstamp)
@@ -3827,7 +3838,9 @@ distclean-compile:
 @AMDEP_TRUE@@am__include@ @am__quote@gpu/nvidia/$(DEPDIR)/libhpcrun_o-cupti-gpu-api.Po@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@gpu/opencl/$(DEPDIR)/libhpcrun_la-opencl-activity-translate.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@gpu/opencl/$(DEPDIR)/libhpcrun_la-opencl-api.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@gpu/opencl/$(DEPDIR)/libhpcrun_la-opencl-context-map.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@gpu/opencl/$(DEPDIR)/libhpcrun_la-opencl-h2d-map.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@gpu/opencl/$(DEPDIR)/libhpcrun_la-opencl-kernel-map.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@gpu/opencl/$(DEPDIR)/libhpcrun_la-opencl-memory-manager.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@lush-agents/$(DEPDIR)/libagent_cilk_la-agent-cilk.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@lush-agents/$(DEPDIR)/libagent_pthread_la-agent-pthread.Plo@am__quote@
@@ -5528,6 +5541,20 @@ gpu/opencl/libhpcrun_la-opencl-h2d-map.lo: gpu/opencl/opencl-h2d-map.c
 @AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
 @am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -c -o gpu/opencl/libhpcrun_la-opencl-h2d-map.lo `test -f 'gpu/opencl/opencl-h2d-map.c' || echo '$(srcdir)/'`gpu/opencl/opencl-h2d-map.c
 
+gpu/opencl/libhpcrun_la-opencl-kernel-map.lo: gpu/opencl/opencl-kernel-map.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -MT gpu/opencl/libhpcrun_la-opencl-kernel-map.lo -MD -MP -MF gpu/opencl/$(DEPDIR)/libhpcrun_la-opencl-kernel-map.Tpo -c -o gpu/opencl/libhpcrun_la-opencl-kernel-map.lo `test -f 'gpu/opencl/opencl-kernel-map.c' || echo '$(srcdir)/'`gpu/opencl/opencl-kernel-map.c
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) gpu/opencl/$(DEPDIR)/libhpcrun_la-opencl-kernel-map.Tpo gpu/opencl/$(DEPDIR)/libhpcrun_la-opencl-kernel-map.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='gpu/opencl/opencl-kernel-map.c' object='gpu/opencl/libhpcrun_la-opencl-kernel-map.lo' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -c -o gpu/opencl/libhpcrun_la-opencl-kernel-map.lo `test -f 'gpu/opencl/opencl-kernel-map.c' || echo '$(srcdir)/'`gpu/opencl/opencl-kernel-map.c
+
+gpu/opencl/libhpcrun_la-opencl-context-map.lo: gpu/opencl/opencl-context-map.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -MT gpu/opencl/libhpcrun_la-opencl-context-map.lo -MD -MP -MF gpu/opencl/$(DEPDIR)/libhpcrun_la-opencl-context-map.Tpo -c -o gpu/opencl/libhpcrun_la-opencl-context-map.lo `test -f 'gpu/opencl/opencl-context-map.c' || echo '$(srcdir)/'`gpu/opencl/opencl-context-map.c
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) gpu/opencl/$(DEPDIR)/libhpcrun_la-opencl-context-map.Tpo gpu/opencl/$(DEPDIR)/libhpcrun_la-opencl-context-map.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='gpu/opencl/opencl-context-map.c' object='gpu/opencl/libhpcrun_la-opencl-context-map.lo' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -c -o gpu/opencl/libhpcrun_la-opencl-context-map.lo `test -f 'gpu/opencl/opencl-context-map.c' || echo '$(srcdir)/'`gpu/opencl/opencl-context-map.c
+
 gpu/instrumentation/libhpcrun_la-kernel-data-map.lo: gpu/instrumentation/kernel-data-map.c
 @am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -MT gpu/instrumentation/libhpcrun_la-kernel-data-map.lo -MD -MP -MF gpu/instrumentation/$(DEPDIR)/libhpcrun_la-kernel-data-map.Tpo -c -o gpu/instrumentation/libhpcrun_la-kernel-data-map.lo `test -f 'gpu/instrumentation/kernel-data-map.c' || echo '$(srcdir)/'`gpu/instrumentation/kernel-data-map.c
 @am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) gpu/instrumentation/$(DEPDIR)/libhpcrun_la-kernel-data-map.Tpo gpu/instrumentation/$(DEPDIR)/libhpcrun_la-kernel-data-map.Plo
diff --git a/src/tool/hpcrun/gpu/opencl/opencl-activity-translate.c b/src/tool/hpcrun/gpu/opencl/opencl-activity-translate.c
index b8e09b6b90..e2b4b22425 100644
--- a/src/tool/hpcrun/gpu/opencl/opencl-activity-translate.c
+++ b/src/tool/hpcrun/gpu/opencl/opencl-activity-translate.c
@@ -139,6 +139,8 @@ opencl_clSetKernelArg_activity_translate
 (
 	gpu_activity_t *ga,
 	uint64_t correlation_id,
+	uint32_t context_id,
+	uint32_t stream_id,
 	size_t size,
 	uint64_t start_time,
 	uint64_t end_time
@@ -147,6 +149,9 @@ opencl_clSetKernelArg_activity_translate
   ga->details.memcpy.correlation_id = correlation_id;
   ga->details.memcpy.bytes = size;
   ga->details.memcpy.copyKind = GPU_MEMCPY_H2D;
+  ga->details.memcpy.context_id = context_id;
+  ga->details.memcpy.stream_id = stream_id;
+  
   ga->kind = GPU_ACTIVITY_MEMCPY;
   set_gpu_interval(&ga->details.interval, start_time, end_time);
   cstack_ptr_set(&(ga->next), 0);
diff --git a/src/tool/hpcrun/gpu/opencl/opencl-activity-translate.h b/src/tool/hpcrun/gpu/opencl/opencl-activity-translate.h
index c52f240279..43aafa3a80 100644
--- a/src/tool/hpcrun/gpu/opencl/opencl-activity-translate.h
+++ b/src/tool/hpcrun/gpu/opencl/opencl-activity-translate.h
@@ -77,6 +77,8 @@ opencl_clSetKernelArg_activity_translate
 (
 	gpu_activity_t *,
 	uint64_t,
+	uint32_t,
+	uint32_t,
 	size_t,
 	uint64_t,
 	uint64_t
diff --git a/src/tool/hpcrun/gpu/opencl/opencl-api.c b/src/tool/hpcrun/gpu/opencl/opencl-api.c
index 003c27a8fc..45cc3edfa1 100644
--- a/src/tool/hpcrun/gpu/opencl/opencl-api.c
+++ b/src/tool/hpcrun/gpu/opencl/opencl-api.c
@@ -83,6 +83,8 @@
 #include "opencl-activity-translate.h"
 #include "opencl-memory-manager.h"
 #include "opencl-h2d-map.h"
+#include "opencl-context-map.h"
+#include "opencl-kernel-map.h"
 
 
 
@@ -365,10 +367,31 @@ initializeKernelCallBackInfo
 static void
 initializeMemoryCallBackInfo
 (
-opencl_object_t *mem_info,
-gpu_memcpy_type_t type,
-size_t size,
-uint64_t correlation_id
+  opencl_object_t *mem_info,
+  gpu_memcpy_type_t type,
+  size_t size,
+  uint64_t correlation_id
+)
+{
+  mem_info->kind = GPU_ACTIVITY_MEMCPY;
+  mem_info->details.mem_cb.type = type;
+  mem_info->details.mem_cb.fromHostToDevice = (type == GPU_MEMCPY_H2D);
+  mem_info->details.mem_cb.fromDeviceToHost = (type == GPU_MEMCPY_D2H);
+  mem_info->details.mem_cb.size = size;
+
+  mem_info->details.mem_cb.correlation_id = correlation_id;
+}
+
+
+static void
+initializeClSetKernelArgMemoryCallBackInfo
+(
+  opencl_object_t *mem_info,
+  gpu_memcpy_type_t type,
+  size_t size,
+  uint64_t correlation_id,
+  uint32_t context_id,
+  uint32_t stream_id
 )
 {
   mem_info->kind = GPU_ACTIVITY_MEMCPY;
@@ -378,6 +401,8 @@ uint64_t correlation_id
   mem_info->details.mem_cb.size = size;
 
   mem_info->details.mem_cb.correlation_id = correlation_id;
+  mem_info->details.context_id = context_id;
+  mem_info->details.stream_id = stream_id;
 }
 
 
@@ -441,16 +466,24 @@ opencl_activity_process
 static void
 opencl_clSetKernelArg_activity_process
 (
- uint64_t correlation_id,
- opencl_h2d_map_entry_t *entry
+  opencl_h2d_map_entry_t *entry,
+  opencl_object_t *cb_data
 )
 {
   gpu_activity_t gpu_activity;
+  uint64_t correlation_id = opencl_h2d_map_entry_correlation_get(entry);
 	size_t size = opencl_h2d_map_entry_size_get(entry); 
 	uint64_t start_time = opencl_h2d_map_entry_start_time_get(entry); 
 	uint64_t end_time = opencl_h2d_map_entry_end_time_get(entry); 
-  opencl_clSetKernelArg_activity_translate(&gpu_activity, correlation_id, size, start_time, end_time);
-  gpu_activity_process(&gpu_activity);
+  opencl_clSetKernelArg_activity_translate(&gpu_activity, correlation_id, cb_data->details.context_id,
+      cb_data->details.stream_id, size, start_time, end_time);
+  
+  if (gpu_activity_multiplexer_my_channel_initialized() == false){
+    gpu_activity_multiplexer_my_channel_init();
+  }
+  ETMSG(OPENCL, "cb_data->details.initiator_channel: %p", cb_data->details.initiator_channel);
+  gpu_activity_multiplexer_push(cb_data->details.initiator_channel, &gpu_activity);
+  //gpu_activity_process(&gpu_activity);
 }
 
 
@@ -502,29 +535,32 @@ add_H2D_metrics_to_cct_node
 	void *arg
 )
 {
-	uint64_t correlation_id = opencl_h2d_map_entry_correlation_get(entry); 
-	gpu_correlation_id_map_entry_t *cid_map_entry = 
-		gpu_correlation_id_map_lookup(correlation_id);
-	if (cid_map_entry == NULL) {
-		ETMSG(OPENCL, "cid_map_entry for correlation_id: %"PRIu64 " (clSetKernelArg H2D) not found", correlation_id);
-		return;
-	}
-	ETMSG(OPENCL, "completion type: %s, Correlation id: %"PRIu64 "", "memcpy_H2D", correlation_id);
+	// uint64_t correlation_id = opencl_h2d_map_entry_correlation_get(entry); 
+	// gpu_correlation_id_map_entry_t *cid_map_entry = 
+	// 	gpu_correlation_id_map_lookup(correlation_id);
+	// if (cid_map_entry == NULL) {
+	// 	ETMSG(OPENCL, "cid_map_entry for correlation_id: %"PRIu64 " (clSetKernelArg H2D) not found", correlation_id);
+	// 	return;
+	// }
 
 	uint64_t start_time = opencl_h2d_map_entry_start_time_get(entry); 
-	uint64_t end_time = opencl_h2d_map_entry_end_time_get(entry); 
+	uint64_t end_time = opencl_h2d_map_entry_end_time_get(entry);
 	ETMSG(OPENCL, "duration [%"PRIu64", %"PRIu64"]",start_time, end_time); 
-	opencl_activity_completion_notify();
-	opencl_clSetKernelArg_activity_process(correlation_id, entry);
+	//opencl_activity_completion_notify();
+  opencl_object_t *cb_data = opencl_h2d_map_entry_callback_info_get(entry);
+  cl_basic_callback_t cb_basic = opencl_cb_basic_get(cb_data);
+  opencl_cb_basic_print(cb_basic, "Completion_Callback");
+
+	opencl_clSetKernelArg_activity_process(entry, cb_data);
 	uint64_t buffer_id = opencl_h2d_map_entry_buffer_id_get(entry);
-	opencl_h2d_map_delete(buffer_id);
+	//opencl_h2d_map_delete(buffer_id);
   opencl_h2d_pending_operations_adjust(-1);
   opencl_pending_operations_adjust(-1);
 }
 
 
 static void
-opencl_add_ccts_for_setClKernelArg
+opencl_update_ccts_for_setClKernelArg
 (
 	void
 )
@@ -651,7 +687,7 @@ opencl_subscriber_callback
 
     case GPU_ACTIVITY_MEMCPY:
       cb_info->details.mem_cb.correlation_id = correlation_id;
-      if (cb_info->details.mem_cb.type == GPU_MEMCPY_H2D){
+      if (cb_info->details.mem_cb.type == GPU_MEMCPY_H2D){ 
         gpu_op_placeholder_flags_set(&gpu_op_placeholder_flags,
                                        gpu_placeholder_type_copyin);
 
@@ -681,7 +717,7 @@ opencl_subscriber_callback
   }
 
   cct_node_t *api_node =
-  gpu_application_thread_correlation_callback(correlation_id);
+    gpu_application_thread_correlation_callback(correlation_id);
   gpu_op_ccts_t gpu_op_ccts;
 
   hpcrun_safe_enter();
@@ -880,8 +916,10 @@ clCreateCommandQueue
   // enabling profiling
   properties |= (cl_command_queue_properties)CL_QUEUE_PROFILING_ENABLE; 
 
-  return HPCRUN_OPENCL_CALL(clCreateCommandQueue, (context, device,
-        properties,errcode_ret));  
+  cl_command_queue queue = HPCRUN_OPENCL_CALL(clCreateCommandQueue, (context, device,
+        properties,errcode_ret));
+  opencl_cl_context_map_insert((uint64_t)context, (uint32_t)queue);
+  return queue;
 }
 
 
@@ -938,6 +976,7 @@ clCreateCommandQueueWithProperties
     // The property is created by us
     free(queue_properties);
   }
+  opencl_cl_context_map_insert((uint64_t)context, (uint32_t)queue);
   return queue;
 }
 
@@ -1160,7 +1199,7 @@ clCreateBuffer
     HPCRUN_OPENCL_CALL(clCreateBuffer, (context, flags, size, host_ptr, errcode_ret));
   uint64_t buffer_id = (uint64_t)buffer; 
   //ETMSG(OPENCL, "inside clCreateBuffer wrapper. cl_mem buffer: %p. buffer_id: %"PRIu64"", buffer, buffer_id);
-	opencl_h2d_map_insert(buffer_id, correlation_id, size, 0, 0);
+	opencl_h2d_map_insert(buffer_id, correlation_id, size, 0, 0, NULL);
   
   return buffer;
 }
@@ -1185,6 +1224,18 @@ clSetKernelArg
 		return return_status;	
 	}
   uint64_t end_time = hpcrun_nanotime();
+
+  size_t context_size;
+  cl_int STATUS = clGetKernelInfo(kernel, CL_KERNEL_CONTEXT, 0, NULL, &context_size);
+  cl_context *context = malloc(sizeof(context_size));
+  STATUS = clGetKernelInfo(kernel, CL_KERNEL_CONTEXT, context_size, (void*)context, NULL);
+  
+  //opencl_cl_kernel_map_insert((uint64_t)ocl_kernel, (uint32_t)(*context), command_queue);
+
+  uint32_t context_id = (uint32_t)(*context);
+  opencl_context_map_entry_t *ce = opencl_cl_context_map_lookup((uint64_t)(*context));
+  uint32_t stream_id = (*ce).stream_id;
+
   uint64_t buffer_id = opencl_get_buffer_id(arg_value);
 	opencl_h2d_map_entry_t *entry = opencl_h2d_map_lookup(buffer_id);
 	if (entry) {
@@ -1192,10 +1243,10 @@ clSetKernelArg
 
 		uint64_t correlation_id = opencl_h2d_map_entry_correlation_get(entry);
     opencl_object_t *mem_info = opencl_malloc();
-    initializeMemoryCallBackInfo(mem_info, GPU_MEMCPY_H2D, size, correlation_id);
+    initializeClSetKernelArgMemoryCallBackInfo(mem_info, GPU_MEMCPY_H2D, size, correlation_id, context_id, stream_id);
     opencl_subscriber_callback(mem_info);
 
-  	opencl_h2d_map_insert(buffer_id,correlation_id, size, start_time, end_time);
+  	opencl_h2d_map_insert(buffer_id,correlation_id, size, start_time, end_time, mem_info);
 	}
   return return_status;
 }
@@ -1218,11 +1269,12 @@ opencl_api_thread_finalize
 )
 {
 	opencl_wait_for_non_clSetKernelArg_pending_operations();
-	opencl_add_ccts_for_setClKernelArg();
+	opencl_update_ccts_for_setClKernelArg();
   opencl_wait_for_all_pending_operations();
   gpu_application_thread_process_activities();
 }
 
+
 void
 opencl_api_process_finalize
 (
diff --git a/src/tool/hpcrun/gpu/opencl/opencl-context-map.c b/src/tool/hpcrun/gpu/opencl/opencl-context-map.c
new file mode 100644
index 0000000000..6843e4f535
--- /dev/null
+++ b/src/tool/hpcrun/gpu/opencl/opencl-context-map.c
@@ -0,0 +1,248 @@
+// -*-Mode: C++;-*- // technically C99
+
+// * BeginRiceCopyright *****************************************************
+//
+// --------------------------------------------------------------------------
+// Part of HPCToolkit (hpctoolkit.org)
+//
+// Information about sources of support for research and development of
+// HPCToolkit is at 'hpctoolkit.org' and in 'README.Acknowledgments'.
+// --------------------------------------------------------------------------
+//
+// Copyright ((c)) 2002-2020, Rice University
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+//   notice, this list of conditions and the following disclaimer.
+//
+// * Redistributions in binary form must reproduce the above copyright
+//   notice, this list of conditions and the following disclaimer in the
+//   documentation and/or other materials provided with the distribution.
+//
+// * Neither the name of Rice University (RICE) nor the names of its
+//   contributors may be used to endorse or promote products derived from
+//   this software without specific prior written permission.
+//
+// This software is provided by RICE and contributors "as is" and any
+// express or implied warranties, including, but not limited to, the
+// implied warranties of merchantability and fitness for a particular
+// purpose are disclaimed. In no event shall RICE or contributors be
+// liable for any direct, indirect, incidental, special, exemplary, or
+// consequential damages (including, but not limited to, procurement of
+// substitute goods or services; loss of use, data, or profits; or
+// business interruption) however caused and on any theory of liability,
+// whether in contract, strict liability, or tort (including negligence
+// or otherwise) arising in any way out of the use of this software, even
+// if advised of the possibility of such damage.
+//
+// ******************************************************* EndRiceCopyright *
+
+//*****************************************************************************
+// system includes
+//*****************************************************************************
+
+#include <assert.h>
+#include <string.h>
+
+
+
+//*****************************************************************************
+// local includes
+//*****************************************************************************
+
+#include <lib/prof-lean/splay-uint64.h>
+#include <lib/prof-lean/spinlock.h>
+#include <hpcrun/gpu/gpu-activity-channel.h>
+#include <hpcrun/gpu/gpu-splay-allocator.h>
+#include <hpcrun/gpu/gpu-op-placeholders.h>
+
+#include "opencl-context-map.h"
+
+
+
+//*****************************************************************************
+// macros
+//*****************************************************************************
+
+#define DEBUG 0
+
+#include "../gpu-print.h"
+
+
+#define st_insert				\
+  typed_splay_insert(correlation_id)
+
+#define st_lookup				\
+  typed_splay_lookup(correlation_id)
+
+#define st_delete				\
+  typed_splay_delete(correlation_id)
+
+#define st_forall				\
+  typed_splay_forall(correlation_id)
+
+#define st_count				\
+  typed_splay_count(correlation_id)
+
+#define st_alloc(free_list)			\
+  typed_splay_alloc(free_list, opencl_context_map_entry_t)
+
+#define st_free(free_list, node)		\
+  typed_splay_free(free_list, node)
+
+
+
+//*****************************************************************************
+// type declarations
+//*****************************************************************************
+
+#undef typed_splay_node
+#define typed_splay_node(correlation_id) opencl_context_map_entry_t
+
+typedef struct typed_splay_node(correlation_id) {
+  struct typed_splay_node(correlation_id) *left;
+  struct typed_splay_node(correlation_id) *right;
+  uint64_t cl_context_id; // key
+
+  uint32_t stream_id; // we save queue id as the stream id
+} typed_splay_node(correlation_id); 
+
+
+//******************************************************************************
+// local data
+//******************************************************************************
+
+static opencl_context_map_entry_t *map_root = NULL;
+
+static opencl_context_map_entry_t *free_list = NULL;
+
+static spinlock_t opencl_context_map_lock = SPINLOCK_UNLOCKED;
+
+//*****************************************************************************
+// private operations
+//*****************************************************************************
+
+typed_splay_impl(correlation_id)
+
+
+static opencl_context_map_entry_t *
+opencl_cl_context_map_entry_alloc()
+{
+  return st_alloc(&free_list);
+}
+
+
+static opencl_context_map_entry_t *
+opencl_cl_context_map_entry_new
+(
+ uint64_t cl_context_id,
+ uint32_t stream_id
+)
+{
+  opencl_context_map_entry_t *e = opencl_cl_context_map_entry_alloc();
+
+  e->cl_context_id = cl_context_id;
+  e->stream_id = stream_id;
+  
+  return e;
+}
+
+
+
+//*****************************************************************************
+// interface operations
+//*****************************************************************************
+
+opencl_context_map_entry_t *
+opencl_cl_context_map_lookup
+(
+ uint64_t cl_context_id
+)
+{
+  spinlock_lock(&opencl_context_map_lock);
+
+  uint64_t id = cl_context_id;
+  opencl_context_map_entry_t *result = st_lookup(&map_root, id);
+
+  spinlock_unlock(&opencl_context_map_lock);
+
+  return result;
+}
+
+
+void
+opencl_cl_context_map_insert
+(
+ uint64_t cl_context_id, 
+ uint32_t stream_id
+)
+{
+  spinlock_lock(&opencl_context_map_lock);
+
+  opencl_context_map_entry_t *entry = st_lookup(&map_root, cl_context_id);
+  if (entry) {
+    entry->cl_context_id = cl_context_id;
+    entry->stream_id = stream_id;
+  } else {
+    opencl_context_map_entry_t *entry = 
+      opencl_cl_context_map_entry_new(cl_context_id, stream_id);
+
+    st_insert(&map_root, entry);
+  }
+
+  spinlock_unlock(&opencl_context_map_lock);
+}
+
+
+void
+opencl_cl_context_map_delete
+(
+ uint64_t cl_context_id
+)
+{
+  spinlock_lock(&opencl_context_map_lock);
+
+  opencl_context_map_entry_t *node = st_delete(&map_root, cl_context_id);
+  st_free(&free_list, node);
+
+  spinlock_unlock(&opencl_context_map_lock);
+}
+
+
+uint64_t
+opencl_cl_context_map_entry_cl_context_id_get
+(
+ opencl_context_map_entry_t *entry
+)
+{
+  return entry->cl_context_id;
+}
+
+
+uint32_t
+opencl_cl_context_map_entry_stream_get
+(
+ opencl_context_map_entry_t *entry
+)
+{
+  return entry->stream_id;
+}
+
+
+
+//*****************************************************************************
+// debugging code
+//*****************************************************************************
+
+uint64_t
+opencl_cl_context_map_count
+(
+ void
+)
+{
+  return st_count(map_root);
+}
diff --git a/src/tool/hpcrun/gpu/opencl/opencl-context-map.h b/src/tool/hpcrun/gpu/opencl/opencl-context-map.h
new file mode 100644
index 0000000000..a4c4a09561
--- /dev/null
+++ b/src/tool/hpcrun/gpu/opencl/opencl-context-map.h
@@ -0,0 +1,111 @@
+// -*-Mode: C++;-*- // technically C99
+
+// * BeginRiceCopyright *****************************************************
+//
+// --------------------------------------------------------------------------
+// Part of HPCToolkit (hpctoolkit.org)
+//
+// Information about sources of support for research and development of
+// HPCToolkit is at 'hpctoolkit.org' and in 'README.Acknowledgments'.
+// --------------------------------------------------------------------------
+//
+// Copyright ((c)) 2002-2020, Rice University
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+//   notice, this list of conditions and the following disclaimer.
+//
+// * Redistributions in binary form must reproduce the above copyright
+//   notice, this list of conditions and the following disclaimer in the
+//   documentation and/or other materials provided with the distribution.
+//
+// * Neither the name of Rice University (RICE) nor the names of its
+//   contributors may be used to endorse or promote products derived from
+//   this software without specific prior written permission.
+//
+// This software is provided by RICE and contributors "as is" and any
+// express or implied warranties, including, but not limited to, the
+// implied warranties of merchantability and fitness for a particular
+// purpose are disclaimed. In no event shall RICE or contributors be
+// liable for any direct, indirect, incidental, special, exemplary, or
+// consequential damages (including, but not limited to, procurement of
+// substitute goods or services; loss of use, data, or profits; or
+// business interruption) however caused and on any theory of liability,
+// whether in contract, strict liability, or tort (including negligence
+// or otherwise) arising in any way out of the use of this software, even
+// if advised of the possibility of such damage.
+//
+// ******************************************************* EndRiceCopyright *
+
+#ifndef opencl_context_map_h
+#define opencl_context_map_h
+
+
+//*****************************************************************************
+// system includes
+//*****************************************************************************
+
+#include <stdint.h>
+
+
+
+//*****************************************************************************
+// type definitions 
+//*****************************************************************************
+
+typedef struct opencl_context_map_entry_t opencl_context_map_entry_t;
+
+
+
+//*****************************************************************************
+// interface operations
+//*****************************************************************************
+
+opencl_context_map_entry_t *
+opencl_cl_context_map_lookup
+(
+ uint64_t
+);
+
+
+void
+opencl_cl_context_map_insert
+(
+ uint64_t, 
+ uint32_t
+);
+
+
+void
+opencl_cl_context_map_delete
+(
+ uint64_t
+);
+
+
+uint64_t
+opencl_cl_context_map_entry_cl_context_id_get
+(
+ opencl_context_map_entry_t *entry
+);
+
+
+uint32_t
+opencl_cl_context_map_entry_stream_get
+(
+ opencl_context_map_entry_t *
+);
+
+
+uint64_t
+opencl_h2d_map_count
+(
+ void
+);
+
+#endif
+
diff --git a/src/tool/hpcrun/gpu/opencl/opencl-h2d-map.c b/src/tool/hpcrun/gpu/opencl/opencl-h2d-map.c
index cfcbfe6830..49aa3c1f94 100644
--- a/src/tool/hpcrun/gpu/opencl/opencl-h2d-map.c
+++ b/src/tool/hpcrun/gpu/opencl/opencl-h2d-map.c
@@ -56,12 +56,13 @@
 
 #include <lib/prof-lean/splay-uint64.h>
 #include <lib/prof-lean/spinlock.h>
-
-#include "opencl-h2d-map.h"
 #include <hpcrun/gpu/gpu-activity-channel.h>
 #include <hpcrun/gpu/gpu-splay-allocator.h>
 #include <hpcrun/gpu/gpu-op-placeholders.h>
 
+#include "opencl-h2d-map.h"
+
+
 
 //*****************************************************************************
 // macros
@@ -111,6 +112,7 @@ typedef struct typed_splay_node(correlation_id) {
   size_t size;
   uint64_t start_time;
   uint64_t end_time;
+  opencl_object_t *cb_info
 } typed_splay_node(correlation_id); 
 
 
@@ -145,7 +147,8 @@ opencl_h2d_map_entry_new
  uint64_t correlation_id,
  size_t size,
  uint64_t start_time,
- uint64_t end_time
+ uint64_t end_time,
+ opencl_object_t *cb_info
 )
 {
   opencl_h2d_map_entry_t *e = opencl_h2d_map_entry_alloc();
@@ -155,6 +158,7 @@ opencl_h2d_map_entry_new
   e->size = size;
   e->start_time = start_time;
   e->end_time = end_time;
+  e->cb_info = cb_info;
 
   return e;
 }
@@ -188,7 +192,8 @@ opencl_h2d_map_insert
  uint64_t correlation_id, 
  size_t size,
  uint64_t start_time,
- uint64_t end_time
+ uint64_t end_time,
+ opencl_object_t *cb_info
 )
 {
   spinlock_lock(&opencl_h2d_map_lock);
@@ -199,9 +204,10 @@ opencl_h2d_map_insert
     entry->size = size;
     entry->start_time = start_time;
     entry->end_time = end_time;
+    entry->cb_info = cb_info;
   } else {
     opencl_h2d_map_entry_t *entry = 
-      opencl_h2d_map_entry_new(buffer_id, correlation_id, size, start_time, end_time);
+      opencl_h2d_map_entry_new(buffer_id, correlation_id, size, start_time, end_time, cb_info);
 
     st_insert(&map_root, entry);
   }
@@ -275,6 +281,16 @@ opencl_h2d_map_entry_end_time_get
 }
 
 
+opencl_object_t *
+opencl_h2d_map_entry_callback_info_get
+(
+ opencl_h2d_map_entry_t *entry
+)
+{
+  return entry->cb_info;
+}
+
+
 void
 opencl_update_ccts_for_h2d_nodes
 (
diff --git a/src/tool/hpcrun/gpu/opencl/opencl-h2d-map.h b/src/tool/hpcrun/gpu/opencl/opencl-h2d-map.h
index 68d9ecdc38..58e72b8f63 100644
--- a/src/tool/hpcrun/gpu/opencl/opencl-h2d-map.h
+++ b/src/tool/hpcrun/gpu/opencl/opencl-h2d-map.h
@@ -52,6 +52,15 @@
 #include <stdint.h>
 
 
+
+//*****************************************************************************
+// local includes
+//*****************************************************************************
+
+#include "opencl-memory-manager.h"
+
+
+
 //*****************************************************************************
 // type definitions 
 //*****************************************************************************
@@ -86,7 +95,8 @@ opencl_h2d_map_insert
  uint64_t, 
  size_t,
  uint64_t,
- uint64_t
+ uint64_t,
+ opencl_object_t *
 );
 
 
@@ -132,6 +142,13 @@ opencl_h2d_map_entry_end_time_get
 );
 
 
+opencl_object_t *
+opencl_h2d_map_entry_callback_info_get
+(
+ opencl_h2d_map_entry_t *entry
+);
+
+
 void
 opencl_update_ccts_for_h2d_nodes
 (
diff --git a/src/tool/hpcrun/gpu/opencl/opencl-kernel-map.c b/src/tool/hpcrun/gpu/opencl/opencl-kernel-map.c
new file mode 100644
index 0000000000..930283fa08
--- /dev/null
+++ b/src/tool/hpcrun/gpu/opencl/opencl-kernel-map.c
@@ -0,0 +1,262 @@
+// -*-Mode: C++;-*- // technically C99
+
+// * BeginRiceCopyright *****************************************************
+//
+// --------------------------------------------------------------------------
+// Part of HPCToolkit (hpctoolkit.org)
+//
+// Information about sources of support for research and development of
+// HPCToolkit is at 'hpctoolkit.org' and in 'README.Acknowledgments'.
+// --------------------------------------------------------------------------
+//
+// Copyright ((c)) 2002-2020, Rice University
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+//   notice, this list of conditions and the following disclaimer.
+//
+// * Redistributions in binary form must reproduce the above copyright
+//   notice, this list of conditions and the following disclaimer in the
+//   documentation and/or other materials provided with the distribution.
+//
+// * Neither the name of Rice University (RICE) nor the names of its
+//   contributors may be used to endorse or promote products derived from
+//   this software without specific prior written permission.
+//
+// This software is provided by RICE and contributors "as is" and any
+// express or implied warranties, including, but not limited to, the
+// implied warranties of merchantability and fitness for a particular
+// purpose are disclaimed. In no event shall RICE or contributors be
+// liable for any direct, indirect, incidental, special, exemplary, or
+// consequential damages (including, but not limited to, procurement of
+// substitute goods or services; loss of use, data, or profits; or
+// business interruption) however caused and on any theory of liability,
+// whether in contract, strict liability, or tort (including negligence
+// or otherwise) arising in any way out of the use of this software, even
+// if advised of the possibility of such damage.
+//
+// ******************************************************* EndRiceCopyright *
+
+//*****************************************************************************
+// system includes
+//*****************************************************************************
+
+#include <assert.h>
+#include <string.h>
+
+
+
+//*****************************************************************************
+// local includes
+//*****************************************************************************
+
+#include <lib/prof-lean/splay-uint64.h>
+#include <lib/prof-lean/spinlock.h>
+#include <hpcrun/gpu/gpu-activity-channel.h>
+#include <hpcrun/gpu/gpu-splay-allocator.h>
+#include <hpcrun/gpu/gpu-op-placeholders.h>
+
+#include "opencl-kernel-map.h"
+
+
+
+//*****************************************************************************
+// macros
+//*****************************************************************************
+
+#define DEBUG 0
+
+#include "../gpu-print.h"
+
+
+#define st_insert				\
+  typed_splay_insert(correlation_id)
+
+#define st_lookup				\
+  typed_splay_lookup(correlation_id)
+
+#define st_delete				\
+  typed_splay_delete(correlation_id)
+
+#define st_forall				\
+  typed_splay_forall(correlation_id)
+
+#define st_count				\
+  typed_splay_count(correlation_id)
+
+#define st_alloc(free_list)			\
+  typed_splay_alloc(free_list, opencl_kernel_map_entry_t)
+
+#define st_free(free_list, node)		\
+  typed_splay_free(free_list, node)
+
+
+
+//*****************************************************************************
+// type declarations
+//*****************************************************************************
+
+#undef typed_splay_node
+#define typed_splay_node(correlation_id) opencl_kernel_map_entry_t
+
+typedef struct typed_splay_node(correlation_id) {
+  struct typed_splay_node(correlation_id) *left;
+  struct typed_splay_node(correlation_id) *right;
+  uint64_t cl_kernel_id; // key
+
+  uint32_t context_id;
+  uint32_t stream_id;
+} typed_splay_node(correlation_id); 
+
+
+//******************************************************************************
+// local data
+//******************************************************************************
+
+static opencl_kernel_map_entry_t *map_root = NULL;
+
+static opencl_kernel_map_entry_t *free_list = NULL;
+
+static spinlock_t opencl_kernel_map_lock = SPINLOCK_UNLOCKED;
+
+//*****************************************************************************
+// private operations
+//*****************************************************************************
+
+typed_splay_impl(correlation_id)
+
+
+static opencl_kernel_map_entry_t *
+opencl_cl_kernel_map_entry_alloc()
+{
+  return st_alloc(&free_list);
+}
+
+
+static opencl_kernel_map_entry_t *
+opencl_cl_kernel_map_entry_new
+(
+ uint64_t cl_kernel_id,
+ uint32_t context_id,
+ uint32_t stream_id
+)
+{
+  opencl_kernel_map_entry_t *e = opencl_cl_kernel_map_entry_alloc();
+
+  e->cl_kernel_id = cl_kernel_id;
+  e->context_id = context_id;
+  e->stream_id = stream_id;
+  
+  return e;
+}
+
+
+
+//*****************************************************************************
+// interface operations
+//*****************************************************************************
+
+opencl_kernel_map_entry_t *
+opencl_cl_kernel_map_lookup
+(
+ uint64_t cl_kernel_id
+)
+{
+  spinlock_lock(&opencl_kernel_map_lock);
+
+  uint64_t id = cl_kernel_id;
+  opencl_kernel_map_entry_t *result = st_lookup(&map_root, id);
+
+  spinlock_unlock(&opencl_kernel_map_lock);
+
+  return result;
+}
+
+
+void
+opencl_cl_kernel_map_insert
+(
+ uint64_t cl_kernel_id, 
+ uint32_t context_id,
+ uint32_t stream_id
+)
+{
+  spinlock_lock(&opencl_kernel_map_lock);
+
+  opencl_kernel_map_entry_t *entry = st_lookup(&map_root, cl_kernel_id);
+  if (entry) {
+    entry->context_id = context_id;
+    entry->stream_id = stream_id;
+  } else {
+    opencl_kernel_map_entry_t *entry = 
+      opencl_cl_kernel_map_entry_new(cl_kernel_id, context_id, stream_id);
+
+    st_insert(&map_root, entry);
+  }
+
+  spinlock_unlock(&opencl_kernel_map_lock);
+}
+
+
+void
+opencl_cl_kernel_map_delete
+(
+ uint64_t cl_kernel_id
+)
+{
+  spinlock_lock(&opencl_kernel_map_lock);
+
+  opencl_kernel_map_entry_t *node = st_delete(&map_root, cl_kernel_id);
+  st_free(&free_list, node);
+
+  spinlock_unlock(&opencl_kernel_map_lock);
+}
+
+
+uint64_t
+opencl_cl_kernel_map_entry_cl_kernel_id_get
+(
+ opencl_kernel_map_entry_t *entry
+)
+{
+  return entry->cl_kernel_id;
+}
+
+
+uint32_t
+opencl_cl_kernel_map_entry_context_get
+(
+ opencl_kernel_map_entry_t *entry
+)
+{
+  return entry->context_id;
+}
+
+
+uint32_t
+opencl_cl_kernel_map_entry_stream_get
+(
+ opencl_kernel_map_entry_t *entry
+)
+{
+  return entry->stream_id;
+}
+
+
+
+//*****************************************************************************
+// debugging code
+//*****************************************************************************
+
+uint64_t
+opencl_cl_kernel_map_count
+(
+ void
+)
+{
+  return st_count(map_root);
+}
diff --git a/src/tool/hpcrun/gpu/opencl/opencl-kernel-map.h b/src/tool/hpcrun/gpu/opencl/opencl-kernel-map.h
new file mode 100644
index 0000000000..0edcd209cb
--- /dev/null
+++ b/src/tool/hpcrun/gpu/opencl/opencl-kernel-map.h
@@ -0,0 +1,119 @@
+// -*-Mode: C++;-*- // technically C99
+
+// * BeginRiceCopyright *****************************************************
+//
+// --------------------------------------------------------------------------
+// Part of HPCToolkit (hpctoolkit.org)
+//
+// Information about sources of support for research and development of
+// HPCToolkit is at 'hpctoolkit.org' and in 'README.Acknowledgments'.
+// --------------------------------------------------------------------------
+//
+// Copyright ((c)) 2002-2020, Rice University
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+//   notice, this list of conditions and the following disclaimer.
+//
+// * Redistributions in binary form must reproduce the above copyright
+//   notice, this list of conditions and the following disclaimer in the
+//   documentation and/or other materials provided with the distribution.
+//
+// * Neither the name of Rice University (RICE) nor the names of its
+//   contributors may be used to endorse or promote products derived from
+//   this software without specific prior written permission.
+//
+// This software is provided by RICE and contributors "as is" and any
+// express or implied warranties, including, but not limited to, the
+// implied warranties of merchantability and fitness for a particular
+// purpose are disclaimed. In no event shall RICE or contributors be
+// liable for any direct, indirect, incidental, special, exemplary, or
+// consequential damages (including, but not limited to, procurement of
+// substitute goods or services; loss of use, data, or profits; or
+// business interruption) however caused and on any theory of liability,
+// whether in contract, strict liability, or tort (including negligence
+// or otherwise) arising in any way out of the use of this software, even
+// if advised of the possibility of such damage.
+//
+// ******************************************************* EndRiceCopyright *
+
+#ifndef opencl_kernel_map_h
+#define opencl_kernel_map_h
+
+
+//*****************************************************************************
+// system includes
+//*****************************************************************************
+
+#include <stdint.h>
+
+
+
+//*****************************************************************************
+// type definitions 
+//*****************************************************************************
+
+typedef struct opencl_kernel_map_entry_t opencl_kernel_map_entry_t;
+
+
+
+//*****************************************************************************
+// interface operations
+//*****************************************************************************
+
+opencl_kernel_map_entry_t *
+opencl_cl_kernel_map_lookup
+(
+ uint64_t
+);
+
+
+void
+opencl_cl_kernel_map_insert
+(
+ uint64_t, 
+ uint32_t,
+ uint32_t
+);
+
+
+void
+opencl_cl_kernel_map_delete
+(
+ uint64_t
+);
+
+
+uint64_t
+opencl_cl_kernel_map_entry_cl_kernel_id_get
+(
+ opencl_kernel_map_entry_t *entry
+);
+
+
+uint32_t
+opencl_cl_kernel_map_entry_context_get
+(
+ opencl_kernel_map_entry_t *
+);
+
+
+uint32_t
+opencl_cl_kernel_map_entry_stream_get
+(
+ opencl_kernel_map_entry_t *
+);
+
+
+uint64_t
+opencl_h2d_map_count
+(
+ void
+);
+
+#endif
+
diff --git a/src/tool/hpcrun/gpu/opencl/opencl-memory-manager.h b/src/tool/hpcrun/gpu/opencl/opencl-memory-manager.h
index 2ea1919726..1c3b43022a 100644
--- a/src/tool/hpcrun/gpu/opencl/opencl-memory-manager.h
+++ b/src/tool/hpcrun/gpu/opencl/opencl-memory-manager.h
@@ -104,6 +104,8 @@ typedef struct opencl_object_details_t {
     cl_memory_callback_t mem_cb;
   };
   gpu_activity_channel_t *initiator_channel;
+  uint32_t context_id;
+  uint32_t stream_id;
   cct_node_t *cct_node;
   uint64_t submit_time;
 } opencl_object_details_t;

From a72260217f6fedaaec98c9267a72dd30ab5edc4d Mon Sep 17 00:00:00 2001
From: Aaron Cherian <atc8@iris.cs.rice.edu>
Date: Thu, 8 Oct 2020 23:39:51 -0500
Subject: [PATCH 088/177] fixing error with stream id in opencl-api.c

---
 src/tool/hpcrun/gpu/opencl/opencl-api.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/tool/hpcrun/gpu/opencl/opencl-api.c b/src/tool/hpcrun/gpu/opencl/opencl-api.c
index 45cc3edfa1..b6d674cf56 100644
--- a/src/tool/hpcrun/gpu/opencl/opencl-api.c
+++ b/src/tool/hpcrun/gpu/opencl/opencl-api.c
@@ -1234,7 +1234,7 @@ clSetKernelArg
 
   uint32_t context_id = (uint32_t)(*context);
   opencl_context_map_entry_t *ce = opencl_cl_context_map_lookup((uint64_t)(*context));
-  uint32_t stream_id = (*ce).stream_id;
+  uint32_t stream_id = opencl_cl_kernel_map_entry_stream_get(ce);
 
   uint64_t buffer_id = opencl_get_buffer_id(arg_value);
 	opencl_h2d_map_entry_t *entry = opencl_h2d_map_lookup(buffer_id);

From a17e84c73b5880ba94e443eee1e7d6310daf7165 Mon Sep 17 00:00:00 2001
From: dejangrubisic <grubisic.dejan@yahoo.com>
Date: Fri, 9 Oct 2020 14:05:51 -0500
Subject: [PATCH 089/177] fix opencl_translate

---
 .../hpcrun/gpu/gpu-operation-item-process.c   |  2 +-
 .../gpu/opencl/opencl-activity-translate.c    | 88 ++++++++++++++-----
 .../gpu/opencl/opencl-activity-translate.h    | 15 ++--
 src/tool/hpcrun/gpu/opencl/opencl-api.c       |  9 +-
 4 files changed, 78 insertions(+), 36 deletions(-)

diff --git a/src/tool/hpcrun/gpu/gpu-operation-item-process.c b/src/tool/hpcrun/gpu/gpu-operation-item-process.c
index 8b26ab71a5..666ca73483 100644
--- a/src/tool/hpcrun/gpu/gpu-operation-item-process.c
+++ b/src/tool/hpcrun/gpu/gpu-operation-item-process.c
@@ -87,7 +87,7 @@ gpu_context_stream_trace
  gpu_trace_item_t *ti
 )
 {
-  gpu_context_id_map_stream_process(context_id, stream_id, gpu_trace_produce, ti);
+//  gpu_context_id_map_stream_process(context_id, stream_id, gpu_trace_produce, ti);
 }
 
 
diff --git a/src/tool/hpcrun/gpu/opencl/opencl-activity-translate.c b/src/tool/hpcrun/gpu/opencl/opencl-activity-translate.c
index e2b4b22425..0ae2aa7fc7 100644
--- a/src/tool/hpcrun/gpu/opencl/opencl-activity-translate.c
+++ b/src/tool/hpcrun/gpu/opencl/opencl-activity-translate.c
@@ -70,7 +70,28 @@ convert_kernel_launch
 (
   gpu_activity_t *ga,
   opencl_object_t *cb_data,
-  cl_event event
+  uint64_t start_time,
+  uint64_t end_time
+)
+{
+  memset(&ga->details.kernel, 0, sizeof(gpu_kernel_t));
+  set_gpu_interval(&ga->details.interval, start_time, end_time);
+
+
+  ga->kind     = cb_data->kind;
+  ga->cct_node = cb_data->details.cct_node;
+
+  ga->details.kernel.correlation_id = cb_data->details.ker_cb.correlation_id;
+  ga->details.kernel.submit_time    = cb_data->details.submit_time;
+}
+
+
+static void
+convert_kernel_launch_event
+(
+gpu_activity_t *ga,
+opencl_object_t *cb_data,
+cl_event event
 )
 {
   memset(&ga->details.kernel, 0, sizeof(gpu_kernel_t));
@@ -89,11 +110,12 @@ convert_memcpy
 (
   gpu_activity_t *ga,
   opencl_object_t *cb_data,
-  cl_event event
+  uint64_t start_time,
+  uint64_t end_time
 )
 {
   memset(&ga->details.memcpy, 0, sizeof(gpu_memcpy_t));
-  opencl_timing_info_get(&ga->details.interval, event);
+  set_gpu_interval(&ga->details.interval, start_time, end_time);
 
   ga->kind     = cb_data->kind;
   ga->cct_node = cb_data->details.cct_node;
@@ -105,6 +127,25 @@ convert_memcpy
 }
 
 
+static void
+convert_memcpy_event
+(
+gpu_activity_t *ga,
+opencl_object_t *cb_data,
+cl_event event
+)
+{
+  memset(&ga->details.memcpy, 0, sizeof(gpu_memcpy_t));
+  opencl_timing_info_get(&ga->details.interval, event);
+
+  ga->kind     = cb_data->kind;
+  ga->cct_node = cb_data->details.cct_node;
+
+  ga->details.memcpy.correlation_id  = cb_data->details.mem_cb.correlation_id;
+  ga->details.memcpy.submit_time     = cb_data->details.submit_time;
+  ga->details.memcpy.bytes           = cb_data->details.mem_cb.size;
+  ga->details.memcpy.copyKind        = cb_data->details.mem_cb.type;
+}
 
 //******************************************************************************
 // interface operations
@@ -114,17 +155,18 @@ void
 opencl_activity_translate
 (
   gpu_activity_t *ga,
-  cl_event event,
-  opencl_object_t *cb_data
+  opencl_object_t *cb_data,
+  uint64_t start_time,
+  uint64_t end_time
 )
 {
   switch (cb_data->kind) {
     case GPU_ACTIVITY_MEMCPY:
-      convert_memcpy(ga, cb_data, event);
+      convert_memcpy(ga, cb_data, start_time, end_time);
       break;
 
     case GPU_ACTIVITY_KERNEL:
-      convert_kernel_launch(ga, cb_data, event);
+      convert_kernel_launch(ga, cb_data, start_time, end_time);
       break;
 
     default:
@@ -135,24 +177,24 @@ opencl_activity_translate
 
 
 void
-opencl_clSetKernelArg_activity_translate
+opencl_activity_translate_event
 (
-	gpu_activity_t *ga,
-	uint64_t correlation_id,
-	uint32_t context_id,
-	uint32_t stream_id,
-	size_t size,
-	uint64_t start_time,
-	uint64_t end_time
+gpu_activity_t *ga,
+cl_event event,
+opencl_object_t *cb_data
 )
 {
-  ga->details.memcpy.correlation_id = correlation_id;
-  ga->details.memcpy.bytes = size;
-  ga->details.memcpy.copyKind = GPU_MEMCPY_H2D;
-  ga->details.memcpy.context_id = context_id;
-  ga->details.memcpy.stream_id = stream_id;
-  
-  ga->kind = GPU_ACTIVITY_MEMCPY;
-  set_gpu_interval(&ga->details.interval, start_time, end_time);
+  switch (cb_data->kind) {
+    case GPU_ACTIVITY_MEMCPY:
+      convert_memcpy_event(ga, cb_data, event);
+      break;
+
+    case GPU_ACTIVITY_KERNEL:
+      convert_kernel_launch_event(ga, cb_data, event);
+      break;
+
+    default:
+      assert(0);
+  }
   cstack_ptr_set(&(ga->next), 0);
 }
diff --git a/src/tool/hpcrun/gpu/opencl/opencl-activity-translate.h b/src/tool/hpcrun/gpu/opencl/opencl-activity-translate.h
index 43aafa3a80..c92aa64213 100644
--- a/src/tool/hpcrun/gpu/opencl/opencl-activity-translate.h
+++ b/src/tool/hpcrun/gpu/opencl/opencl-activity-translate.h
@@ -73,14 +73,13 @@ opencl_activity_translate
 
 
 void
-opencl_clSetKernelArg_activity_translate
+opencl_activity_translate_event
 (
-	gpu_activity_t *,
-	uint64_t,
-	uint32_t,
-	uint32_t,
-	size_t,
-	uint64_t,
-	uint64_t
+ gpu_activity_t *ga,
+ cl_event event,
+ opencl_object_t *cb_data
 );
+
+
+
 #endif  //_OPENCL_ACTIVITY_TRANSLATE_H_
diff --git a/src/tool/hpcrun/gpu/opencl/opencl-api.c b/src/tool/hpcrun/gpu/opencl/opencl-api.c
index b6d674cf56..744e3afd6a 100644
--- a/src/tool/hpcrun/gpu/opencl/opencl-api.c
+++ b/src/tool/hpcrun/gpu/opencl/opencl-api.c
@@ -454,7 +454,7 @@ opencl_activity_process
 )
 {
   gpu_activity_t gpu_activity;
-  opencl_activity_translate(&gpu_activity, event, cb_data);
+  opencl_activity_translate_event(&gpu_activity, event, cb_data);
 
   if (gpu_activity_multiplexer_my_channel_initialized() == false){
     gpu_activity_multiplexer_my_channel_init();
@@ -474,9 +474,10 @@ opencl_clSetKernelArg_activity_process
   uint64_t correlation_id = opencl_h2d_map_entry_correlation_get(entry);
 	size_t size = opencl_h2d_map_entry_size_get(entry); 
 	uint64_t start_time = opencl_h2d_map_entry_start_time_get(entry); 
-	uint64_t end_time = opencl_h2d_map_entry_end_time_get(entry); 
-  opencl_clSetKernelArg_activity_translate(&gpu_activity, correlation_id, cb_data->details.context_id,
-      cb_data->details.stream_id, size, start_time, end_time);
+	uint64_t end_time = opencl_h2d_map_entry_end_time_get(entry);
+
+  cb_data->details.ker_cb.correlation_id = correlation_id;
+  opencl_activity_translate(&gpu_activity, cb_data, start_time, end_time);
   
   if (gpu_activity_multiplexer_my_channel_initialized() == false){
     gpu_activity_multiplexer_my_channel_init();

From 01cc72f7c61cf6851f74766ed81910b3701f0b5e Mon Sep 17 00:00:00 2001
From: dejangrubisic <grubisic.dejan@yahoo.com>
Date: Fri, 9 Oct 2020 14:18:53 -0500
Subject: [PATCH 090/177] fix opencl_translate

---
 src/tool/hpcrun/gpu/opencl/opencl-activity-translate.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/tool/hpcrun/gpu/opencl/opencl-activity-translate.h b/src/tool/hpcrun/gpu/opencl/opencl-activity-translate.h
index c92aa64213..e38f1c4364 100644
--- a/src/tool/hpcrun/gpu/opencl/opencl-activity-translate.h
+++ b/src/tool/hpcrun/gpu/opencl/opencl-activity-translate.h
@@ -67,8 +67,9 @@ void
 opencl_activity_translate
 (
   gpu_activity_t *ga,
-  cl_event event,
-  opencl_object_t *cb_data
+  opencl_object_t *cb_data,
+  uint64_t start_time,
+  uint64_t end_time
 );
 
 

From d8fd4c67b193c7aecaaf4598201344c6490335dc Mon Sep 17 00:00:00 2001
From: dejangrubisic <grubisic.dejan@yahoo.com>
Date: Fri, 9 Oct 2020 18:18:04 -0500
Subject: [PATCH 091/177] working version: opencl_activity_translate:
 gpu_start_time = gpu_submit_time

---
 src/tool/hpcrun/gpu/gpu-operation-channel.c   |  2 +-
 .../hpcrun/gpu/gpu-operation-item-process.c   |  2 +-
 src/tool/hpcrun/gpu/gpu-trace-channel.c       |  3 -
 src/tool/hpcrun/gpu/gpu-trace.c               |  5 +-
 .../gpu/opencl/opencl-activity-translate.c    | 74 ++++++-------------
 .../gpu/opencl/opencl-activity-translate.h    |  4 +-
 src/tool/hpcrun/gpu/opencl/opencl-api.c       |  4 +-
 7 files changed, 29 insertions(+), 65 deletions(-)

diff --git a/src/tool/hpcrun/gpu/gpu-operation-channel.c b/src/tool/hpcrun/gpu/gpu-operation-channel.c
index bb760fa612..e933328464 100644
--- a/src/tool/hpcrun/gpu/gpu-operation-channel.c
+++ b/src/tool/hpcrun/gpu/gpu-operation-channel.c
@@ -256,4 +256,4 @@ gpu_operation_channel_t *channel
 )
 {
   pthread_cond_signal(&channel->cond);
-}
\ No newline at end of file
+}
diff --git a/src/tool/hpcrun/gpu/gpu-operation-item-process.c b/src/tool/hpcrun/gpu/gpu-operation-item-process.c
index 666ca73483..8b26ab71a5 100644
--- a/src/tool/hpcrun/gpu/gpu-operation-item-process.c
+++ b/src/tool/hpcrun/gpu/gpu-operation-item-process.c
@@ -87,7 +87,7 @@ gpu_context_stream_trace
  gpu_trace_item_t *ti
 )
 {
-//  gpu_context_id_map_stream_process(context_id, stream_id, gpu_trace_produce, ti);
+  gpu_context_id_map_stream_process(context_id, stream_id, gpu_trace_produce, ti);
 }
 
 
diff --git a/src/tool/hpcrun/gpu/gpu-trace-channel.c b/src/tool/hpcrun/gpu/gpu-trace-channel.c
index 39472bfc43..26816756e1 100644
--- a/src/tool/hpcrun/gpu/gpu-trace-channel.c
+++ b/src/tool/hpcrun/gpu/gpu-trace-channel.c
@@ -216,9 +216,6 @@ gpu_trace_channel_consume
  gpu_trace_channel_t *channel
 )
 {
-  PRINT("gpu_trace_channel_consume:: channel_count = %u, channel_td = %p, last_time = %lu\n", channel->count,
-        channel->td, channel->td->last_time_us);
-
 
   hpcrun_set_thread_data(channel->td);
 
diff --git a/src/tool/hpcrun/gpu/gpu-trace.c b/src/tool/hpcrun/gpu/gpu-trace.c
index fd2d8931a0..1633ad1e38 100644
--- a/src/tool/hpcrun/gpu/gpu-trace.c
+++ b/src/tool/hpcrun/gpu/gpu-trace.c
@@ -323,9 +323,6 @@ gpu_trace_record
     //getting data from a trace channel
     gpu_trace_channel_set_process(channel_set);
 
-    PRINT("TraceRecord_processed: thread: %ld, set_index = %d\n",
-           gpu_trace_channel_set_get_thread(channel_set),
-           gpu_trace_channel_set_get_channel_num(channel_set));
   }
 
   gpu_trace_channel_set_process(channel_set);
@@ -460,4 +457,4 @@ gpu_trace_stream_release
   hpcrun_write_profile_data(&td->core_profile_trace_data);
   hpcrun_trace_close(&td->core_profile_trace_data);
   atomic_fetch_add(&active_streams_counter, -1);
-}
\ No newline at end of file
+}
diff --git a/src/tool/hpcrun/gpu/opencl/opencl-activity-translate.c b/src/tool/hpcrun/gpu/opencl/opencl-activity-translate.c
index 0ae2aa7fc7..3f6dc44a42 100644
--- a/src/tool/hpcrun/gpu/opencl/opencl-activity-translate.c
+++ b/src/tool/hpcrun/gpu/opencl/opencl-activity-translate.c
@@ -83,25 +83,7 @@ convert_kernel_launch
 
   ga->details.kernel.correlation_id = cb_data->details.ker_cb.correlation_id;
   ga->details.kernel.submit_time    = cb_data->details.submit_time;
-}
-
-
-static void
-convert_kernel_launch_event
-(
-gpu_activity_t *ga,
-opencl_object_t *cb_data,
-cl_event event
-)
-{
-  memset(&ga->details.kernel, 0, sizeof(gpu_kernel_t));
-  opencl_timing_info_get(&ga->details.interval, event);
-
-  ga->kind     = cb_data->kind;
-  ga->cct_node = cb_data->details.cct_node;
 
-  ga->details.kernel.correlation_id = cb_data->details.ker_cb.correlation_id;
-  ga->details.kernel.submit_time    = cb_data->details.submit_time;
 }
 
 
@@ -127,26 +109,6 @@ convert_memcpy
 }
 
 
-static void
-convert_memcpy_event
-(
-gpu_activity_t *ga,
-opencl_object_t *cb_data,
-cl_event event
-)
-{
-  memset(&ga->details.memcpy, 0, sizeof(gpu_memcpy_t));
-  opencl_timing_info_get(&ga->details.interval, event);
-
-  ga->kind     = cb_data->kind;
-  ga->cct_node = cb_data->details.cct_node;
-
-  ga->details.memcpy.correlation_id  = cb_data->details.mem_cb.correlation_id;
-  ga->details.memcpy.submit_time     = cb_data->details.submit_time;
-  ga->details.memcpy.bytes           = cb_data->details.mem_cb.size;
-  ga->details.memcpy.copyKind        = cb_data->details.mem_cb.type;
-}
-
 //******************************************************************************
 // interface operations
 //******************************************************************************
@@ -160,6 +122,8 @@ opencl_activity_translate
   uint64_t end_time
 )
 {
+
+
   switch (cb_data->kind) {
     case GPU_ACTIVITY_MEMCPY:
       convert_memcpy(ga, cb_data, start_time, end_time);
@@ -172,6 +136,17 @@ opencl_activity_translate
     default:
       assert(0);
   }
+
+
+  uint64_t diff = end_time - start_time;
+  uint64_t gpu_time_offset = 0;
+
+  ga->details.interval.start = ga->details.kernel.submit_time + gpu_time_offset;
+  ga->details.interval.end = ga->details.kernel.start + diff;
+
+
+
+
   cstack_ptr_set(&(ga->next), 0);
 }
 
@@ -179,22 +154,17 @@ opencl_activity_translate
 void
 opencl_activity_translate_event
 (
-gpu_activity_t *ga,
-cl_event event,
-opencl_object_t *cb_data
+ gpu_activity_t *ga,
+ opencl_object_t *cb_data,
+ cl_event event
 )
 {
-  switch (cb_data->kind) {
-    case GPU_ACTIVITY_MEMCPY:
-      convert_memcpy_event(ga, cb_data, event);
-      break;
 
-    case GPU_ACTIVITY_KERNEL:
-      convert_kernel_launch_event(ga, cb_data, event);
-      break;
+  gpu_interval_t interval;
+  memset(&interval, 0, sizeof(gpu_interval_t));
+
+  opencl_timing_info_get(&interval, event);
+
+  opencl_activity_translate(ga, cb_data, interval.start, interval.end);
 
-    default:
-      assert(0);
-  }
-  cstack_ptr_set(&(ga->next), 0);
 }
diff --git a/src/tool/hpcrun/gpu/opencl/opencl-activity-translate.h b/src/tool/hpcrun/gpu/opencl/opencl-activity-translate.h
index e38f1c4364..b533544187 100644
--- a/src/tool/hpcrun/gpu/opencl/opencl-activity-translate.h
+++ b/src/tool/hpcrun/gpu/opencl/opencl-activity-translate.h
@@ -77,8 +77,8 @@ void
 opencl_activity_translate_event
 (
  gpu_activity_t *ga,
- cl_event event,
- opencl_object_t *cb_data
+ opencl_object_t *cb_data,
+ cl_event event
 );
 
 
diff --git a/src/tool/hpcrun/gpu/opencl/opencl-api.c b/src/tool/hpcrun/gpu/opencl/opencl-api.c
index 744e3afd6a..3291e9329d 100644
--- a/src/tool/hpcrun/gpu/opencl/opencl-api.c
+++ b/src/tool/hpcrun/gpu/opencl/opencl-api.c
@@ -454,7 +454,7 @@ opencl_activity_process
 )
 {
   gpu_activity_t gpu_activity;
-  opencl_activity_translate_event(&gpu_activity, event, cb_data);
+  opencl_activity_translate_event(&gpu_activity, cb_data, event);
 
   if (gpu_activity_multiplexer_my_channel_initialized() == false){
     gpu_activity_multiplexer_my_channel_init();
@@ -1284,4 +1284,4 @@ void *args
 {
   opencl_api_thread_finalize(NULL);
   gpu_activity_multiplexer_fini();
-}
\ No newline at end of file
+}

From 020f32d18393ebca8ebb98b3b23c0d3dde225231 Mon Sep 17 00:00:00 2001
From: Aaron Cherian <atc8@iris.cs.rice.edu>
Date: Sun, 11 Oct 2020 19:01:59 -0500
Subject: [PATCH 092/177] refactored opencl-activity-translate. Duplicated
 function code has been removed

---
 .../gpu/opencl/opencl-activity-translate.c    | 38 ++-----------------
 .../gpu/opencl/opencl-activity-translate.h    | 14 +------
 src/tool/hpcrun/gpu/opencl/opencl-api.c       | 14 ++++++-
 3 files changed, 16 insertions(+), 50 deletions(-)

diff --git a/src/tool/hpcrun/gpu/opencl/opencl-activity-translate.c b/src/tool/hpcrun/gpu/opencl/opencl-activity-translate.c
index 3f6dc44a42..ffc5a91025 100644
--- a/src/tool/hpcrun/gpu/opencl/opencl-activity-translate.c
+++ b/src/tool/hpcrun/gpu/opencl/opencl-activity-translate.c
@@ -118,53 +118,21 @@ opencl_activity_translate
 (
   gpu_activity_t *ga,
   opencl_object_t *cb_data,
-  uint64_t start_time,
-  uint64_t end_time
+  gpu_interval_t interval
 )
 {
-
-
   switch (cb_data->kind) {
     case GPU_ACTIVITY_MEMCPY:
-      convert_memcpy(ga, cb_data, start_time, end_time);
+      convert_memcpy(ga, cb_data, interval.start, interval.end);
       break;
 
     case GPU_ACTIVITY_KERNEL:
-      convert_kernel_launch(ga, cb_data, start_time, end_time);
+      convert_kernel_launch(ga, cb_data, interval.start, interval.end);
       break;
 
     default:
       assert(0);
   }
 
-
-  uint64_t diff = end_time - start_time;
-  uint64_t gpu_time_offset = 0;
-
-  ga->details.interval.start = ga->details.kernel.submit_time + gpu_time_offset;
-  ga->details.interval.end = ga->details.kernel.start + diff;
-
-
-
-
   cstack_ptr_set(&(ga->next), 0);
 }
-
-
-void
-opencl_activity_translate_event
-(
- gpu_activity_t *ga,
- opencl_object_t *cb_data,
- cl_event event
-)
-{
-
-  gpu_interval_t interval;
-  memset(&interval, 0, sizeof(gpu_interval_t));
-
-  opencl_timing_info_get(&interval, event);
-
-  opencl_activity_translate(ga, cb_data, interval.start, interval.end);
-
-}
diff --git a/src/tool/hpcrun/gpu/opencl/opencl-activity-translate.h b/src/tool/hpcrun/gpu/opencl/opencl-activity-translate.h
index b533544187..1f8908f0f0 100644
--- a/src/tool/hpcrun/gpu/opencl/opencl-activity-translate.h
+++ b/src/tool/hpcrun/gpu/opencl/opencl-activity-translate.h
@@ -68,19 +68,7 @@ opencl_activity_translate
 (
   gpu_activity_t *ga,
   opencl_object_t *cb_data,
-  uint64_t start_time,
-  uint64_t end_time
+  gpu_interval_t interval
 );
 
-
-void
-opencl_activity_translate_event
-(
- gpu_activity_t *ga,
- opencl_object_t *cb_data,
- cl_event event
-);
-
-
-
 #endif  //_OPENCL_ACTIVITY_TRANSLATE_H_
diff --git a/src/tool/hpcrun/gpu/opencl/opencl-api.c b/src/tool/hpcrun/gpu/opencl/opencl-api.c
index 3291e9329d..85e2062759 100644
--- a/src/tool/hpcrun/gpu/opencl/opencl-api.c
+++ b/src/tool/hpcrun/gpu/opencl/opencl-api.c
@@ -454,7 +454,12 @@ opencl_activity_process
 )
 {
   gpu_activity_t gpu_activity;
-  opencl_activity_translate_event(&gpu_activity, cb_data, event);
+
+  gpu_interval_t interval;
+  memset(&interval, 0, sizeof(gpu_interval_t));
+  opencl_timing_info_get(&interval, event);
+  
+  opencl_activity_translate(&gpu_activity, cb_data, interval);
 
   if (gpu_activity_multiplexer_my_channel_initialized() == false){
     gpu_activity_multiplexer_my_channel_init();
@@ -477,7 +482,12 @@ opencl_clSetKernelArg_activity_process
 	uint64_t end_time = opencl_h2d_map_entry_end_time_get(entry);
 
   cb_data->details.ker_cb.correlation_id = correlation_id;
-  opencl_activity_translate(&gpu_activity, cb_data, start_time, end_time);
+
+  gpu_interval_t interval;
+  memset(&interval, 0, sizeof(gpu_interval_t));
+  set_gpu_interval(&interval, start_time, end_time);
+
+  opencl_activity_translate(&gpu_activity, cb_data, interval);
   
   if (gpu_activity_multiplexer_my_channel_initialized() == false){
     gpu_activity_multiplexer_my_channel_init();

From d6680f6ef4e32fbffe05620f960e3f860a9e85d7 Mon Sep 17 00:00:00 2001
From: Aaron Cherian <atc8@iris.cs.rice.edu>
Date: Sun, 11 Oct 2020 19:59:12 -0500
Subject: [PATCH 093/177] removing start_time, end_time for the memory
 transfers that happens as part of opencl clSetKernelArg

---
 .../gpu/opencl/opencl-activity-translate.c    |  9 ++++--
 src/tool/hpcrun/gpu/opencl/opencl-api.c       | 19 +++++------
 src/tool/hpcrun/gpu/opencl/opencl-h2d-map.c   | 32 +------------------
 src/tool/hpcrun/gpu/opencl/opencl-h2d-map.h   | 16 ----------
 4 files changed, 15 insertions(+), 61 deletions(-)

diff --git a/src/tool/hpcrun/gpu/opencl/opencl-activity-translate.c b/src/tool/hpcrun/gpu/opencl/opencl-activity-translate.c
index ffc5a91025..41a8d5a086 100644
--- a/src/tool/hpcrun/gpu/opencl/opencl-activity-translate.c
+++ b/src/tool/hpcrun/gpu/opencl/opencl-activity-translate.c
@@ -75,8 +75,9 @@ convert_kernel_launch
 )
 {
   memset(&ga->details.kernel, 0, sizeof(gpu_kernel_t));
-  set_gpu_interval(&ga->details.interval, start_time, end_time);
-
+  if (start_time != 0 && end_time != 0) {
+    set_gpu_interval(&ga->details.interval, start_time, end_time);
+  }
 
   ga->kind     = cb_data->kind;
   ga->cct_node = cb_data->details.cct_node;
@@ -97,7 +98,9 @@ convert_memcpy
 )
 {
   memset(&ga->details.memcpy, 0, sizeof(gpu_memcpy_t));
-  set_gpu_interval(&ga->details.interval, start_time, end_time);
+  if (start_time != 0 && end_time != 0) {
+    set_gpu_interval(&ga->details.interval, start_time, end_time);
+  }
 
   ga->kind     = cb_data->kind;
   ga->cct_node = cb_data->details.cct_node;
diff --git a/src/tool/hpcrun/gpu/opencl/opencl-api.c b/src/tool/hpcrun/gpu/opencl/opencl-api.c
index 85e2062759..4ef913475c 100644
--- a/src/tool/hpcrun/gpu/opencl/opencl-api.c
+++ b/src/tool/hpcrun/gpu/opencl/opencl-api.c
@@ -478,14 +478,10 @@ opencl_clSetKernelArg_activity_process
   gpu_activity_t gpu_activity;
   uint64_t correlation_id = opencl_h2d_map_entry_correlation_get(entry);
 	size_t size = opencl_h2d_map_entry_size_get(entry); 
-	uint64_t start_time = opencl_h2d_map_entry_start_time_get(entry); 
-	uint64_t end_time = opencl_h2d_map_entry_end_time_get(entry);
-
   cb_data->details.ker_cb.correlation_id = correlation_id;
 
   gpu_interval_t interval;
   memset(&interval, 0, sizeof(gpu_interval_t));
-  set_gpu_interval(&interval, start_time, end_time);
 
   opencl_activity_translate(&gpu_activity, cb_data, interval);
   
@@ -554,9 +550,6 @@ add_H2D_metrics_to_cct_node
 	// 	return;
 	// }
 
-	uint64_t start_time = opencl_h2d_map_entry_start_time_get(entry); 
-	uint64_t end_time = opencl_h2d_map_entry_end_time_get(entry);
-	ETMSG(OPENCL, "duration [%"PRIu64", %"PRIu64"]",start_time, end_time); 
 	//opencl_activity_completion_notify();
   opencl_object_t *cb_data = opencl_h2d_map_entry_callback_info_get(entry);
   cl_basic_callback_t cb_basic = opencl_cb_basic_get(cb_data);
@@ -1210,7 +1203,7 @@ clCreateBuffer
     HPCRUN_OPENCL_CALL(clCreateBuffer, (context, flags, size, host_ptr, errcode_ret));
   uint64_t buffer_id = (uint64_t)buffer; 
   //ETMSG(OPENCL, "inside clCreateBuffer wrapper. cl_mem buffer: %p. buffer_id: %"PRIu64"", buffer, buffer_id);
-	opencl_h2d_map_insert(buffer_id, correlation_id, size, 0, 0, NULL);
+	opencl_h2d_map_insert(buffer_id, correlation_id, size, NULL);
   
   return buffer;
 }
@@ -1227,14 +1220,12 @@ clSetKernelArg
 {
 	bool isClBuffer = opencl_isClArgBuffer(arg_value);
   //ETMSG(OPENCL, "inside clSetKernelArg wrapper."); //isClBuffer: %d. *(cl_mem*)arg_value: %p",isClBuffer, *(cl_mem*)arg_value
-	uint64_t start_time = hpcrun_nanotime();
 
   cl_int return_status = 
     HPCRUN_OPENCL_CALL(clSetKernelArg, (kernel, arg_index, arg_size, arg_value));
 	if (!isClBuffer) {
 		return return_status;	
 	}
-  uint64_t end_time = hpcrun_nanotime();
 
   size_t context_size;
   cl_int STATUS = clGetKernelInfo(kernel, CL_KERNEL_CONTEXT, 0, NULL, &context_size);
@@ -1257,7 +1248,13 @@ clSetKernelArg
     initializeClSetKernelArgMemoryCallBackInfo(mem_info, GPU_MEMCPY_H2D, size, correlation_id, context_id, stream_id);
     opencl_subscriber_callback(mem_info);
 
-  	opencl_h2d_map_insert(buffer_id,correlation_id, size, start_time, end_time, mem_info);
+    /* There is no way to record start_time, end_time for the memory transfer that happens as part of clSetKernelArg
+      This is because clSetKernelArg sets argument for a kernel in a context. But in a context, there can be multiple
+      device-queue pairs and opencl does not provide events or listeners to the queue so that we can read the memory operations.
+      Since the memory transfer is async, there are no event handles and we dont know which device is the receiver;
+      the timing information cannot be calculated.
+    */
+  	opencl_h2d_map_insert(buffer_id,correlation_id, size, mem_info);
 	}
   return return_status;
 }
diff --git a/src/tool/hpcrun/gpu/opencl/opencl-h2d-map.c b/src/tool/hpcrun/gpu/opencl/opencl-h2d-map.c
index 49aa3c1f94..ec0bf8d665 100644
--- a/src/tool/hpcrun/gpu/opencl/opencl-h2d-map.c
+++ b/src/tool/hpcrun/gpu/opencl/opencl-h2d-map.c
@@ -110,8 +110,6 @@ typedef struct typed_splay_node(correlation_id) {
 
   uint64_t corr_id;
   size_t size;
-  uint64_t start_time;
-  uint64_t end_time;
   opencl_object_t *cb_info
 } typed_splay_node(correlation_id); 
 
@@ -146,8 +144,6 @@ opencl_h2d_map_entry_new
  uint64_t buffer_id,
  uint64_t correlation_id,
  size_t size,
- uint64_t start_time,
- uint64_t end_time,
  opencl_object_t *cb_info
 )
 {
@@ -156,8 +152,6 @@ opencl_h2d_map_entry_new
   e->buffer_id = buffer_id;
   e->corr_id = correlation_id;
   e->size = size;
-  e->start_time = start_time;
-  e->end_time = end_time;
   e->cb_info = cb_info;
 
   return e;
@@ -191,8 +185,6 @@ opencl_h2d_map_insert
  uint64_t buffer_id, 
  uint64_t correlation_id, 
  size_t size,
- uint64_t start_time,
- uint64_t end_time,
  opencl_object_t *cb_info
 )
 {
@@ -202,12 +194,10 @@ opencl_h2d_map_insert
   if (entry) {
     entry->corr_id = correlation_id;
     entry->size = size;
-    entry->start_time = start_time;
-    entry->end_time = end_time;
     entry->cb_info = cb_info;
   } else {
     opencl_h2d_map_entry_t *entry = 
-      opencl_h2d_map_entry_new(buffer_id, correlation_id, size, start_time, end_time, cb_info);
+      opencl_h2d_map_entry_new(buffer_id, correlation_id, size, cb_info);
 
     st_insert(&map_root, entry);
   }
@@ -261,26 +251,6 @@ opencl_h2d_map_entry_size_get
 }
 
 
-uint64_t
-opencl_h2d_map_entry_start_time_get
-(
- opencl_h2d_map_entry_t *entry
-)
-{
-  return entry->start_time;
-}
-
-
-uint64_t
-opencl_h2d_map_entry_end_time_get
-(
- opencl_h2d_map_entry_t *entry
-)
-{
-  return entry->end_time;
-}
-
-
 opencl_object_t *
 opencl_h2d_map_entry_callback_info_get
 (
diff --git a/src/tool/hpcrun/gpu/opencl/opencl-h2d-map.h b/src/tool/hpcrun/gpu/opencl/opencl-h2d-map.h
index 58e72b8f63..8be708ebc4 100644
--- a/src/tool/hpcrun/gpu/opencl/opencl-h2d-map.h
+++ b/src/tool/hpcrun/gpu/opencl/opencl-h2d-map.h
@@ -94,8 +94,6 @@ opencl_h2d_map_insert
  uint64_t, 
  uint64_t, 
  size_t,
- uint64_t,
- uint64_t,
  opencl_object_t *
 );
 
@@ -128,20 +126,6 @@ opencl_h2d_map_entry_size_get
 );
 
 
-uint64_t
-opencl_h2d_map_entry_start_time_get
-(
- opencl_h2d_map_entry_t *
-);
-
-
-uint64_t
-opencl_h2d_map_entry_end_time_get
-(
- opencl_h2d_map_entry_t *
-);
-
-
 opencl_object_t *
 opencl_h2d_map_entry_callback_info_get
 (

From f86ebe3441c8d82f71ba5c17d8700e4ca3a4ce12 Mon Sep 17 00:00:00 2001
From: Aaron Cherian <atc8@iris.cs.rice.edu>
Date: Mon, 12 Oct 2020 14:33:15 -0500
Subject: [PATCH 094/177] adding context_id and stream_id for opencl memory
 transfers and kernel executions. hpctraceviewer now shows call stack in depth
 view

---
 src/tool/hpcrun/Makefile.am                   |   3 +-
 src/tool/hpcrun/Makefile.in                   |  34 +--
 .../gpu/opencl/opencl-activity-translate.c    |   5 +-
 src/tool/hpcrun/gpu/opencl/opencl-api.c       |  70 +++--
 .../hpcrun/gpu/opencl/opencl-context-map.c    | 248 -----------------
 .../hpcrun/gpu/opencl/opencl-context-map.h    | 111 --------
 .../hpcrun/gpu/opencl/opencl-kernel-map.c     | 262 ------------------
 .../hpcrun/gpu/opencl/opencl-kernel-map.h     | 119 --------
 8 files changed, 56 insertions(+), 796 deletions(-)
 delete mode 100644 src/tool/hpcrun/gpu/opencl/opencl-context-map.c
 delete mode 100644 src/tool/hpcrun/gpu/opencl/opencl-context-map.h
 delete mode 100644 src/tool/hpcrun/gpu/opencl/opencl-kernel-map.c
 delete mode 100644 src/tool/hpcrun/gpu/opencl/opencl-kernel-map.h

diff --git a/src/tool/hpcrun/Makefile.am b/src/tool/hpcrun/Makefile.am
index ce8247caa1..94556dcf9e 100644
--- a/src/tool/hpcrun/Makefile.am
+++ b/src/tool/hpcrun/Makefile.am
@@ -517,8 +517,7 @@ MY_OPENCL_FILES = sample-sources/opencl.c \
 	gpu/opencl/opencl-memory-manager.c \
 	gpu/opencl/opencl-activity-translate.c \
 	gpu/opencl/opencl-h2d-map.c \
-	gpu/opencl/opencl-kernel-map.c \
-	gpu/opencl/opencl-context-map.c 
+	gpu/opencl/opencl-queue-map.c 
 endif
 
 if OPT_ENABLE_GTPIN
diff --git a/src/tool/hpcrun/Makefile.in b/src/tool/hpcrun/Makefile.in
index 4d24288890..c71920caa9 100644
--- a/src/tool/hpcrun/Makefile.in
+++ b/src/tool/hpcrun/Makefile.in
@@ -544,8 +544,7 @@ am__libhpcrun_la_SOURCES_DIST = utilities/first_func.c main.h main.c \
 	gpu/level0/level0-handle-map.c sample-sources/opencl.c \
 	gpu/opencl/opencl-api.c gpu/opencl/opencl-memory-manager.c \
 	gpu/opencl/opencl-activity-translate.c \
-	gpu/opencl/opencl-h2d-map.c gpu/opencl/opencl-kernel-map.c \
-	gpu/opencl/opencl-context-map.c \
+	gpu/opencl/opencl-h2d-map.c gpu/opencl/opencl-queue-map.c \
 	gpu/instrumentation/kernel-data-map.c \
 	gpu/instrumentation/gtpin-instrumentation.c \
 	gpu/instrumentation/gtpin-correlation-id-map.c \
@@ -760,8 +759,7 @@ am__objects_35 =
 @OPT_ENABLE_OPENCL_TRUE@	gpu/opencl/libhpcrun_la-opencl-memory-manager.lo \
 @OPT_ENABLE_OPENCL_TRUE@	gpu/opencl/libhpcrun_la-opencl-activity-translate.lo \
 @OPT_ENABLE_OPENCL_TRUE@	gpu/opencl/libhpcrun_la-opencl-h2d-map.lo \
-@OPT_ENABLE_OPENCL_TRUE@	gpu/opencl/libhpcrun_la-opencl-kernel-map.lo \
-@OPT_ENABLE_OPENCL_TRUE@	gpu/opencl/libhpcrun_la-opencl-context-map.lo
+@OPT_ENABLE_OPENCL_TRUE@	gpu/opencl/libhpcrun_la-opencl-queue-map.lo
 @OPT_ENABLE_OPENCL_TRUE@am__objects_41 = $(am__objects_40)
 @OPT_ENABLE_GTPIN_TRUE@am__objects_42 = gpu/instrumentation/libhpcrun_la-kernel-data-map.lo \
 @OPT_ENABLE_GTPIN_TRUE@	gpu/instrumentation/libhpcrun_la-gtpin-instrumentation.lo \
@@ -1904,8 +1902,7 @@ MY_AARCH64_FILES = \
 @OPT_ENABLE_OPENCL_TRUE@	gpu/opencl/opencl-memory-manager.c \
 @OPT_ENABLE_OPENCL_TRUE@	gpu/opencl/opencl-activity-translate.c \
 @OPT_ENABLE_OPENCL_TRUE@	gpu/opencl/opencl-h2d-map.c \
-@OPT_ENABLE_OPENCL_TRUE@	gpu/opencl/opencl-kernel-map.c \
-@OPT_ENABLE_OPENCL_TRUE@	gpu/opencl/opencl-context-map.c 
+@OPT_ENABLE_OPENCL_TRUE@	gpu/opencl/opencl-queue-map.c 
 
 @OPT_ENABLE_GTPIN_TRUE@MY_GTPIN_FILES = \
 @OPT_ENABLE_GTPIN_TRUE@	gpu/instrumentation/kernel-data-map.c \
@@ -2870,10 +2867,7 @@ gpu/opencl/libhpcrun_la-opencl-activity-translate.lo:  \
 	gpu/opencl/$(DEPDIR)/$(am__dirstamp)
 gpu/opencl/libhpcrun_la-opencl-h2d-map.lo: gpu/opencl/$(am__dirstamp) \
 	gpu/opencl/$(DEPDIR)/$(am__dirstamp)
-gpu/opencl/libhpcrun_la-opencl-kernel-map.lo:  \
-	gpu/opencl/$(am__dirstamp) \
-	gpu/opencl/$(DEPDIR)/$(am__dirstamp)
-gpu/opencl/libhpcrun_la-opencl-context-map.lo:  \
+gpu/opencl/libhpcrun_la-opencl-queue-map.lo:  \
 	gpu/opencl/$(am__dirstamp) \
 	gpu/opencl/$(DEPDIR)/$(am__dirstamp)
 gpu/instrumentation/$(am__dirstamp):
@@ -3838,10 +3832,9 @@ distclean-compile:
 @AMDEP_TRUE@@am__include@ @am__quote@gpu/nvidia/$(DEPDIR)/libhpcrun_o-cupti-gpu-api.Po@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@gpu/opencl/$(DEPDIR)/libhpcrun_la-opencl-activity-translate.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@gpu/opencl/$(DEPDIR)/libhpcrun_la-opencl-api.Plo@am__quote@
-@AMDEP_TRUE@@am__include@ @am__quote@gpu/opencl/$(DEPDIR)/libhpcrun_la-opencl-context-map.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@gpu/opencl/$(DEPDIR)/libhpcrun_la-opencl-h2d-map.Plo@am__quote@
-@AMDEP_TRUE@@am__include@ @am__quote@gpu/opencl/$(DEPDIR)/libhpcrun_la-opencl-kernel-map.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@gpu/opencl/$(DEPDIR)/libhpcrun_la-opencl-memory-manager.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@gpu/opencl/$(DEPDIR)/libhpcrun_la-opencl-queue-map.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@lush-agents/$(DEPDIR)/libagent_cilk_la-agent-cilk.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@lush-agents/$(DEPDIR)/libagent_pthread_la-agent-pthread.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@lush-agents/$(DEPDIR)/libagent_tbb_la-agent-tbb.Plo@am__quote@
@@ -5541,19 +5534,12 @@ gpu/opencl/libhpcrun_la-opencl-h2d-map.lo: gpu/opencl/opencl-h2d-map.c
 @AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
 @am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -c -o gpu/opencl/libhpcrun_la-opencl-h2d-map.lo `test -f 'gpu/opencl/opencl-h2d-map.c' || echo '$(srcdir)/'`gpu/opencl/opencl-h2d-map.c
 
-gpu/opencl/libhpcrun_la-opencl-kernel-map.lo: gpu/opencl/opencl-kernel-map.c
-@am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -MT gpu/opencl/libhpcrun_la-opencl-kernel-map.lo -MD -MP -MF gpu/opencl/$(DEPDIR)/libhpcrun_la-opencl-kernel-map.Tpo -c -o gpu/opencl/libhpcrun_la-opencl-kernel-map.lo `test -f 'gpu/opencl/opencl-kernel-map.c' || echo '$(srcdir)/'`gpu/opencl/opencl-kernel-map.c
-@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) gpu/opencl/$(DEPDIR)/libhpcrun_la-opencl-kernel-map.Tpo gpu/opencl/$(DEPDIR)/libhpcrun_la-opencl-kernel-map.Plo
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='gpu/opencl/opencl-kernel-map.c' object='gpu/opencl/libhpcrun_la-opencl-kernel-map.lo' libtool=yes @AMDEPBACKSLASH@
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
-@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -c -o gpu/opencl/libhpcrun_la-opencl-kernel-map.lo `test -f 'gpu/opencl/opencl-kernel-map.c' || echo '$(srcdir)/'`gpu/opencl/opencl-kernel-map.c
-
-gpu/opencl/libhpcrun_la-opencl-context-map.lo: gpu/opencl/opencl-context-map.c
-@am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -MT gpu/opencl/libhpcrun_la-opencl-context-map.lo -MD -MP -MF gpu/opencl/$(DEPDIR)/libhpcrun_la-opencl-context-map.Tpo -c -o gpu/opencl/libhpcrun_la-opencl-context-map.lo `test -f 'gpu/opencl/opencl-context-map.c' || echo '$(srcdir)/'`gpu/opencl/opencl-context-map.c
-@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) gpu/opencl/$(DEPDIR)/libhpcrun_la-opencl-context-map.Tpo gpu/opencl/$(DEPDIR)/libhpcrun_la-opencl-context-map.Plo
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='gpu/opencl/opencl-context-map.c' object='gpu/opencl/libhpcrun_la-opencl-context-map.lo' libtool=yes @AMDEPBACKSLASH@
+gpu/opencl/libhpcrun_la-opencl-queue-map.lo: gpu/opencl/opencl-queue-map.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -MT gpu/opencl/libhpcrun_la-opencl-queue-map.lo -MD -MP -MF gpu/opencl/$(DEPDIR)/libhpcrun_la-opencl-queue-map.Tpo -c -o gpu/opencl/libhpcrun_la-opencl-queue-map.lo `test -f 'gpu/opencl/opencl-queue-map.c' || echo '$(srcdir)/'`gpu/opencl/opencl-queue-map.c
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) gpu/opencl/$(DEPDIR)/libhpcrun_la-opencl-queue-map.Tpo gpu/opencl/$(DEPDIR)/libhpcrun_la-opencl-queue-map.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='gpu/opencl/opencl-queue-map.c' object='gpu/opencl/libhpcrun_la-opencl-queue-map.lo' libtool=yes @AMDEPBACKSLASH@
 @AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
-@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -c -o gpu/opencl/libhpcrun_la-opencl-context-map.lo `test -f 'gpu/opencl/opencl-context-map.c' || echo '$(srcdir)/'`gpu/opencl/opencl-context-map.c
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -c -o gpu/opencl/libhpcrun_la-opencl-queue-map.lo `test -f 'gpu/opencl/opencl-queue-map.c' || echo '$(srcdir)/'`gpu/opencl/opencl-queue-map.c
 
 gpu/instrumentation/libhpcrun_la-kernel-data-map.lo: gpu/instrumentation/kernel-data-map.c
 @am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -MT gpu/instrumentation/libhpcrun_la-kernel-data-map.lo -MD -MP -MF gpu/instrumentation/$(DEPDIR)/libhpcrun_la-kernel-data-map.Tpo -c -o gpu/instrumentation/libhpcrun_la-kernel-data-map.lo `test -f 'gpu/instrumentation/kernel-data-map.c' || echo '$(srcdir)/'`gpu/instrumentation/kernel-data-map.c
diff --git a/src/tool/hpcrun/gpu/opencl/opencl-activity-translate.c b/src/tool/hpcrun/gpu/opencl/opencl-activity-translate.c
index 41a8d5a086..be9c6d2834 100644
--- a/src/tool/hpcrun/gpu/opencl/opencl-activity-translate.c
+++ b/src/tool/hpcrun/gpu/opencl/opencl-activity-translate.c
@@ -84,7 +84,8 @@ convert_kernel_launch
 
   ga->details.kernel.correlation_id = cb_data->details.ker_cb.correlation_id;
   ga->details.kernel.submit_time    = cb_data->details.submit_time;
-
+  ga->details.kernel.context_id    = cb_data->details.context_id;
+  ga->details.kernel.stream_id    = cb_data->details.stream_id;
 }
 
 
@@ -107,6 +108,8 @@ convert_memcpy
 
   ga->details.memcpy.correlation_id  = cb_data->details.mem_cb.correlation_id;
   ga->details.memcpy.submit_time     = cb_data->details.submit_time;
+  ga->details.memcpy.context_id      = cb_data->details.context_id;
+  ga->details.memcpy.stream_id       = cb_data->details.stream_id;
   ga->details.memcpy.bytes           = cb_data->details.mem_cb.size;
   ga->details.memcpy.copyKind        = cb_data->details.mem_cb.type;
 }
diff --git a/src/tool/hpcrun/gpu/opencl/opencl-api.c b/src/tool/hpcrun/gpu/opencl/opencl-api.c
index 4ef913475c..7ef7e7237c 100644
--- a/src/tool/hpcrun/gpu/opencl/opencl-api.c
+++ b/src/tool/hpcrun/gpu/opencl/opencl-api.c
@@ -83,8 +83,7 @@
 #include "opencl-activity-translate.h"
 #include "opencl-memory-manager.h"
 #include "opencl-h2d-map.h"
-#include "opencl-context-map.h"
-#include "opencl-kernel-map.h"
+#include "opencl-queue-map.h"
 
 
 
@@ -136,6 +135,8 @@
 
 #define BUFFER_ID_INVALID -1
 
+#define DEFAULT_OPENCL_STREAM_ID 0
+
 
 
 //******************************************************************************
@@ -355,12 +356,16 @@ getCorrelationId
 static void
 initializeKernelCallBackInfo
 (
- opencl_object_t *ker_info,
- uint64_t correlation_id
+  opencl_object_t *ker_info,
+  uint64_t correlation_id,
+  uint32_t context_id,
+  uint32_t stream_id
 )
 {
   ker_info->kind = GPU_ACTIVITY_KERNEL;
   ker_info->details.ker_cb.correlation_id = correlation_id;
+  ker_info->details.context_id = context_id;
+  ker_info->details.stream_id = stream_id;
 }
 
 
@@ -370,7 +375,9 @@ initializeMemoryCallBackInfo
   opencl_object_t *mem_info,
   gpu_memcpy_type_t type,
   size_t size,
-  uint64_t correlation_id
+  uint64_t correlation_id,
+  uint32_t context_id,
+  uint32_t stream_id
 )
 {
   mem_info->kind = GPU_ACTIVITY_MEMCPY;
@@ -380,6 +387,8 @@ initializeMemoryCallBackInfo
   mem_info->details.mem_cb.size = size;
 
   mem_info->details.mem_cb.correlation_id = correlation_id;
+  mem_info->details.context_id = context_id;
+  mem_info->details.stream_id = stream_id;
 }
 
 
@@ -490,7 +499,6 @@ opencl_clSetKernelArg_activity_process
   }
   ETMSG(OPENCL, "cb_data->details.initiator_channel: %p", cb_data->details.initiator_channel);
   gpu_activity_multiplexer_push(cb_data->details.initiator_channel, &gpu_activity);
-  //gpu_activity_process(&gpu_activity);
 }
 
 
@@ -542,22 +550,12 @@ add_H2D_metrics_to_cct_node
 	void *arg
 )
 {
-	// uint64_t correlation_id = opencl_h2d_map_entry_correlation_get(entry); 
-	// gpu_correlation_id_map_entry_t *cid_map_entry = 
-	// 	gpu_correlation_id_map_lookup(correlation_id);
-	// if (cid_map_entry == NULL) {
-	// 	ETMSG(OPENCL, "cid_map_entry for correlation_id: %"PRIu64 " (clSetKernelArg H2D) not found", correlation_id);
-	// 	return;
-	// }
-
-	//opencl_activity_completion_notify();
   opencl_object_t *cb_data = opencl_h2d_map_entry_callback_info_get(entry);
   cl_basic_callback_t cb_basic = opencl_cb_basic_get(cb_data);
   opencl_cb_basic_print(cb_basic, "Completion_Callback");
 
 	opencl_clSetKernelArg_activity_process(entry, cb_data);
 	uint64_t buffer_id = opencl_h2d_map_entry_buffer_id_get(entry);
-	//opencl_h2d_map_delete(buffer_id);
   opencl_h2d_pending_operations_adjust(-1);
   opencl_pending_operations_adjust(-1);
 }
@@ -867,7 +865,8 @@ clCreateProgramWithSource
       // what if a single file has multiple kernels?
       // we need to add logic to get filenames by reading the strings contents
       char fileno = '0' + (i + 1); // right now we are naming the files as index numbers
-      // using malloc instead of hpcrun_malloc gives extra garbage characters in file name
+
+      // TO-DO: AARON using malloc instead of hpcrun_malloc gives extra garbage characters in file name
       char *filename = (char *)hpcrun_malloc(sizeof(fileno) + 1);
       *filename = fileno + '\0';
       f_ptr = fopen(filename, "w");
@@ -922,7 +921,7 @@ clCreateCommandQueue
 
   cl_command_queue queue = HPCRUN_OPENCL_CALL(clCreateCommandQueue, (context, device,
         properties,errcode_ret));
-  opencl_cl_context_map_insert((uint64_t)context, (uint32_t)queue);
+  opencl_cl_queue_map_insert((uint64_t)queue, (uint32_t)context);
   return queue;
 }
 
@@ -980,7 +979,7 @@ clCreateCommandQueueWithProperties
     // The property is created by us
     free(queue_properties);
   }
-  opencl_cl_context_map_insert((uint64_t)context, (uint32_t)queue);
+  opencl_cl_queue_map_insert((uint64_t)queue, (uint32_t)context);
   return queue;
 }
 
@@ -1000,7 +999,10 @@ clEnqueueNDRangeKernel
 )
 {
   opencl_object_t *kernel_info = opencl_malloc();
-  initializeKernelCallBackInfo(kernel_info, CORRELATION_ID_INVALID);
+  opencl_queue_map_entry_t *qe = opencl_cl_queue_map_lookup((uint64_t)command_queue);
+  uint32_t context_id = opencl_cl_queue_map_entry_context_id_get(qe);
+  uint32_t stream_id = (uint32_t)command_queue;
+  initializeKernelCallBackInfo(kernel_info, CORRELATION_ID_INVALID, context_id, stream_id);
 
   opencl_subscriber_callback(kernel_info);
 
@@ -1044,7 +1046,10 @@ clEnqueueReadBuffer
   ETMSG(OPENCL, "inside clEnqueueReadBuffer wrapper");
 
   opencl_object_t *mem_info = opencl_malloc();
-  initializeMemoryCallBackInfo(mem_info, GPU_MEMCPY_D2H, cb, CORRELATION_ID_INVALID);
+  opencl_queue_map_entry_t *qe = opencl_cl_queue_map_lookup((uint64_t)command_queue);
+  uint32_t context_id = opencl_cl_queue_map_entry_context_id_get(qe);
+  uint32_t stream_id = (uint32_t)command_queue;
+  initializeMemoryCallBackInfo(mem_info, GPU_MEMCPY_D2H, cb, CORRELATION_ID_INVALID, context_id, stream_id);
   opencl_subscriber_callback(mem_info);
 
   cl_event my_event;
@@ -1094,7 +1099,12 @@ clEnqueueWriteBuffer
 	opencl_h2d_pending_operations_adjust(-1);
   //opencl_pending_operations_adjust(-1);
   opencl_object_t *mem_info = opencl_malloc();
-  initializeMemoryCallBackInfo(mem_info, GPU_MEMCPY_H2D, cb, CORRELATION_ID_INVALID);
+
+  opencl_queue_map_entry_t *qe = opencl_cl_queue_map_lookup((uint64_t)command_queue);
+  uint32_t context_id = opencl_cl_queue_map_entry_context_id_get(qe);
+  uint32_t stream_id = (uint32_t)command_queue;
+
+  initializeMemoryCallBackInfo(mem_info, GPU_MEMCPY_H2D, cb, CORRELATION_ID_INVALID, context_id, stream_id);
   opencl_subscriber_callback(mem_info);
 
   cl_event my_event;
@@ -1143,11 +1153,15 @@ clEnqueueMapBuffer
   ETMSG(OPENCL, "inside clEnqueueMapBuffer wrapper");
 
   opencl_object_t *mem_info = opencl_malloc();
+
+  opencl_queue_map_entry_t *qe = opencl_cl_queue_map_lookup((uint64_t)command_queue);
+  uint32_t context_id = opencl_cl_queue_map_entry_context_id_get(qe);
+  uint32_t stream_id = (uint32_t)command_queue;
   if (map_flags == CL_MAP_READ) {
-    initializeMemoryCallBackInfo(mem_info, GPU_MEMCPY_D2H, size, CORRELATION_ID_INVALID);
+    initializeMemoryCallBackInfo(mem_info, GPU_MEMCPY_D2H, size, CORRELATION_ID_INVALID, context_id, stream_id);
   } else {
     //map_flags == CL_MAP_WRITE || map_flags == CL_MAP_WRITE_INVALIDATE_REGION
-    initializeMemoryCallBackInfo(mem_info, GPU_MEMCPY_H2D, size, CORRELATION_ID_INVALID);
+    initializeMemoryCallBackInfo(mem_info, GPU_MEMCPY_H2D, size, CORRELATION_ID_INVALID, context_id, stream_id);
   }
   
   opencl_subscriber_callback(mem_info);
@@ -1231,12 +1245,9 @@ clSetKernelArg
   cl_int STATUS = clGetKernelInfo(kernel, CL_KERNEL_CONTEXT, 0, NULL, &context_size);
   cl_context *context = malloc(sizeof(context_size));
   STATUS = clGetKernelInfo(kernel, CL_KERNEL_CONTEXT, context_size, (void*)context, NULL);
-  
-  //opencl_cl_kernel_map_insert((uint64_t)ocl_kernel, (uint32_t)(*context), command_queue);
-
   uint32_t context_id = (uint32_t)(*context);
-  opencl_context_map_entry_t *ce = opencl_cl_context_map_lookup((uint64_t)(*context));
-  uint32_t stream_id = opencl_cl_kernel_map_entry_stream_get(ce);
+  free(context);
+  uint32_t stream_id = DEFAULT_OPENCL_STREAM_ID;
 
   uint64_t buffer_id = opencl_get_buffer_id(arg_value);
 	opencl_h2d_map_entry_t *entry = opencl_h2d_map_lookup(buffer_id);
@@ -1246,6 +1257,7 @@ clSetKernelArg
 		uint64_t correlation_id = opencl_h2d_map_entry_correlation_get(entry);
     opencl_object_t *mem_info = opencl_malloc();
     initializeClSetKernelArgMemoryCallBackInfo(mem_info, GPU_MEMCPY_H2D, size, correlation_id, context_id, stream_id);
+    ETMSG(OPENCL, "%d(bytes) of data being transferred from host to device", (long)size);
     opencl_subscriber_callback(mem_info);
 
     /* There is no way to record start_time, end_time for the memory transfer that happens as part of clSetKernelArg
diff --git a/src/tool/hpcrun/gpu/opencl/opencl-context-map.c b/src/tool/hpcrun/gpu/opencl/opencl-context-map.c
deleted file mode 100644
index 6843e4f535..0000000000
--- a/src/tool/hpcrun/gpu/opencl/opencl-context-map.c
+++ /dev/null
@@ -1,248 +0,0 @@
-// -*-Mode: C++;-*- // technically C99
-
-// * BeginRiceCopyright *****************************************************
-//
-// --------------------------------------------------------------------------
-// Part of HPCToolkit (hpctoolkit.org)
-//
-// Information about sources of support for research and development of
-// HPCToolkit is at 'hpctoolkit.org' and in 'README.Acknowledgments'.
-// --------------------------------------------------------------------------
-//
-// Copyright ((c)) 2002-2020, Rice University
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// * Redistributions of source code must retain the above copyright
-//   notice, this list of conditions and the following disclaimer.
-//
-// * Redistributions in binary form must reproduce the above copyright
-//   notice, this list of conditions and the following disclaimer in the
-//   documentation and/or other materials provided with the distribution.
-//
-// * Neither the name of Rice University (RICE) nor the names of its
-//   contributors may be used to endorse or promote products derived from
-//   this software without specific prior written permission.
-//
-// This software is provided by RICE and contributors "as is" and any
-// express or implied warranties, including, but not limited to, the
-// implied warranties of merchantability and fitness for a particular
-// purpose are disclaimed. In no event shall RICE or contributors be
-// liable for any direct, indirect, incidental, special, exemplary, or
-// consequential damages (including, but not limited to, procurement of
-// substitute goods or services; loss of use, data, or profits; or
-// business interruption) however caused and on any theory of liability,
-// whether in contract, strict liability, or tort (including negligence
-// or otherwise) arising in any way out of the use of this software, even
-// if advised of the possibility of such damage.
-//
-// ******************************************************* EndRiceCopyright *
-
-//*****************************************************************************
-// system includes
-//*****************************************************************************
-
-#include <assert.h>
-#include <string.h>
-
-
-
-//*****************************************************************************
-// local includes
-//*****************************************************************************
-
-#include <lib/prof-lean/splay-uint64.h>
-#include <lib/prof-lean/spinlock.h>
-#include <hpcrun/gpu/gpu-activity-channel.h>
-#include <hpcrun/gpu/gpu-splay-allocator.h>
-#include <hpcrun/gpu/gpu-op-placeholders.h>
-
-#include "opencl-context-map.h"
-
-
-
-//*****************************************************************************
-// macros
-//*****************************************************************************
-
-#define DEBUG 0
-
-#include "../gpu-print.h"
-
-
-#define st_insert				\
-  typed_splay_insert(correlation_id)
-
-#define st_lookup				\
-  typed_splay_lookup(correlation_id)
-
-#define st_delete				\
-  typed_splay_delete(correlation_id)
-
-#define st_forall				\
-  typed_splay_forall(correlation_id)
-
-#define st_count				\
-  typed_splay_count(correlation_id)
-
-#define st_alloc(free_list)			\
-  typed_splay_alloc(free_list, opencl_context_map_entry_t)
-
-#define st_free(free_list, node)		\
-  typed_splay_free(free_list, node)
-
-
-
-//*****************************************************************************
-// type declarations
-//*****************************************************************************
-
-#undef typed_splay_node
-#define typed_splay_node(correlation_id) opencl_context_map_entry_t
-
-typedef struct typed_splay_node(correlation_id) {
-  struct typed_splay_node(correlation_id) *left;
-  struct typed_splay_node(correlation_id) *right;
-  uint64_t cl_context_id; // key
-
-  uint32_t stream_id; // we save queue id as the stream id
-} typed_splay_node(correlation_id); 
-
-
-//******************************************************************************
-// local data
-//******************************************************************************
-
-static opencl_context_map_entry_t *map_root = NULL;
-
-static opencl_context_map_entry_t *free_list = NULL;
-
-static spinlock_t opencl_context_map_lock = SPINLOCK_UNLOCKED;
-
-//*****************************************************************************
-// private operations
-//*****************************************************************************
-
-typed_splay_impl(correlation_id)
-
-
-static opencl_context_map_entry_t *
-opencl_cl_context_map_entry_alloc()
-{
-  return st_alloc(&free_list);
-}
-
-
-static opencl_context_map_entry_t *
-opencl_cl_context_map_entry_new
-(
- uint64_t cl_context_id,
- uint32_t stream_id
-)
-{
-  opencl_context_map_entry_t *e = opencl_cl_context_map_entry_alloc();
-
-  e->cl_context_id = cl_context_id;
-  e->stream_id = stream_id;
-  
-  return e;
-}
-
-
-
-//*****************************************************************************
-// interface operations
-//*****************************************************************************
-
-opencl_context_map_entry_t *
-opencl_cl_context_map_lookup
-(
- uint64_t cl_context_id
-)
-{
-  spinlock_lock(&opencl_context_map_lock);
-
-  uint64_t id = cl_context_id;
-  opencl_context_map_entry_t *result = st_lookup(&map_root, id);
-
-  spinlock_unlock(&opencl_context_map_lock);
-
-  return result;
-}
-
-
-void
-opencl_cl_context_map_insert
-(
- uint64_t cl_context_id, 
- uint32_t stream_id
-)
-{
-  spinlock_lock(&opencl_context_map_lock);
-
-  opencl_context_map_entry_t *entry = st_lookup(&map_root, cl_context_id);
-  if (entry) {
-    entry->cl_context_id = cl_context_id;
-    entry->stream_id = stream_id;
-  } else {
-    opencl_context_map_entry_t *entry = 
-      opencl_cl_context_map_entry_new(cl_context_id, stream_id);
-
-    st_insert(&map_root, entry);
-  }
-
-  spinlock_unlock(&opencl_context_map_lock);
-}
-
-
-void
-opencl_cl_context_map_delete
-(
- uint64_t cl_context_id
-)
-{
-  spinlock_lock(&opencl_context_map_lock);
-
-  opencl_context_map_entry_t *node = st_delete(&map_root, cl_context_id);
-  st_free(&free_list, node);
-
-  spinlock_unlock(&opencl_context_map_lock);
-}
-
-
-uint64_t
-opencl_cl_context_map_entry_cl_context_id_get
-(
- opencl_context_map_entry_t *entry
-)
-{
-  return entry->cl_context_id;
-}
-
-
-uint32_t
-opencl_cl_context_map_entry_stream_get
-(
- opencl_context_map_entry_t *entry
-)
-{
-  return entry->stream_id;
-}
-
-
-
-//*****************************************************************************
-// debugging code
-//*****************************************************************************
-
-uint64_t
-opencl_cl_context_map_count
-(
- void
-)
-{
-  return st_count(map_root);
-}
diff --git a/src/tool/hpcrun/gpu/opencl/opencl-context-map.h b/src/tool/hpcrun/gpu/opencl/opencl-context-map.h
deleted file mode 100644
index a4c4a09561..0000000000
--- a/src/tool/hpcrun/gpu/opencl/opencl-context-map.h
+++ /dev/null
@@ -1,111 +0,0 @@
-// -*-Mode: C++;-*- // technically C99
-
-// * BeginRiceCopyright *****************************************************
-//
-// --------------------------------------------------------------------------
-// Part of HPCToolkit (hpctoolkit.org)
-//
-// Information about sources of support for research and development of
-// HPCToolkit is at 'hpctoolkit.org' and in 'README.Acknowledgments'.
-// --------------------------------------------------------------------------
-//
-// Copyright ((c)) 2002-2020, Rice University
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// * Redistributions of source code must retain the above copyright
-//   notice, this list of conditions and the following disclaimer.
-//
-// * Redistributions in binary form must reproduce the above copyright
-//   notice, this list of conditions and the following disclaimer in the
-//   documentation and/or other materials provided with the distribution.
-//
-// * Neither the name of Rice University (RICE) nor the names of its
-//   contributors may be used to endorse or promote products derived from
-//   this software without specific prior written permission.
-//
-// This software is provided by RICE and contributors "as is" and any
-// express or implied warranties, including, but not limited to, the
-// implied warranties of merchantability and fitness for a particular
-// purpose are disclaimed. In no event shall RICE or contributors be
-// liable for any direct, indirect, incidental, special, exemplary, or
-// consequential damages (including, but not limited to, procurement of
-// substitute goods or services; loss of use, data, or profits; or
-// business interruption) however caused and on any theory of liability,
-// whether in contract, strict liability, or tort (including negligence
-// or otherwise) arising in any way out of the use of this software, even
-// if advised of the possibility of such damage.
-//
-// ******************************************************* EndRiceCopyright *
-
-#ifndef opencl_context_map_h
-#define opencl_context_map_h
-
-
-//*****************************************************************************
-// system includes
-//*****************************************************************************
-
-#include <stdint.h>
-
-
-
-//*****************************************************************************
-// type definitions 
-//*****************************************************************************
-
-typedef struct opencl_context_map_entry_t opencl_context_map_entry_t;
-
-
-
-//*****************************************************************************
-// interface operations
-//*****************************************************************************
-
-opencl_context_map_entry_t *
-opencl_cl_context_map_lookup
-(
- uint64_t
-);
-
-
-void
-opencl_cl_context_map_insert
-(
- uint64_t, 
- uint32_t
-);
-
-
-void
-opencl_cl_context_map_delete
-(
- uint64_t
-);
-
-
-uint64_t
-opencl_cl_context_map_entry_cl_context_id_get
-(
- opencl_context_map_entry_t *entry
-);
-
-
-uint32_t
-opencl_cl_context_map_entry_stream_get
-(
- opencl_context_map_entry_t *
-);
-
-
-uint64_t
-opencl_h2d_map_count
-(
- void
-);
-
-#endif
-
diff --git a/src/tool/hpcrun/gpu/opencl/opencl-kernel-map.c b/src/tool/hpcrun/gpu/opencl/opencl-kernel-map.c
deleted file mode 100644
index 930283fa08..0000000000
--- a/src/tool/hpcrun/gpu/opencl/opencl-kernel-map.c
+++ /dev/null
@@ -1,262 +0,0 @@
-// -*-Mode: C++;-*- // technically C99
-
-// * BeginRiceCopyright *****************************************************
-//
-// --------------------------------------------------------------------------
-// Part of HPCToolkit (hpctoolkit.org)
-//
-// Information about sources of support for research and development of
-// HPCToolkit is at 'hpctoolkit.org' and in 'README.Acknowledgments'.
-// --------------------------------------------------------------------------
-//
-// Copyright ((c)) 2002-2020, Rice University
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// * Redistributions of source code must retain the above copyright
-//   notice, this list of conditions and the following disclaimer.
-//
-// * Redistributions in binary form must reproduce the above copyright
-//   notice, this list of conditions and the following disclaimer in the
-//   documentation and/or other materials provided with the distribution.
-//
-// * Neither the name of Rice University (RICE) nor the names of its
-//   contributors may be used to endorse or promote products derived from
-//   this software without specific prior written permission.
-//
-// This software is provided by RICE and contributors "as is" and any
-// express or implied warranties, including, but not limited to, the
-// implied warranties of merchantability and fitness for a particular
-// purpose are disclaimed. In no event shall RICE or contributors be
-// liable for any direct, indirect, incidental, special, exemplary, or
-// consequential damages (including, but not limited to, procurement of
-// substitute goods or services; loss of use, data, or profits; or
-// business interruption) however caused and on any theory of liability,
-// whether in contract, strict liability, or tort (including negligence
-// or otherwise) arising in any way out of the use of this software, even
-// if advised of the possibility of such damage.
-//
-// ******************************************************* EndRiceCopyright *
-
-//*****************************************************************************
-// system includes
-//*****************************************************************************
-
-#include <assert.h>
-#include <string.h>
-
-
-
-//*****************************************************************************
-// local includes
-//*****************************************************************************
-
-#include <lib/prof-lean/splay-uint64.h>
-#include <lib/prof-lean/spinlock.h>
-#include <hpcrun/gpu/gpu-activity-channel.h>
-#include <hpcrun/gpu/gpu-splay-allocator.h>
-#include <hpcrun/gpu/gpu-op-placeholders.h>
-
-#include "opencl-kernel-map.h"
-
-
-
-//*****************************************************************************
-// macros
-//*****************************************************************************
-
-#define DEBUG 0
-
-#include "../gpu-print.h"
-
-
-#define st_insert				\
-  typed_splay_insert(correlation_id)
-
-#define st_lookup				\
-  typed_splay_lookup(correlation_id)
-
-#define st_delete				\
-  typed_splay_delete(correlation_id)
-
-#define st_forall				\
-  typed_splay_forall(correlation_id)
-
-#define st_count				\
-  typed_splay_count(correlation_id)
-
-#define st_alloc(free_list)			\
-  typed_splay_alloc(free_list, opencl_kernel_map_entry_t)
-
-#define st_free(free_list, node)		\
-  typed_splay_free(free_list, node)
-
-
-
-//*****************************************************************************
-// type declarations
-//*****************************************************************************
-
-#undef typed_splay_node
-#define typed_splay_node(correlation_id) opencl_kernel_map_entry_t
-
-typedef struct typed_splay_node(correlation_id) {
-  struct typed_splay_node(correlation_id) *left;
-  struct typed_splay_node(correlation_id) *right;
-  uint64_t cl_kernel_id; // key
-
-  uint32_t context_id;
-  uint32_t stream_id;
-} typed_splay_node(correlation_id); 
-
-
-//******************************************************************************
-// local data
-//******************************************************************************
-
-static opencl_kernel_map_entry_t *map_root = NULL;
-
-static opencl_kernel_map_entry_t *free_list = NULL;
-
-static spinlock_t opencl_kernel_map_lock = SPINLOCK_UNLOCKED;
-
-//*****************************************************************************
-// private operations
-//*****************************************************************************
-
-typed_splay_impl(correlation_id)
-
-
-static opencl_kernel_map_entry_t *
-opencl_cl_kernel_map_entry_alloc()
-{
-  return st_alloc(&free_list);
-}
-
-
-static opencl_kernel_map_entry_t *
-opencl_cl_kernel_map_entry_new
-(
- uint64_t cl_kernel_id,
- uint32_t context_id,
- uint32_t stream_id
-)
-{
-  opencl_kernel_map_entry_t *e = opencl_cl_kernel_map_entry_alloc();
-
-  e->cl_kernel_id = cl_kernel_id;
-  e->context_id = context_id;
-  e->stream_id = stream_id;
-  
-  return e;
-}
-
-
-
-//*****************************************************************************
-// interface operations
-//*****************************************************************************
-
-opencl_kernel_map_entry_t *
-opencl_cl_kernel_map_lookup
-(
- uint64_t cl_kernel_id
-)
-{
-  spinlock_lock(&opencl_kernel_map_lock);
-
-  uint64_t id = cl_kernel_id;
-  opencl_kernel_map_entry_t *result = st_lookup(&map_root, id);
-
-  spinlock_unlock(&opencl_kernel_map_lock);
-
-  return result;
-}
-
-
-void
-opencl_cl_kernel_map_insert
-(
- uint64_t cl_kernel_id, 
- uint32_t context_id,
- uint32_t stream_id
-)
-{
-  spinlock_lock(&opencl_kernel_map_lock);
-
-  opencl_kernel_map_entry_t *entry = st_lookup(&map_root, cl_kernel_id);
-  if (entry) {
-    entry->context_id = context_id;
-    entry->stream_id = stream_id;
-  } else {
-    opencl_kernel_map_entry_t *entry = 
-      opencl_cl_kernel_map_entry_new(cl_kernel_id, context_id, stream_id);
-
-    st_insert(&map_root, entry);
-  }
-
-  spinlock_unlock(&opencl_kernel_map_lock);
-}
-
-
-void
-opencl_cl_kernel_map_delete
-(
- uint64_t cl_kernel_id
-)
-{
-  spinlock_lock(&opencl_kernel_map_lock);
-
-  opencl_kernel_map_entry_t *node = st_delete(&map_root, cl_kernel_id);
-  st_free(&free_list, node);
-
-  spinlock_unlock(&opencl_kernel_map_lock);
-}
-
-
-uint64_t
-opencl_cl_kernel_map_entry_cl_kernel_id_get
-(
- opencl_kernel_map_entry_t *entry
-)
-{
-  return entry->cl_kernel_id;
-}
-
-
-uint32_t
-opencl_cl_kernel_map_entry_context_get
-(
- opencl_kernel_map_entry_t *entry
-)
-{
-  return entry->context_id;
-}
-
-
-uint32_t
-opencl_cl_kernel_map_entry_stream_get
-(
- opencl_kernel_map_entry_t *entry
-)
-{
-  return entry->stream_id;
-}
-
-
-
-//*****************************************************************************
-// debugging code
-//*****************************************************************************
-
-uint64_t
-opencl_cl_kernel_map_count
-(
- void
-)
-{
-  return st_count(map_root);
-}
diff --git a/src/tool/hpcrun/gpu/opencl/opencl-kernel-map.h b/src/tool/hpcrun/gpu/opencl/opencl-kernel-map.h
deleted file mode 100644
index 0edcd209cb..0000000000
--- a/src/tool/hpcrun/gpu/opencl/opencl-kernel-map.h
+++ /dev/null
@@ -1,119 +0,0 @@
-// -*-Mode: C++;-*- // technically C99
-
-// * BeginRiceCopyright *****************************************************
-//
-// --------------------------------------------------------------------------
-// Part of HPCToolkit (hpctoolkit.org)
-//
-// Information about sources of support for research and development of
-// HPCToolkit is at 'hpctoolkit.org' and in 'README.Acknowledgments'.
-// --------------------------------------------------------------------------
-//
-// Copyright ((c)) 2002-2020, Rice University
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// * Redistributions of source code must retain the above copyright
-//   notice, this list of conditions and the following disclaimer.
-//
-// * Redistributions in binary form must reproduce the above copyright
-//   notice, this list of conditions and the following disclaimer in the
-//   documentation and/or other materials provided with the distribution.
-//
-// * Neither the name of Rice University (RICE) nor the names of its
-//   contributors may be used to endorse or promote products derived from
-//   this software without specific prior written permission.
-//
-// This software is provided by RICE and contributors "as is" and any
-// express or implied warranties, including, but not limited to, the
-// implied warranties of merchantability and fitness for a particular
-// purpose are disclaimed. In no event shall RICE or contributors be
-// liable for any direct, indirect, incidental, special, exemplary, or
-// consequential damages (including, but not limited to, procurement of
-// substitute goods or services; loss of use, data, or profits; or
-// business interruption) however caused and on any theory of liability,
-// whether in contract, strict liability, or tort (including negligence
-// or otherwise) arising in any way out of the use of this software, even
-// if advised of the possibility of such damage.
-//
-// ******************************************************* EndRiceCopyright *
-
-#ifndef opencl_kernel_map_h
-#define opencl_kernel_map_h
-
-
-//*****************************************************************************
-// system includes
-//*****************************************************************************
-
-#include <stdint.h>
-
-
-
-//*****************************************************************************
-// type definitions 
-//*****************************************************************************
-
-typedef struct opencl_kernel_map_entry_t opencl_kernel_map_entry_t;
-
-
-
-//*****************************************************************************
-// interface operations
-//*****************************************************************************
-
-opencl_kernel_map_entry_t *
-opencl_cl_kernel_map_lookup
-(
- uint64_t
-);
-
-
-void
-opencl_cl_kernel_map_insert
-(
- uint64_t, 
- uint32_t,
- uint32_t
-);
-
-
-void
-opencl_cl_kernel_map_delete
-(
- uint64_t
-);
-
-
-uint64_t
-opencl_cl_kernel_map_entry_cl_kernel_id_get
-(
- opencl_kernel_map_entry_t *entry
-);
-
-
-uint32_t
-opencl_cl_kernel_map_entry_context_get
-(
- opencl_kernel_map_entry_t *
-);
-
-
-uint32_t
-opencl_cl_kernel_map_entry_stream_get
-(
- opencl_kernel_map_entry_t *
-);
-
-
-uint64_t
-opencl_h2d_map_count
-(
- void
-);
-
-#endif
-

From 286dc2bb5bb4a81872a625b8f320dec680209b4f Mon Sep 17 00:00:00 2001
From: Aaron Cherian <atc8@iris.cs.rice.edu>
Date: Mon, 12 Oct 2020 19:45:46 -0500
Subject: [PATCH 095/177] adding support for monitoring kernel executions from
 clEnqueueTask API

---
 src/tool/hpcrun/gpu/opencl/opencl-api.c | 55 +++++++++++++++++++++++++
 1 file changed, 55 insertions(+)

diff --git a/src/tool/hpcrun/gpu/opencl/opencl-api.c b/src/tool/hpcrun/gpu/opencl/opencl-api.c
index 7ef7e7237c..a65b129800 100644
--- a/src/tool/hpcrun/gpu/opencl/opencl-api.c
+++ b/src/tool/hpcrun/gpu/opencl/opencl-api.c
@@ -101,6 +101,7 @@
   macro(clCreateCommandQueue)  \
   macro(clCreateCommandQueueWithProperties)  \
   macro(clEnqueueNDRangeKernel)  \
+  macro(clEnqueueTask)  \
   macro(clEnqueueReadBuffer)  \
   macro(clEnqueueWriteBuffer)  \
   macro(clEnqueueMapBuffer) \
@@ -226,6 +227,19 @@ OPENCL_FN
 );
 
 
+OPENCL_FN
+(
+  clEnqueueTask, 
+  (
+   cl_command_queue,
+   cl_kernel,
+   cl_uint,
+   const cl_event *,
+   cl_event *
+  )
+);
+
+
 OPENCL_FN
 (
   clEnqueueReadBuffer, 
@@ -1029,6 +1043,47 @@ clEnqueueNDRangeKernel
 }
 
 
+// this is a simplified version of clEnqueueNDRangeKernel, TODO: check if code duplication can be avoided
+cl_int
+clEnqueueTask
+(
+  cl_command_queue command_queue,
+  cl_kernel kernel,
+  cl_uint num_events_in_wait_list,
+  const cl_event* event_wait_list,
+  cl_event* event
+)
+{
+  opencl_object_t *kernel_info = opencl_malloc();
+  opencl_queue_map_entry_t *qe = opencl_cl_queue_map_lookup((uint64_t)command_queue);
+  uint32_t context_id = opencl_cl_queue_map_entry_context_id_get(qe);
+  uint32_t stream_id = (uint32_t)command_queue;
+  initializeKernelCallBackInfo(kernel_info, CORRELATION_ID_INVALID, context_id, stream_id);
+
+  opencl_subscriber_callback(kernel_info);
+
+  cl_event my_event;
+  cl_event *eventp;
+  if (!event) {
+    kernel_info->isInternalClEvent = true;
+    eventp = &my_event;
+  } else {
+    eventp = event;
+    kernel_info->isInternalClEvent = false;
+  }
+  cl_int return_status =
+            HPCRUN_OPENCL_CALL(clEnqueueTask, (command_queue, kernel,
+                                num_events_in_wait_list, event_wait_list, eventp));
+
+  ETMSG(OPENCL, "Registering callback for kind: Kernel. "
+                "Correlation id: %"PRIu64 "", kernel_info->details.ker_cb.correlation_id);
+
+  clSetEventCallback_wrapper(*eventp, CL_COMPLETE,
+                             &opencl_activity_completion_callback, kernel_info);
+  return return_status;
+}
+
+
 cl_int
 clEnqueueReadBuffer
 (

From 0532ec04390e8d09c615dcc5e5ad37f3d5252669 Mon Sep 17 00:00:00 2001
From: Aaron Cherian <atc8@iris.cs.rice.edu>
Date: Tue, 13 Oct 2020 12:00:40 -0500
Subject: [PATCH 096/177] changed the order of bytes and submit time in
 gpu_memcpy_t to match with gpu_mem_t. Opencl H2D and D2H bytes metrics now
 correct in .hpcrun files. (Note for code review: ensure this code does not
 break existing behaviors)

---
 src/tool/hpcrun/gpu/gpu-activity.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/tool/hpcrun/gpu/gpu-activity.h b/src/tool/hpcrun/gpu/gpu-activity.h
index 7f20cdd054..328d49dd41 100644
--- a/src/tool/hpcrun/gpu/gpu-activity.h
+++ b/src/tool/hpcrun/gpu/gpu-activity.h
@@ -210,8 +210,8 @@ typedef struct gpu_mem_t {
 typedef struct gpu_memcpy_t {
   uint64_t start;
   uint64_t end;
-  uint64_t submit_time;
   uint64_t bytes;
+  uint64_t submit_time;
   uint32_t correlation_id;
   uint32_t context_id;
   uint32_t stream_id;

From 8bd55e2d12a8896d0c4cad5d14bd7f242473e1a3 Mon Sep 17 00:00:00 2001
From: Aaron Cherian <atc8@iris.cs.rice.edu>
Date: Tue, 13 Oct 2020 12:01:36 -0500
Subject: [PATCH 097/177] adding opencl-queue-map files to remote

---
 src/tool/hpcrun/gpu/opencl/opencl-queue-map.c | 248 ++++++++++++++++++
 src/tool/hpcrun/gpu/opencl/opencl-queue-map.h | 111 ++++++++
 2 files changed, 359 insertions(+)
 create mode 100644 src/tool/hpcrun/gpu/opencl/opencl-queue-map.c
 create mode 100644 src/tool/hpcrun/gpu/opencl/opencl-queue-map.h

diff --git a/src/tool/hpcrun/gpu/opencl/opencl-queue-map.c b/src/tool/hpcrun/gpu/opencl/opencl-queue-map.c
new file mode 100644
index 0000000000..b1e1e0fd35
--- /dev/null
+++ b/src/tool/hpcrun/gpu/opencl/opencl-queue-map.c
@@ -0,0 +1,248 @@
+// -*-Mode: C++;-*- // technically C99
+
+// * BeginRiceCopyright *****************************************************
+//
+// --------------------------------------------------------------------------
+// Part of HPCToolkit (hpctoolkit.org)
+//
+// Information about sources of support for research and development of
+// HPCToolkit is at 'hpctoolkit.org' and in 'README.Acknowledgments'.
+// --------------------------------------------------------------------------
+//
+// Copyright ((c)) 2002-2020, Rice University
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+//   notice, this list of conditions and the following disclaimer.
+//
+// * Redistributions in binary form must reproduce the above copyright
+//   notice, this list of conditions and the following disclaimer in the
+//   documentation and/or other materials provided with the distribution.
+//
+// * Neither the name of Rice University (RICE) nor the names of its
+//   contributors may be used to endorse or promote products derived from
+//   this software without specific prior written permission.
+//
+// This software is provided by RICE and contributors "as is" and any
+// express or implied warranties, including, but not limited to, the
+// implied warranties of merchantability and fitness for a particular
+// purpose are disclaimed. In no event shall RICE or contributors be
+// liable for any direct, indirect, incidental, special, exemplary, or
+// consequential damages (including, but not limited to, procurement of
+// substitute goods or services; loss of use, data, or profits; or
+// business interruption) however caused and on any theory of liability,
+// whether in contract, strict liability, or tort (including negligence
+// or otherwise) arising in any way out of the use of this software, even
+// if advised of the possibility of such damage.
+//
+// ******************************************************* EndRiceCopyright *
+
+//*****************************************************************************
+// system includes
+//*****************************************************************************
+
+#include <assert.h>
+#include <string.h>
+
+
+
+//*****************************************************************************
+// local includes
+//*****************************************************************************
+
+#include <lib/prof-lean/splay-uint64.h>
+#include <lib/prof-lean/spinlock.h>
+#include <hpcrun/gpu/gpu-activity-channel.h>
+#include <hpcrun/gpu/gpu-splay-allocator.h>
+#include <hpcrun/gpu/gpu-op-placeholders.h>
+
+#include "opencl-queue-map.h"
+
+
+
+//*****************************************************************************
+// macros
+//*****************************************************************************
+
+#define DEBUG 0
+
+#include "../gpu-print.h"
+
+
+#define st_insert				\
+  typed_splay_insert(correlation_id)
+
+#define st_lookup				\
+  typed_splay_lookup(correlation_id)
+
+#define st_delete				\
+  typed_splay_delete(correlation_id)
+
+#define st_forall				\
+  typed_splay_forall(correlation_id)
+
+#define st_count				\
+  typed_splay_count(correlation_id)
+
+#define st_alloc(free_list)			\
+  typed_splay_alloc(free_list, opencl_queue_map_entry_t)
+
+#define st_free(free_list, node)		\
+  typed_splay_free(free_list, node)
+
+
+
+//*****************************************************************************
+// type declarations
+//*****************************************************************************
+
+#undef typed_splay_node
+#define typed_splay_node(correlation_id) opencl_queue_map_entry_t
+
+typedef struct typed_splay_node(correlation_id) {
+  struct typed_splay_node(correlation_id) *left;
+  struct typed_splay_node(correlation_id) *right;
+  uint64_t queue_id; // key
+
+  uint32_t context_id; // we save queue id as the stream id
+} typed_splay_node(correlation_id); 
+
+
+//******************************************************************************
+// local data
+//******************************************************************************
+
+static opencl_queue_map_entry_t *map_root = NULL;
+
+static opencl_queue_map_entry_t *free_list = NULL;
+
+static spinlock_t opencl_queue_map_lock = SPINLOCK_UNLOCKED;
+
+//*****************************************************************************
+// private operations
+//*****************************************************************************
+
+typed_splay_impl(correlation_id)
+
+
+static opencl_queue_map_entry_t *
+opencl_cl_queue_map_entry_alloc()
+{
+  return st_alloc(&free_list);
+}
+
+
+static opencl_queue_map_entry_t *
+opencl_cl_queue_map_entry_new
+(
+ uint64_t queue_id,
+ uint32_t context_id
+)
+{
+  opencl_queue_map_entry_t *e = opencl_cl_queue_map_entry_alloc();
+
+  e->queue_id = queue_id;
+  e->context_id = context_id;
+  
+  return e;
+}
+
+
+
+//*****************************************************************************
+// interface operations
+//*****************************************************************************
+
+opencl_queue_map_entry_t *
+opencl_cl_queue_map_lookup
+(
+ uint64_t queue_id
+)
+{
+  spinlock_lock(&opencl_queue_map_lock);
+
+  uint64_t id = queue_id;
+  opencl_queue_map_entry_t *result = st_lookup(&map_root, id);
+
+  spinlock_unlock(&opencl_queue_map_lock);
+
+  return result;
+}
+
+
+void
+opencl_cl_queue_map_insert
+(
+ uint64_t queue_id, 
+ uint32_t context_id
+)
+{
+  spinlock_lock(&opencl_queue_map_lock);
+
+  opencl_queue_map_entry_t *entry = st_lookup(&map_root, queue_id);
+  if (entry) {
+    entry->queue_id = queue_id;
+    entry->context_id = context_id;
+  } else {
+    opencl_queue_map_entry_t *entry = 
+      opencl_cl_queue_map_entry_new(queue_id, context_id);
+
+    st_insert(&map_root, entry);
+  }
+
+  spinlock_unlock(&opencl_queue_map_lock);
+}
+
+
+void
+opencl_cl_queue_map_delete
+(
+ uint64_t queue_id
+)
+{
+  spinlock_lock(&opencl_queue_map_lock);
+
+  opencl_queue_map_entry_t *node = st_delete(&map_root, queue_id);
+  st_free(&free_list, node);
+
+  spinlock_unlock(&opencl_queue_map_lock);
+}
+
+
+uint64_t
+opencl_cl_queue_map_entry_queue_id_get
+(
+ opencl_queue_map_entry_t *entry
+)
+{
+  return entry->queue_id;
+}
+
+
+uint32_t
+opencl_cl_queue_map_entry_context_id_get
+(
+ opencl_queue_map_entry_t *entry
+)
+{
+  return entry->context_id;
+}
+
+
+
+//*****************************************************************************
+// debugging code
+//*****************************************************************************
+
+uint64_t
+opencl_cl_queue_map_count
+(
+ void
+)
+{
+  return st_count(map_root);
+}
diff --git a/src/tool/hpcrun/gpu/opencl/opencl-queue-map.h b/src/tool/hpcrun/gpu/opencl/opencl-queue-map.h
new file mode 100644
index 0000000000..1d9b6db2d6
--- /dev/null
+++ b/src/tool/hpcrun/gpu/opencl/opencl-queue-map.h
@@ -0,0 +1,111 @@
+// -*-Mode: C++;-*- // technically C99
+
+// * BeginRiceCopyright *****************************************************
+//
+// --------------------------------------------------------------------------
+// Part of HPCToolkit (hpctoolkit.org)
+//
+// Information about sources of support for research and development of
+// HPCToolkit is at 'hpctoolkit.org' and in 'README.Acknowledgments'.
+// --------------------------------------------------------------------------
+//
+// Copyright ((c)) 2002-2020, Rice University
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+//   notice, this list of conditions and the following disclaimer.
+//
+// * Redistributions in binary form must reproduce the above copyright
+//   notice, this list of conditions and the following disclaimer in the
+//   documentation and/or other materials provided with the distribution.
+//
+// * Neither the name of Rice University (RICE) nor the names of its
+//   contributors may be used to endorse or promote products derived from
+//   this software without specific prior written permission.
+//
+// This software is provided by RICE and contributors "as is" and any
+// express or implied warranties, including, but not limited to, the
+// implied warranties of merchantability and fitness for a particular
+// purpose are disclaimed. In no event shall RICE or contributors be
+// liable for any direct, indirect, incidental, special, exemplary, or
+// consequential damages (including, but not limited to, procurement of
+// substitute goods or services; loss of use, data, or profits; or
+// business interruption) however caused and on any theory of liability,
+// whether in contract, strict liability, or tort (including negligence
+// or otherwise) arising in any way out of the use of this software, even
+// if advised of the possibility of such damage.
+//
+// ******************************************************* EndRiceCopyright *
+
+#ifndef opencl_queue_map_h
+#define opencl_queue_map_h
+
+
+//*****************************************************************************
+// system includes
+//*****************************************************************************
+
+#include <stdint.h>
+
+
+
+//*****************************************************************************
+// type definitions 
+//*****************************************************************************
+
+typedef struct opencl_queue_map_entry_t opencl_queue_map_entry_t;
+
+
+
+//*****************************************************************************
+// interface operations
+//*****************************************************************************
+
+opencl_queue_map_entry_t *
+opencl_cl_queue_map_lookup
+(
+ uint64_t
+);
+
+
+void
+opencl_cl_queue_map_insert
+(
+ uint64_t, 
+ uint32_t
+);
+
+
+void
+opencl_cl_queue_map_delete
+(
+ uint64_t
+);
+
+
+uint64_t
+opencl_cl_queue_map_entry_queue_id_get
+(
+ opencl_queue_map_entry_t *entry
+);
+
+
+uint32_t
+opencl_cl_queue_map_entry_context_id_get
+(
+ opencl_queue_map_entry_t *
+);
+
+
+uint64_t
+opencl_h2d_map_count
+(
+ void
+);
+
+#endif
+

From 9afb68b03f86f8d3c28ff4c050578f0285bb6d83 Mon Sep 17 00:00:00 2001
From: Jokeren <robinho364@gmail.com>
Date: Tue, 13 Oct 2020 21:20:53 +0000
Subject: [PATCH 098/177] Fix trace data collection and flush activity data
 when using operation channels

---
 src/tool/hpcrun/gpu/214002.cobaltlog          |  25 ++
 src/tool/hpcrun/gpu/214002.error              |   1 +
 src/tool/hpcrun/gpu/214002.output             |   0
 .../hpcrun/gpu/gpu-activity-multiplexer.c     |  24 +-
 src/tool/hpcrun/gpu/gpu-activity-process.c    |  27 +-
 src/tool/hpcrun/gpu/gpu-activity.h            |  15 +-
 .../hpcrun/gpu/gpu-operation-item-process.c   |  51 ++-
 src/tool/hpcrun/gpu/gpu-operation-item.h      |   3 +-
 .../gpu/opencl/opencl-activity-translate.c    |  33 +-
 src/tool/hpcrun/gpu/opencl/opencl-api.c       | 400 ++++++++++--------
 .../hpcrun/gpu/opencl/opencl-memory-manager.h |  18 +-
 src/tool/hpcrun/sample-sources/opencl.c       |   1 +
 12 files changed, 365 insertions(+), 233 deletions(-)
 create mode 100644 src/tool/hpcrun/gpu/214002.cobaltlog
 create mode 100644 src/tool/hpcrun/gpu/214002.error
 create mode 100644 src/tool/hpcrun/gpu/214002.output

diff --git a/src/tool/hpcrun/gpu/214002.cobaltlog b/src/tool/hpcrun/gpu/214002.cobaltlog
new file mode 100644
index 0000000000..17ac3cc0e8
--- /dev/null
+++ b/src/tool/hpcrun/gpu/214002.cobaltlog
@@ -0,0 +1,25 @@
+Jobid: 214002
+qsub -I -t 30 -n 1 -q iris
+Thu Oct 01 23:59:10 2020 +0000 (UTC) submitted with cwd set to: /home/jokeren/Codes/hpctoolkit_aaron/src/tool/hpcrun/gpu
+jobid 214002 submitted from terminal /dev/pts/206
+Thu Oct 01 23:59:28 2020 +0000 (UTC) 
+Thu Oct 01 23:59:28 2020 +0000 (UTC) Command: '/usr/bin/ssh' 'iris09' '/usr/libexec/cobalt/cobalt-launcher.py' '--nf' '/var/tmp/cobalt.214002' '--jobid' '214002' '--cwd' '/home/jokeren/Codes/hpctoolkit_aaron/src/tool/hpcrun/gpu' '--env' 'COBALT_JOBID=214002' '/bin/sleep' '1800'
+Thu Oct 01 23:59:28 2020 +0000 (UTC) 
+Thu Oct 01 23:59:28 2020 +0000 (UTC) Environment:
+Thu Oct 01 23:59:28 2020 +0000 (UTC) SHELL=/bin/bash
+Thu Oct 01 23:59:28 2020 +0000 (UTC) COBALT_NODEFILE=/tmp/tmpR3QupI
+Thu Oct 01 23:59:28 2020 +0000 (UTC) COBALT_PARTNAME=iris09
+Thu Oct 01 23:59:28 2020 +0000 (UTC) COBALT_JOBID=214002
+Thu Oct 01 23:59:28 2020 +0000 (UTC) COBALT_STARTTIME=1601596758
+Thu Oct 01 23:59:28 2020 +0000 (UTC) LOGNAME=jokeren
+Thu Oct 01 23:59:28 2020 +0000 (UTC) USER=jokeren
+Thu Oct 01 23:59:28 2020 +0000 (UTC) COBALT_ENDTIME=1601598558
+Thu Oct 01 23:59:28 2020 +0000 (UTC) COBALT_PARTSIZE=1
+Thu Oct 01 23:59:28 2020 +0000 (UTC) HOME=/home/jokeren
+Thu Oct 01 23:59:28 2020 +0000 (UTC) COBALT_JOBSIZE=1
+Thu Oct 01 23:59:28 2020 +0000 (UTC) 
+Thu Oct 01 23:59:28 2020 +0000 (UTC) Info: stdin received from /dev/null
+Thu Oct 01 23:59:28 2020 +0000 (UTC) Info: stdout sent to /home/jokeren/Codes/hpctoolkit_aaron/src/tool/hpcrun/gpu/214002.output
+Thu Oct 01 23:59:28 2020 +0000 (UTC) Info: stderr sent to /home/jokeren/Codes/hpctoolkit_aaron/src/tool/hpcrun/gpu/214002.error
+Thu Oct 01 23:59:28 2020 +0000 (UTC) 
+Fri Oct 02 00:10:57 2020 +0000 (UTC) Info: user delete requested with signal SIGTERM by user jokeren
diff --git a/src/tool/hpcrun/gpu/214002.error b/src/tool/hpcrun/gpu/214002.error
new file mode 100644
index 0000000000..b3be75cc7e
--- /dev/null
+++ b/src/tool/hpcrun/gpu/214002.error
@@ -0,0 +1 @@
+Killed by signal 15.
diff --git a/src/tool/hpcrun/gpu/214002.output b/src/tool/hpcrun/gpu/214002.output
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/src/tool/hpcrun/gpu/gpu-activity-multiplexer.c b/src/tool/hpcrun/gpu/gpu-activity-multiplexer.c
index a5ab08a287..05ba10d26f 100644
--- a/src/tool/hpcrun/gpu/gpu-activity-multiplexer.c
+++ b/src/tool/hpcrun/gpu/gpu-activity-multiplexer.c
@@ -132,6 +132,7 @@ gpu_activity_multiplexer_create
   control_knob_value_get_int("MAX_COMPLETION_CALLBACK_THREADS", &max_completion_cb_threads);
 
   gpu_operation_channel_set_alloc(max_completion_cb_threads);
+
   // You are the first to create monitor thread
   pthread_create(&thread, NULL, (pthread_start_routine_t) gpu_activity_record,
                  NULL);
@@ -190,26 +191,3 @@ gpu_activity_multiplexer_push
   gpu_operation_item_t item = (gpu_operation_item_t){.channel=initiator_channel, .activity=*gpu_activity};
   gpu_operation_channel_produce(gpu_operation_channel, &item);
 }
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
diff --git a/src/tool/hpcrun/gpu/gpu-activity-process.c b/src/tool/hpcrun/gpu/gpu-activity-process.c
index 552e5ee13e..ffaa2738bb 100644
--- a/src/tool/hpcrun/gpu/gpu-activity-process.c
+++ b/src/tool/hpcrun/gpu/gpu-activity-process.c
@@ -72,7 +72,7 @@
 
 #define UNIT_TEST 0
 
-#define DEBUG 0
+#define DEBUG 1
 
 #include "gpu-print.h"
 
@@ -578,7 +578,30 @@ gpu_memory_process
  gpu_activity_t *activity
 )
 {
-  PRINT("Memory process not implemented\n");
+  uint32_t correlation_id = activity->details.memory.correlation_id;
+  gpu_correlation_id_map_entry_t *cid_map_entry =
+    gpu_correlation_id_map_lookup(correlation_id);
+  if (cid_map_entry != NULL) {
+    uint64_t external_id =
+      gpu_correlation_id_map_entry_external_id_get(cid_map_entry);
+    gpu_host_correlation_map_entry_t *host_op_entry =
+      gpu_host_correlation_map_lookup(external_id);
+    if (host_op_entry != NULL) {
+      gpu_placeholder_type_t ph = gpu_placeholder_type_alloc;
+      cct_node_t *host_op_node =
+        gpu_host_correlation_map_entry_op_cct_get(host_op_entry, ph);
+      assert(host_op_node != NULL);
+      // Memory allocation does not always happen on the device
+      // Do not send it to trace channels
+      attribute_activity(host_op_entry, activity, host_op_node);
+    }
+    gpu_correlation_id_map_delete(correlation_id);
+  } else {
+    PRINT("Memory correlation_id %u cannot be found\n", correlation_id);
+  }
+  PRINT("Memory CorrelationId %u\n", correlation_id);
+  PRINT("Memory kind %u\n", activity->details.memory.memKind);
+  PRINT("Memory bytes %lu\n", activity->details.memory.bytes);
 }
 
 
diff --git a/src/tool/hpcrun/gpu/gpu-activity.h b/src/tool/hpcrun/gpu/gpu-activity.h
index 7f20cdd054..2c2f756a24 100644
--- a/src/tool/hpcrun/gpu/gpu-activity.h
+++ b/src/tool/hpcrun/gpu/gpu-activity.h
@@ -94,7 +94,8 @@ typedef enum {
   GPU_ACTIVITY_PC_SAMPLING_INFO        = 13, 
   GPU_ACTIVITY_EXTERNAL_CORRELATION    = 14,
   GPU_ACTIVITY_EVENT                   = 15,
-  GPU_ACTIVITY_FUNCTION                = 16
+  GPU_ACTIVITY_FUNCTION                = 16,
+  GPU_ACTIVITY_FLUSH                   = 17
 } gpu_activity_kind_t;
 
 
@@ -175,7 +176,7 @@ typedef enum {
   GPU_MEM_MANAGED_STATIC  = 6,
   GPU_MEM_UNKNOWN         = 7,
   GPU_MEM_COUNT           = 8
-} gpu_mem_kind_t;
+} gpu_mem_type_t;
 
 
 // pc sampling
@@ -196,6 +197,10 @@ typedef struct gpu_pc_sampling_info_t {
   uint64_t fullSMSamples;
 } gpu_pc_sampling_info_t;
 
+// a special flush record to notify all operations have been consumed
+typedef struct gpu_flush_t {
+  atomic_bool *wait;
+} gpu_flush_t;
 
 // this type is prefix of all memory structures
 // gpu_interval_t is a prefix 
@@ -225,7 +230,8 @@ typedef struct gpu_memory_t {
   uint64_t start;
   uint64_t end;
   uint64_t bytes;
-  gpu_mem_kind_t memKind;
+  uint32_t correlation_id;
+  gpu_mem_type_t memKind;
 } gpu_memory_t;
 
 
@@ -237,7 +243,7 @@ typedef struct gpu_memset_t {
   uint32_t correlation_id;
   uint32_t context_id;
   uint32_t stream_id;
-  gpu_mem_kind_t memKind;
+  gpu_mem_type_t memKind;
 } gpu_memset_t;
 
 
@@ -370,6 +376,7 @@ typedef struct gpu_activity_details_t {
     gpu_branch_t branch;
     gpu_synchronization_t synchronization;
     gpu_host_correlation_t correlation;
+    gpu_flush_t flush;
 
     /* Access short cut for activitiy fields shared by multiple kinds */
 
diff --git a/src/tool/hpcrun/gpu/gpu-operation-item-process.c b/src/tool/hpcrun/gpu/gpu-operation-item-process.c
index 8b26ab71a5..2b11b42501 100644
--- a/src/tool/hpcrun/gpu/gpu-operation-item-process.c
+++ b/src/tool/hpcrun/gpu/gpu-operation-item-process.c
@@ -58,6 +58,7 @@
 #include <hpcrun/gpu/gpu-activity-channel.h>
 #include <hpcrun/gpu/gpu-trace-item.h>
 #include <hpcrun/gpu/gpu-context-id-map.h>
+#include <lib/prof-lean/stdatomic.h>
 
 #include "gpu-operation-item.h"
 #include "gpu-operation-item-process.h"
@@ -99,7 +100,7 @@ gpu_context_stream_trace
 static void
 gpu_memcpy_process
 (
-gpu_operation_item_t *it
+ gpu_operation_item_t *it
 )
 {
   gpu_activity_t *activity = &it->activity;
@@ -130,7 +131,7 @@ gpu_operation_item_t *it
 static void
 gpu_kernel_process
 (
-gpu_operation_item_t *it
+ gpu_operation_item_t *it
 )
 {
   gpu_activity_t *activity = &it->activity;
@@ -156,15 +157,30 @@ gpu_operation_item_t *it
 
 
 static void
-gpu_unknown_process
+gpu_flush_process
 (
-gpu_operation_item_t *it
+ gpu_operation_item_t *it
 )
 {
-  PRINT("Unknown activity kind %d\n", it->activity->kind);
+  gpu_activity_t *activity = &it->activity;
+  // A special flush operation at the end of each thread
+  // Set it false to indicate all previous activities have been processed
+  if (atomic_load(activity->details.flush.wait)) {
+    atomic_store(activity->details.flush.wait, false);
+  }
 }
 
 
+static void
+gpu_unknown_process
+(
+ gpu_operation_item_t *it
+)
+{
+  gpu_activity_t *activity = &it->activity;
+  gpu_activity_channel_t *channel = it->channel;
+  gpu_activity_channel_produce(channel, activity);
+}
 
 //******************************************************************************
 // interface operations
@@ -173,23 +189,26 @@ gpu_operation_item_t *it
 void
 gpu_operation_item_process
 (
-gpu_operation_item_t *it
+ gpu_operation_item_t *it
 )
 {
-
   switch (it->activity.kind) {
 
-  case GPU_ACTIVITY_MEMCPY:
-    gpu_memcpy_process(it);
-    break;
+    case GPU_ACTIVITY_MEMCPY:
+      gpu_memcpy_process(it);
+      break;
+
+    case GPU_ACTIVITY_KERNEL:
+      gpu_kernel_process(it);
+      break;
 
-  case GPU_ACTIVITY_KERNEL:
-    gpu_kernel_process(it);
-    break;
+    case GPU_ACTIVITY_FLUSH:
+      gpu_flush_process(it);
+      break;
 
-  default:
-    gpu_unknown_process(it);
-    break;
+    default:
+      gpu_unknown_process(it);
+      break;
   }
 }
 
diff --git a/src/tool/hpcrun/gpu/gpu-operation-item.h b/src/tool/hpcrun/gpu/gpu-operation-item.h
index a4e6e0be6f..db5a528bb4 100644
--- a/src/tool/hpcrun/gpu/gpu-operation-item.h
+++ b/src/tool/hpcrun/gpu/gpu-operation-item.h
@@ -77,10 +77,11 @@ typedef struct gpu_operation_channel_t gpu_operation_channel_t;
 // type declarations
 //******************************************************************************
 
-typedef struct gpu_operation_item_t{
+typedef struct gpu_operation_item_t {
   s_element_t next;
   gpu_activity_channel_t *channel;
   gpu_activity_t activity;
+  atomic_bool *flush;
 } gpu_operation_item_t;
 
 
diff --git a/src/tool/hpcrun/gpu/opencl/opencl-activity-translate.c b/src/tool/hpcrun/gpu/opencl/opencl-activity-translate.c
index 41a8d5a086..0e13b6c2d9 100644
--- a/src/tool/hpcrun/gpu/opencl/opencl-activity-translate.c
+++ b/src/tool/hpcrun/gpu/opencl/opencl-activity-translate.c
@@ -105,10 +105,33 @@ convert_memcpy
   ga->kind     = cb_data->kind;
   ga->cct_node = cb_data->details.cct_node;
 
-  ga->details.memcpy.correlation_id  = cb_data->details.mem_cb.correlation_id;
+  ga->details.memcpy.correlation_id  = cb_data->details.cpy_cb.correlation_id;
   ga->details.memcpy.submit_time     = cb_data->details.submit_time;
-  ga->details.memcpy.bytes           = cb_data->details.mem_cb.size;
-  ga->details.memcpy.copyKind        = cb_data->details.mem_cb.type;
+  ga->details.memcpy.bytes           = cb_data->details.cpy_cb.size;
+  ga->details.memcpy.copyKind        = cb_data->details.cpy_cb.type;
+}
+
+
+static void
+convert_memory
+(
+  gpu_activity_t *ga,
+  opencl_object_t *cb_data,
+  uint64_t start_time,
+  uint64_t end_time
+)
+{
+  memset(&ga->details.memory, 0, sizeof(gpu_memory_t));
+  if (start_time != 0 && end_time != 0) {
+    set_gpu_interval(&ga->details.interval, start_time, end_time);
+  }
+
+  ga->kind     = cb_data->kind;
+  ga->cct_node = cb_data->details.cct_node;
+
+  ga->details.memory.correlation_id  = cb_data->details.mem_cb.correlation_id;
+  ga->details.memory.bytes           = cb_data->details.mem_cb.size;
+  ga->details.memory.memKind         = cb_data->details.mem_cb.type;
 }
 
 
@@ -133,6 +156,10 @@ opencl_activity_translate
       convert_kernel_launch(ga, cb_data, interval.start, interval.end);
       break;
 
+    case GPU_ACTIVITY_MEMORY:
+      convert_memory(ga, cb_data, interval.start, interval.end);
+      break;
+
     default:
       assert(0);
   }
diff --git a/src/tool/hpcrun/gpu/opencl/opencl-api.c b/src/tool/hpcrun/gpu/opencl/opencl-api.c
index 4ef913475c..367f69cbc3 100644
--- a/src/tool/hpcrun/gpu/opencl/opencl-api.c
+++ b/src/tool/hpcrun/gpu/opencl/opencl-api.c
@@ -143,11 +143,14 @@
 //******************************************************************************
 
 static atomic_long correlation_id_counter;
-static atomic_ullong opencl_pending_operations;
 static atomic_ullong opencl_h2d_pending_operations;
 static spinlock_t opencl_h2d_lock = SPINLOCK_UNLOCKED;
 static bool instrumentation = false;
 
+static __thread atomic_int opencl_self_pending_operations = { 0 };
+static atomic_int opencl_pending_operations = { 0 };
+static __thread bool opencl_stop_flag = false;
+
 #define CL_PROGRAM_DEBUG_INFO_SIZES_INTEL 0x4101
 #define CL_PROGRAM_DEBUG_INFO_INTEL       0x4100
 
@@ -342,7 +345,7 @@ OPENCL_FN
 // private operations
 //******************************************************************************
 
-static uint64_t
+static uint32_t
 getCorrelationId
 (
  void
@@ -356,53 +359,58 @@ static void
 initializeKernelCallBackInfo
 (
  opencl_object_t *ker_info,
- uint64_t correlation_id
+ uint32_t correlation_id
 )
 {
   ker_info->kind = GPU_ACTIVITY_KERNEL;
   ker_info->details.ker_cb.correlation_id = correlation_id;
+  ker_info->pending_operations = &opencl_self_pending_operations;
 }
 
 
 static void
-initializeMemoryCallBackInfo
+initializeMemcpyCallBackInfo
 (
-  opencl_object_t *mem_info,
-  gpu_memcpy_type_t type,
-  size_t size,
-  uint64_t correlation_id
+ opencl_object_t *cpy_info,
+ gpu_memcpy_type_t type,
+ size_t size,
+ uint32_t correlation_id
 )
 {
-  mem_info->kind = GPU_ACTIVITY_MEMCPY;
-  mem_info->details.mem_cb.type = type;
-  mem_info->details.mem_cb.fromHostToDevice = (type == GPU_MEMCPY_H2D);
-  mem_info->details.mem_cb.fromDeviceToHost = (type == GPU_MEMCPY_D2H);
-  mem_info->details.mem_cb.size = size;
-
-  mem_info->details.mem_cb.correlation_id = correlation_id;
+  cpy_info->kind = GPU_ACTIVITY_MEMCPY;
+  cpy_info->details.cpy_cb.type = type;
+  cpy_info->details.cpy_cb.fromHostToDevice = (type == GPU_MEMCPY_H2D);
+  cpy_info->details.cpy_cb.fromDeviceToHost = (type == GPU_MEMCPY_D2H);
+  cpy_info->details.cpy_cb.size = size;
+  cpy_info->details.cpy_cb.correlation_id = correlation_id;
+  cpy_info->pending_operations = &opencl_self_pending_operations;
 }
 
 
 static void
-initializeClSetKernelArgMemoryCallBackInfo
+initializeMemoryCallBackInfo
 (
-  opencl_object_t *mem_info,
-  gpu_memcpy_type_t type,
-  size_t size,
-  uint64_t correlation_id,
-  uint32_t context_id,
-  uint32_t stream_id
+ opencl_object_t *mem_info,
+ cl_mem_flags flags,
+ size_t size,
+ uint32_t correlation_id
 )
 {
-  mem_info->kind = GPU_ACTIVITY_MEMCPY;
-  mem_info->details.mem_cb.type = type;
-  mem_info->details.mem_cb.fromHostToDevice = (type == GPU_MEMCPY_H2D);
-  mem_info->details.mem_cb.fromDeviceToHost = (type == GPU_MEMCPY_D2H);
-  mem_info->details.mem_cb.size = size;
+  mem_info->kind = GPU_ACTIVITY_MEMORY;
+  if (flags & CL_MEM_USE_HOST_PTR) {
+    // Managed by the host
+    mem_info->details.mem_cb.type = GPU_MEM_MANAGED;
+  } else if (flags & CL_MEM_ALLOC_HOST_PTR) {
+    // Use host memory
+    mem_info->details.mem_cb.type = GPU_MEM_PINNED;
+  } else {
+    // Normal
+    mem_info->details.mem_cb.type = GPU_MEM_DEVICE;
+  }
 
+  mem_info->details.mem_cb.size = size;
   mem_info->details.mem_cb.correlation_id = correlation_id;
-  mem_info->details.context_id = context_id;
-  mem_info->details.stream_id = stream_id;
+  mem_info->pending_operations = &opencl_self_pending_operations;
 }
 
 
@@ -437,34 +445,45 @@ opencl_h2d_pending_operations_adjust
 
 
 static void
-opencl_pending_operations_adjust
+opencl_activity_multiplexer_push
 (
-  int value
+ gpu_interval_t interval,
+ opencl_object_t *cb_data,
+ uint32_t correlation_id
 )
 {
-  atomic_fetch_add(&opencl_pending_operations, value);
+  if (gpu_activity_multiplexer_my_channel_initialized() == false){
+    gpu_activity_multiplexer_my_channel_init();
+  }
+
+  gpu_activity_t gpu_activity;
+  memset(&gpu_activity, 0, sizeof(gpu_activity_t));
+
+  // A pseudo host correlation entry
+  gpu_activity.kind = GPU_ACTIVITY_EXTERNAL_CORRELATION;
+  gpu_activity.details.correlation.correlation_id = correlation_id;
+  gpu_activity.details.correlation.host_correlation_id = correlation_id;
+  gpu_activity_multiplexer_push(cb_data->details.initiator_channel, &gpu_activity);
+  
+  // The actual entry
+  opencl_activity_translate(&gpu_activity, cb_data, interval);
+  gpu_activity_multiplexer_push(cb_data->details.initiator_channel, &gpu_activity);
 }
 
 
 static void
 opencl_activity_process
 (
-  cl_event event,
-  opencl_object_t *cb_data
+ cl_event event,
+ opencl_object_t *cb_data,
+ uint32_t correlation_id
 )
 {
-  gpu_activity_t gpu_activity;
-
   gpu_interval_t interval;
   memset(&interval, 0, sizeof(gpu_interval_t));
   opencl_timing_info_get(&interval, event);
-  
-  opencl_activity_translate(&gpu_activity, cb_data, interval);
 
-  if (gpu_activity_multiplexer_my_channel_initialized() == false){
-    gpu_activity_multiplexer_my_channel_init();
-  }
-  gpu_activity_multiplexer_push(cb_data->details.initiator_channel, &gpu_activity);
+  opencl_activity_multiplexer_push(interval, cb_data, correlation_id);
 }
 
 
@@ -476,8 +495,10 @@ opencl_clSetKernelArg_activity_process
 )
 {
   gpu_activity_t gpu_activity;
-  uint64_t correlation_id = opencl_h2d_map_entry_correlation_get(entry);
-	size_t size = opencl_h2d_map_entry_size_get(entry); 
+  memset(&gpu_activity, 0, sizeof(gpu_activity_t));
+
+  uint32_t correlation_id = opencl_h2d_map_entry_correlation_get(entry);
+  size_t size = opencl_h2d_map_entry_size_get(entry); 
   cb_data->details.ker_cb.correlation_id = correlation_id;
 
   gpu_interval_t interval;
@@ -516,64 +537,63 @@ opencl_isClArgBuffer
   const void *arg
 )
 {
-	/*
-	 * There are 2 scenarios in which opencl_isClArgBuffer will return false
-	 * 1. When clCreateBuffer was not called for arg before calling clSetKernelArg
-	 * 2. clEnqueueWriteBuffer is being called for arg. We shouldnt be recording duplicate H2D calls
-	 * */
+  /*
+   * There are 2 scenarios in which opencl_isClArgBuffer will return false
+   * 1. When clCreateBuffer was not called for arg before calling clSetKernelArg
+   * 2. clEnqueueWriteBuffer is being called for arg. We shouldnt be recording duplicate H2D calls
+   * */
   uint64_t buffer_id = opencl_get_buffer_id(arg);
   bool isBuffer;
   if (buffer_id == BUFFER_ID_INVALID) {
     isBuffer = false;
   } else {
-	  opencl_h2d_map_entry_t *entry = opencl_h2d_map_lookup(buffer_id);
-	  isBuffer = entry ? true : false;
+    opencl_h2d_map_entry_t *entry = opencl_h2d_map_lookup(buffer_id);
+    isBuffer = entry ? true : false;
   }
-	//ETMSG(OPENCL, "opencl_isClArgBuffer. buffer_id: %"PRIu64". isBuffer: %d",	buffer_id, isBuffer);
-	return isBuffer;
+  //ETMSG(OPENCL, "opencl_isClArgBuffer. buffer_id: %"PRIu64". isBuffer: %d",  buffer_id, isBuffer);
+  return isBuffer;
 }
 
 
 static void
 add_H2D_metrics_to_cct_node
 (
-	opencl_h2d_map_entry_t *entry,
-	splay_visit_t visit_type,
-	void *arg
+  opencl_h2d_map_entry_t *entry,
+  splay_visit_t visit_type,
+  void *arg
 )
 {
-	// uint64_t correlation_id = opencl_h2d_map_entry_correlation_get(entry); 
-	// gpu_correlation_id_map_entry_t *cid_map_entry = 
-	// 	gpu_correlation_id_map_lookup(correlation_id);
-	// if (cid_map_entry == NULL) {
-	// 	ETMSG(OPENCL, "cid_map_entry for correlation_id: %"PRIu64 " (clSetKernelArg H2D) not found", correlation_id);
-	// 	return;
-	// }
-
-	//opencl_activity_completion_notify();
+  // uint64_t correlation_id = opencl_h2d_map_entry_correlation_get(entry); 
+  // gpu_correlation_id_map_entry_t *cid_map_entry = 
+  //   gpu_correlation_id_map_lookup(correlation_id);
+  // if (cid_map_entry == NULL) {
+  //   ETMSG(OPENCL, "cid_map_entry for correlation_id: %"PRIu64 " (clSetKernelArg H2D) not found", correlation_id);
+  //   return;
+  // }
+
+  //opencl_activity_completion_notify();
   opencl_object_t *cb_data = opencl_h2d_map_entry_callback_info_get(entry);
   cl_basic_callback_t cb_basic = opencl_cb_basic_get(cb_data);
   opencl_cb_basic_print(cb_basic, "Completion_Callback");
 
-	opencl_clSetKernelArg_activity_process(entry, cb_data);
-	uint64_t buffer_id = opencl_h2d_map_entry_buffer_id_get(entry);
-	//opencl_h2d_map_delete(buffer_id);
-  opencl_h2d_pending_operations_adjust(-1);
-  opencl_pending_operations_adjust(-1);
+  opencl_clSetKernelArg_activity_process(entry, cb_data);
+  uint64_t buffer_id = opencl_h2d_map_entry_buffer_id_get(entry);
+  //opencl_h2d_map_delete(buffer_id);
+  //opencl_pending_operations_adjust(-1);
 }
 
 
 static void
 opencl_update_ccts_for_setClKernelArg
 (
-	void
+  void
 )
 {
   spinlock_lock(&opencl_h2d_lock);
   uint64_t count = opencl_h2d_map_count();
-	if (atomic_load(&opencl_h2d_pending_operations) > 0) {
-		opencl_update_ccts_for_h2d_nodes(add_H2D_metrics_to_cct_node);
-	}
+  if (atomic_load(&opencl_h2d_pending_operations) > 0) {
+    opencl_update_ccts_for_h2d_nodes(add_H2D_metrics_to_cct_node);
+  }
   spinlock_unlock(&opencl_h2d_lock);
 }
 
@@ -589,6 +609,17 @@ opencl_wait_for_non_clSetKernelArg_pending_operations
 }
 
 
+static void
+opencl_wait_for_self_pending_operations
+(
+  void
+)
+{
+  ETMSG(OPENCL, "pending self operations: %lu", atomic_load(&opencl_self_pending_operations));
+  while (atomic_load(&opencl_self_pending_operations) != 0);
+}
+
+
 static void
 opencl_wait_for_all_pending_operations
 (
@@ -674,8 +705,10 @@ opencl_subscriber_callback
   opencl_object_t *cb_info
 )
 {
+  opencl_stop_flag = true;
+
   gpu_placeholder_type_t placeholder_type;
-  uint64_t correlation_id;
+  uint32_t correlation_id;
 
   if( get_corr_id(cb_info) == CORRELATION_ID_INVALID){
     correlation_id = getCorrelationId();
@@ -683,39 +716,54 @@ opencl_subscriber_callback
     correlation_id = get_corr_id(cb_info);
   }
 
-  opencl_pending_operations_adjust(1);
+  atomic_fetch_add(cb_info->pending_operations, 1);
+  atomic_fetch_add(&opencl_pending_operations, 1);
   gpu_op_placeholder_flags_t gpu_op_placeholder_flags = 0;
-  gpu_correlation_id_map_insert(correlation_id, correlation_id);
 
   switch (cb_info->kind) {
 
     case GPU_ACTIVITY_MEMCPY:
-      cb_info->details.mem_cb.correlation_id = correlation_id;
-      if (cb_info->details.mem_cb.type == GPU_MEMCPY_H2D){ 
-        gpu_op_placeholder_flags_set(&gpu_op_placeholder_flags,
-                                       gpu_placeholder_type_copyin);
+      {
+        cb_info->details.cpy_cb.correlation_id = correlation_id;
+        if (cb_info->details.cpy_cb.type == GPU_MEMCPY_H2D){ 
+          gpu_op_placeholder_flags_set(&gpu_op_placeholder_flags,
+            gpu_placeholder_type_copyin);
 
-        placeholder_type = gpu_placeholder_type_copyin;
+          placeholder_type = gpu_placeholder_type_copyin;
 
-      }else if (cb_info->details.mem_cb.type == GPU_MEMCPY_D2H){
-        gpu_op_placeholder_flags_set(&gpu_op_placeholder_flags,
-                                       gpu_placeholder_type_copyout);
+        } else if (cb_info->details.cpy_cb.type == GPU_MEMCPY_D2H){
+          gpu_op_placeholder_flags_set(&gpu_op_placeholder_flags,
+            gpu_placeholder_type_copyout);
 
-        placeholder_type = gpu_placeholder_type_copyout;
+          placeholder_type = gpu_placeholder_type_copyout;
+        }
+        break;
       }
-      break;
 
     case GPU_ACTIVITY_KERNEL:
-      cb_info->details.ker_cb.correlation_id = correlation_id;
-      gpu_op_placeholder_flags_set(&gpu_op_placeholder_flags, 
-           gpu_placeholder_type_kernel);
+      {
+        cb_info->details.ker_cb.correlation_id = correlation_id;
+        gpu_op_placeholder_flags_set(&gpu_op_placeholder_flags, 
+          gpu_placeholder_type_kernel);
 
-      gpu_op_placeholder_flags_set(&gpu_op_placeholder_flags, 
-				   gpu_placeholder_type_trace);
+        gpu_op_placeholder_flags_set(&gpu_op_placeholder_flags, 
+          gpu_placeholder_type_trace);
 
-      placeholder_type = gpu_placeholder_type_kernel;
+        placeholder_type = gpu_placeholder_type_kernel;
+
+        break;
+      }
+
+    case GPU_ACTIVITY_MEMORY:
+      {
+        cb_info->details.mem_cb.correlation_id = correlation_id;
+        gpu_op_placeholder_flags_set(&gpu_op_placeholder_flags,
+          gpu_placeholder_type_alloc);
+
+        placeholder_type = gpu_placeholder_type_alloc;
+        break;
+      }
 
-      break;
     default:
       assert(0);
   }
@@ -754,16 +802,20 @@ opencl_activity_completion_callback
   cl_basic_callback_t cb_basic = opencl_cb_basic_get(cb_data);
 
   if (event_command_exec_status == CL_COMPLETE) {
-    opencl_in_correlation_map(cb_basic);
+    // TODO(Aaron): multiple threads can call completion callback
+    //opencl_in_correlation_map(cb_basic);
 
     opencl_cb_basic_print(cb_basic, "Completion_Callback");
-    opencl_activity_process(event, cb_data);
+    opencl_activity_process(event, cb_data, cb_basic.correlation_id);
   }
   if (cb_data->isInternalClEvent) {
     HPCRUN_OPENCL_CALL(clReleaseEvent, (event));
   }
+
+  atomic_fetch_add(cb_data->pending_operations, -1);
+  atomic_fetch_add(&opencl_pending_operations, -1);
+
   opencl_free(cb_data);
-  opencl_pending_operations_adjust(-1);
 }
 
 
@@ -861,6 +913,7 @@ clCreateProgramWithSource
 {
   ETMSG(OPENCL, "inside clCreateProgramWithSource_wrapper");
 
+#if 0
   if (strings != NULL && lengths != NULL) {
     FILE *f_ptr;
     for (int i = 0; i < (int)count; i++) {
@@ -875,6 +928,7 @@ clCreateProgramWithSource
     }
     fclose(f_ptr);
   }
+#endif
 
   return HPCRUN_OPENCL_CALL(clCreateProgramWithSource, (context, count, strings, lengths, errcode_ret));
 }
@@ -1043,18 +1097,18 @@ clEnqueueReadBuffer
 {
   ETMSG(OPENCL, "inside clEnqueueReadBuffer wrapper");
 
-  opencl_object_t *mem_info = opencl_malloc();
-  initializeMemoryCallBackInfo(mem_info, GPU_MEMCPY_D2H, cb, CORRELATION_ID_INVALID);
-  opencl_subscriber_callback(mem_info);
+  opencl_object_t *cpy_info = opencl_malloc();
+  initializeMemcpyCallBackInfo(cpy_info, GPU_MEMCPY_D2H, cb, CORRELATION_ID_INVALID);
+  opencl_subscriber_callback(cpy_info);
 
   cl_event my_event;
   cl_event *eventp;
   if (!event) {
-    mem_info->isInternalClEvent = true;
+    cpy_info->isInternalClEvent = true;
     eventp = &my_event;
   } else {
     eventp = event;
-    mem_info->isInternalClEvent = false;
+    cpy_info->isInternalClEvent = false;
   }
 
   cl_int return_status =
@@ -1063,13 +1117,13 @@ clEnqueueReadBuffer
                      cb, ptr, num_events_in_wait_list, event_wait_list, eventp));
 
   ETMSG(OPENCL, "Registering callback for kind MEMCPY, type: D2H. "
-                "Correlation id: %"PRIu64 "", mem_info->details.mem_cb.correlation_id);
+                "Correlation id: %"PRIu64 "", cpy_info->details.cpy_cb.correlation_id);
   ETMSG(OPENCL, "%d(bytes) of data being transferred from device to host",
         (long)cb);
 
 
   clSetEventCallback_wrapper(*eventp, CL_COMPLETE,
-                             &opencl_activity_completion_callback, mem_info);
+                             &opencl_activity_completion_callback, cpy_info);
 
   return return_status;
 }
@@ -1090,21 +1144,19 @@ clEnqueueWriteBuffer
 )
 {
   ETMSG(OPENCL, "inside clEnqueueWriteBuffer wrapper. cl_mem buffer: %p", buffer);
-  opencl_h2d_map_delete((uint64_t)buffer);
-	opencl_h2d_pending_operations_adjust(-1);
   //opencl_pending_operations_adjust(-1);
-  opencl_object_t *mem_info = opencl_malloc();
-  initializeMemoryCallBackInfo(mem_info, GPU_MEMCPY_H2D, cb, CORRELATION_ID_INVALID);
-  opencl_subscriber_callback(mem_info);
+  opencl_object_t *cpy_info = opencl_malloc();
+  initializeMemcpyCallBackInfo(cpy_info, GPU_MEMCPY_H2D, cb, CORRELATION_ID_INVALID);
+  opencl_subscriber_callback(cpy_info);
 
   cl_event my_event;
   cl_event *eventp;
   if (!event) {
-    mem_info->isInternalClEvent = true;
+    cpy_info->isInternalClEvent = true;
     eventp = &my_event;
   } else {
     eventp = event;
-    mem_info->isInternalClEvent = false;
+    cpy_info->isInternalClEvent = false;
   }
 
   cl_int return_status =
@@ -1113,13 +1165,13 @@ clEnqueueWriteBuffer
                           num_events_in_wait_list, event_wait_list, eventp));
 
   ETMSG(OPENCL, "Registering callback for kind MEMCPY, type: H2D. "
-                "Correlation id: %"PRIu64 "", mem_info->details.mem_cb.correlation_id);
+                "Correlation id: %"PRIu64 "", cpy_info->details.cpy_cb.correlation_id);
   ETMSG(OPENCL, "%d(bytes) of data being transferred from host to device",
         (long)cb);
 
   clSetEventCallback_wrapper(*eventp, CL_COMPLETE,
                              &opencl_activity_completion_callback,
-                             (void*) mem_info);
+                             (void*) cpy_info);
 
   return return_status;
 }
@@ -1128,38 +1180,38 @@ clEnqueueWriteBuffer
 void*
 clEnqueueMapBuffer
 (
-  cl_command_queue command_queue,
-  cl_mem buffer,
-  cl_bool blocking_map,
-  cl_map_flags map_flags,
-  size_t offset,
-  size_t size,
-  cl_uint num_events_in_wait_list,
-  const cl_event* event_wait_list,
-  cl_event* event,
-  cl_int* errcode_ret
+ cl_command_queue command_queue,
+ cl_mem buffer,
+ cl_bool blocking_map,
+ cl_map_flags map_flags,
+ size_t offset,
+ size_t size,
+ cl_uint num_events_in_wait_list,
+ const cl_event* event_wait_list,
+ cl_event* event,
+ cl_int* errcode_ret
 )
 {
   ETMSG(OPENCL, "inside clEnqueueMapBuffer wrapper");
 
-  opencl_object_t *mem_info = opencl_malloc();
+  opencl_object_t *cpy_info = opencl_malloc();
   if (map_flags == CL_MAP_READ) {
-    initializeMemoryCallBackInfo(mem_info, GPU_MEMCPY_D2H, size, CORRELATION_ID_INVALID);
+    initializeMemcpyCallBackInfo(cpy_info, GPU_MEMCPY_D2H, size, CORRELATION_ID_INVALID);
   } else {
     //map_flags == CL_MAP_WRITE || map_flags == CL_MAP_WRITE_INVALIDATE_REGION
-    initializeMemoryCallBackInfo(mem_info, GPU_MEMCPY_H2D, size, CORRELATION_ID_INVALID);
+    initializeMemcpyCallBackInfo(cpy_info, GPU_MEMCPY_H2D, size, CORRELATION_ID_INVALID);
   }
   
-  opencl_subscriber_callback(mem_info);
+  opencl_subscriber_callback(cpy_info);
 
   cl_event my_event;
   cl_event *eventp;
   if (!event) {
-    mem_info->isInternalClEvent = true;
+    cpy_info->isInternalClEvent = true;
     eventp = &my_event;
   } else {
     eventp = event;
-    mem_info->isInternalClEvent = false;
+    cpy_info->isInternalClEvent = false;
   }
 
   void *map_ptr =
@@ -1169,19 +1221,19 @@ clEnqueueMapBuffer
 
   if (map_flags == CL_MAP_READ) {
     ETMSG(OPENCL, "Registering callback for kind MEMCPY, type: D2H. "
-                  "Correlation id: %"PRIu64 "", mem_info->details.mem_cb.correlation_id);
+                  "Correlation id: %"PRIu64 "", cpy_info->details.cpy_cb.correlation_id);
     ETMSG(OPENCL, "%d(bytes) of data being transferred from device to host",
           (long)size);
   } else {
     ETMSG(OPENCL, "Registering callback for kind MEMCPY, type: H2D. "
-                  "Correlation id: %"PRIu64 "", mem_info->details.mem_cb.correlation_id);
+                  "Correlation id: %"PRIu64 "", cpy_info->details.cpy_cb.correlation_id);
     ETMSG(OPENCL, "%d(bytes) of data being transferred from host to device",
           (long)size);
   }
 
 
   clSetEventCallback_wrapper(*eventp, CL_COMPLETE,
-                             &opencl_activity_completion_callback, mem_info);
+                             &opencl_activity_completion_callback, cpy_info);
 
   return map_ptr;
 }
@@ -1197,13 +1249,25 @@ clCreateBuffer
  cl_int* errcode_ret
 )
 {
-	uint64_t correlation_id = getCorrelationId();
-	opencl_h2d_pending_operations_adjust(1);
+  uint32_t correlation_id = getCorrelationId();
+
+  opencl_object_t mem_info;
+  initializeMemoryCallBackInfo(&mem_info, flags, size, correlation_id);
+  opencl_subscriber_callback(&mem_info);
+
+  gpu_interval_t interval;
+
+  interval.start = CPU_NANOTIME();
   cl_mem buffer = 
     HPCRUN_OPENCL_CALL(clCreateBuffer, (context, flags, size, host_ptr, errcode_ret));
-  uint64_t buffer_id = (uint64_t)buffer; 
-  //ETMSG(OPENCL, "inside clCreateBuffer wrapper. cl_mem buffer: %p. buffer_id: %"PRIu64"", buffer, buffer_id);
-	opencl_h2d_map_insert(buffer_id, correlation_id, size, NULL);
+  interval.end = CPU_NANOTIME();
+
+  ETMSG(OPENCL, "clCreateBuffer correlation_id: %u, flags: %u, size: %"PRIu64 "", correlation_id, flags, size);
+
+  opencl_activity_multiplexer_push(interval, &mem_info, correlation_id);
+
+  atomic_fetch_add(&opencl_pending_operations, -1);
+  atomic_fetch_add(&opencl_self_pending_operations, -1);
   
   return buffer;
 }
@@ -1218,45 +1282,7 @@ clSetKernelArg
  const void* arg_value
 )
 {
-	bool isClBuffer = opencl_isClArgBuffer(arg_value);
-  //ETMSG(OPENCL, "inside clSetKernelArg wrapper."); //isClBuffer: %d. *(cl_mem*)arg_value: %p",isClBuffer, *(cl_mem*)arg_value
-
-  cl_int return_status = 
-    HPCRUN_OPENCL_CALL(clSetKernelArg, (kernel, arg_index, arg_size, arg_value));
-	if (!isClBuffer) {
-		return return_status;	
-	}
-
-  size_t context_size;
-  cl_int STATUS = clGetKernelInfo(kernel, CL_KERNEL_CONTEXT, 0, NULL, &context_size);
-  cl_context *context = malloc(sizeof(context_size));
-  STATUS = clGetKernelInfo(kernel, CL_KERNEL_CONTEXT, context_size, (void*)context, NULL);
-  
-  //opencl_cl_kernel_map_insert((uint64_t)ocl_kernel, (uint32_t)(*context), command_queue);
-
-  uint32_t context_id = (uint32_t)(*context);
-  opencl_context_map_entry_t *ce = opencl_cl_context_map_lookup((uint64_t)(*context));
-  uint32_t stream_id = opencl_cl_kernel_map_entry_stream_get(ce);
-
-  uint64_t buffer_id = opencl_get_buffer_id(arg_value);
-	opencl_h2d_map_entry_t *entry = opencl_h2d_map_lookup(buffer_id);
-	if (entry) {
-		size_t size = opencl_h2d_map_entry_size_get(entry);
-
-		uint64_t correlation_id = opencl_h2d_map_entry_correlation_get(entry);
-    opencl_object_t *mem_info = opencl_malloc();
-    initializeClSetKernelArgMemoryCallBackInfo(mem_info, GPU_MEMCPY_H2D, size, correlation_id, context_id, stream_id);
-    opencl_subscriber_callback(mem_info);
-
-    /* There is no way to record start_time, end_time for the memory transfer that happens as part of clSetKernelArg
-      This is because clSetKernelArg sets argument for a kernel in a context. But in a context, there can be multiple
-      device-queue pairs and opencl does not provide events or listeners to the queue so that we can read the memory operations.
-      Since the memory transfer is async, there are no event handles and we dont know which device is the receiver;
-      the timing information cannot be calculated.
-    */
-  	opencl_h2d_map_insert(buffer_id,correlation_id, size, mem_info);
-	}
-  return return_status;
+  return HPCRUN_OPENCL_CALL(clSetKernelArg, (kernel, arg_index, arg_size, arg_value));
 }
 
 
@@ -1276,10 +1302,24 @@ opencl_api_thread_finalize
  void *args
 )
 {
-	opencl_wait_for_non_clSetKernelArg_pending_operations();
-	opencl_update_ccts_for_setClKernelArg();
-  opencl_wait_for_all_pending_operations();
-  gpu_application_thread_process_activities();
+  if (opencl_stop_flag) {
+    opencl_stop_flag = false;
+
+    opencl_wait_for_self_pending_operations();
+    if (gpu_activity_multiplexer_my_channel_initialized() == false){
+      gpu_activity_multiplexer_my_channel_init();
+    }
+    atomic_bool wait;
+    atomic_store(&wait, true);
+    gpu_activity_t gpu_activity;
+    memset(&gpu_activity, 0, sizeof(gpu_activity_t));
+
+    gpu_activity.kind = GPU_ACTIVITY_FLUSH;
+    gpu_activity.details.flush.wait = &wait;
+    gpu_activity_multiplexer_push(gpu_activity_channel_get(), &gpu_activity);
+    while (atomic_load(&wait)) {}
+    gpu_application_thread_process_activities();
+  }
 }
 
 
diff --git a/src/tool/hpcrun/gpu/opencl/opencl-memory-manager.h b/src/tool/hpcrun/gpu/opencl/opencl-memory-manager.h
index 1c3b43022a..9dca1f1233 100644
--- a/src/tool/hpcrun/gpu/opencl/opencl-memory-manager.h
+++ b/src/tool/hpcrun/gpu/opencl/opencl-memory-manager.h
@@ -51,6 +51,7 @@
 //******************************************************************************
 
 #include <lib/prof-lean/bistack.h>
+#include <lib/prof-lean/stdatomic.h>
 #include <hpcrun/gpu/gpu-activity.h>
 
 
@@ -77,7 +78,7 @@ typedef struct opencl_object_channel_t opencl_object_channel_t;
 typedef struct gpu_activity_channel_t gpu_activity_channel_t;
 
 typedef struct cl_basic_callback_t {
-  uint64_t correlation_id;
+  uint32_t correlation_id;
   gpu_activity_kind_t kind;
   gpu_memcpy_type_t type;
   cct_node_t *cct_node;
@@ -85,22 +86,30 @@ typedef struct cl_basic_callback_t {
 
 
 typedef struct cl_kernel_callback_t {
-  uint64_t correlation_id;
+  uint32_t correlation_id;
 } cl_kernel_callback_t;
 
 
-typedef struct cl_memory_callback_t {
-  uint64_t correlation_id;
+typedef struct cl_memcpy_callback_t {
+  uint32_t correlation_id;
   gpu_memcpy_type_t type;
   bool fromHostToDevice;
   bool fromDeviceToHost;
   size_t size;
+} cl_memcpy_callback_t;
+
+
+typedef struct cl_memory_callback_t {
+  uint32_t correlation_id;
+  gpu_mem_type_t type;
+  size_t size;
 } cl_memory_callback_t;
 
 
 typedef struct opencl_object_details_t {
   union {
     cl_kernel_callback_t ker_cb;
+    cl_memcpy_callback_t cpy_cb;
     cl_memory_callback_t mem_cb;
   };
   gpu_activity_channel_t *initiator_channel;
@@ -117,6 +126,7 @@ typedef struct opencl_object_t {
   gpu_activity_kind_t kind;
   bool isInternalClEvent;
   opencl_object_details_t details;
+  atomic_int *pending_operations;
 } opencl_object_t;
 
 
diff --git a/src/tool/hpcrun/sample-sources/opencl.c b/src/tool/hpcrun/sample-sources/opencl.c
index 889788b87b..c23510d6fe 100644
--- a/src/tool/hpcrun/sample-sources/opencl.c
+++ b/src/tool/hpcrun/sample-sources/opencl.c
@@ -154,6 +154,7 @@ METHOD_FN(process_event_list, int lush_metrics)
   int nevents = (self->evl).nevents;
   TMSG(OPENCL,"nevents = %d", nevents);
   gpu_metrics_default_enable();
+  gpu_metrics_KINFO_enable();
 
   char* evlist = METHOD_CALL(self, get_event_str);
   char* event = start_tok(evlist);

From eae3adcee2268df8d991bebec62f149aa32349ae Mon Sep 17 00:00:00 2001
From: Jokeren <robinho364@gmail.com>
Date: Wed, 14 Oct 2020 02:11:03 +0000
Subject: [PATCH 099/177] Code cleanup for opencl

---
 src/tool/hpcrun/Makefile.am                   |   3 +-
 src/tool/hpcrun/Makefile.in                   |  18 +-
 .../hpcrun/gpu/gpu-activity-multiplexer.c     |   4 +
 src/tool/hpcrun/gpu/gpu-activity.c            |  16 +-
 src/tool/hpcrun/gpu/gpu-activity.h            |   8 +-
 src/tool/hpcrun/gpu/opencl/opencl-api.c       | 332 ++++++++----------
 src/tool/hpcrun/gpu/opencl/opencl-api.h       |  16 +-
 .../hpcrun/gpu/opencl/opencl-context-map.c    | 244 +++++++++++++
 .../hpcrun/gpu/opencl/opencl-context-map.h    |  98 ++++++
 .../hpcrun/gpu/opencl/opencl-memory-manager.c |  33 +-
 .../hpcrun/gpu/opencl/opencl-memory-manager.h |  17 +-
 src/tool/hpcrun/gpu/opencl/opencl-queue-map.c |  64 ++--
 src/tool/hpcrun/gpu/opencl/opencl-queue-map.h |  12 +-
 src/tool/hpcrun/sample-sources/opencl.c       |  11 +-
 14 files changed, 600 insertions(+), 276 deletions(-)
 create mode 100644 src/tool/hpcrun/gpu/opencl/opencl-context-map.c
 create mode 100644 src/tool/hpcrun/gpu/opencl/opencl-context-map.h

diff --git a/src/tool/hpcrun/Makefile.am b/src/tool/hpcrun/Makefile.am
index 94556dcf9e..76e128fc5b 100644
--- a/src/tool/hpcrun/Makefile.am
+++ b/src/tool/hpcrun/Makefile.am
@@ -517,7 +517,8 @@ MY_OPENCL_FILES = sample-sources/opencl.c \
 	gpu/opencl/opencl-memory-manager.c \
 	gpu/opencl/opencl-activity-translate.c \
 	gpu/opencl/opencl-h2d-map.c \
-	gpu/opencl/opencl-queue-map.c 
+	gpu/opencl/opencl-queue-map.c \
+	gpu/opencl/opencl-context-map.c 
 endif
 
 if OPT_ENABLE_GTPIN
diff --git a/src/tool/hpcrun/Makefile.in b/src/tool/hpcrun/Makefile.in
index c71920caa9..0a44403079 100644
--- a/src/tool/hpcrun/Makefile.in
+++ b/src/tool/hpcrun/Makefile.in
@@ -545,6 +545,7 @@ am__libhpcrun_la_SOURCES_DIST = utilities/first_func.c main.h main.c \
 	gpu/opencl/opencl-api.c gpu/opencl/opencl-memory-manager.c \
 	gpu/opencl/opencl-activity-translate.c \
 	gpu/opencl/opencl-h2d-map.c gpu/opencl/opencl-queue-map.c \
+	gpu/opencl/opencl-context-map.c \
 	gpu/instrumentation/kernel-data-map.c \
 	gpu/instrumentation/gtpin-instrumentation.c \
 	gpu/instrumentation/gtpin-correlation-id-map.c \
@@ -759,7 +760,8 @@ am__objects_35 =
 @OPT_ENABLE_OPENCL_TRUE@	gpu/opencl/libhpcrun_la-opencl-memory-manager.lo \
 @OPT_ENABLE_OPENCL_TRUE@	gpu/opencl/libhpcrun_la-opencl-activity-translate.lo \
 @OPT_ENABLE_OPENCL_TRUE@	gpu/opencl/libhpcrun_la-opencl-h2d-map.lo \
-@OPT_ENABLE_OPENCL_TRUE@	gpu/opencl/libhpcrun_la-opencl-queue-map.lo
+@OPT_ENABLE_OPENCL_TRUE@	gpu/opencl/libhpcrun_la-opencl-queue-map.lo \
+@OPT_ENABLE_OPENCL_TRUE@	gpu/opencl/libhpcrun_la-opencl-context-map.lo
 @OPT_ENABLE_OPENCL_TRUE@am__objects_41 = $(am__objects_40)
 @OPT_ENABLE_GTPIN_TRUE@am__objects_42 = gpu/instrumentation/libhpcrun_la-kernel-data-map.lo \
 @OPT_ENABLE_GTPIN_TRUE@	gpu/instrumentation/libhpcrun_la-gtpin-instrumentation.lo \
@@ -1902,7 +1904,8 @@ MY_AARCH64_FILES = \
 @OPT_ENABLE_OPENCL_TRUE@	gpu/opencl/opencl-memory-manager.c \
 @OPT_ENABLE_OPENCL_TRUE@	gpu/opencl/opencl-activity-translate.c \
 @OPT_ENABLE_OPENCL_TRUE@	gpu/opencl/opencl-h2d-map.c \
-@OPT_ENABLE_OPENCL_TRUE@	gpu/opencl/opencl-queue-map.c 
+@OPT_ENABLE_OPENCL_TRUE@	gpu/opencl/opencl-queue-map.c \
+@OPT_ENABLE_OPENCL_TRUE@	gpu/opencl/opencl-context-map.c 
 
 @OPT_ENABLE_GTPIN_TRUE@MY_GTPIN_FILES = \
 @OPT_ENABLE_GTPIN_TRUE@	gpu/instrumentation/kernel-data-map.c \
@@ -2870,6 +2873,9 @@ gpu/opencl/libhpcrun_la-opencl-h2d-map.lo: gpu/opencl/$(am__dirstamp) \
 gpu/opencl/libhpcrun_la-opencl-queue-map.lo:  \
 	gpu/opencl/$(am__dirstamp) \
 	gpu/opencl/$(DEPDIR)/$(am__dirstamp)
+gpu/opencl/libhpcrun_la-opencl-context-map.lo:  \
+	gpu/opencl/$(am__dirstamp) \
+	gpu/opencl/$(DEPDIR)/$(am__dirstamp)
 gpu/instrumentation/$(am__dirstamp):
 	@$(MKDIR_P) gpu/instrumentation
 	@: > gpu/instrumentation/$(am__dirstamp)
@@ -3832,6 +3838,7 @@ distclean-compile:
 @AMDEP_TRUE@@am__include@ @am__quote@gpu/nvidia/$(DEPDIR)/libhpcrun_o-cupti-gpu-api.Po@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@gpu/opencl/$(DEPDIR)/libhpcrun_la-opencl-activity-translate.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@gpu/opencl/$(DEPDIR)/libhpcrun_la-opencl-api.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@gpu/opencl/$(DEPDIR)/libhpcrun_la-opencl-context-map.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@gpu/opencl/$(DEPDIR)/libhpcrun_la-opencl-h2d-map.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@gpu/opencl/$(DEPDIR)/libhpcrun_la-opencl-memory-manager.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@gpu/opencl/$(DEPDIR)/libhpcrun_la-opencl-queue-map.Plo@am__quote@
@@ -5541,6 +5548,13 @@ gpu/opencl/libhpcrun_la-opencl-queue-map.lo: gpu/opencl/opencl-queue-map.c
 @AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
 @am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -c -o gpu/opencl/libhpcrun_la-opencl-queue-map.lo `test -f 'gpu/opencl/opencl-queue-map.c' || echo '$(srcdir)/'`gpu/opencl/opencl-queue-map.c
 
+gpu/opencl/libhpcrun_la-opencl-context-map.lo: gpu/opencl/opencl-context-map.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -MT gpu/opencl/libhpcrun_la-opencl-context-map.lo -MD -MP -MF gpu/opencl/$(DEPDIR)/libhpcrun_la-opencl-context-map.Tpo -c -o gpu/opencl/libhpcrun_la-opencl-context-map.lo `test -f 'gpu/opencl/opencl-context-map.c' || echo '$(srcdir)/'`gpu/opencl/opencl-context-map.c
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) gpu/opencl/$(DEPDIR)/libhpcrun_la-opencl-context-map.Tpo gpu/opencl/$(DEPDIR)/libhpcrun_la-opencl-context-map.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='gpu/opencl/opencl-context-map.c' object='gpu/opencl/libhpcrun_la-opencl-context-map.lo' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -c -o gpu/opencl/libhpcrun_la-opencl-context-map.lo `test -f 'gpu/opencl/opencl-context-map.c' || echo '$(srcdir)/'`gpu/opencl/opencl-context-map.c
+
 gpu/instrumentation/libhpcrun_la-kernel-data-map.lo: gpu/instrumentation/kernel-data-map.c
 @am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -MT gpu/instrumentation/libhpcrun_la-kernel-data-map.lo -MD -MP -MF gpu/instrumentation/$(DEPDIR)/libhpcrun_la-kernel-data-map.Tpo -c -o gpu/instrumentation/libhpcrun_la-kernel-data-map.lo `test -f 'gpu/instrumentation/kernel-data-map.c' || echo '$(srcdir)/'`gpu/instrumentation/kernel-data-map.c
 @am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) gpu/instrumentation/$(DEPDIR)/libhpcrun_la-kernel-data-map.Tpo gpu/instrumentation/$(DEPDIR)/libhpcrun_la-kernel-data-map.Plo
diff --git a/src/tool/hpcrun/gpu/gpu-activity-multiplexer.c b/src/tool/hpcrun/gpu/gpu-activity-multiplexer.c
index 05ba10d26f..241f8549ed 100644
--- a/src/tool/hpcrun/gpu/gpu-activity-multiplexer.c
+++ b/src/tool/hpcrun/gpu/gpu-activity-multiplexer.c
@@ -188,6 +188,10 @@ gpu_activity_multiplexer_push
  gpu_activity_t *gpu_activity
 )
 {
+  if (gpu_activity_multiplexer_my_channel_initialized() == false) {
+    gpu_activity_multiplexer_my_channel_init();
+  }
+
   gpu_operation_item_t item = (gpu_operation_item_t){.channel=initiator_channel, .activity=*gpu_activity};
   gpu_operation_channel_produce(gpu_operation_channel, &item);
 }
diff --git a/src/tool/hpcrun/gpu/gpu-activity.c b/src/tool/hpcrun/gpu/gpu-activity.c
index 03ab59bbe9..7f06d073f9 100644
--- a/src/tool/hpcrun/gpu/gpu-activity.c
+++ b/src/tool/hpcrun/gpu/gpu-activity.c
@@ -141,21 +141,21 @@ gpu_activity_free
 }
 
 void
-set_gpu_instruction
+gpu_instruction_set
 (
-  gpu_instruction_t* insn, 
-  ip_normalized_t pc
+ gpu_instruction_t* insn, 
+ ip_normalized_t pc
 )
 {
   insn->pc = pc;
 }
 
 void
-set_gpu_interval
+gpu_interval_set
 (
-  gpu_interval_t* interval,
-  uint64_t start,
-  uint64_t end
+ gpu_interval_t* interval,
+ uint64_t start,
+ uint64_t end
 )
 {
   interval->start = start;
@@ -188,4 +188,4 @@ gpu_memcpy_type_t type
     FORALL_OPENCL_MEM_TYPES(CODE_TO_STRING)
     default: return "CL_unknown_type";
   }
-}
\ No newline at end of file
+}
diff --git a/src/tool/hpcrun/gpu/gpu-activity.h b/src/tool/hpcrun/gpu/gpu-activity.h
index 0202060d36..d651fa0333 100644
--- a/src/tool/hpcrun/gpu/gpu-activity.h
+++ b/src/tool/hpcrun/gpu/gpu-activity.h
@@ -434,11 +434,11 @@ gpu_activity_free
 
 
 void
-set_gpu_interval
+gpu_interval_set
 (
-  gpu_interval_t* interval,
-  uint64_t start,
-  uint64_t end
+ gpu_interval_t* interval,
+ uint64_t start,
+ uint64_t end
 );
 
 
diff --git a/src/tool/hpcrun/gpu/opencl/opencl-api.c b/src/tool/hpcrun/gpu/opencl/opencl-api.c
index 82c3e631bf..83d0b50112 100644
--- a/src/tool/hpcrun/gpu/opencl/opencl-api.c
+++ b/src/tool/hpcrun/gpu/opencl/opencl-api.c
@@ -84,7 +84,7 @@
 #include "opencl-memory-manager.h"
 #include "opencl-h2d-map.h"
 #include "opencl-queue-map.h"
-
+#include "opencl-context-map.h"
 
 
 //******************************************************************************
@@ -144,20 +144,24 @@
 // local data
 //******************************************************************************
 
-static atomic_long correlation_id_counter;
 static atomic_ullong opencl_h2d_pending_operations;
-static spinlock_t opencl_h2d_lock = SPINLOCK_UNLOCKED;
-static bool instrumentation = false;
+static atomic_uint correlation_id_counter = { 0 };
+// Global pending operation count for all threads
+static atomic_uint opencl_pending_operations = { 0 };
 
+// The thread itself how many pending operations
 static __thread atomic_int opencl_self_pending_operations = { 0 };
-static atomic_int opencl_pending_operations = { 0 };
+// Mark if a thread has invoked any opencl call
+// If yes, we can flush all opencl activities when the thread terminates
 static __thread bool opencl_stop_flag = false;
 
+static spinlock_t opencl_h2d_lock = SPINLOCK_UNLOCKED;
+static bool instrumentation = false;
+
 #define CL_PROGRAM_DEBUG_INFO_SIZES_INTEL 0x4101
 #define CL_PROGRAM_DEBUG_INFO_INTEL       0x4100
 
 
-
 //----------------------------------------------------------
 // opencl function pointers for late binding
 //----------------------------------------------------------
@@ -281,16 +285,16 @@ OPENCL_ENQUEUEMAPBUFFER_FN
 (
   clEnqueueMapBuffer, 
   (
-    cl_command_queue,
-    cl_mem,
-    cl_bool,
-    cl_map_flags,
-    size_t,
-    size_t,
-    cl_uint,
-    const cl_event*,
-    cl_event*,
-    cl_int*
+   cl_command_queue,
+   cl_mem,
+   cl_bool,
+   cl_map_flags,
+   size_t,
+   size_t,
+   cl_uint,
+   const cl_event*,
+   cl_event*,
+   cl_int*
   )
 );
 
@@ -299,11 +303,11 @@ OPENCL_CREATEBUFFER_FN
 (
   clCreateBuffer,
   (
-    cl_context,
-    cl_mem_flags,
-    size_t,
-    void *,
-    cl_int *
+   cl_context,
+   cl_mem_flags,
+   size_t,
+   void *,
+   cl_int *
   )
 );
 
@@ -312,10 +316,10 @@ OPENCL_FN
 (
   clSetKernelArg,
   (
-    cl_kernel kernel,
-    cl_uint arg_index,
-    size_t arg_size,
-    const void* arg_value
+   cl_kernel kernel,
+   cl_uint arg_index,
+   size_t arg_size,
+   const void* arg_value
   )
 );
 
@@ -374,16 +378,16 @@ static void
 initializeKernelCallBackInfo
 (
  opencl_object_t *ker_info,
- uint64_t correlation_id,
- uint32_t context_id,
- uint32_t stream_id
+ cl_command_queue command_queue
 )
 {
-  ker_info->kind = GPU_ACTIVITY_KERNEL;
-  ker_info->details.ker_cb.correlation_id = correlation_id;
-  ker_info->pending_operations = &opencl_self_pending_operations;
+  opencl_queue_map_entry_t *qe = opencl_cl_queue_map_lookup((uint64_t)command_queue);
+  uint32_t context_id = opencl_cl_queue_map_entry_context_id_get(qe);
+  uint32_t queue_id = opencl_cl_queue_map_entry_queue_id_get(qe);
+
   ker_info->details.context_id = context_id;
-  ker_info->details.stream_id = stream_id;
+  ker_info->details.stream_id = queue_id;
+  ker_info->pending_operations = &opencl_self_pending_operations;
 }
 
 
@@ -393,19 +397,20 @@ initializeMemcpyCallBackInfo
  opencl_object_t *cpy_info,
  gpu_memcpy_type_t type,
  size_t size,
- uint32_t correlation_id,
- uint32_t context_id,
- uint32_t stream_id
+ cl_command_queue command_queue
 )
 {
+  opencl_queue_map_entry_t *qe = opencl_cl_queue_map_lookup((uint64_t)command_queue);
+  uint32_t context_id = opencl_cl_queue_map_entry_context_id_get(qe);
+  uint32_t queue_id = opencl_cl_queue_map_entry_queue_id_get(qe);
+
   cpy_info->kind = GPU_ACTIVITY_MEMCPY;
   cpy_info->details.cpy_cb.type = type;
   cpy_info->details.cpy_cb.fromHostToDevice = (type == GPU_MEMCPY_H2D);
   cpy_info->details.cpy_cb.fromDeviceToHost = (type == GPU_MEMCPY_D2H);
   cpy_info->details.cpy_cb.size = size;
-  cpy_info->details.cpy_cb.correlation_id = correlation_id;
   cpy_info->details.context_id = context_id;
-  cpy_info->details.stream_id = stream_id;
+  cpy_info->details.stream_id = queue_id;
   cpy_info->pending_operations = &opencl_self_pending_operations;
 }
 
@@ -415,8 +420,7 @@ initializeMemoryCallBackInfo
 (
  opencl_object_t *mem_info,
  cl_mem_flags flags,
- size_t size,
- uint32_t correlation_id
+ size_t size
 )
 {
   mem_info->kind = GPU_ACTIVITY_MEMORY;
@@ -432,18 +436,23 @@ initializeMemoryCallBackInfo
   }
 
   mem_info->details.mem_cb.size = size;
-  mem_info->details.mem_cb.correlation_id = correlation_id;
-  mem_info->pending_operations = &opencl_self_pending_operations;
 }
 
 
-static void opencl_activity_completion_notify
-(
- void
-)
-{
-  gpu_monitoring_thread_activities_ready();
-}
+#define INITIALIZE_CALLBACK_INFO(f, obj, args) \
+  f args; \
+  obj->pending_operations = &opencl_self_pending_operations;
+
+
+#define SET_EVENT_POINTER(eventp, event, obj) \
+  cl_event my_event; \
+  if (!event) { \
+    obj->internal_event = true; \
+    eventp = &my_event; \
+  } else { \
+    eventp = event; \
+    obj->internal_event = false; \
+  }
 
 
 // we are dumping the debuginfo since the binary does not have debugsection
@@ -457,7 +466,7 @@ clBuildProgramCallback
 }
 
 
-static void
+static void __attribute__((unused))
 opencl_h2d_pending_operations_adjust
 (
  int value
@@ -471,14 +480,10 @@ static void
 opencl_activity_multiplexer_push
 (
  gpu_interval_t interval,
- opencl_object_t *cb_data,
+ opencl_object_t *obj,
  uint32_t correlation_id
 )
 {
-  if (gpu_activity_multiplexer_my_channel_initialized() == false){
-    gpu_activity_multiplexer_my_channel_init();
-  }
-
   gpu_activity_t gpu_activity;
   memset(&gpu_activity, 0, sizeof(gpu_activity_t));
 
@@ -486,11 +491,11 @@ opencl_activity_multiplexer_push
   gpu_activity.kind = GPU_ACTIVITY_EXTERNAL_CORRELATION;
   gpu_activity.details.correlation.correlation_id = correlation_id;
   gpu_activity.details.correlation.host_correlation_id = correlation_id;
-  gpu_activity_multiplexer_push(cb_data->details.initiator_channel, &gpu_activity);
+  gpu_activity_multiplexer_push(obj->details.initiator_channel, &gpu_activity);
   
   // The actual entry
-  opencl_activity_translate(&gpu_activity, cb_data, interval);
-  gpu_activity_multiplexer_push(cb_data->details.initiator_channel, &gpu_activity);
+  opencl_activity_translate(&gpu_activity, obj, interval);
+  gpu_activity_multiplexer_push(obj->details.initiator_channel, &gpu_activity);
 }
 
 
@@ -498,7 +503,7 @@ static void
 opencl_activity_process
 (
  cl_event event,
- opencl_object_t *cb_data,
+ opencl_object_t *obj,
  uint32_t correlation_id
 )
 {
@@ -506,15 +511,15 @@ opencl_activity_process
   memset(&interval, 0, sizeof(gpu_interval_t));
   opencl_timing_info_get(&interval, event);
 
-  opencl_activity_multiplexer_push(interval, cb_data, correlation_id);
+  opencl_activity_multiplexer_push(interval, obj, correlation_id);
 }
 
 
-static void
+static void __attribute__((unused))
 opencl_clSetKernelArg_activity_process
 (
-  opencl_h2d_map_entry_t *entry,
-  opencl_object_t *cb_data
+ opencl_h2d_map_entry_t *entry,
+ opencl_object_t *cb_data
 )
 {
   gpu_activity_t gpu_activity;
@@ -537,10 +542,10 @@ opencl_clSetKernelArg_activity_process
 }
 
 
-static uint64_t
+static uint64_t __attribute__((unused))
 opencl_get_buffer_id
 (
-  const void *arg
+ const void *arg
 )
 {
   if (arg != NULL) {
@@ -549,14 +554,13 @@ opencl_get_buffer_id
   } else {
     return BUFFER_ID_INVALID;
   }
-
 }
 
 
-static bool
+static bool __attribute__((unused))
 opencl_isClArgBuffer
 (
-  const void *arg
+ const void *arg
 )
 {
   /*
@@ -577,7 +581,7 @@ opencl_isClArgBuffer
 }
 
 
-static void
+static void __attribute__((unused))
 add_H2D_metrics_to_cct_node
 (
  opencl_h2d_map_entry_t *entry,
@@ -596,7 +600,7 @@ add_H2D_metrics_to_cct_node
 }
 
 
-static void
+static void __attribute__((unused))
 opencl_update_ccts_for_setClKernelArg
 (
  void
@@ -611,7 +615,7 @@ opencl_update_ccts_for_setClKernelArg
 }
 
 
-static void
+static void __attribute__((unused))
 opencl_wait_for_non_clSetKernelArg_pending_operations
 (
  void
@@ -670,6 +674,7 @@ opencl_cb_basic_get
   return cb_basic;
 }
 
+
 void
 opencl_cb_basic_print
 (
@@ -677,46 +682,31 @@ opencl_cb_basic_print
  char *title
 )
 {
-
   ETMSG(OPENCL, " %s | Activity kind: %s | type: %s | correlation id: %"PRIu64 "| cct_node = %p",
         title,
         gpu_kind_to_string(cb_basic.kind),
         gpu_type_to_string(cb_basic.type),
         cb_basic.correlation_id,
         cb_basic.cct_node);
-
-}
-
-
-static uint64_t
-get_corr_id
-(
- opencl_object_t *cb_info
-){
-  return cb_info->details.ker_cb.correlation_id;
 }
 
 
-
 void
 opencl_subscriber_callback
 (
-  opencl_object_t *cb_info
+ opencl_object_t *cb_info
 )
 {
+  // We invoked an opencl operation
   opencl_stop_flag = true;
 
-  gpu_placeholder_type_t placeholder_type;
-  uint32_t correlation_id;
-
-  if( get_corr_id(cb_info) == CORRELATION_ID_INVALID){
-    correlation_id = getCorrelationId();
-  }else{
-    correlation_id = get_corr_id(cb_info);
-  }
+  uint32_t correlation_id = getCorrelationId();
 
+  // Init operations
   atomic_fetch_add(cb_info->pending_operations, 1);
   atomic_fetch_add(&opencl_pending_operations, 1);
+
+  gpu_placeholder_type_t placeholder_type;
   gpu_op_placeholder_flags_t gpu_op_placeholder_flags = 0;
 
   switch (cb_info->kind) {
@@ -804,10 +794,11 @@ opencl_activity_completion_callback
     opencl_cb_basic_print(cb_basic, "Completion_Callback");
     opencl_activity_process(event, cb_data, cb_basic.correlation_id);
   }
-  if (cb_data->isInternalClEvent) {
+  if (cb_data->internal_event) {
     HPCRUN_OPENCL_CALL(clReleaseEvent, (event));
   }
 
+  // Finish operations
   atomic_fetch_add(cb_data->pending_operations, -1);
   atomic_fetch_add(&opencl_pending_operations, -1);
 
@@ -835,7 +826,7 @@ opencl_timing_info_get
 
   ETMSG(OPENCL, "duration [%lu, %lu]", commandStart, commandEnd);
 
-  set_gpu_interval(interval, (uint64_t)commandStart, (uint64_t)commandEnd);
+  gpu_interval_set(interval, (uint64_t)commandStart, (uint64_t)commandEnd);
 }
 
 
@@ -862,7 +853,6 @@ opencl_api_initialize
 {
   ETMSG(OPENCL, "CL_TARGET_OPENCL_VERSION: %d", CL_TARGET_OPENCL_VERSION);
   if (instrumentation) {
-    gpu_metrics_GPU_INST_enable();
     gtpin_enable_profiling();
   }
   atomic_store(&correlation_id_counter, 0);
@@ -973,7 +963,9 @@ clCreateCommandQueue
 
   cl_command_queue queue = HPCRUN_OPENCL_CALL(clCreateCommandQueue, (context, device,
         properties,errcode_ret));
-  opencl_cl_queue_map_insert((uint64_t)queue, (uint32_t)context);
+
+  uint32_t context_id = opencl_cl_context_map_update(context);
+  opencl_cl_queue_map_update((uint64_t)queue, context_id);
   return queue;
 }
 
@@ -1001,7 +993,6 @@ clCreateCommandQueueWithProperties
         queue_props_id = props_count;
         ++props_count;
       } else if (properties[props_count] == CL_QUEUE_SIZE) {
-        // TODO(Keren): A temporay hack
         ++props_count;
       }
       ++props_count;
@@ -1031,7 +1022,9 @@ clCreateCommandQueueWithProperties
     // The property is created by us
     free(queue_properties);
   }
-  opencl_cl_queue_map_insert((uint64_t)queue, (uint32_t)context);
+
+  uint32_t context_id = opencl_cl_context_map_update(context);
+  opencl_cl_queue_map_update(queue, context_id);
   return queue;
 }
 
@@ -1050,23 +1043,14 @@ clEnqueueNDRangeKernel
  cl_event *event
 )
 {
-  opencl_object_t *kernel_info = opencl_malloc();
-  opencl_queue_map_entry_t *qe = opencl_cl_queue_map_lookup((uint64_t)command_queue);
-  uint32_t context_id = opencl_cl_queue_map_entry_context_id_get(qe);
-  uint32_t stream_id = (uint32_t)command_queue;
-  initializeKernelCallBackInfo(kernel_info, CORRELATION_ID_INVALID, context_id, stream_id);
+  opencl_object_t *kernel_info = opencl_malloc_kind(GPU_ACTIVITY_KERNEL);
+  INITIALIZE_CALLBACK_INFO(initializeKernelCallBackInfo, kernel_info, (kernel_info, command_queue))
 
   opencl_subscriber_callback(kernel_info);
 
-  cl_event my_event;
-  cl_event *eventp;
-  if (!event) {
-    kernel_info->isInternalClEvent = true;
-    eventp = &my_event;
-  } else {
-    eventp = event;
-    kernel_info->isInternalClEvent = false;
-  }
+  cl_event *eventp = NULL;
+  SET_EVENT_POINTER(eventp, event, kernel_info)
+
   cl_int return_status =
             HPCRUN_OPENCL_CALL(clEnqueueNDRangeKernel, (command_queue, ocl_kernel, work_dim,
                                 global_work_offset, global_work_size, local_work_size,
@@ -1085,30 +1069,21 @@ clEnqueueNDRangeKernel
 cl_int
 clEnqueueTask
 (
-  cl_command_queue command_queue,
-  cl_kernel kernel,
-  cl_uint num_events_in_wait_list,
-  const cl_event* event_wait_list,
-  cl_event* event
+ cl_command_queue command_queue,
+ cl_kernel kernel,
+ cl_uint num_events_in_wait_list,
+ const cl_event* event_wait_list,
+ cl_event* event
 )
 {
-  opencl_object_t *kernel_info = opencl_malloc();
-  opencl_queue_map_entry_t *qe = opencl_cl_queue_map_lookup((uint64_t)command_queue);
-  uint32_t context_id = opencl_cl_queue_map_entry_context_id_get(qe);
-  uint32_t stream_id = (uint32_t)command_queue;
-  initializeKernelCallBackInfo(kernel_info, CORRELATION_ID_INVALID, context_id, stream_id);
+  opencl_object_t *kernel_info = opencl_malloc_kind(GPU_ACTIVITY_KERNEL);
+  INITIALIZE_CALLBACK_INFO(initializeKernelCallBackInfo, kernel_info, (kernel_info, command_queue))
 
   opencl_subscriber_callback(kernel_info);
 
-  cl_event my_event;
-  cl_event *eventp;
-  if (!event) {
-    kernel_info->isInternalClEvent = true;
-    eventp = &my_event;
-  } else {
-    eventp = event;
-    kernel_info->isInternalClEvent = false;
-  }
+  cl_event *eventp = NULL;
+  SET_EVENT_POINTER(eventp, event, kernel_info);
+
   cl_int return_status =
             HPCRUN_OPENCL_CALL(clEnqueueTask, (command_queue, kernel,
                                 num_events_in_wait_list, event_wait_list, eventp));
@@ -1138,27 +1113,18 @@ clEnqueueReadBuffer
 {
   ETMSG(OPENCL, "inside clEnqueueReadBuffer wrapper");
 
-  opencl_object_t *cpy_info = opencl_malloc();
-  opencl_queue_map_entry_t *qe = opencl_cl_queue_map_lookup((uint64_t)command_queue);
-  uint32_t context_id = opencl_cl_queue_map_entry_context_id_get(qe);
-  uint32_t stream_id = (uint32_t)command_queue;
-  initializeMemcpyCallBackInfo(cpy_info, GPU_MEMCPY_D2H, cb, CORRELATION_ID_INVALID, context_id, stream_id);
+  opencl_object_t *cpy_info = opencl_malloc_kind(GPU_ACTIVITY_MEMCPY);
+  INITIALIZE_CALLBACK_INFO(initializeMemcpyCallBackInfo, cpy_info, (cpy_info, GPU_MEMCPY_D2H, cb, command_queue))
+
   opencl_subscriber_callback(cpy_info);
 
-  cl_event my_event;
-  cl_event *eventp;
-  if (!event) {
-    cpy_info->isInternalClEvent = true;
-    eventp = &my_event;
-  } else {
-    eventp = event;
-    cpy_info->isInternalClEvent = false;
-  }
+  cl_event *eventp = NULL;
+  SET_EVENT_POINTER(eventp, event, cpy_info);
 
   cl_int return_status =
-  HPCRUN_OPENCL_CALL(clEnqueueReadBuffer,
-                     (command_queue, buffer, blocking_read, offset,
-                     cb, ptr, num_events_in_wait_list, event_wait_list, eventp));
+    HPCRUN_OPENCL_CALL(clEnqueueReadBuffer,
+      (command_queue, buffer, blocking_read, offset,
+       cb, ptr, num_events_in_wait_list, event_wait_list, eventp));
 
   ETMSG(OPENCL, "Registering callback for kind MEMCPY, type: D2H. "
                 "Correlation id: %"PRIu64 "", cpy_info->details.cpy_cb.correlation_id);
@@ -1188,22 +1154,13 @@ clEnqueueWriteBuffer
 )
 {
   ETMSG(OPENCL, "inside clEnqueueWriteBuffer wrapper. cl_mem buffer: %p", buffer);
-  opencl_object_t *cpy_info = opencl_malloc();
-  opencl_queue_map_entry_t *qe = opencl_cl_queue_map_lookup((uint64_t)command_queue);
-  uint32_t context_id = opencl_cl_queue_map_entry_context_id_get(qe);
-  uint32_t stream_id = (uint32_t)command_queue;
-  initializeMemcpyCallBackInfo(cpy_info, GPU_MEMCPY_H2D, cb, CORRELATION_ID_INVALID, context_id, stream_id);
+  opencl_object_t *cpy_info = opencl_malloc_kind(GPU_ACTIVITY_MEMCPY);
+  INITIALIZE_CALLBACK_INFO(initializeMemcpyCallBackInfo, cpy_info, (cpy_info, GPU_MEMCPY_H2D, cb, command_queue))
+
   opencl_subscriber_callback(cpy_info);
 
-  cl_event my_event;
-  cl_event *eventp;
-  if (!event) {
-    cpy_info->isInternalClEvent = true;
-    eventp = &my_event;
-  } else {
-    eventp = event;
-    cpy_info->isInternalClEvent = false;
-  }
+  cl_event *eventp = NULL;
+  SET_EVENT_POINTER(eventp, event, cpy_info);
 
   cl_int return_status =
   HPCRUN_OPENCL_CALL(clEnqueueWriteBuffer,
@@ -1240,28 +1197,18 @@ clEnqueueMapBuffer
 {
   ETMSG(OPENCL, "inside clEnqueueMapBuffer wrapper");
 
-  opencl_object_t *cpy_info = opencl_malloc();
-  opencl_queue_map_entry_t *qe = opencl_cl_queue_map_lookup((uint64_t)command_queue);
-  uint32_t context_id = opencl_cl_queue_map_entry_context_id_get(qe);
-  uint32_t stream_id = (uint32_t)command_queue;
+  opencl_object_t *cpy_info = opencl_malloc_kind(GPU_ACTIVITY_MEMCPY);
   if (map_flags == CL_MAP_READ) {
-    initializeMemcpyCallBackInfo(cpy_info, GPU_MEMCPY_D2H, size, CORRELATION_ID_INVALID, context_id, stream_id);
+    INITIALIZE_CALLBACK_INFO(initializeMemcpyCallBackInfo, cpy_info, (cpy_info, GPU_MEMCPY_D2H, size, command_queue));
   } else {
     //map_flags == CL_MAP_WRITE || map_flags == CL_MAP_WRITE_INVALIDATE_REGION
-    initializeMemcpyCallBackInfo(cpy_info, GPU_MEMCPY_H2D, size, CORRELATION_ID_INVALID, context_id, stream_id);
+    INITIALIZE_CALLBACK_INFO(initializeMemcpyCallBackInfo, cpy_info, (cpy_info, GPU_MEMCPY_H2D, size, command_queue));
   }
   
   opencl_subscriber_callback(cpy_info);
 
-  cl_event my_event;
-  cl_event *eventp;
-  if (!event) {
-    cpy_info->isInternalClEvent = true;
-    eventp = &my_event;
-  } else {
-    eventp = event;
-    cpy_info->isInternalClEvent = false;
-  }
+  cl_event *eventp = NULL;
+  SET_EVENT_POINTER(eventp, event, cpy_info);
 
   void *map_ptr =
   HPCRUN_OPENCL_CALL(clEnqueueMapBuffer,
@@ -1280,7 +1227,6 @@ clEnqueueMapBuffer
           (long)size);
   }
 
-
   clSetEventCallback_wrapper(*eventp, CL_COMPLETE,
                              &opencl_activity_completion_callback, cpy_info);
 
@@ -1298,25 +1244,25 @@ clCreateBuffer
  cl_int* errcode_ret
 )
 {
-  uint32_t correlation_id = getCorrelationId();
+  ETMSG(OPENCL, "clCreateBuffer flags: %u, size: %"PRIu64 "", flags, size);
 
-  opencl_object_t mem_info;
-  initializeMemoryCallBackInfo(&mem_info, flags, size, correlation_id);
-  opencl_subscriber_callback(&mem_info);
+  opencl_object_t *mem_info = opencl_malloc_kind(GPU_ACTIVITY_MEMORY);
+  INITIALIZE_CALLBACK_INFO(initializeMemoryCallBackInfo, mem_info, (mem_info, flags, size))
 
-  gpu_interval_t interval;
+  opencl_subscriber_callback(mem_info);
 
+  gpu_interval_t interval;
   interval.start = CPU_NANOTIME();
   cl_mem buffer = 
     HPCRUN_OPENCL_CALL(clCreateBuffer, (context, flags, size, host_ptr, errcode_ret));
   interval.end = CPU_NANOTIME();
 
-  ETMSG(OPENCL, "clCreateBuffer correlation_id: %u, flags: %u, size: %"PRIu64 "", correlation_id, flags, size);
-
-  opencl_activity_multiplexer_push(interval, &mem_info, correlation_id);
+  opencl_activity_multiplexer_push(interval, mem_info, mem_info->details.mem_cb.correlation_id);
 
   atomic_fetch_add(&opencl_pending_operations, -1);
   atomic_fetch_add(&opencl_self_pending_operations, -1);
+
+  opencl_free(mem_info);
   
   return buffer;
 }
@@ -1336,7 +1282,7 @@ clSetKernelArg
 
 
 void
-opencl_enable_instrumentation
+opencl_instrumentation_enable
 (
  void
 )
@@ -1352,12 +1298,10 @@ opencl_api_thread_finalize
 )
 {
   if (opencl_stop_flag) {
+    // If I have invoked any opencl api, I have to attribute all my activities to my ccts
     opencl_stop_flag = false;
-
     opencl_wait_for_self_pending_operations();
-    if (gpu_activity_multiplexer_my_channel_initialized() == false){
-      gpu_activity_multiplexer_my_channel_init();
-    }
+
     atomic_bool wait;
     atomic_store(&wait, true);
     gpu_activity_t gpu_activity;
@@ -1366,7 +1310,12 @@ opencl_api_thread_finalize
     gpu_activity.kind = GPU_ACTIVITY_FLUSH;
     gpu_activity.details.flush.wait = &wait;
     gpu_activity_multiplexer_push(gpu_activity_channel_get(), &gpu_activity);
+
+    // Wait until the activity is flushed
+    // Operation channel is FIFO
     while (atomic_load(&wait)) {}
+
+    // Now I can attribute activities
     gpu_application_thread_process_activities();
   }
 }
@@ -1375,9 +1324,8 @@ opencl_api_thread_finalize
 void
 opencl_api_process_finalize
 (
-void *args
+ void *args
 )
 {
-  opencl_api_thread_finalize(NULL);
   gpu_activity_multiplexer_fini();
 }
diff --git a/src/tool/hpcrun/gpu/opencl/opencl-api.h b/src/tool/hpcrun/gpu/opencl/opencl-api.h
index 14aa6c2e5a..0614f2ac19 100644
--- a/src/tool/hpcrun/gpu/opencl/opencl-api.h
+++ b/src/tool/hpcrun/gpu/opencl/opencl-api.h
@@ -67,29 +67,29 @@ typedef struct opencl_object_t opencl_object_t;
 cl_basic_callback_t
 opencl_cb_basic_get
 (
-  opencl_object_t *cb_data
+ opencl_object_t *cb_data
 );
 
 
 void
 opencl_cb_basic_print
 (
-  cl_basic_callback_t cb_basic,
-  char *title
+ cl_basic_callback_t cb_basic,
+ char *title
 );
 
 
 void
 opencl_initialize_correlation_id
 (
-  void
+ void
 );
 
 
 void
 opencl_subscriber_callback
 (
-  opencl_object_t *cb_info
+ opencl_object_t *cb_info
 );
 
 
@@ -142,9 +142,9 @@ opencl_bind
 
 
 void
-opencl_enable_instrumentation
+opencl_instrumentation_enable
 (
-	void
+ void
 );
 
 
@@ -158,7 +158,7 @@ opencl_api_thread_finalize
 void
 opencl_api_process_finalize
 (
-void *
+ void *
 );
 
 
diff --git a/src/tool/hpcrun/gpu/opencl/opencl-context-map.c b/src/tool/hpcrun/gpu/opencl/opencl-context-map.c
new file mode 100644
index 0000000000..711523a93d
--- /dev/null
+++ b/src/tool/hpcrun/gpu/opencl/opencl-context-map.c
@@ -0,0 +1,244 @@
+// -*-Mode: C++;-*- // technically C99
+
+// * BeginRiceCopyright *****************************************************
+//
+// --------------------------------------------------------------------------
+// Part of HPCToolkit (hpctoolkit.org)
+//
+// Information about sources of support for research and development of
+// HPCToolkit is at 'hpctoolkit.org' and in 'README.Acknowledgments'.
+// --------------------------------------------------------------------------
+//
+// Copyright ((c)) 2002-2020, Rice University
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+//   notice, this list of conditions and the following disclaimer.
+//
+// * Redistributions in binary form must reproduce the above copyright
+//   notice, this list of conditions and the following disclaimer in the
+//   documentation and/or other materials provided with the distribution.
+//
+// * Neither the name of Rice University (RICE) nor the names of its
+//   contributors may be used to endorse or promote products derived from
+//   this software without specific prior written permission.
+//
+// This software is provided by RICE and contributors "as is" and any
+// express or implied warranties, including, but not limited to, the
+// implied warranties of merchantability and fitness for a particular
+// purpose are disclaimed. In no event shall RICE or contributors be
+// liable for any direct, indirect, incidental, special, exemplary, or
+// consequential damages (including, but not limited to, procurement of
+// substitute goods or services; loss of use, data, or profits; or
+// business interruption) however caused and on any theory of liability,
+// whether in contract, strict liability, or tort (including negligence
+// or otherwise) arising in any way out of the use of this software, even
+// if advised of the possibility of such damage.
+//
+// ******************************************************* EndRiceCopyright *
+
+//*****************************************************************************
+// system includes
+//*****************************************************************************
+
+#include <assert.h>
+#include <string.h>
+
+
+
+//*****************************************************************************
+// local includes
+//*****************************************************************************
+
+#include <lib/prof-lean/splay-uint64.h>
+#include <lib/prof-lean/spinlock.h>
+#include <hpcrun/gpu/gpu-activity-channel.h>
+#include <hpcrun/gpu/gpu-splay-allocator.h>
+#include <hpcrun/gpu/gpu-op-placeholders.h>
+
+#include "opencl-context-map.h"
+
+
+//*****************************************************************************
+// macros
+//*****************************************************************************
+
+#define DEBUG 0
+
+#include "../gpu-print.h"
+
+
+#define st_insert				\
+  typed_splay_insert(context)
+
+#define st_lookup				\
+  typed_splay_lookup(context)
+
+#define st_delete				\
+  typed_splay_delete(context)
+
+#define st_forall				\
+  typed_splay_forall(context)
+
+#define st_count				\
+  typed_splay_count(context)
+
+#define st_alloc(free_list)			\
+  typed_splay_alloc(free_list, opencl_context_map_entry_t)
+
+#define st_free(free_list, node)		\
+  typed_splay_free(free_list, node)
+
+
+
+//*****************************************************************************
+// type declarations
+//*****************************************************************************
+
+#undef typed_splay_node
+#define typed_splay_node(context) opencl_context_map_entry_t
+
+typedef struct typed_splay_node(context) {
+  struct typed_splay_node(context) *left;
+  struct typed_splay_node(context) *right;
+  uint64_t context; // key
+
+  uint32_t context_id;
+} typed_splay_node(context); 
+
+
+//******************************************************************************
+// local data
+//******************************************************************************
+
+static opencl_context_map_entry_t *map_root = NULL;
+
+static opencl_context_map_entry_t *free_list = NULL;
+
+static spinlock_t opencl_context_map_lock = SPINLOCK_UNLOCKED;
+
+static uint32_t cl_context_id = 0;
+
+//*****************************************************************************
+// private operations
+//*****************************************************************************
+
+typed_splay_impl(context)
+
+
+static opencl_context_map_entry_t *
+opencl_cl_context_map_entry_alloc()
+{
+  return st_alloc(&free_list);
+}
+
+
+static opencl_context_map_entry_t *
+opencl_cl_context_map_entry_new
+(
+ uint64_t context,
+ uint32_t context_id
+)
+{
+  opencl_context_map_entry_t *e = opencl_cl_context_map_entry_alloc();
+
+  e->context = context;
+  e->context_id = context_id;
+  
+  return e;
+}
+
+
+//*****************************************************************************
+// interface operations
+//*****************************************************************************
+
+opencl_context_map_entry_t *
+opencl_cl_context_map_lookup
+(
+ uint64_t context
+)
+{
+  spinlock_lock(&opencl_context_map_lock);
+
+  uint64_t id = context;
+  opencl_context_map_entry_t *result = st_lookup(&map_root, id);
+
+  spinlock_unlock(&opencl_context_map_lock);
+
+  return result;
+}
+
+
+uint32_t
+opencl_cl_context_map_update
+(
+ uint64_t context
+)
+{
+  uint32_t ret_context_id = 0;
+  spinlock_lock(&opencl_context_map_lock);
+
+  opencl_context_map_entry_t *entry = st_lookup(&map_root, context);
+  if (entry) {
+    entry->context = context;
+    entry->context_id = cl_context_id;
+  } else {
+    opencl_context_map_entry_t *entry = 
+      opencl_cl_context_map_entry_new(context, cl_context_id);
+
+    st_insert(&map_root, entry);
+  }
+    
+  // Update cl_context_id
+  ret_context_id = cl_context_id++;
+
+  spinlock_unlock(&opencl_context_map_lock);
+
+  return ret_context_id;
+}
+
+
+void
+opencl_cl_context_map_delete
+(
+ uint64_t context
+)
+{
+  spinlock_lock(&opencl_context_map_lock);
+
+  opencl_context_map_entry_t *node = st_delete(&map_root, context);
+  st_free(&free_list, node);
+
+  spinlock_unlock(&opencl_context_map_lock);
+}
+
+
+uint32_t
+opencl_cl_context_map_entry_context_id_get
+(
+ opencl_context_map_entry_t *entry
+)
+{
+  return entry->context_id;
+}
+
+
+
+//*****************************************************************************
+// debugging code
+//*****************************************************************************
+
+uint64_t
+opencl_cl_context_map_count
+(
+ void
+)
+{
+  return st_count(map_root);
+}
+
diff --git a/src/tool/hpcrun/gpu/opencl/opencl-context-map.h b/src/tool/hpcrun/gpu/opencl/opencl-context-map.h
new file mode 100644
index 0000000000..d91d2a9471
--- /dev/null
+++ b/src/tool/hpcrun/gpu/opencl/opencl-context-map.h
@@ -0,0 +1,98 @@
+// -*-Mode: C++;-*- // technically C99
+
+// * BeginRiceCopyright *****************************************************
+//
+// --------------------------------------------------------------------------
+// Part of HPCToolkit (hpctoolkit.org)
+//
+// Information about sources of support for research and development of
+// HPCToolkit is at 'hpctoolkit.org' and in 'README.Acknowledgments'.
+// --------------------------------------------------------------------------
+//
+// Copyright ((c)) 2002-2020, Rice University
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+//   notice, this list of conditions and the following disclaimer.
+//
+// * Redistributions in binary form must reproduce the above copyright
+//   notice, this list of conditions and the following disclaimer in the
+//   documentation and/or other materials provided with the distribution.
+//
+// * Neither the name of Rice University (RICE) nor the names of its
+//   contributors may be used to endorse or promote products derived from
+//   this software without specific prior written permission.
+//
+// This software is provided by RICE and contributors "as is" and any
+// express or implied warranties, including, but not limited to, the
+// implied warranties of merchantability and fitness for a particular
+// purpose are disclaimed. In no event shall RICE or contributors be
+// liable for any direct, indirect, incidental, special, exemplary, or
+// consequential damages (including, but not limited to, procurement of
+// substitute goods or services; loss of use, data, or profits; or
+// business interruption) however caused and on any theory of liability,
+// whether in contract, strict liability, or tort (including negligence
+// or otherwise) arising in any way out of the use of this software, even
+// if advised of the possibility of such damage.
+//
+// ******************************************************* EndRiceCopyright *
+
+#ifndef opencl_context_map_h
+#define opencl_context_map_h
+
+
+//*****************************************************************************
+// system includes
+//*****************************************************************************
+
+#include <stdint.h>
+
+
+
+//*****************************************************************************
+// type definitions 
+//*****************************************************************************
+
+typedef struct opencl_context_map_entry_t opencl_context_map_entry_t;
+
+
+
+//*****************************************************************************
+// interface operations
+//*****************************************************************************
+
+opencl_context_map_entry_t *
+opencl_cl_context_map_lookup
+(
+ uint64_t
+);
+
+
+uint32_t
+opencl_cl_context_map_update
+(
+ uint64_t
+);
+
+
+void
+opencl_cl_context_map_delete
+(
+ uint64_t
+);
+
+
+uint32_t
+opencl_cl_context_map_entry_context_id_get
+(
+ opencl_context_map_entry_t *entry
+);
+
+
+#endif
+
+
diff --git a/src/tool/hpcrun/gpu/opencl/opencl-memory-manager.c b/src/tool/hpcrun/gpu/opencl/opencl-memory-manager.c
index d7895cfd4f..87c161ff18 100644
--- a/src/tool/hpcrun/gpu/opencl/opencl-memory-manager.c
+++ b/src/tool/hpcrun/gpu/opencl/opencl-memory-manager.c
@@ -48,7 +48,6 @@
 #include <string.h>
 
 
-
 //******************************************************************************
 // local includes
 //******************************************************************************
@@ -58,8 +57,6 @@
 
 #include "opencl-memory-manager.h"
 
-
-
 //******************************************************************************
 // type declarations
 //******************************************************************************
@@ -68,16 +65,12 @@ struct opencl_object_channel_t {
   bistack_t bistacks[2];
 };
 
-
-
 //******************************************************************************
 // local data
 //******************************************************************************
 
 static __thread opencl_object_channel_t *opencl_object_channel;
 
-
-
 //******************************************************************************
 // private operations
 //******************************************************************************
@@ -105,31 +98,43 @@ opencl_object_channel_get
 }
 
 
-
 //******************************************************************************
 // interface operations
 //******************************************************************************
 
-opencl_object_t*
+opencl_object_t *
 opencl_malloc
 (
-  void
+ void
 )
 {
   opencl_object_channel_t *c = opencl_object_channel_get();
   opencl_object_t *cl_obj = channel_item_alloc(c, opencl_object_t);
+  memset(cl_obj, 0, sizeof(opencl_object_t));
   cl_obj->channel = c;
   return cl_obj;
 }
 
 
+opencl_object_t *
+opencl_malloc_kind
+(
+ gpu_activity_kind_t kind
+)
+{
+  opencl_object_t *cl_obj = opencl_malloc();
+  cl_obj->kind = kind;
+  return cl_obj;
+}
+
+
 void
 opencl_free
 (
-  opencl_object_t *o
+ opencl_object_t *obj
 )
 {
-  memset(o, 0, sizeof(opencl_object_t));
-  opencl_object_channel_t *c = &(o->channel);
-  channel_item_free(c, o);
+  memset(obj, 0, sizeof(opencl_object_t));
+  opencl_object_channel_t *c = &(obj->channel);
+  channel_item_free(c, obj);
 }
diff --git a/src/tool/hpcrun/gpu/opencl/opencl-memory-manager.h b/src/tool/hpcrun/gpu/opencl/opencl-memory-manager.h
index 9dca1f1233..6523197f15 100644
--- a/src/tool/hpcrun/gpu/opencl/opencl-memory-manager.h
+++ b/src/tool/hpcrun/gpu/opencl/opencl-memory-manager.h
@@ -124,7 +124,7 @@ typedef struct opencl_object_t {
   s_element_ptr_t next;
   opencl_object_channel_t *channel;
   gpu_activity_kind_t kind;
-  bool isInternalClEvent;
+  bool internal_event;
   opencl_object_details_t details;
   atomic_int *pending_operations;
 } opencl_object_t;
@@ -135,19 +135,24 @@ typedef struct opencl_object_t {
 // interface operations
 //******************************************************************************
 
-opencl_object_t*
+opencl_object_t *
 opencl_malloc
 (
-  void
+ void
 );
 
 
-void
-opencl_free
+opencl_object_t *
+opencl_malloc_kind
 (
-  opencl_object_t *
+ gpu_activity_kind_t kind
 );
 
 
+void
+opencl_free
+(
+ opencl_object_t *
+);
 
 #endif  //OPENCL_MEMORY_MANAGER_H
diff --git a/src/tool/hpcrun/gpu/opencl/opencl-queue-map.c b/src/tool/hpcrun/gpu/opencl/opencl-queue-map.c
index b1e1e0fd35..60f7154131 100644
--- a/src/tool/hpcrun/gpu/opencl/opencl-queue-map.c
+++ b/src/tool/hpcrun/gpu/opencl/opencl-queue-map.c
@@ -63,7 +63,6 @@
 #include "opencl-queue-map.h"
 
 
-
 //*****************************************************************************
 // macros
 //*****************************************************************************
@@ -74,19 +73,19 @@
 
 
 #define st_insert				\
-  typed_splay_insert(correlation_id)
+  typed_splay_insert(queue)
 
 #define st_lookup				\
-  typed_splay_lookup(correlation_id)
+  typed_splay_lookup(queue)
 
 #define st_delete				\
-  typed_splay_delete(correlation_id)
+  typed_splay_delete(queue)
 
 #define st_forall				\
-  typed_splay_forall(correlation_id)
+  typed_splay_forall(queue)
 
 #define st_count				\
-  typed_splay_count(correlation_id)
+  typed_splay_count(queue)
 
 #define st_alloc(free_list)			\
   typed_splay_alloc(free_list, opencl_queue_map_entry_t)
@@ -101,15 +100,16 @@
 //*****************************************************************************
 
 #undef typed_splay_node
-#define typed_splay_node(correlation_id) opencl_queue_map_entry_t
+#define typed_splay_node(queue) opencl_queue_map_entry_t
 
-typedef struct typed_splay_node(correlation_id) {
-  struct typed_splay_node(correlation_id) *left;
-  struct typed_splay_node(correlation_id) *right;
-  uint64_t queue_id; // key
+typedef struct typed_splay_node(queue) {
+  struct typed_splay_node(queue) *left;
+  struct typed_splay_node(queue) *right;
+  uint64_t queue; // key
 
-  uint32_t context_id; // we save queue id as the stream id
-} typed_splay_node(correlation_id); 
+  uint32_t queue_id; // we save queue id as the stream id
+  uint32_t context_id;
+} typed_splay_node(queue); 
 
 
 //******************************************************************************
@@ -122,11 +122,13 @@ static opencl_queue_map_entry_t *free_list = NULL;
 
 static spinlock_t opencl_queue_map_lock = SPINLOCK_UNLOCKED;
 
+static uint32_t cl_queue_id = 0;
+
 //*****************************************************************************
 // private operations
 //*****************************************************************************
 
-typed_splay_impl(correlation_id)
+typed_splay_impl(queue)
 
 
 static opencl_queue_map_entry_t *
@@ -139,20 +141,20 @@ opencl_cl_queue_map_entry_alloc()
 static opencl_queue_map_entry_t *
 opencl_cl_queue_map_entry_new
 (
- uint64_t queue_id,
+ uint64_t queue,
  uint32_t context_id
 )
 {
   opencl_queue_map_entry_t *e = opencl_cl_queue_map_entry_alloc();
 
-  e->queue_id = queue_id;
+  e->queue = queue;
+  e->queue_id = cl_queue_id;
   e->context_id = context_id;
   
   return e;
 }
 
 
-
 //*****************************************************************************
 // interface operations
 //*****************************************************************************
@@ -160,12 +162,12 @@ opencl_cl_queue_map_entry_new
 opencl_queue_map_entry_t *
 opencl_cl_queue_map_lookup
 (
- uint64_t queue_id
+ uint64_t queue
 )
 {
   spinlock_lock(&opencl_queue_map_lock);
 
-  uint64_t id = queue_id;
+  uint64_t id = queue;
   opencl_queue_map_entry_t *result = st_lookup(&map_root, id);
 
   spinlock_unlock(&opencl_queue_map_lock);
@@ -175,45 +177,53 @@ opencl_cl_queue_map_lookup
 
 
 void
-opencl_cl_queue_map_insert
+opencl_cl_queue_map_update
 (
- uint64_t queue_id, 
+ uint64_t queue, 
  uint32_t context_id
 )
 {
+  uint32_t ret_queue_id = 0;
+
   spinlock_lock(&opencl_queue_map_lock);
 
-  opencl_queue_map_entry_t *entry = st_lookup(&map_root, queue_id);
+  opencl_queue_map_entry_t *entry = st_lookup(&map_root, queue);
   if (entry) {
-    entry->queue_id = queue_id;
+    entry->queue = queue;
+    entry->queue_id = cl_queue_id;
     entry->context_id = context_id;
   } else {
     opencl_queue_map_entry_t *entry = 
-      opencl_cl_queue_map_entry_new(queue_id, context_id);
+      opencl_cl_queue_map_entry_new(queue, context_id);
 
     st_insert(&map_root, entry);
   }
+    
+  // Update cl_queue_id
+  ret_queue_id = cl_queue_id++;
 
   spinlock_unlock(&opencl_queue_map_lock);
+
+  return ret_queue_id;
 }
 
 
 void
 opencl_cl_queue_map_delete
 (
- uint64_t queue_id
+ uint64_t queue
 )
 {
   spinlock_lock(&opencl_queue_map_lock);
 
-  opencl_queue_map_entry_t *node = st_delete(&map_root, queue_id);
+  opencl_queue_map_entry_t *node = st_delete(&map_root, queue);
   st_free(&free_list, node);
 
   spinlock_unlock(&opencl_queue_map_lock);
 }
 
 
-uint64_t
+uint32_t
 opencl_cl_queue_map_entry_queue_id_get
 (
  opencl_queue_map_entry_t *entry
diff --git a/src/tool/hpcrun/gpu/opencl/opencl-queue-map.h b/src/tool/hpcrun/gpu/opencl/opencl-queue-map.h
index 1d9b6db2d6..ad3133d0ab 100644
--- a/src/tool/hpcrun/gpu/opencl/opencl-queue-map.h
+++ b/src/tool/hpcrun/gpu/opencl/opencl-queue-map.h
@@ -59,8 +59,6 @@
 
 typedef struct opencl_queue_map_entry_t opencl_queue_map_entry_t;
 
-
-
 //*****************************************************************************
 // interface operations
 //*****************************************************************************
@@ -73,7 +71,7 @@ opencl_cl_queue_map_lookup
 
 
 void
-opencl_cl_queue_map_insert
+opencl_cl_queue_map_update
 (
  uint64_t, 
  uint32_t
@@ -87,7 +85,7 @@ opencl_cl_queue_map_delete
 );
 
 
-uint64_t
+uint32_t
 opencl_cl_queue_map_entry_queue_id_get
 (
  opencl_queue_map_entry_t *entry
@@ -101,11 +99,5 @@ opencl_cl_queue_map_entry_context_id_get
 );
 
 
-uint64_t
-opencl_h2d_map_count
-(
- void
-);
-
 #endif
 
diff --git a/src/tool/hpcrun/sample-sources/opencl.c b/src/tool/hpcrun/sample-sources/opencl.c
index c23510d6fe..d9b9b17261 100644
--- a/src/tool/hpcrun/sample-sources/opencl.c
+++ b/src/tool/hpcrun/sample-sources/opencl.c
@@ -69,9 +69,9 @@
 #define GPU_STRING "gpu=opencl"
 #define ENABLE_INSTRUMENTATION "gpu=opencl,inst"
 #define NO_THRESHOLD  1L
-static device_finalizer_fn_entry_t device_finalizer_shutdown;
-static device_finalizer_fn_entry_t device_trace_finalizer_shutdown;
 
+static device_finalizer_fn_entry_t device_finalizer_flush;
+static device_finalizer_fn_entry_t device_finalizer_shutdown;
 
 
 //******************************************************************************
@@ -118,7 +118,6 @@ static void
 METHOD_FN(thread_fini_action)
 {
   TMSG(OPENCL, "thread_fini_action");
-  opencl_api_thread_finalize(NULL);
 }
 
 
@@ -164,7 +163,8 @@ METHOD_FN(process_event_list, int lush_metrics)
 
   if (hpcrun_ev_is(opencl_name, GPU_STRING)) {
   } else if (hpcrun_ev_is(opencl_name, ENABLE_INSTRUMENTATION)) {
-    opencl_enable_instrumentation();
+    gpu_metrics_GPU_INST_enable();
+    opencl_instrumentation_enable();
   }
 }
 
@@ -180,6 +180,9 @@ METHOD_FN(finalize_event_list)
   #endif
   opencl_api_initialize();
 
+  device_finalizer_flush.fn = opencl_api_thread_finalize;
+  device_finalizer_register(device_finalizer_type_flush, &device_finalizer_flush);
+
   device_finalizer_shutdown.fn = opencl_api_process_finalize;
   device_finalizer_register(device_finalizer_type_shutdown, &device_finalizer_shutdown);
 }

From 8c0447f4488bf0f8c40a403f4b36abaea89db70d Mon Sep 17 00:00:00 2001
From: Jokeren <robinho364@gmail.com>
Date: Wed, 14 Oct 2020 03:39:37 +0000
Subject: [PATCH 100/177] Fix warnings and bugs

---
 .../gpu/opencl/opencl-activity-translate.c    |  6 +++---
 src/tool/hpcrun/gpu/opencl/opencl-api.c       | 20 +++++++++++--------
 .../hpcrun/gpu/opencl/opencl-memory-manager.c |  3 +--
 src/tool/hpcrun/gpu/opencl/opencl-queue-map.c |  2 +-
 src/tool/hpcrun/gpu/opencl/opencl-queue-map.h |  2 +-
 5 files changed, 18 insertions(+), 15 deletions(-)

diff --git a/src/tool/hpcrun/gpu/opencl/opencl-activity-translate.c b/src/tool/hpcrun/gpu/opencl/opencl-activity-translate.c
index 043791cfd5..6cd3339fc6 100644
--- a/src/tool/hpcrun/gpu/opencl/opencl-activity-translate.c
+++ b/src/tool/hpcrun/gpu/opencl/opencl-activity-translate.c
@@ -76,7 +76,7 @@ convert_kernel_launch
 {
   memset(&ga->details.kernel, 0, sizeof(gpu_kernel_t));
   if (start_time != 0 && end_time != 0) {
-    set_gpu_interval(&ga->details.interval, start_time, end_time);
+    gpu_interval_set(&ga->details.interval, start_time, end_time);
   }
 
   ga->kind     = cb_data->kind;
@@ -100,7 +100,7 @@ convert_memcpy
 {
   memset(&ga->details.memcpy, 0, sizeof(gpu_memcpy_t));
   if (start_time != 0 && end_time != 0) {
-    set_gpu_interval(&ga->details.interval, start_time, end_time);
+    gpu_interval_set(&ga->details.interval, start_time, end_time);
   }
 
   ga->kind     = cb_data->kind;
@@ -126,7 +126,7 @@ convert_memory
 {
   memset(&ga->details.memory, 0, sizeof(gpu_memory_t));
   if (start_time != 0 && end_time != 0) {
-    set_gpu_interval(&ga->details.interval, start_time, end_time);
+    gpu_interval_set(&ga->details.interval, start_time, end_time);
   }
 
   ga->kind     = cb_data->kind;
diff --git a/src/tool/hpcrun/gpu/opencl/opencl-api.c b/src/tool/hpcrun/gpu/opencl/opencl-api.c
index 83d0b50112..a816a3228c 100644
--- a/src/tool/hpcrun/gpu/opencl/opencl-api.c
+++ b/src/tool/hpcrun/gpu/opencl/opencl-api.c
@@ -385,6 +385,8 @@ initializeKernelCallBackInfo
   uint32_t context_id = opencl_cl_queue_map_entry_context_id_get(qe);
   uint32_t queue_id = opencl_cl_queue_map_entry_queue_id_get(qe);
 
+  ETMSG(OPENCL, "submit kernel to context %u queue %u\n", context_id, queue_id);
+
   ker_info->details.context_id = context_id;
   ker_info->details.stream_id = queue_id;
   ker_info->pending_operations = &opencl_self_pending_operations;
@@ -404,6 +406,8 @@ initializeMemcpyCallBackInfo
   uint32_t context_id = opencl_cl_queue_map_entry_context_id_get(qe);
   uint32_t queue_id = opencl_cl_queue_map_entry_queue_id_get(qe);
 
+  ETMSG(OPENCL, "submit memcpy to context %u queue %u\n", context_id, queue_id);
+
   cpy_info->kind = GPU_ACTIVITY_MEMCPY;
   cpy_info->details.cpy_cb.type = type;
   cpy_info->details.cpy_cb.fromHostToDevice = (type == GPU_MEMCPY_H2D);
@@ -526,7 +530,7 @@ opencl_clSetKernelArg_activity_process
   memset(&gpu_activity, 0, sizeof(gpu_activity_t));
 
   uint32_t correlation_id = opencl_h2d_map_entry_correlation_get(entry);
-  size_t size = opencl_h2d_map_entry_size_get(entry); 
+  opencl_h2d_map_entry_size_get(entry); 
   cb_data->details.ker_cb.correlation_id = correlation_id;
 
   gpu_interval_t interval;
@@ -594,9 +598,8 @@ add_H2D_metrics_to_cct_node
   opencl_cb_basic_print(cb_basic, "Completion_Callback");
 
   opencl_clSetKernelArg_activity_process(entry, cb_data);
-  uint64_t buffer_id = opencl_h2d_map_entry_buffer_id_get(entry);
+  opencl_h2d_map_entry_buffer_id_get(entry);
   opencl_h2d_pending_operations_adjust(-1);
-  opencl_pending_operations_adjust(-1);
 }
 
 
@@ -607,7 +610,7 @@ opencl_update_ccts_for_setClKernelArg
 )
 {
   spinlock_lock(&opencl_h2d_lock);
-  uint64_t count = opencl_h2d_map_count();
+  opencl_h2d_map_count();
   if (atomic_load(&opencl_h2d_pending_operations) > 0) {
     opencl_update_ccts_for_h2d_nodes(add_H2D_metrics_to_cct_node);
   }
@@ -637,7 +640,7 @@ opencl_wait_for_self_pending_operations
 }
 
 
-static void
+static void __attribute__((unused))
 opencl_wait_for_all_pending_operations
 (
  void
@@ -964,8 +967,9 @@ clCreateCommandQueue
   cl_command_queue queue = HPCRUN_OPENCL_CALL(clCreateCommandQueue, (context, device,
         properties,errcode_ret));
 
-  uint32_t context_id = opencl_cl_context_map_update(context);
+  uint32_t context_id = opencl_cl_context_map_update((uint64_t)context);
   opencl_cl_queue_map_update((uint64_t)queue, context_id);
+
   return queue;
 }
 
@@ -1023,8 +1027,8 @@ clCreateCommandQueueWithProperties
     free(queue_properties);
   }
 
-  uint32_t context_id = opencl_cl_context_map_update(context);
-  opencl_cl_queue_map_update(queue, context_id);
+  uint32_t context_id = opencl_cl_context_map_update((uint64_t)context);
+  opencl_cl_queue_map_update((uint64_t)queue, context_id);
   return queue;
 }
 
diff --git a/src/tool/hpcrun/gpu/opencl/opencl-memory-manager.c b/src/tool/hpcrun/gpu/opencl/opencl-memory-manager.c
index 87c161ff18..0e555b38be 100644
--- a/src/tool/hpcrun/gpu/opencl/opencl-memory-manager.c
+++ b/src/tool/hpcrun/gpu/opencl/opencl-memory-manager.c
@@ -134,7 +134,6 @@ opencl_free
  opencl_object_t *obj
 )
 {
-  memset(obj, 0, sizeof(opencl_object_t));
-  opencl_object_channel_t *c = &(obj->channel);
+  opencl_object_channel_t *c = obj->channel;
   channel_item_free(c, obj);
 }
diff --git a/src/tool/hpcrun/gpu/opencl/opencl-queue-map.c b/src/tool/hpcrun/gpu/opencl/opencl-queue-map.c
index 60f7154131..099fc3064d 100644
--- a/src/tool/hpcrun/gpu/opencl/opencl-queue-map.c
+++ b/src/tool/hpcrun/gpu/opencl/opencl-queue-map.c
@@ -176,7 +176,7 @@ opencl_cl_queue_map_lookup
 }
 
 
-void
+uint32_t
 opencl_cl_queue_map_update
 (
  uint64_t queue, 
diff --git a/src/tool/hpcrun/gpu/opencl/opencl-queue-map.h b/src/tool/hpcrun/gpu/opencl/opencl-queue-map.h
index ad3133d0ab..63393888de 100644
--- a/src/tool/hpcrun/gpu/opencl/opencl-queue-map.h
+++ b/src/tool/hpcrun/gpu/opencl/opencl-queue-map.h
@@ -70,7 +70,7 @@ opencl_cl_queue_map_lookup
 );
 
 
-void
+uint32_t
 opencl_cl_queue_map_update
 (
  uint64_t, 

From 9adb6d13e1fdf5d5f75ae1fb3547b4a36b598a0a Mon Sep 17 00:00:00 2001
From: Jokeren <robinho364@gmail.com>
Date: Wed, 14 Oct 2020 03:46:06 +0000
Subject: [PATCH 101/177] Remove log files generated on jlse

---
 src/tool/hpcrun/gpu/214002.cobaltlog | 25 -------------------------
 src/tool/hpcrun/gpu/214002.error     |  1 -
 src/tool/hpcrun/gpu/214002.output    |  0
 3 files changed, 26 deletions(-)
 delete mode 100644 src/tool/hpcrun/gpu/214002.cobaltlog
 delete mode 100644 src/tool/hpcrun/gpu/214002.error
 delete mode 100644 src/tool/hpcrun/gpu/214002.output

diff --git a/src/tool/hpcrun/gpu/214002.cobaltlog b/src/tool/hpcrun/gpu/214002.cobaltlog
deleted file mode 100644
index 17ac3cc0e8..0000000000
--- a/src/tool/hpcrun/gpu/214002.cobaltlog
+++ /dev/null
@@ -1,25 +0,0 @@
-Jobid: 214002
-qsub -I -t 30 -n 1 -q iris
-Thu Oct 01 23:59:10 2020 +0000 (UTC) submitted with cwd set to: /home/jokeren/Codes/hpctoolkit_aaron/src/tool/hpcrun/gpu
-jobid 214002 submitted from terminal /dev/pts/206
-Thu Oct 01 23:59:28 2020 +0000 (UTC) 
-Thu Oct 01 23:59:28 2020 +0000 (UTC) Command: '/usr/bin/ssh' 'iris09' '/usr/libexec/cobalt/cobalt-launcher.py' '--nf' '/var/tmp/cobalt.214002' '--jobid' '214002' '--cwd' '/home/jokeren/Codes/hpctoolkit_aaron/src/tool/hpcrun/gpu' '--env' 'COBALT_JOBID=214002' '/bin/sleep' '1800'
-Thu Oct 01 23:59:28 2020 +0000 (UTC) 
-Thu Oct 01 23:59:28 2020 +0000 (UTC) Environment:
-Thu Oct 01 23:59:28 2020 +0000 (UTC) SHELL=/bin/bash
-Thu Oct 01 23:59:28 2020 +0000 (UTC) COBALT_NODEFILE=/tmp/tmpR3QupI
-Thu Oct 01 23:59:28 2020 +0000 (UTC) COBALT_PARTNAME=iris09
-Thu Oct 01 23:59:28 2020 +0000 (UTC) COBALT_JOBID=214002
-Thu Oct 01 23:59:28 2020 +0000 (UTC) COBALT_STARTTIME=1601596758
-Thu Oct 01 23:59:28 2020 +0000 (UTC) LOGNAME=jokeren
-Thu Oct 01 23:59:28 2020 +0000 (UTC) USER=jokeren
-Thu Oct 01 23:59:28 2020 +0000 (UTC) COBALT_ENDTIME=1601598558
-Thu Oct 01 23:59:28 2020 +0000 (UTC) COBALT_PARTSIZE=1
-Thu Oct 01 23:59:28 2020 +0000 (UTC) HOME=/home/jokeren
-Thu Oct 01 23:59:28 2020 +0000 (UTC) COBALT_JOBSIZE=1
-Thu Oct 01 23:59:28 2020 +0000 (UTC) 
-Thu Oct 01 23:59:28 2020 +0000 (UTC) Info: stdin received from /dev/null
-Thu Oct 01 23:59:28 2020 +0000 (UTC) Info: stdout sent to /home/jokeren/Codes/hpctoolkit_aaron/src/tool/hpcrun/gpu/214002.output
-Thu Oct 01 23:59:28 2020 +0000 (UTC) Info: stderr sent to /home/jokeren/Codes/hpctoolkit_aaron/src/tool/hpcrun/gpu/214002.error
-Thu Oct 01 23:59:28 2020 +0000 (UTC) 
-Fri Oct 02 00:10:57 2020 +0000 (UTC) Info: user delete requested with signal SIGTERM by user jokeren
diff --git a/src/tool/hpcrun/gpu/214002.error b/src/tool/hpcrun/gpu/214002.error
deleted file mode 100644
index b3be75cc7e..0000000000
--- a/src/tool/hpcrun/gpu/214002.error
+++ /dev/null
@@ -1 +0,0 @@
-Killed by signal 15.
diff --git a/src/tool/hpcrun/gpu/214002.output b/src/tool/hpcrun/gpu/214002.output
deleted file mode 100644
index e69de29bb2..0000000000

From 74cbfec897d78278d868f863847413bec27d2f08 Mon Sep 17 00:00:00 2001
From: Jokeren <robinho364@gmail.com>
Date: Thu, 15 Oct 2020 02:04:17 +0000
Subject: [PATCH 102/177] Rename activity-multiplexer to operation-multiplexer

---
 src/tool/hpcrun/Makefile.am                   |  2 +-
 src/tool/hpcrun/Makefile.in                   | 85 ++++++++++---------
 .../hpcrun/gpu/gpu-operation-channel-set.c    |  4 +-
 ...tiplexer.c => gpu-operation-multiplexer.c} | 32 +++----
 ...tiplexer.h => gpu-operation-multiplexer.h} | 27 ++----
 src/tool/hpcrun/gpu/opencl/opencl-api.c       | 15 ++--
 6 files changed, 75 insertions(+), 90 deletions(-)
 rename src/tool/hpcrun/gpu/{gpu-activity-multiplexer.c => gpu-operation-multiplexer.c} (87%)
 rename src/tool/hpcrun/gpu/{gpu-activity-multiplexer.h => gpu-operation-multiplexer.h} (93%)

diff --git a/src/tool/hpcrun/Makefile.am b/src/tool/hpcrun/Makefile.am
index 76e128fc5b..bda36a2502 100644
--- a/src/tool/hpcrun/Makefile.am
+++ b/src/tool/hpcrun/Makefile.am
@@ -357,7 +357,6 @@ MY_BASE_FILES =				\
 	hpcrun-placeholders.c 		\
 	gpu/gpu-activity.c 		\
 	gpu/gpu-activity-channel.c 	\
-	gpu/gpu-activity-multiplexer.c 	\
 	gpu/gpu-activity-process.c 	\
 	gpu/gpu-application-thread-api.c \
 	gpu/gpu-channel-item-allocator.c \
@@ -378,6 +377,7 @@ MY_BASE_FILES =				\
 	gpu/gpu-operation-item-process.c   \
 	gpu/gpu-operation-channel.c \
 	gpu/gpu-operation-channel-set.c \
+	gpu/gpu-operation-multiplexer.c 	\
 	gpu/gpu-splay-allocator.c	\
 	gpu/gpu-stream-id-map.c		\
 	gpu/gpu-trace.c			\
diff --git a/src/tool/hpcrun/Makefile.in b/src/tool/hpcrun/Makefile.in
index 0a44403079..c0fdae4d20 100644
--- a/src/tool/hpcrun/Makefile.in
+++ b/src/tool/hpcrun/Makefile.in
@@ -479,8 +479,8 @@ am__libhpcrun_la_SOURCES_DIST = utilities/first_func.c main.h main.c \
 	memory/mem.c memory/mmap.c messages/debug-flag.c \
 	messages/messages-sync.c messages/messages-async.c \
 	messages/fmt.c hpcrun-placeholders.c gpu/gpu-activity.c \
-	gpu/gpu-activity-channel.c gpu/gpu-activity-multiplexer.c \
-	gpu/gpu-activity-process.c gpu/gpu-application-thread-api.c \
+	gpu/gpu-activity-channel.c gpu/gpu-activity-process.c \
+	gpu/gpu-application-thread-api.c \
 	gpu/gpu-channel-item-allocator.c gpu/gpu-context-id-map.c \
 	gpu/gpu-correlation.c gpu/gpu-correlation-channel.c \
 	gpu/gpu-correlation-channel-set.c gpu/gpu-correlation-id.c \
@@ -490,8 +490,9 @@ am__libhpcrun_la_SOURCES_DIST = utilities/first_func.c main.h main.c \
 	gpu/gpu-monitoring-thread-api.c gpu/gpu-op-placeholders.c \
 	gpu/gpu-operation-item.c gpu/gpu-operation-item-process.c \
 	gpu/gpu-operation-channel.c gpu/gpu-operation-channel-set.c \
-	gpu/gpu-splay-allocator.c gpu/gpu-stream-id-map.c \
-	gpu/gpu-trace.c gpu/gpu-trace-channel.c gpu/gpu-trace-item.c \
+	gpu/gpu-operation-multiplexer.c gpu/gpu-splay-allocator.c \
+	gpu/gpu-stream-id-map.c gpu/gpu-trace.c \
+	gpu/gpu-trace-channel.c gpu/gpu-trace-item.c \
 	gpu/gpu-trace-channel-set.c gpu/gpu-trace-demultiplexer.c \
 	ompt/ompt-callstack.c ompt/ompt-defer.c ompt/ompt-device.c \
 	ompt/ompt-defer-write.c ompt/ompt-interface.c \
@@ -643,7 +644,6 @@ am__objects_14 = utilities/libhpcrun_la-first_func.lo \
 	libhpcrun_la-hpcrun-placeholders.lo \
 	gpu/libhpcrun_la-gpu-activity.lo \
 	gpu/libhpcrun_la-gpu-activity-channel.lo \
-	gpu/libhpcrun_la-gpu-activity-multiplexer.lo \
 	gpu/libhpcrun_la-gpu-activity-process.lo \
 	gpu/libhpcrun_la-gpu-application-thread-api.lo \
 	gpu/libhpcrun_la-gpu-channel-item-allocator.lo \
@@ -664,6 +664,7 @@ am__objects_14 = utilities/libhpcrun_la-first_func.lo \
 	gpu/libhpcrun_la-gpu-operation-item-process.lo \
 	gpu/libhpcrun_la-gpu-operation-channel.lo \
 	gpu/libhpcrun_la-gpu-operation-channel-set.lo \
+	gpu/libhpcrun_la-gpu-operation-multiplexer.lo \
 	gpu/libhpcrun_la-gpu-splay-allocator.lo \
 	gpu/libhpcrun_la-gpu-stream-id-map.lo \
 	gpu/libhpcrun_la-gpu-trace.lo \
@@ -912,8 +913,8 @@ am__libhpcrun_o_SOURCES_DIST = utilities/first_func.c main.h main.c \
 	memory/mem.c memory/mmap.c messages/debug-flag.c \
 	messages/messages-sync.c messages/messages-async.c \
 	messages/fmt.c hpcrun-placeholders.c gpu/gpu-activity.c \
-	gpu/gpu-activity-channel.c gpu/gpu-activity-multiplexer.c \
-	gpu/gpu-activity-process.c gpu/gpu-application-thread-api.c \
+	gpu/gpu-activity-channel.c gpu/gpu-activity-process.c \
+	gpu/gpu-application-thread-api.c \
 	gpu/gpu-channel-item-allocator.c gpu/gpu-context-id-map.c \
 	gpu/gpu-correlation.c gpu/gpu-correlation-channel.c \
 	gpu/gpu-correlation-channel-set.c gpu/gpu-correlation-id.c \
@@ -923,8 +924,9 @@ am__libhpcrun_o_SOURCES_DIST = utilities/first_func.c main.h main.c \
 	gpu/gpu-monitoring-thread-api.c gpu/gpu-op-placeholders.c \
 	gpu/gpu-operation-item.c gpu/gpu-operation-item-process.c \
 	gpu/gpu-operation-channel.c gpu/gpu-operation-channel-set.c \
-	gpu/gpu-splay-allocator.c gpu/gpu-stream-id-map.c \
-	gpu/gpu-trace.c gpu/gpu-trace-channel.c gpu/gpu-trace-item.c \
+	gpu/gpu-operation-multiplexer.c gpu/gpu-splay-allocator.c \
+	gpu/gpu-stream-id-map.c gpu/gpu-trace.c \
+	gpu/gpu-trace-channel.c gpu/gpu-trace-item.c \
 	gpu/gpu-trace-channel-set.c gpu/gpu-trace-demultiplexer.c \
 	ompt/ompt-callstack.c ompt/ompt-defer.c ompt/ompt-device.c \
 	ompt/ompt-defer-write.c ompt/ompt-interface.c \
@@ -1072,7 +1074,6 @@ am__objects_57 = utilities/libhpcrun_o-first_func.$(OBJEXT) \
 	libhpcrun_o-hpcrun-placeholders.$(OBJEXT) \
 	gpu/libhpcrun_o-gpu-activity.$(OBJEXT) \
 	gpu/libhpcrun_o-gpu-activity-channel.$(OBJEXT) \
-	gpu/libhpcrun_o-gpu-activity-multiplexer.$(OBJEXT) \
 	gpu/libhpcrun_o-gpu-activity-process.$(OBJEXT) \
 	gpu/libhpcrun_o-gpu-application-thread-api.$(OBJEXT) \
 	gpu/libhpcrun_o-gpu-channel-item-allocator.$(OBJEXT) \
@@ -1093,6 +1094,7 @@ am__objects_57 = utilities/libhpcrun_o-first_func.$(OBJEXT) \
 	gpu/libhpcrun_o-gpu-operation-item-process.$(OBJEXT) \
 	gpu/libhpcrun_o-gpu-operation-channel.$(OBJEXT) \
 	gpu/libhpcrun_o-gpu-operation-channel-set.$(OBJEXT) \
+	gpu/libhpcrun_o-gpu-operation-multiplexer.$(OBJEXT) \
 	gpu/libhpcrun_o-gpu-splay-allocator.$(OBJEXT) \
 	gpu/libhpcrun_o-gpu-stream-id-map.$(OBJEXT) \
 	gpu/libhpcrun_o-gpu-trace.$(OBJEXT) \
@@ -1821,8 +1823,8 @@ MY_BASE_FILES = utilities/first_func.c main.h main.c disabled.c \
 	memory/mem.c memory/mmap.c messages/debug-flag.c \
 	messages/messages-sync.c messages/messages-async.c \
 	messages/fmt.c hpcrun-placeholders.c gpu/gpu-activity.c \
-	gpu/gpu-activity-channel.c gpu/gpu-activity-multiplexer.c \
-	gpu/gpu-activity-process.c gpu/gpu-application-thread-api.c \
+	gpu/gpu-activity-channel.c gpu/gpu-activity-process.c \
+	gpu/gpu-application-thread-api.c \
 	gpu/gpu-channel-item-allocator.c gpu/gpu-context-id-map.c \
 	gpu/gpu-correlation.c gpu/gpu-correlation-channel.c \
 	gpu/gpu-correlation-channel-set.c gpu/gpu-correlation-id.c \
@@ -1832,8 +1834,9 @@ MY_BASE_FILES = utilities/first_func.c main.h main.c disabled.c \
 	gpu/gpu-monitoring-thread-api.c gpu/gpu-op-placeholders.c \
 	gpu/gpu-operation-item.c gpu/gpu-operation-item-process.c \
 	gpu/gpu-operation-channel.c gpu/gpu-operation-channel-set.c \
-	gpu/gpu-splay-allocator.c gpu/gpu-stream-id-map.c \
-	gpu/gpu-trace.c gpu/gpu-trace-channel.c gpu/gpu-trace-item.c \
+	gpu/gpu-operation-multiplexer.c gpu/gpu-splay-allocator.c \
+	gpu/gpu-stream-id-map.c gpu/gpu-trace.c \
+	gpu/gpu-trace-channel.c gpu/gpu-trace-item.c \
 	gpu/gpu-trace-channel-set.c gpu/gpu-trace-demultiplexer.c \
 	ompt/ompt-callstack.c ompt/ompt-defer.c ompt/ompt-device.c \
 	ompt/ompt-defer-write.c ompt/ompt-interface.c \
@@ -2516,8 +2519,6 @@ gpu/libhpcrun_la-gpu-activity.lo: gpu/$(am__dirstamp) \
 	gpu/$(DEPDIR)/$(am__dirstamp)
 gpu/libhpcrun_la-gpu-activity-channel.lo: gpu/$(am__dirstamp) \
 	gpu/$(DEPDIR)/$(am__dirstamp)
-gpu/libhpcrun_la-gpu-activity-multiplexer.lo: gpu/$(am__dirstamp) \
-	gpu/$(DEPDIR)/$(am__dirstamp)
 gpu/libhpcrun_la-gpu-activity-process.lo: gpu/$(am__dirstamp) \
 	gpu/$(DEPDIR)/$(am__dirstamp)
 gpu/libhpcrun_la-gpu-application-thread-api.lo: gpu/$(am__dirstamp) \
@@ -2558,6 +2559,8 @@ gpu/libhpcrun_la-gpu-operation-channel.lo: gpu/$(am__dirstamp) \
 	gpu/$(DEPDIR)/$(am__dirstamp)
 gpu/libhpcrun_la-gpu-operation-channel-set.lo: gpu/$(am__dirstamp) \
 	gpu/$(DEPDIR)/$(am__dirstamp)
+gpu/libhpcrun_la-gpu-operation-multiplexer.lo: gpu/$(am__dirstamp) \
+	gpu/$(DEPDIR)/$(am__dirstamp)
 gpu/libhpcrun_la-gpu-splay-allocator.lo: gpu/$(am__dirstamp) \
 	gpu/$(DEPDIR)/$(am__dirstamp)
 gpu/libhpcrun_la-gpu-stream-id-map.lo: gpu/$(am__dirstamp) \
@@ -3200,8 +3203,6 @@ gpu/libhpcrun_o-gpu-activity.$(OBJEXT): gpu/$(am__dirstamp) \
 	gpu/$(DEPDIR)/$(am__dirstamp)
 gpu/libhpcrun_o-gpu-activity-channel.$(OBJEXT): gpu/$(am__dirstamp) \
 	gpu/$(DEPDIR)/$(am__dirstamp)
-gpu/libhpcrun_o-gpu-activity-multiplexer.$(OBJEXT):  \
-	gpu/$(am__dirstamp) gpu/$(DEPDIR)/$(am__dirstamp)
 gpu/libhpcrun_o-gpu-activity-process.$(OBJEXT): gpu/$(am__dirstamp) \
 	gpu/$(DEPDIR)/$(am__dirstamp)
 gpu/libhpcrun_o-gpu-application-thread-api.$(OBJEXT):  \
@@ -3242,6 +3243,8 @@ gpu/libhpcrun_o-gpu-operation-channel.$(OBJEXT): gpu/$(am__dirstamp) \
 	gpu/$(DEPDIR)/$(am__dirstamp)
 gpu/libhpcrun_o-gpu-operation-channel-set.$(OBJEXT):  \
 	gpu/$(am__dirstamp) gpu/$(DEPDIR)/$(am__dirstamp)
+gpu/libhpcrun_o-gpu-operation-multiplexer.$(OBJEXT):  \
+	gpu/$(am__dirstamp) gpu/$(DEPDIR)/$(am__dirstamp)
 gpu/libhpcrun_o-gpu-splay-allocator.$(OBJEXT): gpu/$(am__dirstamp) \
 	gpu/$(DEPDIR)/$(am__dirstamp)
 gpu/libhpcrun_o-gpu-stream-id-map.$(OBJEXT): gpu/$(am__dirstamp) \
@@ -3746,7 +3749,6 @@ distclean-compile:
 @AMDEP_TRUE@@am__include@ @am__quote@fnbounds/$(DEPDIR)/libhpcrun_o-fnbounds_common.Po@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@fnbounds/$(DEPDIR)/libhpcrun_o-fnbounds_static.Po@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@gpu/$(DEPDIR)/libhpcrun_la-gpu-activity-channel.Plo@am__quote@
-@AMDEP_TRUE@@am__include@ @am__quote@gpu/$(DEPDIR)/libhpcrun_la-gpu-activity-multiplexer.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@gpu/$(DEPDIR)/libhpcrun_la-gpu-activity-process.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@gpu/$(DEPDIR)/libhpcrun_la-gpu-activity.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@gpu/$(DEPDIR)/libhpcrun_la-gpu-application-thread-api.Plo@am__quote@
@@ -3768,6 +3770,7 @@ distclean-compile:
 @AMDEP_TRUE@@am__include@ @am__quote@gpu/$(DEPDIR)/libhpcrun_la-gpu-operation-channel.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@gpu/$(DEPDIR)/libhpcrun_la-gpu-operation-item-process.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@gpu/$(DEPDIR)/libhpcrun_la-gpu-operation-item.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@gpu/$(DEPDIR)/libhpcrun_la-gpu-operation-multiplexer.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@gpu/$(DEPDIR)/libhpcrun_la-gpu-splay-allocator.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@gpu/$(DEPDIR)/libhpcrun_la-gpu-stream-id-map.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@gpu/$(DEPDIR)/libhpcrun_la-gpu-trace-channel-set.Plo@am__quote@
@@ -3776,7 +3779,6 @@ distclean-compile:
 @AMDEP_TRUE@@am__include@ @am__quote@gpu/$(DEPDIR)/libhpcrun_la-gpu-trace-item.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@gpu/$(DEPDIR)/libhpcrun_la-gpu-trace.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@gpu/$(DEPDIR)/libhpcrun_o-gpu-activity-channel.Po@am__quote@
-@AMDEP_TRUE@@am__include@ @am__quote@gpu/$(DEPDIR)/libhpcrun_o-gpu-activity-multiplexer.Po@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@gpu/$(DEPDIR)/libhpcrun_o-gpu-activity-process.Po@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@gpu/$(DEPDIR)/libhpcrun_o-gpu-activity.Po@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@gpu/$(DEPDIR)/libhpcrun_o-gpu-application-thread-api.Po@am__quote@
@@ -3798,6 +3800,7 @@ distclean-compile:
 @AMDEP_TRUE@@am__include@ @am__quote@gpu/$(DEPDIR)/libhpcrun_o-gpu-operation-channel.Po@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@gpu/$(DEPDIR)/libhpcrun_o-gpu-operation-item-process.Po@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@gpu/$(DEPDIR)/libhpcrun_o-gpu-operation-item.Po@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@gpu/$(DEPDIR)/libhpcrun_o-gpu-operation-multiplexer.Po@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@gpu/$(DEPDIR)/libhpcrun_o-gpu-splay-allocator.Po@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@gpu/$(DEPDIR)/libhpcrun_o-gpu-stream-id-map.Po@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@gpu/$(DEPDIR)/libhpcrun_o-gpu-trace-channel-set.Po@am__quote@
@@ -4799,13 +4802,6 @@ gpu/libhpcrun_la-gpu-activity-channel.lo: gpu/gpu-activity-channel.c
 @AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
 @am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -c -o gpu/libhpcrun_la-gpu-activity-channel.lo `test -f 'gpu/gpu-activity-channel.c' || echo '$(srcdir)/'`gpu/gpu-activity-channel.c
 
-gpu/libhpcrun_la-gpu-activity-multiplexer.lo: gpu/gpu-activity-multiplexer.c
-@am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -MT gpu/libhpcrun_la-gpu-activity-multiplexer.lo -MD -MP -MF gpu/$(DEPDIR)/libhpcrun_la-gpu-activity-multiplexer.Tpo -c -o gpu/libhpcrun_la-gpu-activity-multiplexer.lo `test -f 'gpu/gpu-activity-multiplexer.c' || echo '$(srcdir)/'`gpu/gpu-activity-multiplexer.c
-@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) gpu/$(DEPDIR)/libhpcrun_la-gpu-activity-multiplexer.Tpo gpu/$(DEPDIR)/libhpcrun_la-gpu-activity-multiplexer.Plo
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='gpu/gpu-activity-multiplexer.c' object='gpu/libhpcrun_la-gpu-activity-multiplexer.lo' libtool=yes @AMDEPBACKSLASH@
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
-@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -c -o gpu/libhpcrun_la-gpu-activity-multiplexer.lo `test -f 'gpu/gpu-activity-multiplexer.c' || echo '$(srcdir)/'`gpu/gpu-activity-multiplexer.c
-
 gpu/libhpcrun_la-gpu-activity-process.lo: gpu/gpu-activity-process.c
 @am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -MT gpu/libhpcrun_la-gpu-activity-process.lo -MD -MP -MF gpu/$(DEPDIR)/libhpcrun_la-gpu-activity-process.Tpo -c -o gpu/libhpcrun_la-gpu-activity-process.lo `test -f 'gpu/gpu-activity-process.c' || echo '$(srcdir)/'`gpu/gpu-activity-process.c
 @am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) gpu/$(DEPDIR)/libhpcrun_la-gpu-activity-process.Tpo gpu/$(DEPDIR)/libhpcrun_la-gpu-activity-process.Plo
@@ -4946,6 +4942,13 @@ gpu/libhpcrun_la-gpu-operation-channel-set.lo: gpu/gpu-operation-channel-set.c
 @AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
 @am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -c -o gpu/libhpcrun_la-gpu-operation-channel-set.lo `test -f 'gpu/gpu-operation-channel-set.c' || echo '$(srcdir)/'`gpu/gpu-operation-channel-set.c
 
+gpu/libhpcrun_la-gpu-operation-multiplexer.lo: gpu/gpu-operation-multiplexer.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -MT gpu/libhpcrun_la-gpu-operation-multiplexer.lo -MD -MP -MF gpu/$(DEPDIR)/libhpcrun_la-gpu-operation-multiplexer.Tpo -c -o gpu/libhpcrun_la-gpu-operation-multiplexer.lo `test -f 'gpu/gpu-operation-multiplexer.c' || echo '$(srcdir)/'`gpu/gpu-operation-multiplexer.c
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) gpu/$(DEPDIR)/libhpcrun_la-gpu-operation-multiplexer.Tpo gpu/$(DEPDIR)/libhpcrun_la-gpu-operation-multiplexer.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='gpu/gpu-operation-multiplexer.c' object='gpu/libhpcrun_la-gpu-operation-multiplexer.lo' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -c -o gpu/libhpcrun_la-gpu-operation-multiplexer.lo `test -f 'gpu/gpu-operation-multiplexer.c' || echo '$(srcdir)/'`gpu/gpu-operation-multiplexer.c
+
 gpu/libhpcrun_la-gpu-splay-allocator.lo: gpu/gpu-splay-allocator.c
 @am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -MT gpu/libhpcrun_la-gpu-splay-allocator.lo -MD -MP -MF gpu/$(DEPDIR)/libhpcrun_la-gpu-splay-allocator.Tpo -c -o gpu/libhpcrun_la-gpu-splay-allocator.lo `test -f 'gpu/gpu-splay-allocator.c' || echo '$(srcdir)/'`gpu/gpu-splay-allocator.c
 @am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) gpu/$(DEPDIR)/libhpcrun_la-gpu-splay-allocator.Tpo gpu/$(DEPDIR)/libhpcrun_la-gpu-splay-allocator.Plo
@@ -6864,20 +6867,6 @@ gpu/libhpcrun_o-gpu-activity-channel.obj: gpu/gpu-activity-channel.c
 @AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
 @am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_o_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_o_CFLAGS) $(CFLAGS) -c -o gpu/libhpcrun_o-gpu-activity-channel.obj `if test -f 'gpu/gpu-activity-channel.c'; then $(CYGPATH_W) 'gpu/gpu-activity-channel.c'; else $(CYGPATH_W) '$(srcdir)/gpu/gpu-activity-channel.c'; fi`
 
-gpu/libhpcrun_o-gpu-activity-multiplexer.o: gpu/gpu-activity-multiplexer.c
-@am__fastdepCC_TRUE@	$(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_o_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_o_CFLAGS) $(CFLAGS) -MT gpu/libhpcrun_o-gpu-activity-multiplexer.o -MD -MP -MF gpu/$(DEPDIR)/libhpcrun_o-gpu-activity-multiplexer.Tpo -c -o gpu/libhpcrun_o-gpu-activity-multiplexer.o `test -f 'gpu/gpu-activity-multiplexer.c' || echo '$(srcdir)/'`gpu/gpu-activity-multiplexer.c
-@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) gpu/$(DEPDIR)/libhpcrun_o-gpu-activity-multiplexer.Tpo gpu/$(DEPDIR)/libhpcrun_o-gpu-activity-multiplexer.Po
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='gpu/gpu-activity-multiplexer.c' object='gpu/libhpcrun_o-gpu-activity-multiplexer.o' libtool=no @AMDEPBACKSLASH@
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
-@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_o_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_o_CFLAGS) $(CFLAGS) -c -o gpu/libhpcrun_o-gpu-activity-multiplexer.o `test -f 'gpu/gpu-activity-multiplexer.c' || echo '$(srcdir)/'`gpu/gpu-activity-multiplexer.c
-
-gpu/libhpcrun_o-gpu-activity-multiplexer.obj: gpu/gpu-activity-multiplexer.c
-@am__fastdepCC_TRUE@	$(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_o_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_o_CFLAGS) $(CFLAGS) -MT gpu/libhpcrun_o-gpu-activity-multiplexer.obj -MD -MP -MF gpu/$(DEPDIR)/libhpcrun_o-gpu-activity-multiplexer.Tpo -c -o gpu/libhpcrun_o-gpu-activity-multiplexer.obj `if test -f 'gpu/gpu-activity-multiplexer.c'; then $(CYGPATH_W) 'gpu/gpu-activity-multiplexer.c'; else $(CYGPATH_W) '$(srcdir)/gpu/gpu-activity-multiplexer.c'; fi`
-@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) gpu/$(DEPDIR)/libhpcrun_o-gpu-activity-multiplexer.Tpo gpu/$(DEPDIR)/libhpcrun_o-gpu-activity-multiplexer.Po
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='gpu/gpu-activity-multiplexer.c' object='gpu/libhpcrun_o-gpu-activity-multiplexer.obj' libtool=no @AMDEPBACKSLASH@
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
-@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_o_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_o_CFLAGS) $(CFLAGS) -c -o gpu/libhpcrun_o-gpu-activity-multiplexer.obj `if test -f 'gpu/gpu-activity-multiplexer.c'; then $(CYGPATH_W) 'gpu/gpu-activity-multiplexer.c'; else $(CYGPATH_W) '$(srcdir)/gpu/gpu-activity-multiplexer.c'; fi`
-
 gpu/libhpcrun_o-gpu-activity-process.o: gpu/gpu-activity-process.c
 @am__fastdepCC_TRUE@	$(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_o_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_o_CFLAGS) $(CFLAGS) -MT gpu/libhpcrun_o-gpu-activity-process.o -MD -MP -MF gpu/$(DEPDIR)/libhpcrun_o-gpu-activity-process.Tpo -c -o gpu/libhpcrun_o-gpu-activity-process.o `test -f 'gpu/gpu-activity-process.c' || echo '$(srcdir)/'`gpu/gpu-activity-process.c
 @am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) gpu/$(DEPDIR)/libhpcrun_o-gpu-activity-process.Tpo gpu/$(DEPDIR)/libhpcrun_o-gpu-activity-process.Po
@@ -7158,6 +7147,20 @@ gpu/libhpcrun_o-gpu-operation-channel-set.obj: gpu/gpu-operation-channel-set.c
 @AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
 @am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_o_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_o_CFLAGS) $(CFLAGS) -c -o gpu/libhpcrun_o-gpu-operation-channel-set.obj `if test -f 'gpu/gpu-operation-channel-set.c'; then $(CYGPATH_W) 'gpu/gpu-operation-channel-set.c'; else $(CYGPATH_W) '$(srcdir)/gpu/gpu-operation-channel-set.c'; fi`
 
+gpu/libhpcrun_o-gpu-operation-multiplexer.o: gpu/gpu-operation-multiplexer.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_o_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_o_CFLAGS) $(CFLAGS) -MT gpu/libhpcrun_o-gpu-operation-multiplexer.o -MD -MP -MF gpu/$(DEPDIR)/libhpcrun_o-gpu-operation-multiplexer.Tpo -c -o gpu/libhpcrun_o-gpu-operation-multiplexer.o `test -f 'gpu/gpu-operation-multiplexer.c' || echo '$(srcdir)/'`gpu/gpu-operation-multiplexer.c
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) gpu/$(DEPDIR)/libhpcrun_o-gpu-operation-multiplexer.Tpo gpu/$(DEPDIR)/libhpcrun_o-gpu-operation-multiplexer.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='gpu/gpu-operation-multiplexer.c' object='gpu/libhpcrun_o-gpu-operation-multiplexer.o' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_o_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_o_CFLAGS) $(CFLAGS) -c -o gpu/libhpcrun_o-gpu-operation-multiplexer.o `test -f 'gpu/gpu-operation-multiplexer.c' || echo '$(srcdir)/'`gpu/gpu-operation-multiplexer.c
+
+gpu/libhpcrun_o-gpu-operation-multiplexer.obj: gpu/gpu-operation-multiplexer.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_o_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_o_CFLAGS) $(CFLAGS) -MT gpu/libhpcrun_o-gpu-operation-multiplexer.obj -MD -MP -MF gpu/$(DEPDIR)/libhpcrun_o-gpu-operation-multiplexer.Tpo -c -o gpu/libhpcrun_o-gpu-operation-multiplexer.obj `if test -f 'gpu/gpu-operation-multiplexer.c'; then $(CYGPATH_W) 'gpu/gpu-operation-multiplexer.c'; else $(CYGPATH_W) '$(srcdir)/gpu/gpu-operation-multiplexer.c'; fi`
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) gpu/$(DEPDIR)/libhpcrun_o-gpu-operation-multiplexer.Tpo gpu/$(DEPDIR)/libhpcrun_o-gpu-operation-multiplexer.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='gpu/gpu-operation-multiplexer.c' object='gpu/libhpcrun_o-gpu-operation-multiplexer.obj' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_o_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_o_CFLAGS) $(CFLAGS) -c -o gpu/libhpcrun_o-gpu-operation-multiplexer.obj `if test -f 'gpu/gpu-operation-multiplexer.c'; then $(CYGPATH_W) 'gpu/gpu-operation-multiplexer.c'; else $(CYGPATH_W) '$(srcdir)/gpu/gpu-operation-multiplexer.c'; fi`
+
 gpu/libhpcrun_o-gpu-splay-allocator.o: gpu/gpu-splay-allocator.c
 @am__fastdepCC_TRUE@	$(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_o_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_o_CFLAGS) $(CFLAGS) -MT gpu/libhpcrun_o-gpu-splay-allocator.o -MD -MP -MF gpu/$(DEPDIR)/libhpcrun_o-gpu-splay-allocator.Tpo -c -o gpu/libhpcrun_o-gpu-splay-allocator.o `test -f 'gpu/gpu-splay-allocator.c' || echo '$(srcdir)/'`gpu/gpu-splay-allocator.c
 @am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) gpu/$(DEPDIR)/libhpcrun_o-gpu-splay-allocator.Tpo gpu/$(DEPDIR)/libhpcrun_o-gpu-splay-allocator.Po
diff --git a/src/tool/hpcrun/gpu/gpu-operation-channel-set.c b/src/tool/hpcrun/gpu/gpu-operation-channel-set.c
index 4c6cb6dd41..fb97449985 100644
--- a/src/tool/hpcrun/gpu/gpu-operation-channel-set.c
+++ b/src/tool/hpcrun/gpu/gpu-operation-channel-set.c
@@ -53,7 +53,7 @@
 
 
 #include "gpu-activity-process.h"
-#include "gpu-activity-multiplexer.h"
+#include "gpu-operation-multiplexer.h"
 #include "gpu-operation-item.h"
 #include "gpu-operation-channel.h"
 #include "gpu-operation-channel-set.h"
@@ -215,4 +215,4 @@ gpu_operation_channel_set_notify
 )
 {
   gpu_operation_channel_set_apply(gpu_operation_channel_signal_consumer, channel_num);
-}
\ No newline at end of file
+}
diff --git a/src/tool/hpcrun/gpu/gpu-activity-multiplexer.c b/src/tool/hpcrun/gpu/gpu-operation-multiplexer.c
similarity index 87%
rename from src/tool/hpcrun/gpu/gpu-activity-multiplexer.c
rename to src/tool/hpcrun/gpu/gpu-operation-multiplexer.c
index 241f8549ed..821476ede6 100644
--- a/src/tool/hpcrun/gpu/gpu-activity-multiplexer.c
+++ b/src/tool/hpcrun/gpu/gpu-operation-multiplexer.c
@@ -49,9 +49,9 @@
 
 #define DEBUG 0
 
+#include "gpu-operation-multiplexer.h"
 #include "gpu-activity.h"
 #include "gpu-activity-channel.h"
-#include "gpu-activity-multiplexer.h"
 #include "gpu-activity-process.h"
 #include "gpu-monitoring-thread-api.h"
 #include "gpu-operation-channel-set.h"
@@ -70,7 +70,7 @@ typedef void *(*pthread_start_routine_t)(void *);
 // local variables
 //******************************************************************************
 
-static _Atomic(bool) stop_activity_flag;
+static _Atomic(bool) stop_operation_flag;
 static _Atomic(bool) gpu_trace_finished;
 
 static atomic_uint operation_channels_count;
@@ -94,14 +94,14 @@ gpu_init_operation_channel(){
 
 
 static void *
-gpu_activity_record
+gpu_operation_record
 (
  void
 )
 {
   int current_operation_channels_count;
 
-  while (!atomic_load(&stop_activity_flag)){
+  while (!atomic_load(&stop_operation_flag)){
     current_operation_channels_count = atomic_load(&operation_channels_count);
     gpu_operation_channel_set_process(current_operation_channels_count);
   }
@@ -118,13 +118,13 @@ gpu_activity_record
 
 
 static void
-gpu_activity_multiplexer_create
+gpu_operation_multiplexer_create
 (
  void
 )
 {
   pthread_t thread;
-  atomic_store(&stop_activity_flag, false);
+  atomic_store(&stop_operation_flag, false);
   atomic_store(&gpu_trace_finished, false);
   atomic_store(&operation_channels_count, 0);
 
@@ -134,7 +134,7 @@ gpu_activity_multiplexer_create
   gpu_operation_channel_set_alloc(max_completion_cb_threads);
 
   // You are the first to create monitor thread
-  pthread_create(&thread, NULL, (pthread_start_routine_t) gpu_activity_record,
+  pthread_create(&thread, NULL, (pthread_start_routine_t) gpu_operation_record,
                  NULL);
 }
 
@@ -145,7 +145,7 @@ gpu_activity_multiplexer_create
 //******************************************************************************
 
 bool
-gpu_activity_multiplexer_my_channel_initialized
+gpu_operation_multiplexer_my_channel_initialized
 (
  void
 )
@@ -155,25 +155,25 @@ gpu_activity_multiplexer_my_channel_initialized
 
 
 void
-gpu_activity_multiplexer_my_channel_init
+gpu_operation_multiplexer_my_channel_init
 (
  void
 )
 {
-  pthread_once(&is_initialized, gpu_activity_multiplexer_create);
+  pthread_once(&is_initialized, gpu_operation_multiplexer_create);
   gpu_init_operation_channel();
 }
 
 
 void
-gpu_activity_multiplexer_fini
+gpu_operation_multiplexer_fini
 (
  void
 )
 {
-  PRINT("gpu_activity_multiplexer_fini called\n");
+  PRINT("gpu_operation_multiplexer_fini called\n");
 
-  atomic_store(&stop_activity_flag, true);
+  atomic_store(&stop_operation_flag, true);
 
   gpu_operation_channel_set_notify(atomic_load(&operation_channels_count));
 
@@ -182,14 +182,14 @@ gpu_activity_multiplexer_fini
 
 
 void
-gpu_activity_multiplexer_push
+gpu_operation_multiplexer_push
 (
  gpu_activity_channel_t *initiator_channel,
  gpu_activity_t *gpu_activity
 )
 {
-  if (gpu_activity_multiplexer_my_channel_initialized() == false) {
-    gpu_activity_multiplexer_my_channel_init();
+  if (gpu_operation_multiplexer_my_channel_initialized() == false) {
+    gpu_operation_multiplexer_my_channel_init();
   }
 
   gpu_operation_item_t item = (gpu_operation_item_t){.channel=initiator_channel, .activity=*gpu_activity};
diff --git a/src/tool/hpcrun/gpu/gpu-activity-multiplexer.h b/src/tool/hpcrun/gpu/gpu-operation-multiplexer.h
similarity index 93%
rename from src/tool/hpcrun/gpu/gpu-activity-multiplexer.h
rename to src/tool/hpcrun/gpu/gpu-operation-multiplexer.h
index cbd6db1136..7dacaf2fb3 100644
--- a/src/tool/hpcrun/gpu/gpu-activity-multiplexer.h
+++ b/src/tool/hpcrun/gpu/gpu-operation-multiplexer.h
@@ -42,8 +42,8 @@
 // ******************************************************* EndRiceCopyright *
 
 
-#ifndef gpu_activity_multiplexer_h
-#define gpu_activity_multiplexer_h
+#ifndef gpu_operation_multiplexer_h
+#define gpu_operation_multiplexer_h
 
 #include <hpcrun/thread_data.h>
 #include "gpu-operation-channel.h"
@@ -70,47 +70,32 @@ typedef struct gpu_activity_t gpu_activity_t;
 
 
 bool
-gpu_activity_multiplexer_my_channel_initialized
+gpu_operation_multiplexer_my_channel_initialized
 (
  void
 );
 
 
 void
-gpu_activity_multiplexer_my_channel_init
+gpu_operation_multiplexer_my_channel_init
 (
  void
 );
 
 
 void
-gpu_activity_multiplexer_fini
+gpu_operation_multiplexer_fini
 (
  void
 );
 
 
 void
-gpu_activity_multiplexer_push
+gpu_operation_multiplexer_push
 (
  gpu_activity_channel_t *initiator_channel,
  gpu_activity_t *gpu_activity
 );
 
 
-
 #endif
-
-
-
-
-
-
-
-
-
-
-
-
-
-
diff --git a/src/tool/hpcrun/gpu/opencl/opencl-api.c b/src/tool/hpcrun/gpu/opencl/opencl-api.c
index a816a3228c..7c3c3467f8 100644
--- a/src/tool/hpcrun/gpu/opencl/opencl-api.c
+++ b/src/tool/hpcrun/gpu/opencl/opencl-api.c
@@ -61,7 +61,7 @@
 #include <hpcrun/gpu/gpu-activity.h>
 #include <hpcrun/gpu/gpu-activity-channel.h>
 #include <hpcrun/gpu/gpu-activity-process.h>
-#include <hpcrun/gpu/gpu-activity-multiplexer.h>
+#include <hpcrun/gpu/gpu-operation-multiplexer.h>
 #include <hpcrun/gpu/gpu-correlation-channel.h>
 #include <hpcrun/gpu/gpu-correlation-id-map.h>
 #include <hpcrun/gpu/gpu-application-thread-api.h>
@@ -495,11 +495,11 @@ opencl_activity_multiplexer_push
   gpu_activity.kind = GPU_ACTIVITY_EXTERNAL_CORRELATION;
   gpu_activity.details.correlation.correlation_id = correlation_id;
   gpu_activity.details.correlation.host_correlation_id = correlation_id;
-  gpu_activity_multiplexer_push(obj->details.initiator_channel, &gpu_activity);
+  gpu_operation_multiplexer_push(obj->details.initiator_channel, &gpu_activity);
   
   // The actual entry
   opencl_activity_translate(&gpu_activity, obj, interval);
-  gpu_activity_multiplexer_push(obj->details.initiator_channel, &gpu_activity);
+  gpu_operation_multiplexer_push(obj->details.initiator_channel, &gpu_activity);
 }
 
 
@@ -538,11 +538,8 @@ opencl_clSetKernelArg_activity_process
 
   opencl_activity_translate(&gpu_activity, cb_data, interval);
   
-  if (gpu_activity_multiplexer_my_channel_initialized() == false){
-    gpu_activity_multiplexer_my_channel_init();
-  }
   ETMSG(OPENCL, "cb_data->details.initiator_channel: %p", cb_data->details.initiator_channel);
-  gpu_activity_multiplexer_push(cb_data->details.initiator_channel, &gpu_activity);
+  gpu_operation_multiplexer_push(cb_data->details.initiator_channel, &gpu_activity);
 }
 
 
@@ -1313,7 +1310,7 @@ opencl_api_thread_finalize
 
     gpu_activity.kind = GPU_ACTIVITY_FLUSH;
     gpu_activity.details.flush.wait = &wait;
-    gpu_activity_multiplexer_push(gpu_activity_channel_get(), &gpu_activity);
+    gpu_operation_multiplexer_push(gpu_activity_channel_get(), &gpu_activity);
 
     // Wait until the activity is flushed
     // Operation channel is FIFO
@@ -1331,5 +1328,5 @@ opencl_api_process_finalize
  void *args
 )
 {
-  gpu_activity_multiplexer_fini();
+  gpu_operation_multiplexer_fini();
 }

From 6a68e40f90768389d0e56225f3291e3ae221a6aa Mon Sep 17 00:00:00 2001
From: Jokeren <robinho364@gmail.com>
Date: Thu, 15 Oct 2020 02:21:21 +0000
Subject: [PATCH 103/177] Flush all activities to activity channels before a
 thread exits

---
 .../hpcrun/gpu/gpu-operation-item-process.c   |  4 ++
 src/tool/hpcrun/gpu/gpu-operation-item.h      |  1 +
 .../hpcrun/gpu/gpu-operation-multiplexer.c    |  4 +-
 .../hpcrun/gpu/gpu-operation-multiplexer.h    |  1 +
 src/tool/hpcrun/gpu/opencl/opencl-api.c       | 62 +++++++++----------
 5 files changed, 39 insertions(+), 33 deletions(-)

diff --git a/src/tool/hpcrun/gpu/gpu-operation-item-process.c b/src/tool/hpcrun/gpu/gpu-operation-item-process.c
index 2b11b42501..0ebb1f91ec 100644
--- a/src/tool/hpcrun/gpu/gpu-operation-item-process.c
+++ b/src/tool/hpcrun/gpu/gpu-operation-item-process.c
@@ -210,5 +210,9 @@ gpu_operation_item_process
       gpu_unknown_process(it);
       break;
   }
+
+  if (it->pending_operations) {
+    atomic_fetch_add(it->pending_operations, -1);
+  }
 }
 
diff --git a/src/tool/hpcrun/gpu/gpu-operation-item.h b/src/tool/hpcrun/gpu/gpu-operation-item.h
index db5a528bb4..4439b95a24 100644
--- a/src/tool/hpcrun/gpu/gpu-operation-item.h
+++ b/src/tool/hpcrun/gpu/gpu-operation-item.h
@@ -81,6 +81,7 @@ typedef struct gpu_operation_item_t {
   s_element_t next;
   gpu_activity_channel_t *channel;
   gpu_activity_t activity;
+  atomic_uint *pending_operations;
   atomic_bool *flush;
 } gpu_operation_item_t;
 
diff --git a/src/tool/hpcrun/gpu/gpu-operation-multiplexer.c b/src/tool/hpcrun/gpu/gpu-operation-multiplexer.c
index 821476ede6..9c21f7ec08 100644
--- a/src/tool/hpcrun/gpu/gpu-operation-multiplexer.c
+++ b/src/tool/hpcrun/gpu/gpu-operation-multiplexer.c
@@ -185,6 +185,7 @@ void
 gpu_operation_multiplexer_push
 (
  gpu_activity_channel_t *initiator_channel,
+ atomic_bool *initiator_pending_operations,
  gpu_activity_t *gpu_activity
 )
 {
@@ -192,6 +193,7 @@ gpu_operation_multiplexer_push
     gpu_operation_multiplexer_my_channel_init();
   }
 
-  gpu_operation_item_t item = (gpu_operation_item_t){.channel=initiator_channel, .activity=*gpu_activity};
+  gpu_operation_item_t item = {.channel=initiator_channel,
+    .pending_operations=initiator_pending_operations, .activity=*gpu_activity};
   gpu_operation_channel_produce(gpu_operation_channel, &item);
 }
diff --git a/src/tool/hpcrun/gpu/gpu-operation-multiplexer.h b/src/tool/hpcrun/gpu/gpu-operation-multiplexer.h
index 7dacaf2fb3..e261d70d83 100644
--- a/src/tool/hpcrun/gpu/gpu-operation-multiplexer.h
+++ b/src/tool/hpcrun/gpu/gpu-operation-multiplexer.h
@@ -94,6 +94,7 @@ void
 gpu_operation_multiplexer_push
 (
  gpu_activity_channel_t *initiator_channel,
+ atomic_bool *initiator_pending_operations,
  gpu_activity_t *gpu_activity
 );
 
diff --git a/src/tool/hpcrun/gpu/opencl/opencl-api.c b/src/tool/hpcrun/gpu/opencl/opencl-api.c
index 7c3c3467f8..81b8aaed12 100644
--- a/src/tool/hpcrun/gpu/opencl/opencl-api.c
+++ b/src/tool/hpcrun/gpu/opencl/opencl-api.c
@@ -153,7 +153,7 @@ static atomic_uint opencl_pending_operations = { 0 };
 static __thread atomic_int opencl_self_pending_operations = { 0 };
 // Mark if a thread has invoked any opencl call
 // If yes, we can flush all opencl activities when the thread terminates
-static __thread bool opencl_stop_flag = false;
+static __thread bool opencl_api_flag = false;
 
 static spinlock_t opencl_h2d_lock = SPINLOCK_UNLOCKED;
 static bool instrumentation = false;
@@ -481,7 +481,7 @@ opencl_h2d_pending_operations_adjust
 
 
 static void
-opencl_activity_multiplexer_push
+opencl_operation_multiplexer_push
 (
  gpu_interval_t interval,
  opencl_object_t *obj,
@@ -495,11 +495,13 @@ opencl_activity_multiplexer_push
   gpu_activity.kind = GPU_ACTIVITY_EXTERNAL_CORRELATION;
   gpu_activity.details.correlation.correlation_id = correlation_id;
   gpu_activity.details.correlation.host_correlation_id = correlation_id;
-  gpu_operation_multiplexer_push(obj->details.initiator_channel, &gpu_activity);
+  gpu_operation_multiplexer_push(obj->details.initiator_channel,
+    NULL, &gpu_activity);
   
   // The actual entry
   opencl_activity_translate(&gpu_activity, obj, interval);
-  gpu_operation_multiplexer_push(obj->details.initiator_channel, &gpu_activity);
+  gpu_operation_multiplexer_push(obj->details.initiator_channel, 
+    obj->pending_operations, &gpu_activity);
 }
 
 
@@ -515,7 +517,7 @@ opencl_activity_process
   memset(&interval, 0, sizeof(gpu_interval_t));
   opencl_timing_info_get(&interval, event);
 
-  opencl_activity_multiplexer_push(interval, obj, correlation_id);
+  opencl_operation_multiplexer_push(interval, obj, correlation_id);
 }
 
 
@@ -539,7 +541,8 @@ opencl_clSetKernelArg_activity_process
   opencl_activity_translate(&gpu_activity, cb_data, interval);
   
   ETMSG(OPENCL, "cb_data->details.initiator_channel: %p", cb_data->details.initiator_channel);
-  gpu_operation_multiplexer_push(cb_data->details.initiator_channel, &gpu_activity);
+  gpu_operation_multiplexer_push(cb_data->details.initiator_channel,
+    cb_data->pending_operations, &gpu_activity);
 }
 
 
@@ -694,33 +697,32 @@ opencl_cb_basic_print
 void
 opencl_subscriber_callback
 (
- opencl_object_t *cb_info
+ opencl_object_t *obj
 )
 {
   // We invoked an opencl operation
-  opencl_stop_flag = true;
+  opencl_api_flag = true;
 
   uint32_t correlation_id = getCorrelationId();
 
   // Init operations
-  atomic_fetch_add(cb_info->pending_operations, 1);
-  atomic_fetch_add(&opencl_pending_operations, 1);
+  atomic_fetch_add(obj->pending_operations, 1);
 
   gpu_placeholder_type_t placeholder_type;
   gpu_op_placeholder_flags_t gpu_op_placeholder_flags = 0;
 
-  switch (cb_info->kind) {
+  switch (obj->kind) {
 
     case GPU_ACTIVITY_MEMCPY:
       {
-        cb_info->details.cpy_cb.correlation_id = correlation_id;
-        if (cb_info->details.cpy_cb.type == GPU_MEMCPY_H2D){ 
+        obj->details.cpy_cb.correlation_id = correlation_id;
+        if (obj->details.cpy_cb.type == GPU_MEMCPY_H2D){ 
           gpu_op_placeholder_flags_set(&gpu_op_placeholder_flags,
             gpu_placeholder_type_copyin);
 
           placeholder_type = gpu_placeholder_type_copyin;
 
-        } else if (cb_info->details.cpy_cb.type == GPU_MEMCPY_D2H){
+        } else if (obj->details.cpy_cb.type == GPU_MEMCPY_D2H){
           gpu_op_placeholder_flags_set(&gpu_op_placeholder_flags,
             gpu_placeholder_type_copyout);
 
@@ -731,7 +733,7 @@ opencl_subscriber_callback
 
     case GPU_ACTIVITY_KERNEL:
       {
-        cb_info->details.ker_cb.correlation_id = correlation_id;
+        obj->details.ker_cb.correlation_id = correlation_id;
         gpu_op_placeholder_flags_set(&gpu_op_placeholder_flags, 
           gpu_placeholder_type_kernel);
 
@@ -745,7 +747,7 @@ opencl_subscriber_callback
 
     case GPU_ACTIVITY_MEMORY:
       {
-        cb_info->details.mem_cb.correlation_id = correlation_id;
+        obj->details.mem_cb.correlation_id = correlation_id;
         gpu_op_placeholder_flags_set(&gpu_op_placeholder_flags,
           gpu_placeholder_type_alloc);
 
@@ -768,11 +770,11 @@ opencl_subscriber_callback
 
   gpu_activity_channel_consume(gpu_metrics_attribute);
 
-  cb_info->details.cct_node = cct_ph;
-  cb_info->details.initiator_channel = gpu_activity_channel_get();
-  cb_info->details.submit_time = CPU_NANOTIME();
+  obj->details.cct_node = cct_ph;
+  obj->details.initiator_channel = gpu_activity_channel_get();
+  obj->details.submit_time = CPU_NANOTIME();
 
-  if (cb_info->kind == GPU_ACTIVITY_KERNEL && instrumentation) {
+  if (obj->kind == GPU_ACTIVITY_KERNEL && instrumentation) {
     // Callback to produce gtpin correlation
     gtpin_produce_runtime_callstack(&gpu_op_ccts);
   }
@@ -799,9 +801,6 @@ opencl_activity_completion_callback
   }
 
   // Finish operations
-  atomic_fetch_add(cb_data->pending_operations, -1);
-  atomic_fetch_add(&opencl_pending_operations, -1);
-
   opencl_free(cb_data);
 }
 
@@ -1258,10 +1257,7 @@ clCreateBuffer
     HPCRUN_OPENCL_CALL(clCreateBuffer, (context, flags, size, host_ptr, errcode_ret));
   interval.end = CPU_NANOTIME();
 
-  opencl_activity_multiplexer_push(interval, mem_info, mem_info->details.mem_cb.correlation_id);
-
-  atomic_fetch_add(&opencl_pending_operations, -1);
-  atomic_fetch_add(&opencl_self_pending_operations, -1);
+  opencl_operation_multiplexer_push(interval, mem_info, mem_info->details.mem_cb.correlation_id);
 
   opencl_free(mem_info);
   
@@ -1298,10 +1294,9 @@ opencl_api_thread_finalize
  void *args
 )
 {
-  if (opencl_stop_flag) {
+  if (opencl_api_flag) {
     // If I have invoked any opencl api, I have to attribute all my activities to my ccts
-    opencl_stop_flag = false;
-    opencl_wait_for_self_pending_operations();
+    opencl_api_flag = false;
 
     atomic_bool wait;
     atomic_store(&wait, true);
@@ -1310,12 +1305,15 @@ opencl_api_thread_finalize
 
     gpu_activity.kind = GPU_ACTIVITY_FLUSH;
     gpu_activity.details.flush.wait = &wait;
-    gpu_operation_multiplexer_push(gpu_activity_channel_get(), &gpu_activity);
+    gpu_operation_multiplexer_push(gpu_activity_channel_get(), NULL, &gpu_activity);
 
-    // Wait until the activity is flushed
+    // Wait until operations are drained
     // Operation channel is FIFO
     while (atomic_load(&wait)) {}
 
+    // Wait until my activities are drained
+    opencl_wait_for_self_pending_operations();
+
     // Now I can attribute activities
     gpu_application_thread_process_activities();
   }

From 3ab858be964b18c8718bb1c0adb2c0f86c532290 Mon Sep 17 00:00:00 2001
From: dejangrubisic <grubisic.dejan@yahoo.com>
Date: Thu, 15 Oct 2020 16:14:09 -0500
Subject: [PATCH 104/177] printf refactoring

---
 src/tool/hpcrun/gpu/nvidia/cupti-api.c  |  8 ++--
 src/tool/hpcrun/sample-sources/papi-c.c | 55 ++++++++++++++-----------
 2 files changed, 35 insertions(+), 28 deletions(-)

diff --git a/src/tool/hpcrun/gpu/nvidia/cupti-api.c b/src/tool/hpcrun/gpu/nvidia/cupti-api.c
index 367c763e96..85ffbf04f4 100644
--- a/src/tool/hpcrun/gpu/nvidia/cupti-api.c
+++ b/src/tool/hpcrun/gpu/nvidia/cupti-api.c
@@ -112,9 +112,11 @@
 //******************************************************************************
 // macros
 //******************************************************************************
-#define DEBUG 1
+
+#define DEBUG 0
 #include <hpcrun/gpu/gpu-print.h>
 
+
 #define CUPTI_LIBRARY_LOCATION "/lib64/libcupti.so"
 #define CUPTI_PATH_FROM_CUDA "extras/CUPTI"
 
@@ -800,7 +802,7 @@ cupti_subscriber_callback
     cupti_stop_flag_set();
 
     const CUpti_CallbackData *cd = (const CUpti_CallbackData *) cb_info;
-		printf("\nDriver API:  -----------------%s\n", cd->functionName );
+		PRINT("\nDriver API:  -----------------%s\n", cd->functionName );
 
     bool ompt_runtime_api_flag = ompt_runtime_status_get();
 
@@ -1015,7 +1017,7 @@ cupti_subscriber_callback
     cupti_stop_flag_set();
 
     const CUpti_CallbackData *cd = (const CUpti_CallbackData *)cb_info;
-		printf("\nRuntime API:  -----------------%s\n", cd->functionName );
+		PRINT("\nRuntime API:  -----------------%s\n", cd->functionName );
 
     bool is_valid_op = false;
     bool is_kernel_op __attribute__((unused)) = false; // used only by PRINT when debugging
diff --git a/src/tool/hpcrun/sample-sources/papi-c.c b/src/tool/hpcrun/sample-sources/papi-c.c
index 45decffc0d..ad3d48ef21 100644
--- a/src/tool/hpcrun/sample-sources/papi-c.c
+++ b/src/tool/hpcrun/sample-sources/papi-c.c
@@ -99,17 +99,22 @@
 #include <lush/lush-backtrace.h>
 #include <lib/prof-lean/hpcrun-fmt.h>
 
+#include "papi-c.h"
 #include "tool_state.h"
 
+
 /******************************************************************************
  * macros
  *****************************************************************************/
 
+#define DEBUG 0
+
+#include <hpcrun/gpu/gpu-print.h>
+
 #define OVERFLOW_MODE 0
 #define WEIGHT_METRIC 0
 #define DEFAULT_THRESHOLD  2000000L
 
-#include "papi-c.h"
 
 /******************************************************************************
  * forward declarations 
@@ -754,7 +759,7 @@ METHOD_FN(display_events)
   int num_components, cidx;
 
   if (papi_unavail) {
-    printf("PAPI is not available.  Probably, the kernel doesn't support PAPI,\n"
+    PRINT("PAPI is not available.  Probably, the kernel doesn't support PAPI,\n"
      "or else maybe HPCToolkit is out of sync with PAPI.\n\n");
     goto finish;
   }
@@ -762,11 +767,11 @@ METHOD_FN(display_events)
   cidx = 0; // CPU component
   {
     const PAPI_component_info_t *component = PAPI_get_component_info(cidx);
-    printf("===========================================================================\n");
-    printf("Available PAPI preset events in component %s\n", component->name);
-    printf("\n");
-    printf("Name\t    Profilable\tDescription\n");
-    printf("===========================================================================\n");
+    PRINT("===========================================================================\n");
+    PRINT("Available PAPI preset events in component %s\n", component->name);
+    PRINT("\n");
+    PRINT("Name\t    Profilable\tDescription\n");
+    PRINT("===========================================================================\n");
 
     num_total = 0;
     num_prof = 0;
@@ -783,13 +788,13 @@ METHOD_FN(display_events)
     num_prof++;
   }
   num_total++;
-  printf("%-10s\t%s\t%s\n", info.symbol, prof, info.long_descr);
+  PRINT("%-10s\t%s\t%s\n", info.symbol, prof, info.long_descr);
       }
       ret = PAPI_enum_cmp_event(&ev, PAPI_ENUM_EVENTS, cidx);
     }
-    printf("---------------------------------------------------------------------------\n");
-    printf("Total PAPI events: %d, able to profile: %d\n", num_total, num_prof);
-    printf("\n\n");
+    PRINT("---------------------------------------------------------------------------\n");
+    PRINT("Total PAPI events: %d, able to profile: %d\n", num_total, num_prof);
+    PRINT("\n\n");
   }
 
   num_components = PAPI_num_components();
@@ -799,11 +804,11 @@ METHOD_FN(display_events)
 
     if (component->disabled) continue;
 
-    printf("===========================================================================\n");
-    printf("Native events in component %s\n", component->name);
-    printf("\n");
-    printf("Name  Description\n");
-    printf("===========================================================================\n");
+    PRINT("===========================================================================\n");
+    PRINT("Native events in component %s\n", component->name);
+    PRINT("\n");
+    PRINT("Name  Description\n");
+    PRINT("===========================================================================\n");
 
     ev = 0 | PAPI_NATIVE_MASK;
     ret = PAPI_enum_cmp_event(&ev, PAPI_ENUM_FIRST, cidx);
@@ -812,17 +817,17 @@ METHOD_FN(display_events)
       if (PAPI_get_event_info(ev, &info) == PAPI_OK) {
   cmp_event_count++;
         display_event_info(stdout, info.symbol, info.long_descr);
-        printf("---------------------------------------------------------------------------\n");
+        PRINT("---------------------------------------------------------------------------\n");
       }
       ret = PAPI_enum_cmp_event(&ev, PAPI_ENUM_EVENTS, cidx);
     }
-    printf("Total native events for component %s: %d\n", component->name, cmp_event_count);
-    printf("\n\n");
+    PRINT("Total native events for component %s: %d\n", component->name, cmp_event_count);
+    PRINT("\n\n");
     num_total += cmp_event_count;
   }
 
-  printf( "Total events reported: %d\n", num_total);
-  printf("\n\n");
+  PRINT( "Total events reported: %d\n", num_total);
+  PRINT("\n\n");
 finish:
   tool_exit();
 }
@@ -1043,7 +1048,7 @@ papi_monitor_enter(void *reg_info, void *args_in)
   sample_source_t *self = &obj_name(); /// just for debug
   int ret;
 
-  printf("|------->PAPI_MONITOR_ENTER | cct = %p\n", args->cct_node);
+  PRINT("|------->PAPI_MONITOR_ENTER | cct = %p\n", args->cct_node);
 
   // if sampling disabled explicitly for this thread, skip all processing
   if (hpcrun_suppress_sample() || sample_filters_apply()) goto finish;
@@ -1057,7 +1062,7 @@ papi_monitor_enter(void *reg_info, void *args_in)
   for (int cid = 0; cid < psi->num_components; ++cid) {
     papi_component_info_t *ci = &(psi->component_info[cid]);
     if (ci->inUse) {
-      printf("Self = %p | Component %d \t | cct = %p \n\n", self, cid, args->cct_node );
+      PRINT("Self = %p | Component %d \t | cct = %p \n\n", self, cid, args->cct_node );
 
       ret = PAPI_read(ci->eventSet, prev_values);
       //      ret = PAPI_start(ci->eventSet);
@@ -1088,7 +1093,7 @@ papi_monitor_exit(void *reg_info, void *args_in)
   int my_event_count = MAX_EVENTS;
   int ret;
 
-  printf("|------->PAPI_MONITOR_EXIT| running? %d\n", METHOD_CALL(self, started));
+  PRINT("|------->PAPI_MONITOR_EXIT| running? %d\n", METHOD_CALL(self, started));
 
   if (args->gpu_sync_ptr)
     args->gpu_sync_ptr();
@@ -1118,7 +1123,7 @@ papi_monitor_exit(void *reg_info, void *args_in)
         int event_index = get_event_index(self, my_event_codes[eid]);
         int metric_id = hpcrun_event2metric(self, event_index);
 
-        printf("%d Event = %x, event_index = %d, metric_id = %d || value = %llu ---> %llu\n",
+        PRINT("%d Event = %x, event_index = %d, metric_id = %d || value = %llu ---> %llu\n",
                eid, my_event_codes[eid], event_index, metric_id, prev_values[eid], my_event_values[eid]);
 
         blame_shift_apply(metric_id, cct_node, my_event_values[eid] /*metricIncr*/);

From 1975715bd97e5d1de7add94198df6eba292e6142 Mon Sep 17 00:00:00 2001
From: Aaron Cherian <atc8@iris.cs.rice.edu>
Date: Thu, 15 Oct 2020 18:18:02 -0500
Subject: [PATCH 105/177] merge fixes with develop

---
 src/tool/hpcrun/gpu/gpu-trace.c | 33 +--------------------------------
 1 file changed, 1 insertion(+), 32 deletions(-)

diff --git a/src/tool/hpcrun/gpu/gpu-trace.c b/src/tool/hpcrun/gpu/gpu-trace.c
index 51921fa917..55f09af6d3 100644
--- a/src/tool/hpcrun/gpu/gpu-trace.c
+++ b/src/tool/hpcrun/gpu/gpu-trace.c
@@ -309,7 +309,7 @@ gpu_trace_stream_release
 
   hpcrun_write_profile_data(&td->core_profile_trace_data);
   hpcrun_trace_close(&td->core_profile_trace_data);
-  atomic_fetch_add(&stream_counter, -1);
+  atomic_fetch_add(&active_streams_counter, -1);
 
 }
 
@@ -462,34 +462,3 @@ consume_one_trace_item
     PRINT("%p Append trace activity [%lu, %lu]\n", td, start, end);
   }
 }
-
-
-thread_data_t *
-gpu_trace_stream_acquire
-(
- void
-)
-{
-  thread_data_t *td = NULL;
-
-  int id = gpu_trace_stream_id();
-
-  // XXX(Keren): This API calls allocate_and_init_thread_data to bind td with the current thread
-  hpcrun_threadMgr_data_get_safe(id, NULL, &td, true);
-
-  return td;
-}
-
-
-void
-gpu_trace_stream_release
-(
- gpu_trace_channel_t *channel
-)
-{
-  thread_data_t *td = gpu_trace_channel_get_td(channel);
-
-  hpcrun_write_profile_data(&td->core_profile_trace_data);
-  hpcrun_trace_close(&td->core_profile_trace_data);
-  atomic_fetch_add(&active_streams_counter, -1);
-}

From 79abee8607d8445ed739111cf2c7815b31c2a82a Mon Sep 17 00:00:00 2001
From: Jokeren <robinho364@gmail.com>
Date: Fri, 16 Oct 2020 18:41:26 +0000
Subject: [PATCH 106/177] Use operation channel to attribute gtpin activities

---
 src/tool/hpcrun/gpu/instrumentation/gtpin-instrumentation.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/src/tool/hpcrun/gpu/instrumentation/gtpin-instrumentation.c b/src/tool/hpcrun/gpu/instrumentation/gtpin-instrumentation.c
index 37d539c807..e2a5cf2e3a 100644
--- a/src/tool/hpcrun/gpu/instrumentation/gtpin-instrumentation.c
+++ b/src/tool/hpcrun/gpu/instrumentation/gtpin-instrumentation.c
@@ -70,6 +70,7 @@
 #include <hpcrun/gpu/gpu-application-thread-api.h>
 #include <hpcrun/gpu/gpu-correlation.h>
 #include <hpcrun/gpu/gpu-correlation-channel.h>
+#include <hpcrun/gpu/gpu-operation-multiplexer.h>
 #include <hpcrun/gpu/gpu-host-correlation-map.h>
 #include <hpcrun/gpu/gpu-op-placeholders.h>
 #include <hpcrun/gpu/gpu-metrics.h>
@@ -327,7 +328,7 @@ kernelBlockActivityProcess
   cct_node_t *cct_child = hpcrun_cct_insert_ip_norm(host_op_node, ip); // how to set the ip_norm
   if (cct_child) {
     ga.cct_node = cct_child;
-    gpu_activity_channel_produce(activity_channel, &ga);
+    gpu_operation_multiplexer_push(activity_channel, NULL, &ga);
   }
 }
 
@@ -509,8 +510,6 @@ gtpin_enable_profiling
   }
 #endif
 
-  gpu_metrics_GPU_INST_enable();
-
   // Use opencl/level zero runtime stack
   gtpin_use_runtime_callstack = true;
 

From ac0fbe0d88a7025cfbd11b67cc3fc803eca2b520 Mon Sep 17 00:00:00 2001
From: Jokeren <robinho364@gmail.com>
Date: Thu, 22 Oct 2020 04:44:20 +0000
Subject: [PATCH 107/177] Adjust gtpin callstack to handle both runtime first
 and gtpin first cases

---
 .../instrumentation/gtpin-instrumentation.c   | 26 ++++++++++++++++---
 1 file changed, 22 insertions(+), 4 deletions(-)

diff --git a/src/tool/hpcrun/gpu/instrumentation/gtpin-instrumentation.c b/src/tool/hpcrun/gpu/instrumentation/gtpin-instrumentation.c
index e2a5cf2e3a..263047c2e6 100644
--- a/src/tool/hpcrun/gpu/instrumentation/gtpin-instrumentation.c
+++ b/src/tool/hpcrun/gpu/instrumentation/gtpin-instrumentation.c
@@ -100,6 +100,8 @@ static bool gtpin_use_runtime_callstack = false;
 
 static __thread uint64_t gtpin_correlation_id = 0;
 static __thread uint64_t gtpin_cpu_submit_time = 0;
+static __thread gpu_op_ccts_t gtpin_gpu_op_ccts;
+static __thread bool gtpin_first = true;
 
 //******************************************************************************
 // private operations
@@ -153,8 +155,15 @@ createKernelNode
   if (gtpin_use_runtime_callstack) {
     // XXX(Keren): gtpin's call stack is a mass, better to use opencl's call path
     // onKernelRun->clEnqueueNDRangeKernel_wrapper->opencl_subscriber_callback
-    gtpin_correlation_id = correlation_id;
-    gtpin_cpu_submit_time = cpu_submit_time;
+    if (gtpin_first) {
+      // gtpin callback->runtime callback
+      gtpin_correlation_id = correlation_id;
+      gtpin_cpu_submit_time = cpu_submit_time;
+    } else {
+      // runtime callback->gtpin callback
+      gpu_activity_channel_t *activity_channel = gpu_activity_channel_get();
+      gtpin_correlation_id_map_insert(correlation_id, &gtpin_gpu_op_ccts, activity_channel, cpu_submit_time);
+    }
   } else {
     cct_node_t *api_node = gpu_application_thread_correlation_callback(correlation_id);
 
@@ -528,7 +537,16 @@ gtpin_produce_runtime_callstack
 )
 {
   if (gtpin_use_runtime_callstack) {
-    gpu_activity_channel_t *activity_channel = gpu_activity_channel_get();
-    gtpin_correlation_id_map_insert(gtpin_correlation_id, gpu_op_ccts, activity_channel, gtpin_cpu_submit_time);
+    if (gtpin_correlation_id != 0) {
+      // gtpin callback->opencl callback
+      gpu_activity_channel_t *activity_channel = gpu_activity_channel_get();
+      gtpin_correlation_id_map_insert(gtpin_correlation_id, gpu_op_ccts, activity_channel, gtpin_cpu_submit_time);
+      gtpin_correlation_id = 0;
+      gtpin_first = true;
+    } else {
+      // opencl callback->gtpin callback;
+      gtpin_gpu_op_ccts = *gpu_op_ccts;      
+      gtpin_first = false;
+    }
   }
 }

From 4b53905646f028be9634e686d0cda66de6e59543 Mon Sep 17 00:00:00 2001
From: Jokeren <robinho364@gmail.com>
Date: Thu, 22 Oct 2020 22:32:41 +0000
Subject: [PATCH 108/177] Remove kernel suffix as gtpin writes a elf per kernel

---
 src/lib/binutils/intel/IntelGPUBinutils.cpp          | 10 ++++------
 .../gpu/instrumentation/gtpin-instrumentation.c      | 12 +++---------
 2 files changed, 7 insertions(+), 15 deletions(-)

diff --git a/src/lib/binutils/intel/IntelGPUBinutils.cpp b/src/lib/binutils/intel/IntelGPUBinutils.cpp
index 20df3222c0..7de38d8cc9 100644
--- a/src/lib/binutils/intel/IntelGPUBinutils.cpp
+++ b/src/lib/binutils/intel/IntelGPUBinutils.cpp
@@ -152,24 +152,22 @@ findIntelGPUBins
     const SKernelDebugDataHeaderIGC *kernel_header =
       reinterpret_cast<const SKernelDebugDataHeaderIGC*>(ptr);
     ptr += sizeof(SKernelDebugDataHeaderIGC);
-    std::string kernel_name(ptr);
 
     unsigned kernel_name_size_aligned = sizeof(uint32_t) *
       (1 + (kernel_header->KernelNameSize - 1) / sizeof(uint32_t));
     ptr += kernel_name_size_aligned;
 
     if (kernel_header->SizeVisaDbgInBytes > 0) {
-      std::stringstream ss;
-      ss << file_name << "." << kernel_name;
+      std::string kernel_name = file_name + ".kernel";
 
       size_t kernel_size = kernel_header->SizeVisaDbgInBytes;
       char *kernel_buffer = (char *)malloc(kernel_size);
       memcpy(kernel_buffer, ptr, kernel_size);
 
       auto elf_file = new ElfFile;
-      if (elf_file->open(kernel_buffer, kernel_size, ss.str())) {
-        // TODO(Keren): Dump binaries or not?
-        FILE *fptr = fopen(ss.str().c_str(), "wb");
+      if (elf_file->open(kernel_buffer, kernel_size, kernel_name)) {
+        // XXX(Keren): Since we are using gtpin, each elf correponds to a single kernel
+        FILE *fptr = fopen(kernel_name.c_str(), "wb");
         fwrite(kernel_buffer, sizeof(char), kernel_size, fptr);
         fclose(fptr);
 
diff --git a/src/tool/hpcrun/gpu/instrumentation/gtpin-instrumentation.c b/src/tool/hpcrun/gpu/instrumentation/gtpin-instrumentation.c
index 263047c2e6..3c2c50c9a4 100644
--- a/src/tool/hpcrun/gpu/instrumentation/gtpin-instrumentation.c
+++ b/src/tool/hpcrun/gpu/instrumentation/gtpin-instrumentation.c
@@ -90,6 +90,7 @@
 //******************************************************************************
 
 #define MAX_STR_SIZE 1024
+#define KERNEL_SUFFIX ".kernel"
 
 // TODO(Aaron): Why there are so many correlation ids
 static atomic_ullong correlation_id;
@@ -243,14 +244,8 @@ findOrAddKernelModule
  GTPinKernel kernel
 )
 {
-  char kernel_name[MAX_STR_SIZE];
-  GTPINTOOL_STATUS status;
-
-  status = GTPin_KernelGetName(kernel, MAX_STR_SIZE, kernel_name, NULL);
-  assert(status == GTPINTOOL_STATUS_SUCCESS);
-
   uint32_t kernel_elf_size = 0;
-  status = GTPin_GetElf(kernel, 0, NULL, &kernel_elf_size);
+  GTPINTOOL_STATUS status = GTPin_GetElf(kernel, 0, NULL, &kernel_elf_size);
   assert(status == GTPINTOOL_STATUS_SUCCESS);
 
   char *kernel_elf = (char *)malloc(sizeof(char) * kernel_elf_size);
@@ -269,8 +264,7 @@ findOrAddKernelModule
 
   free(kernel_elf);
 
-  strncat(file_name, ".", 1);
-  strncat(file_name, kernel_name, strlen(kernel_name));
+  strncat(file_name, KERNEL_SUFFIX, strlen(KERNEL_SUFFIX));
 
   uint32_t module_id = 0;
 

From abc11e0562a654cecb37f19f5cddb96df15d2e15 Mon Sep 17 00:00:00 2001
From: Jokeren <robinho364@gmail.com>
Date: Fri, 23 Oct 2020 05:11:32 +0000
Subject: [PATCH 109/177] Fix bugs

---
 src/tool/hpcrun/gpu/gpu-trace.c                             | 2 +-
 src/tool/hpcrun/gpu/instrumentation/gtpin-instrumentation.c | 5 +++--
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/src/tool/hpcrun/gpu/gpu-trace.c b/src/tool/hpcrun/gpu/gpu-trace.c
index 8084aa0384..aad8497de2 100644
--- a/src/tool/hpcrun/gpu/gpu-trace.c
+++ b/src/tool/hpcrun/gpu/gpu-trace.c
@@ -309,7 +309,7 @@ gpu_trace_stream_release
 
   hpcrun_write_profile_data(&td->core_profile_trace_data);
   hpcrun_trace_close(&td->core_profile_trace_data);
-  atomic_fetch_add(&stream_counter, -1);
+  atomic_fetch_add(&active_streams_counter, -1);
 
 }
 
diff --git a/src/tool/hpcrun/gpu/instrumentation/gtpin-instrumentation.c b/src/tool/hpcrun/gpu/instrumentation/gtpin-instrumentation.c
index 3c2c50c9a4..569bfea1ac 100644
--- a/src/tool/hpcrun/gpu/instrumentation/gtpin-instrumentation.c
+++ b/src/tool/hpcrun/gpu/instrumentation/gtpin-instrumentation.c
@@ -61,6 +61,7 @@
 // local includes
 //******************************************************************************
 
+#include <include/gpu-binary.h>
 #include <hpcrun/safe-sampling.h>
 #include <hpcrun/cct/cct.h>
 #include <hpcrun/memory/hpcrun-malloc.h>
@@ -229,12 +230,12 @@ computeBinaryHash
   size_t i;
   size_t used = 0;
   used += sprintf(&file_name[used], "%s", hpcrun_files_output_directory());
-  used += sprintf(&file_name[used], "%s", "/intel/");
+  used += sprintf(&file_name[used], "%s", "/" GPU_BINARY_DIRECTORY "/");
   mkdir(file_name, S_IRWXU | S_IRWXG | S_IROTH | S_IXOTH);
   for (i = 0; i < HASH_LENGTH; ++i) {
     used += sprintf(&file_name[used], "%02x", hash[i]);
   }
-  used += sprintf(&file_name[used], "%s", ".gpubin");
+  used += sprintf(&file_name[used], "%s", GPU_BINARY_SUFFIX);
 }
 
 

From 5399efd74ac11edf64214d0607b8ea2278d1aa63 Mon Sep 17 00:00:00 2001
From: Jokeren <robinho364@gmail.com>
Date: Fri, 23 Oct 2020 19:13:30 +0000
Subject: [PATCH 110/177] Not write gpu elf files to disk

---
 src/lib/banal/gpu/ReadIntelCFG.cpp                | 15 ++++++++++++---
 src/lib/binutils/intel/IntelGPUBinutils.cpp       | 10 +++-------
 .../gpu/instrumentation/gtpin-instrumentation.c   | 11 +++++++++--
 3 files changed, 24 insertions(+), 12 deletions(-)

diff --git a/src/lib/banal/gpu/ReadIntelCFG.cpp b/src/lib/banal/gpu/ReadIntelCFG.cpp
index d5ea08aa1f..f515929e32 100644
--- a/src/lib/banal/gpu/ReadIntelCFG.cpp
+++ b/src/lib/banal/gpu/ReadIntelCFG.cpp
@@ -126,7 +126,7 @@ addCustomFunctionObject
 
 
 static std::string
-getFileNameFromAbsolutePath(const std::string &str) {
+getFunctionNameFromAbsolutePath(const std::string &str) {
   // TODO(Aaron): you can just find the last "/" and grab "/" to the end
   std::vector<std::string> tokens; 
   std::stringstream str_stream(str); 
@@ -136,7 +136,16 @@ getFileNameFromAbsolutePath(const std::string &str) {
   while(std::getline(str_stream, intermediate, '/')) { 
     tokens.push_back(intermediate); 
   } 
-  return tokens[tokens.size() - 1];
+
+  std::string file_name = tokens[tokens.size() - 1];
+  std::string function_name;
+  // xxx.gpubin.function_name
+  auto pos = file_name.rfind(".");
+  if (pos != std::string::npos) {
+    function_name = file_name.substr(pos + 1);
+  }
+
+  return function_name;
 }
 
 
@@ -243,7 +252,7 @@ readIntelCFG
 )
 {
   if (cfg_wanted) {
-    auto function_name = getFileNameFromAbsolutePath(elfFile->getFileName());
+    auto function_name = getFunctionNameFromAbsolutePath(elfFile->getFileName());
     addCustomFunctionObject(function_name, the_symtab); //adds a dummy function object
 
     char *text_section = NULL;
diff --git a/src/lib/binutils/intel/IntelGPUBinutils.cpp b/src/lib/binutils/intel/IntelGPUBinutils.cpp
index 7de38d8cc9..3c06a27902 100644
--- a/src/lib/binutils/intel/IntelGPUBinutils.cpp
+++ b/src/lib/binutils/intel/IntelGPUBinutils.cpp
@@ -152,25 +152,21 @@ findIntelGPUBins
     const SKernelDebugDataHeaderIGC *kernel_header =
       reinterpret_cast<const SKernelDebugDataHeaderIGC*>(ptr);
     ptr += sizeof(SKernelDebugDataHeaderIGC);
+    std::string kernel_name(ptr);
 
     unsigned kernel_name_size_aligned = sizeof(uint32_t) *
       (1 + (kernel_header->KernelNameSize - 1) / sizeof(uint32_t));
     ptr += kernel_name_size_aligned;
 
     if (kernel_header->SizeVisaDbgInBytes > 0) {
-      std::string kernel_name = file_name + ".kernel";
+      std::string real_kernel_name = file_name + "." + kernel_name;
 
       size_t kernel_size = kernel_header->SizeVisaDbgInBytes;
       char *kernel_buffer = (char *)malloc(kernel_size);
       memcpy(kernel_buffer, ptr, kernel_size);
 
       auto elf_file = new ElfFile;
-      if (elf_file->open(kernel_buffer, kernel_size, kernel_name)) {
-        // XXX(Keren): Since we are using gtpin, each elf correponds to a single kernel
-        FILE *fptr = fopen(kernel_name.c_str(), "wb");
-        fwrite(kernel_buffer, sizeof(char), kernel_size, fptr);
-        fclose(fptr);
-
+      if (elf_file->open(kernel_buffer, kernel_size, real_kernel_name)) {
         elf_file->setIntelGPUFile(true);
         filevector->push_back(elf_file);
       } else {
diff --git a/src/tool/hpcrun/gpu/instrumentation/gtpin-instrumentation.c b/src/tool/hpcrun/gpu/instrumentation/gtpin-instrumentation.c
index 569bfea1ac..258e72f439 100644
--- a/src/tool/hpcrun/gpu/instrumentation/gtpin-instrumentation.c
+++ b/src/tool/hpcrun/gpu/instrumentation/gtpin-instrumentation.c
@@ -245,8 +245,14 @@ findOrAddKernelModule
  GTPinKernel kernel
 )
 {
+  char kernel_name[MAX_STR_SIZE];
+  GTPINTOOL_STATUS status;
+
+  status = GTPin_KernelGetName(kernel, MAX_STR_SIZE, kernel_name, NULL);
+  assert(status == GTPINTOOL_STATUS_SUCCESS);
+
   uint32_t kernel_elf_size = 0;
-  GTPINTOOL_STATUS status = GTPin_GetElf(kernel, 0, NULL, &kernel_elf_size);
+  status = GTPin_GetElf(kernel, 0, NULL, &kernel_elf_size);
   assert(status == GTPINTOOL_STATUS_SUCCESS);
 
   char *kernel_elf = (char *)malloc(sizeof(char) * kernel_elf_size);
@@ -265,7 +271,8 @@ findOrAddKernelModule
 
   free(kernel_elf);
 
-  strncat(file_name, KERNEL_SUFFIX, strlen(KERNEL_SUFFIX));
+  strncat(file_name, ".", 1);
+  strncat(file_name, kernel_name, strlen(kernel_name));
 
   uint32_t module_id = 0;
 

From 1a57f5477531cb5ba6deb5de89828bc16dd39640 Mon Sep 17 00:00:00 2001
From: Jokeren <robinho364@gmail.com>
Date: Fri, 23 Oct 2020 19:14:21 +0000
Subject: [PATCH 111/177] Use ls -s to get file size; change size unit from
 byte to kb

---
 src/tool/hpcstruct/Args.cpp            | 6 +++---
 src/tool/hpcstruct/gpubin-analysis.txt | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/tool/hpcstruct/Args.cpp b/src/tool/hpcstruct/Args.cpp
index d054617e4d..a7b8450657 100644
--- a/src/tool/hpcstruct/Args.cpp
+++ b/src/tool/hpcstruct/Args.cpp
@@ -89,8 +89,8 @@ using std::string;
 //***************************************************************************
 
 // Size in bytes for parallel analysis of gpu binaries
-#define DEFAULT_GPU_SIZE   100000000
-#define GPU_SIZE_STR      "100000000"
+#define DEFAULT_GPU_SIZE   100000
+#define GPU_SIZE_STR      "100000"
 
 static const char* version_info = HPCTOOLKIT_VERSION_STRING;
 
@@ -137,7 +137,7 @@ Options: General\n\
 \n\
 Options: Parallel usage\n\
   -j <num>, --jobs <num>  Use <num> threads for all phases in hpcstruct. {1}\n\
-  --gpu-size <n>       Size (bytes) of a GPU binary that will cause hpcstruct\n\
+  --gpu-size <n>       Size (KB) of a GPU binary that will cause hpcstruct\n\
                        to use <num> threads to analyze a binary in parallel.\n\
                        GPU binaries with fewer than <n> bytes will be analyzed\n\
                        concurrently, <num> at a time.  {" GPU_SIZE_STR "}\n\
diff --git a/src/tool/hpcstruct/gpubin-analysis.txt b/src/tool/hpcstruct/gpubin-analysis.txt
index 24fc86aed9..c1dd11fb9c 100644
--- a/src/tool/hpcstruct/gpubin-analysis.txt
+++ b/src/tool/hpcstruct/gpubin-analysis.txt
@@ -49,7 +49,7 @@ $(STRUCTS_DIR)/%.hpcstruct: $(GPUBIN_DIR)/%
 	@gpubin_name=`basename -s x $<`
 	struct_name=$@
 	warn_name=$(STRUCTS_DIR)/$$gpubin_name.warnings
-	if test `size $< | tail -1 | awk '{ print $$1 }'` -gt $(PAR_SIZE) ; then
+	if test `ls -s $< | tail -1 | awk '{ print $$1 }'` -gt $(PAR_SIZE) ; then
 		if test $(THREADS) -gt 1 ; then
 			echo msg: begin parallel analysis of $$gpubin_name \\($(THREADS) threads\\)
 		else

From 8864f9b041ac2002d4d5d34634e3bee5b3f5ff3b Mon Sep 17 00:00:00 2001
From: Jokeren <robinho364@gmail.com>
Date: Sat, 24 Oct 2020 20:28:40 +0000
Subject: [PATCH 112/177] Revert "Use ls -s to get file size; change size unit
 from byte to kb"

This reverts commit 1a57f5477531cb5ba6deb5de89828bc16dd39640.
---
 src/tool/hpcstruct/Args.cpp            | 6 +++---
 src/tool/hpcstruct/gpubin-analysis.txt | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/tool/hpcstruct/Args.cpp b/src/tool/hpcstruct/Args.cpp
index a7b8450657..d054617e4d 100644
--- a/src/tool/hpcstruct/Args.cpp
+++ b/src/tool/hpcstruct/Args.cpp
@@ -89,8 +89,8 @@ using std::string;
 //***************************************************************************
 
 // Size in bytes for parallel analysis of gpu binaries
-#define DEFAULT_GPU_SIZE   100000
-#define GPU_SIZE_STR      "100000"
+#define DEFAULT_GPU_SIZE   100000000
+#define GPU_SIZE_STR      "100000000"
 
 static const char* version_info = HPCTOOLKIT_VERSION_STRING;
 
@@ -137,7 +137,7 @@ Options: General\n\
 \n\
 Options: Parallel usage\n\
   -j <num>, --jobs <num>  Use <num> threads for all phases in hpcstruct. {1}\n\
-  --gpu-size <n>       Size (KB) of a GPU binary that will cause hpcstruct\n\
+  --gpu-size <n>       Size (bytes) of a GPU binary that will cause hpcstruct\n\
                        to use <num> threads to analyze a binary in parallel.\n\
                        GPU binaries with fewer than <n> bytes will be analyzed\n\
                        concurrently, <num> at a time.  {" GPU_SIZE_STR "}\n\
diff --git a/src/tool/hpcstruct/gpubin-analysis.txt b/src/tool/hpcstruct/gpubin-analysis.txt
index c1dd11fb9c..24fc86aed9 100644
--- a/src/tool/hpcstruct/gpubin-analysis.txt
+++ b/src/tool/hpcstruct/gpubin-analysis.txt
@@ -49,7 +49,7 @@ $(STRUCTS_DIR)/%.hpcstruct: $(GPUBIN_DIR)/%
 	@gpubin_name=`basename -s x $<`
 	struct_name=$@
 	warn_name=$(STRUCTS_DIR)/$$gpubin_name.warnings
-	if test `ls -s $< | tail -1 | awk '{ print $$1 }'` -gt $(PAR_SIZE) ; then
+	if test `size $< | tail -1 | awk '{ print $$1 }'` -gt $(PAR_SIZE) ; then
 		if test $(THREADS) -gt 1 ; then
 			echo msg: begin parallel analysis of $$gpubin_name \\($(THREADS) threads\\)
 		else

From c0eb7fffdb03bfa7dd16012ce8eea0f701260567 Mon Sep 17 00:00:00 2001
From: Jokeren <robinho364@gmail.com>
Date: Sat, 24 Oct 2020 21:56:18 +0000
Subject: [PATCH 113/177] Enable hpcstruct to analyze intel gpubins in
 parallel: 1. change size to du in gpubin-analysis.txt 2. write a elf file
 before using dyninst

---
 src/lib/banal/gpu/ReadIntelCFG.cpp            | 30 +++-------------
 src/lib/binutils/ElfHelper.hpp                |  4 +++
 src/lib/binutils/intel/IntelGPUBinutils.cpp   | 35 ++++++++++++++++--
 src/lib/prof-lean/crypto-hash.h               |  9 +++++
 src/tool/hpclump/Makefile.am                  |  1 +
 src/tool/hpclump/Makefile.in                  |  3 +-
 src/tool/hpcprof/Makefile.am                  |  1 +
 src/tool/hpcprof/Makefile.in                  |  4 ++-
 src/tool/hpcproftt/Makefile.am                |  1 +
 src/tool/hpcproftt/Makefile.in                |  4 ++-
 .../instrumentation/gtpin-instrumentation.c   | 36 +++++++++++++------
 src/tool/hpcstruct/Makefile.am                |  1 +
 src/tool/hpcstruct/Makefile.in                |  4 ++-
 src/tool/hpcstruct/gpubin-analysis.txt        |  2 +-
 14 files changed, 92 insertions(+), 43 deletions(-)

diff --git a/src/lib/banal/gpu/ReadIntelCFG.cpp b/src/lib/banal/gpu/ReadIntelCFG.cpp
index f515929e32..f822d7c1a3 100644
--- a/src/lib/banal/gpu/ReadIntelCFG.cpp
+++ b/src/lib/banal/gpu/ReadIntelCFG.cpp
@@ -124,31 +124,6 @@ addCustomFunctionObject
   assert(status == true);
 }
 
-
-static std::string
-getFunctionNameFromAbsolutePath(const std::string &str) {
-  // TODO(Aaron): you can just find the last "/" and grab "/" to the end
-  std::vector<std::string> tokens; 
-  std::stringstream str_stream(str); 
-  std::string intermediate; 
-
-  // Tokenizing w.r.t. '/'
-  while(std::getline(str_stream, intermediate, '/')) { 
-    tokens.push_back(intermediate); 
-  } 
-
-  std::string file_name = tokens[tokens.size() - 1];
-  std::string function_name;
-  // xxx.gpubin.function_name
-  auto pos = file_name.rfind(".");
-  if (pos != std::string::npos) {
-    function_name = file_name.substr(pos + 1);
-  }
-
-  return function_name;
-}
-
-
 static void
 parseIntelCFG
 (
@@ -252,12 +227,15 @@ readIntelCFG
 )
 {
   if (cfg_wanted) {
-    auto function_name = getFunctionNameFromAbsolutePath(elfFile->getFileName());
+    auto function_name = elfFile->getGPUKernelName();
     addCustomFunctionObject(function_name, the_symtab); //adds a dummy function object
 
     char *text_section = NULL;
     auto text_section_size = elfFile->getTextSection(&text_section);
     if (text_section_size == 0) {
+      *code_src = new SymtabCodeSource(the_symtab);
+      *code_obj = new CodeObject(*code_src, NULL, NULL, false, true);
+
       return false;
     }
 
diff --git a/src/lib/binutils/ElfHelper.hpp b/src/lib/binutils/ElfHelper.hpp
index 1d8d554629..5ff8dd2500 100644
--- a/src/lib/binutils/ElfHelper.hpp
+++ b/src/lib/binutils/ElfHelper.hpp
@@ -98,6 +98,9 @@ class ElfFile {
   size_t getTextSection(char **text_section);
   bool isIntelGPUFile() { return intelGPU; }
   void setIntelGPUFile(bool _intelGPU) { intelGPU = _intelGPU; }
+  // Intel GPUs have kernel name suffix
+  void setGPUKernelName(const std::string &_gpuKernel) { gpuKernel = _gpuKernel; }
+  std::string getGPUKernelName() { return gpuKernel; }
 private:
   int arch;
   char *origPtr;
@@ -106,6 +109,7 @@ class ElfFile {
   Elf *elf;
   bool intelGPU;
   std::string fileName;
+  std::string gpuKernel;
 };
 
 class ElfFileVector : public std::vector<ElfFile *> {};
diff --git a/src/lib/binutils/intel/IntelGPUBinutils.cpp b/src/lib/binutils/intel/IntelGPUBinutils.cpp
index 3c06a27902..26d950e224 100644
--- a/src/lib/binutils/intel/IntelGPUBinutils.cpp
+++ b/src/lib/binutils/intel/IntelGPUBinutils.cpp
@@ -55,6 +55,7 @@
 #include <string>
 #include <sys/types.h>
 #include <sys/stat.h>
+#include <linux/limits.h>
 #include <fcntl.h>
 #include <unistd.h>
 #include <libelf.h>
@@ -68,6 +69,7 @@
 // local includes
 //******************************************************************************
 
+#include <lib/prof-lean/crypto-hash.h>
 #include <lib/binutils/ElfHelper.hpp>
 #include <lib/support/diagnostics.h>
 #include <lib/support/RealPathMgr.cpp>
@@ -126,6 +128,26 @@ opencl_elf_section_type
   }
 }
 
+static size_t
+computeHash
+(
+ const char *mem_ptr,
+ size_t mem_size,
+ char *name
+)
+{
+  // Compute hash for the binary
+  unsigned char hash[HASH_LENGTH];
+  crypto_hash_compute((const unsigned char *)mem_ptr, mem_size, hash, HASH_LENGTH);
+
+  size_t i;
+  size_t used = 0;
+  for (i = 0; i < HASH_LENGTH; ++i) {
+    used += sprintf(&name[used], "%02x", hash[i]);
+  }
+  return used;
+}
+
 //******************************************************************************
 // interface operations
 //******************************************************************************
@@ -159,15 +181,24 @@ findIntelGPUBins
     ptr += kernel_name_size_aligned;
 
     if (kernel_header->SizeVisaDbgInBytes > 0) {
-      std::string real_kernel_name = file_name + "." + kernel_name;
-
       size_t kernel_size = kernel_header->SizeVisaDbgInBytes;
       char *kernel_buffer = (char *)malloc(kernel_size);
       memcpy(kernel_buffer, ptr, kernel_size);
 
+      // Compute hash for the kernel name
+      char kernel_name_hash[PATH_MAX];
+      computeHash(kernel_name.c_str(), kernel_name.size(), kernel_name_hash);
+
+      std::string real_kernel_name = file_name + "." + std::string((char *)kernel_name_hash);
+
       auto elf_file = new ElfFile;
       if (elf_file->open(kernel_buffer, kernel_size, real_kernel_name)) {
+        FILE *fptr = fopen(real_kernel_name.c_str(), "wb");
+        fwrite(kernel_buffer, sizeof(char), kernel_size, fptr);
+        fclose(fptr);
+
         elf_file->setIntelGPUFile(true);
+        elf_file->setGPUKernelName(kernel_name);
         filevector->push_back(elf_file);
       } else {
         // kernel_buffer is released with elf_file
diff --git a/src/lib/prof-lean/crypto-hash.h b/src/lib/prof-lean/crypto-hash.h
index 62a0684166..1ac6d5f913 100644
--- a/src/lib/prof-lean/crypto-hash.h
+++ b/src/lib/prof-lean/crypto-hash.h
@@ -67,6 +67,11 @@
 // interface operations
 //*****************************************************************************
 
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+
 //-----------------------------------------------------------------------------
 // function: 
 //   crypto_hash_compute
@@ -152,4 +157,8 @@ crypto_hash_self_test
   int verbose
 );
 
+#if defined(__cplusplus)
+}
+#endif
+
 #endif
diff --git a/src/tool/hpclump/Makefile.am b/src/tool/hpclump/Makefile.am
index 80ad51ffa2..5caa66e31a 100644
--- a/src/tool/hpclump/Makefile.am
+++ b/src/tool/hpclump/Makefile.am
@@ -110,6 +110,7 @@ MYLDADD = \
 	$(HPCLIB_SupportLean) \
 	$(MY_ELF_DWARF) \
 	$(IGC_LDFLGS) \
+	$(MBEDTLS_LIBS) \
 	@LZMA_LDFLAGS_STAT@ \
 	@BINUTILS_LIBS@ \
 	@HOST_HPCSTRUCT_LDFLAGS@
diff --git a/src/tool/hpclump/Makefile.in b/src/tool/hpclump/Makefile.in
index 06e70d1f4a..60a6d3885e 100644
--- a/src/tool/hpclump/Makefile.in
+++ b/src/tool/hpclump/Makefile.in
@@ -144,7 +144,7 @@ am__DEPENDENCIES_1 =
 am__DEPENDENCIES_3 = $(HPCLIB_Binutils) $(HPCLIB_ProfLean) \
 	$(HPCLIB_ISA) $(am__DEPENDENCIES_2) $(HPCLIB_Support) \
 	$(HPCLIB_SupportLean) $(am__DEPENDENCIES_1) \
-	$(am__DEPENDENCIES_1)
+	$(am__DEPENDENCIES_1) $(am__DEPENDENCIES_1)
 hpclump_DEPENDENCIES = $(am__DEPENDENCIES_3)
 AM_V_lt = $(am__v_lt_@AM_V@)
 am__v_lt_ = $(am__v_lt_@AM_DEFAULT_V@)
@@ -562,6 +562,7 @@ MYLDADD = \
 	$(HPCLIB_SupportLean) \
 	$(MY_ELF_DWARF) \
 	$(IGC_LDFLGS) \
+	$(MBEDTLS_LIBS) \
 	@LZMA_LDFLAGS_STAT@ \
 	@BINUTILS_LIBS@ \
 	@HOST_HPCSTRUCT_LDFLAGS@
diff --git a/src/tool/hpcprof/Makefile.am b/src/tool/hpcprof/Makefile.am
index 1f00dfdd92..417d729d76 100644
--- a/src/tool/hpcprof/Makefile.am
+++ b/src/tool/hpcprof/Makefile.am
@@ -116,6 +116,7 @@ MYLDADD = \
 	$(HPCLIB_SupportLean) \
 	$(MY_ELF_DWARF) \
 	$(IGC_LDFLGS) \
+	$(MBEDTLS_LIBS) \
 	@LZMA_LDFLAGS_STAT@ \
 	@XERCES_LDLIBS@ \
 	@XERCES_LDLIBS@ \
diff --git a/src/tool/hpcprof/Makefile.in b/src/tool/hpcprof/Makefile.in
index 3b64d798af..569e748013 100644
--- a/src/tool/hpcprof/Makefile.in
+++ b/src/tool/hpcprof/Makefile.in
@@ -146,7 +146,8 @@ am__DEPENDENCIES_3 = $(HPCLIB_Analysis) $(HPCLIB_Banal_Simple) \
 	$(HPCLIB_ProfXML) $(HPCLIB_Prof) $(HPCLIB_Binutils) \
 	$(HPCLIB_ProfLean) $(HPCLIB_ISA) $(am__DEPENDENCIES_2) \
 	$(HPCLIB_XML) $(HPCLIB_Support) $(HPCLIB_SupportLean) \
-	$(am__DEPENDENCIES_1) $(am__DEPENDENCIES_1)
+	$(am__DEPENDENCIES_1) $(am__DEPENDENCIES_1) \
+	$(am__DEPENDENCIES_1)
 hpcprof_bin_DEPENDENCIES = $(am__DEPENDENCIES_3)
 AM_V_lt = $(am__v_lt_@AM_V@)
 am__v_lt_ = $(am__v_lt_@AM_DEFAULT_V@)
@@ -599,6 +600,7 @@ MYLDADD = \
 	$(HPCLIB_SupportLean) \
 	$(MY_ELF_DWARF) \
 	$(IGC_LDFLGS) \
+	$(MBEDTLS_LIBS) \
 	@LZMA_LDFLAGS_STAT@ \
 	@XERCES_LDLIBS@ \
 	@XERCES_LDLIBS@ \
diff --git a/src/tool/hpcproftt/Makefile.am b/src/tool/hpcproftt/Makefile.am
index bfc7751b20..91ee143e86 100644
--- a/src/tool/hpcproftt/Makefile.am
+++ b/src/tool/hpcproftt/Makefile.am
@@ -115,6 +115,7 @@ MYLDADD = \
 	$(HPCLIB_SupportLean) \
 	$(MY_ELF_DWARF) \
 	$(IGC_LDFLGS) \
+	$(MBEDTLS_LIBS) \
 	@LZMA_LDFLAGS_STAT@ \
 	@XERCES_LDLIBS@ \
 	@BINUTILS_LIBS@ \
diff --git a/src/tool/hpcproftt/Makefile.in b/src/tool/hpcproftt/Makefile.in
index f89fac0c12..5278e3cd15 100644
--- a/src/tool/hpcproftt/Makefile.in
+++ b/src/tool/hpcproftt/Makefile.in
@@ -148,7 +148,8 @@ am__DEPENDENCIES_3 = $(HPCLIB_Analysis) $(HPCLIB_Banal_Simple) \
 	$(HPCLIB_ProfXML) $(HPCLIB_Prof) $(HPCLIB_Binutils) \
 	$(HPCLIB_ProfLean) $(HPCLIB_ISA) $(am__DEPENDENCIES_2) \
 	$(HPCLIB_XML) $(HPCLIB_Support) $(HPCLIB_SupportLean) \
-	$(am__DEPENDENCIES_1) $(am__DEPENDENCIES_1)
+	$(am__DEPENDENCIES_1) $(am__DEPENDENCIES_1) \
+	$(am__DEPENDENCIES_1)
 hpcproftt_bin_DEPENDENCIES = $(am__DEPENDENCIES_3)
 AM_V_lt = $(am__v_lt_@AM_V@)
 am__v_lt_ = $(am__v_lt_@AM_DEFAULT_V@)
@@ -602,6 +603,7 @@ MYLDADD = \
 	$(HPCLIB_SupportLean) \
 	$(MY_ELF_DWARF) \
 	$(IGC_LDFLGS) \
+	$(MBEDTLS_LIBS) \
 	@LZMA_LDFLAGS_STAT@ \
 	@XERCES_LDLIBS@ \
 	@BINUTILS_LIBS@ \
diff --git a/src/tool/hpcrun/gpu/instrumentation/gtpin-instrumentation.c b/src/tool/hpcrun/gpu/instrumentation/gtpin-instrumentation.c
index 258e72f439..0e3fb653f1 100644
--- a/src/tool/hpcrun/gpu/instrumentation/gtpin-instrumentation.c
+++ b/src/tool/hpcrun/gpu/instrumentation/gtpin-instrumentation.c
@@ -214,8 +214,27 @@ writeBinary
   }
 }
 
+static size_t
+computeHash
+(
+ const char *mem_ptr,
+ size_t mem_size,
+ char *name
+)
+{
+  // Compute hash for mem_ptr with mem_size
+  unsigned char hash[HASH_LENGTH];
+  crypto_hash_compute((const unsigned char *)mem_ptr, mem_size, hash, HASH_LENGTH);
 
-void
+  size_t i;
+  size_t used = 0;
+  for (i = 0; i < HASH_LENGTH; ++i) {
+    used += sprintf(&name[used], "%02x", hash[i]);
+  }
+  return used;
+}
+
+static void
 computeBinaryHash
 (
  const char *binary,
@@ -223,18 +242,11 @@ computeBinaryHash
  char *file_name
 )
 {
-  // Compute hash for the binary
-  unsigned char hash[HASH_LENGTH];
-  crypto_hash_compute((const unsigned char *)binary, binary_size, hash, HASH_LENGTH);
-
-  size_t i;
   size_t used = 0;
   used += sprintf(&file_name[used], "%s", hpcrun_files_output_directory());
   used += sprintf(&file_name[used], "%s", "/" GPU_BINARY_DIRECTORY "/");
   mkdir(file_name, S_IRWXU | S_IRWXG | S_IROTH | S_IXOTH);
-  for (i = 0; i < HASH_LENGTH; ++i) {
-    used += sprintf(&file_name[used], "%02x", hash[i]);
-  }
+  used += computeHash(binary, binary_size, &file_name[used]);
   used += sprintf(&file_name[used], "%s", GPU_BINARY_SUFFIX);
 }
 
@@ -271,8 +283,12 @@ findOrAddKernelModule
 
   free(kernel_elf);
 
+  // Compute hash for the kernel name
+  char kernel_name_hash[PATH_MAX];
+  computeHash(kernel_name, strlen(kernel_name), kernel_name_hash);
+
   strncat(file_name, ".", 1);
-  strncat(file_name, kernel_name, strlen(kernel_name));
+  strncat(file_name, kernel_name_hash, strlen(kernel_name_hash));
 
   uint32_t module_id = 0;
 
diff --git a/src/tool/hpcstruct/Makefile.am b/src/tool/hpcstruct/Makefile.am
index 4cc692ce9f..49f837c8c2 100644
--- a/src/tool/hpcstruct/Makefile.am
+++ b/src/tool/hpcstruct/Makefile.am
@@ -123,6 +123,7 @@ MYLDADD = \
 	@BINUTILS_LIBS@ \
 	$(LZMA_LDFLAGS_DYN) \
 	$(IGC_LDFLGS) \
+	$(MBEDTLS_LIBS) \
 	$(TBB_LFLAGS)
 
 DOT_LDADD = \
diff --git a/src/tool/hpcstruct/Makefile.in b/src/tool/hpcstruct/Makefile.in
index 5cef49945b..6abcc97f30 100644
--- a/src/tool/hpcstruct/Makefile.in
+++ b/src/tool/hpcstruct/Makefile.in
@@ -165,7 +165,8 @@ am__DEPENDENCIES_4 = $(HPCLIB_Analysis) $(HPCLIB_Banal) \
 	$(HPCLIB_XML) $(HPCLIB_Support) $(HPCLIB_SupportLean) \
 	$(am__DEPENDENCIES_1) $(am__DEPENDENCIES_1) \
 	$(am__DEPENDENCIES_1) $(am__DEPENDENCIES_1) \
-	$(am__DEPENDENCIES_1) $(am__DEPENDENCIES_1)
+	$(am__DEPENDENCIES_1) $(am__DEPENDENCIES_1) \
+	$(am__DEPENDENCIES_1)
 hpcstruct_bin_DEPENDENCIES = $(am__DEPENDENCIES_4)
 hpcstruct_bin_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \
@@ -593,6 +594,7 @@ MYLDADD = \
 	@BINUTILS_LIBS@ \
 	$(LZMA_LDFLAGS_DYN) \
 	$(IGC_LDFLGS) \
+	$(MBEDTLS_LIBS) \
 	$(TBB_LFLAGS)
 
 DOT_LDADD = \
diff --git a/src/tool/hpcstruct/gpubin-analysis.txt b/src/tool/hpcstruct/gpubin-analysis.txt
index 24fc86aed9..aebc00b477 100644
--- a/src/tool/hpcstruct/gpubin-analysis.txt
+++ b/src/tool/hpcstruct/gpubin-analysis.txt
@@ -49,7 +49,7 @@ $(STRUCTS_DIR)/%.hpcstruct: $(GPUBIN_DIR)/%
 	@gpubin_name=`basename -s x $<`
 	struct_name=$@
 	warn_name=$(STRUCTS_DIR)/$$gpubin_name.warnings
-	if test `size $< | tail -1 | awk '{ print $$1 }'` -gt $(PAR_SIZE) ; then
+	if test `du -b $< | tail -1 | awk '{ print $$1 }'` -gt $(PAR_SIZE) ; then
 		if test $(THREADS) -gt 1 ; then
 			echo msg: begin parallel analysis of $$gpubin_name \\($(THREADS) threads\\)
 		else

From 911550719615f59c47e9a47f16b245123f1b04ff Mon Sep 17 00:00:00 2001
From: Jokeren <robinho364@gmail.com>
Date: Sat, 24 Oct 2020 21:57:19 +0000
Subject: [PATCH 114/177] Let dyninst return instruction size. Without a dummy
 buffer, dyninst resets instruction size to zero

---
 src/lib/banal/gpu/GPUBlock.cpp | 15 +++++----------
 1 file changed, 5 insertions(+), 10 deletions(-)

diff --git a/src/lib/banal/gpu/GPUBlock.cpp b/src/lib/banal/gpu/GPUBlock.cpp
index 95f0ad7514..6c452c4326 100644
--- a/src/lib/banal/gpu/GPUBlock.cpp
+++ b/src/lib/banal/gpu/GPUBlock.cpp
@@ -1,6 +1,7 @@
 #include "GPUBlock.hpp"
 #include <Instruction.h>
 
+#define MAX_INST_SIZE 32
 
 namespace Dyninst {
 namespace ParseAPI {
@@ -16,6 +17,8 @@ Address GPUBlock::last() const {
 
 
 void GPUBlock::getInsns(Insns &insns) const {
+  unsigned char dummy_inst[MAX_INST_SIZE];
+
   for (auto &inst_offset : _inst_offsets) {
     entryID entry_id = intel_gpu_op_general;
     InstructionAPI::Operation op(entry_id, "", Arch_intelGen9);
@@ -23,16 +26,8 @@ void GPUBlock::getInsns(Insns &insns) const {
     auto offset = inst_offset.first;
     auto size = inst_offset.second;
 
-#if 0 
-// No longer support this path
-#ifdef DYNINST_INSTRUCTION_PTR
-    insns.insert(std::pair<long unsigned int, 
-      InstructionAPI::InstructionPtr>(offset, NULL));
-#endif
-#endif
-
-    InstructionAPI::Instruction inst(op, size, NULL, Arch_intelGen9);
-    insns.emplace(offset, std::move(inst));
+    InstructionAPI::Instruction inst(op, size, dummy_inst, Arch_intelGen9);
+    insns.emplace(offset, inst);
   }
 }
 

From d1e7945afa9cde3bba277fb6136afad797325930 Mon Sep 17 00:00:00 2001
From: Jokeren <robinho364@gmail.com>
Date: Sat, 24 Oct 2020 22:10:58 +0000
Subject: [PATCH 115/177] Assign correct instruction arch

---
 src/lib/banal/gpu/GPUBlock.cpp      | 9 +++++----
 src/lib/banal/gpu/GPUBlock.hpp      | 4 +++-
 src/lib/banal/gpu/GPUCFGFactory.cpp | 6 ++++--
 3 files changed, 12 insertions(+), 7 deletions(-)

diff --git a/src/lib/banal/gpu/GPUBlock.cpp b/src/lib/banal/gpu/GPUBlock.cpp
index 6c452c4326..2f37e34aa4 100644
--- a/src/lib/banal/gpu/GPUBlock.cpp
+++ b/src/lib/banal/gpu/GPUBlock.cpp
@@ -7,8 +7,9 @@ namespace Dyninst {
 namespace ParseAPI {
 
 GPUBlock::GPUBlock(CodeObject * o, CodeRegion * r,
-  Address start, std::vector<std::pair<Offset, size_t>> &offsets) :
-  Block(o, r, start), _inst_offsets(offsets) {}
+  Address start, std::vector<std::pair<Offset, size_t>> &offsets,
+  Dyninst::Architecture arch) :
+  Block(o, r, start), _inst_offsets(offsets), _arch(arch) {}
 
 
 Address GPUBlock::last() const {
@@ -21,12 +22,12 @@ void GPUBlock::getInsns(Insns &insns) const {
 
   for (auto &inst_offset : _inst_offsets) {
     entryID entry_id = intel_gpu_op_general;
-    InstructionAPI::Operation op(entry_id, "", Arch_intelGen9);
+    InstructionAPI::Operation op(entry_id, "", _arch);
 
     auto offset = inst_offset.first;
     auto size = inst_offset.second;
 
-    InstructionAPI::Instruction inst(op, size, dummy_inst, Arch_intelGen9);
+    InstructionAPI::Instruction inst(op, size, dummy_inst, _arch);
     insns.emplace(offset, inst);
   }
 }
diff --git a/src/lib/banal/gpu/GPUBlock.hpp b/src/lib/banal/gpu/GPUBlock.hpp
index 658f89f0fa..91229aa14d 100644
--- a/src/lib/banal/gpu/GPUBlock.hpp
+++ b/src/lib/banal/gpu/GPUBlock.hpp
@@ -8,7 +8,8 @@ namespace ParseAPI {
 
 class PARSER_EXPORT GPUBlock : public Block {
  public:
-  GPUBlock(CodeObject * o, CodeRegion * r, Address start, std::vector<std::pair<Offset, size_t>> &offsets);
+  GPUBlock(CodeObject * o, CodeRegion * r, Address start,
+    std::vector<std::pair<Offset, size_t>> &offsets, Architecture arch);
 
   virtual ~GPUBlock() {}
 
@@ -19,6 +20,7 @@ class PARSER_EXPORT GPUBlock : public Block {
  private:
   // <offset, size> pair
   std::vector<std::pair<Offset, size_t>> _inst_offsets;
+  Architecture _arch;
 };
 
 }
diff --git a/src/lib/banal/gpu/GPUCFGFactory.cpp b/src/lib/banal/gpu/GPUCFGFactory.cpp
index 070f2e056f..911ddb2cb5 100644
--- a/src/lib/banal/gpu/GPUCFGFactory.cpp
+++ b/src/lib/banal/gpu/GPUCFGFactory.cpp
@@ -21,6 +21,8 @@ Function *GPUCFGFactory::mkfunc(Address addr, FuncSource src,
           std::hex << addr << std::dec << std::endl;
       }
       for (auto *block : function->blocks) {
+        auto arch = block->insts.front()->arch;
+
         GPUBlock *ret_block = NULL;
         // If a block has not been created by callers, create it
         // Otherwise get the block from _block_filter
@@ -32,7 +34,7 @@ Function *GPUCFGFactory::mkfunc(Address addr, FuncSource src,
           for (auto *inst : block->insts) {
             inst_offsets.emplace_back(std::make_pair(inst->offset, inst->size));
           }
-          ret_block = new GPUBlock(obj, region, block->address, inst_offsets);
+          ret_block = new GPUBlock(obj, region, block->address, inst_offsets, arch);
           _block_filter[block->id] = ret_block;
           blocks_.add(ret_block);
         } else {
@@ -59,7 +61,7 @@ Function *GPUCFGFactory::mkfunc(Address addr, FuncSource src,
             for (auto *inst : target->block->insts) {
               inst_offsets.push_back(std::make_pair(inst->offset, inst->size));
             }
-            ret_target_block = new GPUBlock(obj, region, target->block->address, inst_offsets);
+            ret_target_block = new GPUBlock(obj, region, target->block->address, inst_offsets, arch);
             _block_filter[target->block->id] = ret_target_block;
             blocks_.add(ret_target_block);
           } else {

From da12281fd666240f582451bf8bb8c1108473a56c Mon Sep 17 00:00:00 2001
From: dejangrubisic <grubisic.dejan@yahoo.com>
Date: Wed, 4 Nov 2020 17:48:45 -0600
Subject: [PATCH 116/177] Papi process only gpu on monitor gpu

---
 src/tool/hpcrun/gpu/amd/roctracer-api.c |  8 +++-
 src/tool/hpcrun/sample-sources/papi-c.c | 64 ++++++++++++++++++++-----
 2 files changed, 58 insertions(+), 14 deletions(-)

diff --git a/src/tool/hpcrun/gpu/amd/roctracer-api.c b/src/tool/hpcrun/gpu/amd/roctracer-api.c
index 29ea75ab65..7b8b9cab42 100644
--- a/src/tool/hpcrun/gpu/amd/roctracer-api.c
+++ b/src/tool/hpcrun/gpu/amd/roctracer-api.c
@@ -73,6 +73,10 @@
 // macros
 //******************************************************************************
 
+#define DEBUG 0
+#include <hpcrun/gpu/gpu-print.h>
+
+
 #define FORALL_ROCTRACER_ROUTINES(macro)      \
   macro(roctracer_open_pool_expl)   \
   macro(roctracer_enable_callback)  \
@@ -372,13 +376,13 @@ roctracer_subscriber_callback
     // Generate notification entry
     uint64_t cpu_submit_time = hpcrun_nanotime();
 
-    printf("\nACTIVITY_API_PHASE_ENTER -----------------| cct = %p \n", api_node);
+    PRINT("\nACTIVITY_API_PHASE_ENTER -----------------| cct = %p \n", api_node);
     int (*hip_gpu_sync_ptr)(void) = hip_dev_sync;
     gpu_monitors_apply(&(gpu_monitors_apply_t) {.cct_node=api_node, .gpu_sync_ptr=hip_gpu_sync_ptr}, gpu_monitor_type_enter);
 
     gpu_correlation_channel_produce(correlation_id, &gpu_op_ccts, cpu_submit_time);
   }else if (data->phase == ACTIVITY_API_PHASE_EXIT){
-    printf("\nACTIVITY_API_PHASE_EXIT -----------------| \n");
+    PRINT("\nACTIVITY_API_PHASE_EXIT -----------------| \n");
     int (*hip_gpu_sync_ptr)(void) = hip_dev_sync;
     gpu_monitors_apply(&(gpu_monitors_apply_t) {.cct_node=NULL, .gpu_sync_ptr=hip_gpu_sync_ptr}, gpu_monitor_type_exit);
 
diff --git a/src/tool/hpcrun/sample-sources/papi-c.c b/src/tool/hpcrun/sample-sources/papi-c.c
index ad3d48ef21..ff728957dd 100644
--- a/src/tool/hpcrun/sample-sources/papi-c.c
+++ b/src/tool/hpcrun/sample-sources/papi-c.c
@@ -107,7 +107,7 @@
  * macros
  *****************************************************************************/
 
-#define DEBUG 0
+#define DEBUG 1
 
 #include <hpcrun/gpu/gpu-print.h>
 
@@ -145,11 +145,43 @@ static kind_info_t *papi_kind;
 static __thread gpu_monitor_fn_entry_t gpu_monitor_enter;
 static __thread gpu_monitor_fn_entry_t gpu_monitor_exit;
 
+typedef struct papi_mon_comp_t{
+  struct papi_mon_comp_t *next;
+  int idx;
+}papi_mon_comp_t;
+
+static papi_mon_comp_t *papi_mon_comp_list = NULL;
+
 /******************************************************************************
  * private operations 
  *****************************************************************************/
 static void papi_monitor_enter(void *reg_info, void *args_in);
 static void papi_monitor_exit(void *reg_info, void *args_in);
+
+static bool
+is_gpu_component(int cidx)
+{
+  const char* name = PAPI_get_component_info(cidx)->name;
+  if(strstr(name, "cuda") == name || strstr(name, "rocm")==name) {
+    return true;
+  }
+  return false;
+}
+
+
+static void
+papi_add_mon_comp(int cidx)
+{
+  if (is_gpu_component(cidx)){
+    papi_mon_comp_t *new_comp = hpcrun_malloc(sizeof(papi_mon_comp_t));
+    new_comp->next = papi_mon_comp_list;
+    new_comp->idx = cidx;
+
+    papi_mon_comp_list = new_comp;
+  }
+}
+
+
 static void
 gpu_metrics_attribute_papi(int metric_id, cct_node_t *cct_node, long long value);
 
@@ -671,6 +703,8 @@ METHOD_FN(gen_event_set, int lush_metrics)
     ci->sync_start = sync_start_for_component(i);
     ci->sync_stop = sync_stop_for_component(i);
     memset(ci->prev_values, 0, sizeof(ci->prev_values));
+
+    papi_add_mon_comp(i);
   }
 
   // record the component state in thread state
@@ -1048,22 +1082,24 @@ papi_monitor_enter(void *reg_info, void *args_in)
   sample_source_t *self = &obj_name(); /// just for debug
   int ret;
 
-  PRINT("|------->PAPI_MONITOR_ENTER | cct = %p\n", args->cct_node);
+//  PRINT("|------->PAPI_MONITOR_ENTER | cct = %p\n", args->cct_node);
 
   // if sampling disabled explicitly for this thread, skip all processing
   if (hpcrun_suppress_sample() || sample_filters_apply()) goto finish;
 
   cct_node = args->cct_node;
 
-  if (args->gpu_sync_ptr)  // for amd it seems that there is no default sync like in nvidia case
-    args->gpu_sync_ptr();
-
   // Save counts on the end so we could substract that from next call (we don't want to measure ourselves)
-  for (int cid = 0; cid < psi->num_components; ++cid) {
+  
+  for ( papi_mon_comp_t *it = papi_mon_comp_list; it != NULL; it = it->next) {
+    int cid = it->idx;
     papi_component_info_t *ci = &(psi->component_info[cid]);
+    
     if (ci->inUse) {
-      PRINT("Self = %p | Component %d \t | cct = %p \n\n", self, cid, args->cct_node );
+      if (args->gpu_sync_ptr)  // for amd it seems that there is no default sync like in nvidia case
+        args->gpu_sync_ptr();
 
+      PRINT("Self = %p | Component %d \t | cct = %p \n\n", self, cid, args->cct_node );
       ret = PAPI_read(ci->eventSet, prev_values);
       //      ret = PAPI_start(ci->eventSet);
 
@@ -1093,18 +1129,22 @@ papi_monitor_exit(void *reg_info, void *args_in)
   int my_event_count = MAX_EVENTS;
   int ret;
 
-  PRINT("|------->PAPI_MONITOR_EXIT| running? %d\n", METHOD_CALL(self, started));
-
-  if (args->gpu_sync_ptr)
-    args->gpu_sync_ptr();
+//  PRINT("|------->PAPI_MONITOR_EXIT| running? %d\n", METHOD_CALL(self, started));
 
   // if sampling disabled explicitly for this thread, skip all processing
   if (hpcrun_suppress_sample() || sample_filters_apply()) goto finish;
 
   // Collect counters for components in use
-  for (int cid = 0; cid < psi->num_components; ++cid) {
+  for ( papi_mon_comp_t *it = papi_mon_comp_list; it != NULL; it = it->next) {
+    int cid = it->idx;
+
     papi_component_info_t *ci = &(psi->component_info[cid]);
     if (ci->inUse){
+
+      if (args->gpu_sync_ptr)
+        args->gpu_sync_ptr();
+
+
       ret = PAPI_read(ci->eventSet, my_event_values);
 
       if (ret != PAPI_OK) {

From 8a2511e7f3d1f824e5dd59209fd4bcfcc9966d3b Mon Sep 17 00:00:00 2001
From: Aaron Thomas Cherian <atc8@gpu.cs.rice.edu>
Date: Mon, 16 Nov 2020 17:16:24 -0600
Subject: [PATCH 117/177] adding guards/comments in intel iga code. With this,
 we can run opencl examples when igc,gtpin,md-discovery (intel-instrumentation
 specific) paths are not passed in configure script

---
 configure                                    | 3 +++
 configure.ac                                 | 3 +++
 src/lib/banal/Struct.cpp                     | 6 ++++++
 src/lib/banal/gpu/ReadIntelCFG.cpp           | 7 +++++++
 src/lib/binutils/InputFile.cpp               | 5 ++++-
 src/lib/binutils/intel/IntelGPUBinutils.cpp  | 3 +++
 src/lib/binutils/intel/gen_binary_decoder.h  | 3 +++
 src/lib/binutils/intel/gen_symbols_decoder.h | 5 +++++
 src/lib/binutils/intel/igc_binary_decoder.h  | 2 ++
 src/tool/hpcrun/gpu/opencl/opencl-api.c      | 3 ++-
 10 files changed, 38 insertions(+), 2 deletions(-)

diff --git a/configure b/configure
index 2afaa96ccc..24ed2729d6 100755
--- a/configure
+++ b/configure
@@ -23843,6 +23843,7 @@ $as_echo "$OPENCL" >&6; }
 # Option: --with-igc=PATH
 #-------------------------------------------------
 
+IGC=no
 OPT_HAVE_IGC=no
 OPT_IGC_IFLAGS=
 OPT_IGC_LDFLAGS=
@@ -23918,6 +23919,7 @@ $as_echo "$IGC" >&6; }
 # Option: --with-metrics-discovery=PATH
 #-------------------------------------------------
 
+METRICS_DISCOVERY=no
 OPT_HAVE_METRICS_DISCOVERY=no
 OPT_METRICS_DISCOVERY_IFLAGS=
 OPT_METRICS_DISCOVERY_LDFLAGS=
@@ -23977,6 +23979,7 @@ $as_echo "$METRICS_DISCOVERY" >&6; }
 # Option: --with-gtpin=PATH
 #-------------------------------------------------
 
+GTPIN=no
 OPT_HAVE_GTPIN=no
 OPT_GTPIN_IFLAGS=
 OPT_GTPIN_LDFLAGS=
diff --git a/configure.ac b/configure.ac
index 6da42a92b9..e45484324b 100644
--- a/configure.ac
+++ b/configure.ac
@@ -4881,6 +4881,7 @@ AC_SUBST([OPT_OPENCL_IFLAGS])
 # Option: --with-igc=PATH
 #-------------------------------------------------
 
+IGC=no
 OPT_HAVE_IGC=no
 OPT_IGC_IFLAGS=
 OPT_IGC_LDFLAGS=
@@ -4946,6 +4947,7 @@ AC_SUBST([OPT_IGC_LDFLAGS])
 # Option: --with-metrics-discovery=PATH
 #-------------------------------------------------
 
+METRICS_DISCOVERY=no
 OPT_HAVE_METRICS_DISCOVERY=no
 OPT_METRICS_DISCOVERY_IFLAGS=
 OPT_METRICS_DISCOVERY_LDFLAGS=
@@ -4995,6 +4997,7 @@ AC_SUBST([OPT_METRICS_DISCOVERY_LDFLAGS])
 # Option: --with-gtpin=PATH
 #-------------------------------------------------
 
+GTPIN=no
 OPT_HAVE_GTPIN=no
 OPT_GTPIN_IFLAGS=
 OPT_GTPIN_LDFLAGS=
diff --git a/src/lib/banal/Struct.cpp b/src/lib/banal/Struct.cpp
index 5a7bd17436..850b305c49 100644
--- a/src/lib/banal/Struct.cpp
+++ b/src/lib/banal/Struct.cpp
@@ -116,7 +116,10 @@
 #include "Struct-Skel.hpp"
 
 #include "gpu/ReadCudaCFG.hpp"
+
+#ifdef OPT_ENABLE_IGC
 #include "gpu/ReadIntelCFG.hpp"
+#endif // OPT_ENABLE_IGC
 
 #ifdef ENABLE_OPENMP
 #include <omp.h>
@@ -658,8 +661,11 @@ makeStructure(string filename,
     } else if (intel_file) { // don't run parseapi on intel binary
       // TODO(Aaron): determine which generation of intel gpu it is
       intel_gpu_arch = 1;
+
+      #ifdef OPT_ENABLE_IGC
       parsable = readIntelCFG(search_path, elfFile, the_symtab,
         structOpts.compute_gpu_cfg, &code_src, &code_obj);
+      #endif // OPT_ENABLE_IGC
     } else {
       code_src = new SymtabCodeSource(symtab);
       code_obj = new CodeObject(code_src);
diff --git a/src/lib/banal/gpu/ReadIntelCFG.cpp b/src/lib/banal/gpu/ReadIntelCFG.cpp
index f822d7c1a3..86cf859eb7 100644
--- a/src/lib/banal/gpu/ReadIntelCFG.cpp
+++ b/src/lib/banal/gpu/ReadIntelCFG.cpp
@@ -45,6 +45,8 @@
 
 //***************************************************************************
 
+#ifdef OPT_ENABLE_IGC
+
 //******************************************************************************
 // system includes
 //******************************************************************************
@@ -63,6 +65,8 @@
 
 #include <iga/kv.hpp>
 
+
+
 //******************************************************************************
 // local includes
 //******************************************************************************
@@ -124,6 +128,7 @@ addCustomFunctionObject
   assert(status == true);
 }
 
+
 static void
 parseIntelCFG
 (
@@ -256,3 +261,5 @@ readIntelCFG
 
   return false;
 }
+
+#endif // OPT_ENABLE_IGC
\ No newline at end of file
diff --git a/src/lib/binutils/InputFile.cpp b/src/lib/binutils/InputFile.cpp
index 44632d8387..1f2ece285a 100644
--- a/src/lib/binutils/InputFile.cpp
+++ b/src/lib/binutils/InputFile.cpp
@@ -188,7 +188,9 @@ InputFile::openFile
   if (result) {
     filevector->push_back(elfFile);
     //findCubins(elfFile, filevector);
-  } else if (!findIntelGPUBins(filename, file_buffer, f_size, filevector)) { // Check if the file is a intel debug binary
+  } 
+  #ifdef OPT_ENABLE_IGC
+  else if (!findIntelGPUBins(filename, file_buffer, f_size, filevector)) { // Check if the file is a intel debug binary
     // Release memory
     delete(elfFile);
     DIAG_MsgIf_GENERIC(tag, 1, "Not an ELF binary " << filename);
@@ -196,6 +198,7 @@ InputFile::openFile
     // Not a standard elf file
     return false;
   }
+  #endif // OPT_ENABLE_IGC
 
   return result;
 }
diff --git a/src/lib/binutils/intel/IntelGPUBinutils.cpp b/src/lib/binutils/intel/IntelGPUBinutils.cpp
index 26d950e224..ccbe5afe9c 100644
--- a/src/lib/binutils/intel/IntelGPUBinutils.cpp
+++ b/src/lib/binutils/intel/IntelGPUBinutils.cpp
@@ -152,6 +152,7 @@ computeHash
 // interface operations
 //******************************************************************************
 
+#ifdef OPT_ENABLE_IGC
 bool
 findIntelGPUBins
 (
@@ -215,3 +216,5 @@ findIntelGPUBins
 
   return true;
 }
+
+#endif // OPT_ENABLE_IGC
diff --git a/src/lib/binutils/intel/gen_binary_decoder.h b/src/lib/binutils/intel/gen_binary_decoder.h
index 349c63857b..1dac1ea278 100644
--- a/src/lib/binutils/intel/gen_binary_decoder.h
+++ b/src/lib/binutils/intel/gen_binary_decoder.h
@@ -28,6 +28,8 @@
 #include <vector>
 #include <string>
 
+
+#ifdef OPT_ENABLE_IGC
 #include <iga/kv.hpp>
 
 #define MAX_STR_SIZE 1024
@@ -72,5 +74,6 @@ class GenBinaryDecoder {
  private:
   KernelView kernel_view_;
 };
+#endif // OPT_ENABLE_IGC
 
 #endif // PTI_SAMPLES_UTILS_GEN_BINARY_DECODER_H_
diff --git a/src/lib/binutils/intel/gen_symbols_decoder.h b/src/lib/binutils/intel/gen_symbols_decoder.h
index 601d3a2ae5..4553efe94e 100644
--- a/src/lib/binutils/intel/gen_symbols_decoder.h
+++ b/src/lib/binutils/intel/gen_symbols_decoder.h
@@ -25,13 +25,17 @@
 
 #include <vector>
 
+#ifdef OPT_ENABLE_IGC
 #include <igc/ocl_igc_shared/executable_format/program_debug_data.h>
+#endif // OPT_ENABLE_IGC
+
 
 #include "elf_parser.h"
 
 #define IS_POWER_OF_TWO(X) (!((X - 1)&X))
 #define IGC_MAX_VALUE 1024
 
+#ifdef OPT_ENABLE_IGC
 class GenSymbolsDecoder {
  public:
   GenSymbolsDecoder(const std::vector<uint8_t>& symbols)
@@ -120,5 +124,6 @@ class GenSymbolsDecoder {
   const uint8_t* data_ = nullptr;
   size_t size_ = 0;
 };
+#endif // OPT_ENABLE_IGC
 
 #endif // PTI_SAMPLES_UTILS_GEN_SYMBOLS_DECODER_H_
diff --git a/src/lib/binutils/intel/igc_binary_decoder.h b/src/lib/binutils/intel/igc_binary_decoder.h
index 77f00cad0d..5acd89a228 100644
--- a/src/lib/binutils/intel/igc_binary_decoder.h
+++ b/src/lib/binutils/intel/igc_binary_decoder.h
@@ -25,6 +25,7 @@
 
 #include <memory.h>
 
+#ifdef OPT_ENABLE_IGC
 #include <igc/ocl_igc_shared/executable_format/patch_list.h>
 
 #include <metrics_discovery_internal_api.h>
@@ -113,5 +114,6 @@ class IgcBinaryDecoder {
 private:
     std::vector<uint8_t> binary_;
 };
+#endif // OPT_ENABLE_IGC
 
 #endif
diff --git a/src/tool/hpcrun/gpu/opencl/opencl-api.c b/src/tool/hpcrun/gpu/opencl/opencl-api.c
index 81b8aaed12..569978511a 100644
--- a/src/tool/hpcrun/gpu/opencl/opencl-api.c
+++ b/src/tool/hpcrun/gpu/opencl/opencl-api.c
@@ -919,7 +919,7 @@ clCreateProgramWithSource
   return HPCRUN_OPENCL_CALL(clCreateProgramWithSource, (context, count, strings, lengths, errcode_ret));
 }
 
-
+#ifdef OPT_ENABLE_IGC
 // one downside of this appproach is that we may override the callback provided by user
 cl_int
 clBuildProgram
@@ -946,6 +946,7 @@ clBuildProgram
   free(options_with_debug_flags);
   return ret;
 }
+#endif // OPT_ENABLE_IGC
 
 
 cl_command_queue

From 80f0aff313a532169330da3ac1c9788c3ced56bd Mon Sep 17 00:00:00 2001
From: dejangrubisic <grubisic.dejan@yahoo.com>
Date: Mon, 23 Nov 2020 13:30:10 -0600
Subject: [PATCH 118/177] checkpoint 1

---
 src/tool/hpcrun/gpu/gpu-activity-process.c    |  4 +-
 src/tool/hpcrun/sample-sources/papi-c-cupti.c | 31 ++++++----
 .../sample-sources/papi-c-extended-info.c     | 60 +++++++++++++++----
 .../sample-sources/papi-c-extended-info.h     | 11 ++--
 src/tool/hpcrun/sample-sources/papi-c-rocm.c  |  4 +-
 src/tool/hpcrun/sample-sources/papi-c.c       | 60 ++++++++++++-------
 src/tool/hpcrun/sample-sources/papi-c.h       |  5 +-
 src/tool/hpcrun/sample-sources/ss-list.h      |  4 +-
 8 files changed, 125 insertions(+), 54 deletions(-)

diff --git a/src/tool/hpcrun/gpu/gpu-activity-process.c b/src/tool/hpcrun/gpu/gpu-activity-process.c
index ffaa2738bb..6b4b97d355 100644
--- a/src/tool/hpcrun/gpu/gpu-activity-process.c
+++ b/src/tool/hpcrun/gpu/gpu-activity-process.c
@@ -72,7 +72,7 @@
 
 #define UNIT_TEST 0
 
-#define DEBUG 1
+#define DEBUG 0
 
 #include "gpu-print.h"
 
@@ -678,7 +678,7 @@ gpu_activity_process
     break;
 
   case GPU_ACTIVITY_SYNCHRONIZATION:
-    gpu_synchronization_process(ga);
+//    gpu_synchronization_process(ga);
     break;
 
   case GPU_ACTIVITY_MEMORY:
diff --git a/src/tool/hpcrun/sample-sources/papi-c-cupti.c b/src/tool/hpcrun/sample-sources/papi-c-cupti.c
index 0cea3bcb0d..a2a9c2e352 100644
--- a/src/tool/hpcrun/sample-sources/papi-c-cupti.c
+++ b/src/tool/hpcrun/sample-sources/papi-c-cupti.c
@@ -287,9 +287,9 @@ papi_c_cupti_setup(void)
   local.event_set = get_component_event_set(psi, cuda_component_idx);
 
   Cupti_call(dcuptiSubscribe, &subscriber,
-             (CUpti_CallbackFunc)hpcrun_cuda_kernel_callback, 
+             (CUpti_CallbackFunc)hpcrun_cuda_kernel_callback,
              &local);
-             
+
   Cupti_call(dcuptiEnableCallback, 1, subscriber,
              CUPTI_CB_DOMAIN_RUNTIME_API,
              CUPTI_RUNTIME_TRACE_CBID_cudaLaunch_v3020);
@@ -312,7 +312,7 @@ papi_c_cupti_get_event_set(int* ev_s)
     TMSG(CUDA, "No event set created, so create one");
     int ret = PAPI_create_eventset(ev_s);
     if (ret != PAPI_OK) {
-      hpcrun_abort("Failure: PAPI_create_eventset.Return code = %d ==> %s", 
+      hpcrun_abort("Failure: PAPI_create_eventset.Return code = %d ==> %s",
                    ret, PAPI_strerror(ret));
     }
     local.event_set = *ev_s;
@@ -348,6 +348,16 @@ papi_c_cupti_finalize_event_set(void)
   spinlock_unlock(&setup_lock);
 }
 
+void
+papi_c_cupti_read(int EventSet, long long *values)
+{
+  int ret = PAPI_read(EventSet, values);
+  if (ret != PAPI_OK) {
+    EMSG("PAPI_read of event set %d failed with %s (%d)",
+         EventSet, PAPI_strerror(ret), ret);
+  }
+}
+
 
 //
 // sync teardown for cuda/cupti
@@ -368,13 +378,14 @@ papi_c_cupti_teardown(void)
 
 static sync_info_list_t cuda_component = {
   .pred = is_papi_c_cuda,
-  .get_event_set = papi_c_cupti_get_event_set,
-  .add_event = papi_c_cupti_add_event,
-  .finalize_event_set = papi_c_cupti_finalize_event_set,
-  .sync_setup = papi_c_cupti_setup,
-  .sync_teardown = papi_c_cupti_teardown,
-  .sync_start = papi_c_no_action,
-  .sync_stop = papi_c_no_action,
+  .get_event_set = NULL, //papi_c_cupti_get_event_set,
+  .add_event = NULL, //papi_c_cupti_add_event,
+  .finalize_event_set = NULL, //papi_c_cupti_finalize_event_set,
+  .sync_setup = NULL, //papi_c_cupti_setup,
+  .sync_teardown = NULL, //papi_c_cupti_teardown,
+  .start = NULL, //papi_c_no_action,
+  .sync_read = NULL,
+  .stop = NULL, //papi_c_no_action,
   .process_only = true,
   .next = NULL,
 };
diff --git a/src/tool/hpcrun/sample-sources/papi-c-extended-info.c b/src/tool/hpcrun/sample-sources/papi-c-extended-info.c
index 9d49d892bd..81ebf10e24 100644
--- a/src/tool/hpcrun/sample-sources/papi-c-extended-info.c
+++ b/src/tool/hpcrun/sample-sources/papi-c-extended-info.c
@@ -14,28 +14,53 @@ papi_c_sync_register(sync_info_list_t* info)
   registered_sync_components = info;
 }
 
+
 void
 no_action(void)
 {
 }
 
+
 void
 std_get_event_set(int* ev_s)
 {
   int ret = PAPI_create_eventset(ev_s);
-  TMSG(PAPI,"PAPI_create_eventset = %d, eventSet = %d", ret, *ev_s);
+  TMSG(PAPI,"PAPI_create_eventset = %d, ev_s = %d", ret, *ev_s);
   if (ret != PAPI_OK) {
     hpcrun_abort("Failure: PAPI_create_eventset.Return code = %d ==> %s", 
                  ret, PAPI_strerror(ret));
   }
 }
 
+
 int
 std_add_event(int ev_s, int ev)
 {
   return PAPI_add_event(ev_s, ev);
 }
 
+
+int
+std_start(int ev_s)
+{
+  return PAPI_start(ev_s);
+}
+
+
+int
+std_read_event(int ev_s, long long *values)
+{
+  return PAPI_read(ev_s, values);
+}
+
+
+int
+std_stop(int ev_s, long long *values)
+{
+  return PAPI_stop(ev_s, values);
+}
+
+
 get_event_set_proc_t
 component_get_event_set(int cidx)
 {
@@ -43,11 +68,12 @@ component_get_event_set(int cidx)
   
   TMSG(PAPI, "looking for sync get_event_set for component idx=%d(%s)", cidx, name);
   for(sync_info_list_t* item=registered_sync_components; item; item = item->next) {
-    if (item->pred(name)) return item->get_event_set;
+    if (item->pred(name) && item->get_event_set != NULL) return item->get_event_set;
   }
   return std_get_event_set;
 }
 
+
 add_event_proc_t
 component_add_event_proc(int cidx)
 {
@@ -55,7 +81,7 @@ component_add_event_proc(int cidx)
   
   TMSG(PAPI, "looking for sync add_event for component idx=%d(%s)", cidx, name);
   for(sync_info_list_t* item=registered_sync_components; item; item = item->next) {
-    if (item->pred(name)) return item->add_event;
+    if (item->pred(name) && item->add_event != NULL) return item->add_event;
   }
   return std_add_event;
 }
@@ -67,7 +93,7 @@ component_finalize_event_set(int cidx)
   
   TMSG(PAPI, "looking for sync finalize_event_set for component idx=%d(%s)", cidx, name);
   for(sync_info_list_t* item=registered_sync_components; item; item = item->next) {
-    if (item->pred(name)) return item->finalize_event_set;
+    if (item->pred(name) && item->finalize_event_set != NULL) return item->finalize_event_set;
   }
   return no_action;
 }
@@ -94,7 +120,7 @@ sync_setup_for_component(int cidx)
   
   TMSG(PAPI, "looking for sync setup for component idx=%d(%s)", cidx, name);
   for(sync_info_list_t* item=registered_sync_components; item; item = item->next) {
-    if (item->pred(name)) return item->sync_setup;
+    if (item->pred(name) && item->sync_setup != NULL) return item->sync_setup;
   }
   return no_action;
 }
@@ -106,7 +132,7 @@ sync_teardown_for_component(int cidx)
   
   TMSG(PAPI, "looking for sync teardown for component idx=%d(%s)", cidx, name);
   for(sync_info_list_t* item=registered_sync_components; item; item = item->next) {
-    if (item->pred(name)) return item->sync_teardown;
+    if (item->pred(name) && item->sync_teardown != NULL) return item->sync_teardown;
   }
   return no_action;
 }
@@ -118,11 +144,25 @@ sync_start_for_component(int cidx)
   
   TMSG(PAPI, "looking for sync start for component idx=%d(%s)", cidx, name);
   for(sync_info_list_t* item=registered_sync_components; item; item = item->next) {
-    if (item->pred(name)) return item->sync_start;
+    if (item->pred(name) && item->start != NULL) return item->start;
   }
-  return no_action;
+  return std_start;
+}
+
+
+read_proc_t
+sync_read_for_component(int cidx)
+{
+  const char* name = PAPI_get_component_info(cidx)->name;
+
+  TMSG(PAPI, "looking for sync start for component idx=%d(%s)", cidx, name);
+  for(sync_info_list_t* item=registered_sync_components; item; item = item->next) {
+    if (item->pred(name) && item->sync_read != NULL) return item->sync_read;
+  }
+  return std_read_event;
 }
 
+
 stop_proc_t
 sync_stop_for_component(int cidx)
 {
@@ -130,7 +170,7 @@ sync_stop_for_component(int cidx)
   
   TMSG(PAPI, "looking for sync stop for component idx=%d(%s)", cidx, name);
   for(sync_info_list_t* item=registered_sync_components; item; item = item->next) {
-    if (item->pred(name)) return item->sync_stop;
+    if (item->pred(name) && item->stop != NULL) return item->stop;
   }
-  return no_action;
+  return std_stop;
 }
diff --git a/src/tool/hpcrun/sample-sources/papi-c-extended-info.h b/src/tool/hpcrun/sample-sources/papi-c-extended-info.h
index 1636a3f631..522848b507 100644
--- a/src/tool/hpcrun/sample-sources/papi-c-extended-info.h
+++ b/src/tool/hpcrun/sample-sources/papi-c-extended-info.h
@@ -6,8 +6,9 @@ typedef int (*add_event_proc_t)(int ev_s, int evcode);
 typedef void (*finalize_event_set_proc_t)(void);
 typedef void (*setup_proc_t)(void);
 typedef void (*teardown_proc_t)(void);
-typedef void (*start_proc_t)(void);
-typedef void (*stop_proc_t)(void);
+typedef int (*start_proc_t)(int ev_s);
+typedef int (*read_proc_t)(int ev_s, long long *values);
+typedef int (*stop_proc_t)(int ev_s, long long *values);
 typedef bool (*pred_proc_t)(const char* name);
 
 typedef struct sync_info_list_t {
@@ -17,8 +18,9 @@ typedef struct sync_info_list_t {
   const finalize_event_set_proc_t finalize_event_set;
   const setup_proc_t sync_setup;
   const teardown_proc_t sync_teardown;
-  const start_proc_t sync_start;
-  const stop_proc_t sync_stop;
+  const start_proc_t start;
+  const read_proc_t sync_read;
+  const stop_proc_t stop;
   const bool process_only;
   struct sync_info_list_t* next;
 } sync_info_list_t;
@@ -30,6 +32,7 @@ extern finalize_event_set_proc_t component_finalize_event_set(int cidx);
 extern setup_proc_t sync_setup_for_component(int cidx);
 extern teardown_proc_t sync_teardown_for_component(int cidx);
 extern start_proc_t sync_start_for_component(int cidx);
+extern read_proc_t sync_read_for_component(int cidx);
 extern stop_proc_t sync_stop_for_component(int cidx);
 extern void papi_c_sync_register(sync_info_list_t* info);
 
diff --git a/src/tool/hpcrun/sample-sources/papi-c-rocm.c b/src/tool/hpcrun/sample-sources/papi-c-rocm.c
index 94f56203ff..8ea95bcb3a 100644
--- a/src/tool/hpcrun/sample-sources/papi-c-rocm.c
+++ b/src/tool/hpcrun/sample-sources/papi-c-rocm.c
@@ -371,8 +371,8 @@ static sync_info_list_t cuda_component = {
   .finalize_event_set = papi_c_cupti_finalize_event_set,
   .sync_setup = papi_c_cupti_setup,
   .sync_teardown = papi_c_cupti_teardown,
-  .sync_start = papi_c_no_action,
-  .sync_stop = papi_c_no_action,
+  .start = papi_c_no_action,
+  .stop = papi_c_no_action,
   .process_only = true,
   .next = NULL,
 };
diff --git a/src/tool/hpcrun/sample-sources/papi-c.c b/src/tool/hpcrun/sample-sources/papi-c.c
index ab678c146d..8038079240 100644
--- a/src/tool/hpcrun/sample-sources/papi-c.c
+++ b/src/tool/hpcrun/sample-sources/papi-c.c
@@ -150,7 +150,7 @@ typedef struct papi_mon_comp_t{
   int idx;
 }papi_mon_comp_t;
 
-static papi_mon_comp_t *papi_mon_comp_list = NULL;
+static __thread papi_mon_comp_t *papi_mon_comp_list = NULL;
 
 /******************************************************************************
  * private operations 
@@ -265,6 +265,8 @@ strip_papi_prefix(const char *str)
   return str;
 }
 
+static atomic_uint stop_papi_flag = { 0 };
+
 static void
 METHOD_FN(init)
 {
@@ -389,7 +391,7 @@ METHOD_FN(start)
     if (ci->inUse) {
       if (component_uses_sync_samples(cidx)) {
   TMSG(PAPI, "component %d is synchronous, use synchronous start", cidx);
-  ci->sync_start();
+  ci->start(ci->eventSet);
       }
       else {
   TMSG(PAPI,"starting PAPI event set %d for component %d", ci->eventSet, cidx);
@@ -473,12 +475,16 @@ METHOD_FN(stop)
   long_long values[nevents+2];
   //  long_long *values = (long_long *) alloca(sizeof(long_long) * (nevents+2));
 
-  int ret = PAPI_stop(ci->eventSet, values);
-  if (ret != PAPI_OK){
-    EMSG("Failed to stop PAPI for eventset %d. Return code = %d ==> %s",
-         ci->eventSet, ret, PAPI_strerror(ret));
-  }
 
+  if(atomic_fetch_add(&stop_papi_flag, 1) == 0) {
+    //TODO: PAPI_stop is called from monitor_fini_thread and monitor_fini_process -> PAPI_error
+
+    int ret = PAPI_stop(ci->eventSet, values);
+    if (ret != PAPI_OK) {
+      EMSG("Failed to stop PAPI for eventset %d. Return code = %d ==> %s",
+           ci->eventSet, ret, PAPI_strerror(ret));
+    }
+  }
       }
     }
   }
@@ -501,7 +507,7 @@ METHOD_FN(shutdown)
   }while(0);
   // FIXME: add component shutdown code here
 
-  PAPI_shutdown();
+//  PAPI_shutdown();
 
   self->state = UNINIT;
 finish:
@@ -702,8 +708,9 @@ METHOD_FN(gen_event_set, int lush_metrics)
     ci->is_sync = component_uses_sync_samples(i);
     ci->sync_setup = sync_setup_for_component(i);
     ci->sync_teardown = sync_teardown_for_component(i);
-    ci->sync_start = sync_start_for_component(i);
-    ci->sync_stop = sync_stop_for_component(i);
+    ci->start = sync_start_for_component(i);
+    ci->sync_read = sync_read_for_component(i);
+    ci->stop = sync_stop_for_component(i);
     memset(ci->prev_values, 0, sizeof(ci->prev_values));
 
     papi_add_mon_comp(i);
@@ -1074,6 +1081,21 @@ papi_event_handler(int event_set, void *pc, long long ovec,
 static __thread cct_node_t *cct_node;
 static __thread long long prev_values[MAX_EVENTS];
 
+static void
+papi_insert_cct(cct_node_t *api_node){
+
+//  gpu_op_ccts_t gpu_op_ccts;
+//
+//  hpcrun_safe_enter();
+//  gpu_op_ccts_insert(api_node, &gpu_op_ccts, gpu_placeholder_type_sync);
+//  hpcrun_safe_exit();
+//
+//  cupti_papi_ph = gpu_op_ccts_get(&gpu_op_ccts, gpu_placeholder_type_sync);
+//
+//  gpu_correlation_channel_produce(correlation_id, &gpu_op_ccts,
+//                                  cpu_submit_time);
+}
+
 static void
 papi_monitor_enter(void *reg_info, void *args_in)
 {
@@ -1092,24 +1114,23 @@ papi_monitor_enter(void *reg_info, void *args_in)
   cct_node = args->cct_node;
 
   // Save counts on the end so we could substract that from next call (we don't want to measure ourselves)
-  
+
   for ( papi_mon_comp_t *it = papi_mon_comp_list; it != NULL; it = it->next) {
     int cid = it->idx;
     papi_component_info_t *ci = &(psi->component_info[cid]);
-    
+
     if (ci->inUse) {
       if (args->gpu_sync_ptr)  // for amd it seems that there is no default sync like in nvidia case
         args->gpu_sync_ptr();
 
       PRINT("Self = %p | Component %d \t | cct = %p \n\n", self, cid, args->cct_node );
-      ret = PAPI_read(ci->eventSet, prev_values);
-      //      ret = PAPI_start(ci->eventSet);
-
+      ret = ci->sync_read(ci->eventSet, prev_values);
       if (ret != PAPI_OK) {
         EMSG("PAPI_read of event set %d for component %d failed with %s (%d)",
              ci->eventSet, cid, PAPI_strerror(ret), ret);
       }
 
+      PRINT("ENTER_read Event = %d, value = %lld \n", ci->eventSet, prev_values[0]);
     }
   }
 
@@ -1146,9 +1167,7 @@ papi_monitor_exit(void *reg_info, void *args_in)
       if (args->gpu_sync_ptr)
         args->gpu_sync_ptr();
 
-
-      ret = PAPI_read(ci->eventSet, my_event_values);
-
+      ret = ci->sync_read(ci->eventSet, my_event_values);
       if (ret != PAPI_OK) {
         EMSG("PAPI_read of event set %d for component %d failed with %s (%d)",
              ci->eventSet, cid, PAPI_strerror(ret), ret);
@@ -1165,7 +1184,7 @@ papi_monitor_exit(void *reg_info, void *args_in)
         int event_index = get_event_index(self, my_event_codes[eid]);
         int metric_id = hpcrun_event2metric(self, event_index);
 
-        PRINT("%d Event = %x, event_index = %d, metric_id = %d || value = %llu ---> %llu\n",
+        PRINT("%d Event = %x, event_index = %d, metric_id = %d || value = %lld ---> %lld\n",
                eid, my_event_codes[eid], event_index, metric_id, prev_values[eid], my_event_values[eid]);
 
         blame_shift_apply(metric_id, cct_node, my_event_values[eid] /*metricIncr*/);
@@ -1196,7 +1215,4 @@ gpu_metrics_attribute_papi
   hpcrun_metric_std_inc(metric_id,
                         metrics,
                         (cct_metric_data_t) {.i = value});
-
-
-//  gpu_context_trace(context_id, &entry_trace);
 }
\ No newline at end of file
diff --git a/src/tool/hpcrun/sample-sources/papi-c.h b/src/tool/hpcrun/sample-sources/papi-c.h
index df4cfbe101..cccc1c819e 100644
--- a/src/tool/hpcrun/sample-sources/papi-c.h
+++ b/src/tool/hpcrun/sample-sources/papi-c.h
@@ -71,8 +71,9 @@ typedef struct {
   get_event_set_proc_t get_event_set;
   add_event_proc_t add_event;
   finalize_event_set_proc_t finalize_event_set;
-  start_proc_t sync_start;
-  stop_proc_t sync_stop;
+  start_proc_t start;
+  read_proc_t sync_read;
+  stop_proc_t stop;
   setup_proc_t sync_setup;
   teardown_proc_t sync_teardown;
 } papi_component_info_t;
diff --git a/src/tool/hpcrun/sample-sources/ss-list.h b/src/tool/hpcrun/sample-sources/ss-list.h
index 7fd9eecb5c..3d6d3a7cc3 100644
--- a/src/tool/hpcrun/sample-sources/ss-list.h
+++ b/src/tool/hpcrun/sample-sources/ss-list.h
@@ -80,9 +80,9 @@ SAMPLE_SOURCE_DECL_MACRO(directed_blame)
 SAMPLE_SOURCE_DECL_MACRO(retcnt)
 #endif
 
-#ifdef HPCRUN_SS_PAPI_C_CUPTI
+//#ifdef HPCRUN_SS_PAPI_C_CUPTI
 SAMPLE_SOURCE_DECL_MACRO(papi_c_cupti)
-#endif
+//#endif
 
 #ifdef HPCRUN_SS_NVIDIA
 SAMPLE_SOURCE_DECL_MACRO(nvidia_gpu)

From e85d88b48f52188355fd42e720f20f21de7eadcb Mon Sep 17 00:00:00 2001
From: dejangrubisic <grubisic.dejan@yahoo.com>
Date: Mon, 23 Nov 2020 20:45:15 -0600
Subject: [PATCH 119/177] checkpoint 2

---
 src/tool/hpcrun/gpu-monitors.c                |  30 +-
 src/tool/hpcrun/gpu-monitors.h                |  24 +-
 src/tool/hpcrun/gpu/amd/roctracer-api.c       |   4 +-
 src/tool/hpcrun/gpu/nvidia/cupti-api.c        |   8 +-
 src/tool/hpcrun/sample-sources/papi-c-cupti.c | 286 ++++++++-------
 .../sample-sources/papi-c-extended-info.c     |  80 ++--
 .../sample-sources/papi-c-extended-info.h     |  18 +-
 src/tool/hpcrun/sample-sources/papi-c-rocm.c  |  14 +-
 src/tool/hpcrun/sample-sources/papi-c.c       | 342 ++++++++----------
 src/tool/hpcrun/sample-sources/papi-c.h       |   7 +-
 10 files changed, 401 insertions(+), 412 deletions(-)

diff --git a/src/tool/hpcrun/gpu-monitors.c b/src/tool/hpcrun/gpu-monitors.c
index 3ffd5e8d55..df48579e6f 100644
--- a/src/tool/hpcrun/gpu-monitors.c
+++ b/src/tool/hpcrun/gpu-monitors.c
@@ -3,24 +3,34 @@
 //
 
 #include "gpu-monitors.h"
+#include "hpcrun-malloc.h"
 
-static gpu_monitor_fn_entry_t *kinds[2] = {0, 0};
+
+static __thread gpu_monitor_node_t *gpu_monitor_list = NULL;
 
 void
-gpu_monitor_register(gpu_monitor_type_t type, gpu_monitor_fn_entry_t *entry)
+gpu_monitor_register(	gpu_monitor_node_t node)
 {
-	gpu_monitor_fn_entry_t* device_fn = kinds[type];
-	entry->next = device_fn;
-	kinds[type] = entry;
+  gpu_monitor_node_t* new_node = hpcrun_malloc(sizeof(gpu_monitor_node_t));
+  new_node->component = node.component;
+  new_node->enter_fn = node.enter_fn;
+  new_node->exit_fn = node.exit_fn;
+  new_node->next = gpu_monitor_list;
+  gpu_monitor_list = new_node;
 }
 
 
 void
-gpu_monitors_apply(void *args_in, gpu_monitor_type_t type)
+gpu_monitors_apply(gpu_monitor_apply_t *args_in, gpu_monitor_type_t type)
 {
-	gpu_monitor_fn_entry_t* fn = kinds[type];
-	while (fn != 0) {
-		fn->fn(fn->reg_info, args_in);
-		fn = fn->next;
+  gpu_monitor_node_t *node = gpu_monitor_list;
+
+	while (node != NULL) {
+    if (type == gpu_monitor_type_enter)
+      node->enter_fn(node->component, args_in);
+    else if (type == gpu_monitor_type_exit)
+      node->exit_fn(node->component, args_in);
+
+		node = node->next;
 	}
 }
\ No newline at end of file
diff --git a/src/tool/hpcrun/gpu-monitors.h b/src/tool/hpcrun/gpu-monitors.h
index 15a63542d1..2ed8ce3974 100644
--- a/src/tool/hpcrun/gpu-monitors.h
+++ b/src/tool/hpcrun/gpu-monitors.h
@@ -7,7 +7,6 @@
 
 #include "cct.h"
 
-typedef void (*gpu_monitor_fn_t)(void* reg_info, void* args_in);
 
 typedef enum {
 	gpu_monitor_type_enter,
@@ -15,21 +14,24 @@ typedef enum {
 } gpu_monitor_type_t;
 
 
-typedef struct gpu_monitors_apply_t {
+typedef struct gpu_monitor_apply_t {
 	cct_node_t *cct_node;
-  int (*gpu_sync_ptr)(void);
-} gpu_monitors_apply_t;
+  const char *name;
+} gpu_monitor_apply_t;
 
 
-typedef struct gpu_monitor_fn_entry_t {
-	struct gpu_monitor_fn_entry_t* next;
-	gpu_monitor_fn_t fn;
-	void* reg_info;
-} gpu_monitor_fn_entry_t;
+typedef void (*gpu_monitor_fn_t)(void* component, gpu_monitor_apply_t* args_in);
 
+typedef struct gpu_monitor_node_t {
+	struct gpu_monitor_node_t * next;
+	void *component;
+	gpu_monitor_fn_t enter_fn;
+  gpu_monitor_fn_t exit_fn;
+} gpu_monitor_node_t;
 
-extern void gpu_monitor_register(gpu_monitor_type_t type, gpu_monitor_fn_entry_t* entry);
-extern void gpu_monitors_apply(void *args, gpu_monitor_type_t type);
+
+extern void gpu_monitor_register(gpu_monitor_node_t node);
+extern void gpu_monitors_apply(gpu_monitor_apply_t *args, gpu_monitor_type_t type);
 
 
 #endif //HPCTOOLKIT_GPU_MONITORS_H
diff --git a/src/tool/hpcrun/gpu/amd/roctracer-api.c b/src/tool/hpcrun/gpu/amd/roctracer-api.c
index ad9f708615..f273ccfc9c 100644
--- a/src/tool/hpcrun/gpu/amd/roctracer-api.c
+++ b/src/tool/hpcrun/gpu/amd/roctracer-api.c
@@ -451,13 +451,13 @@ roctracer_subscriber_callback
 
     PRINT("\nACTIVITY_API_PHASE_ENTER -----------------| cct = %p \n", api_node);
     int (*hip_gpu_sync_ptr)(void) = hip_dev_sync;
-    gpu_monitors_apply(&(gpu_monitors_apply_t) {.cct_node=api_node, .gpu_sync_ptr=hip_gpu_sync_ptr}, gpu_monitor_type_enter);
+    gpu_monitors_apply(&(gpu_monitor_apply_t) {.cct_node=api_node, .gpu_sync_ptr=hip_gpu_sync_ptr}, gpu_monitor_type_enter);
 
     gpu_correlation_channel_produce(correlation_id, &gpu_op_ccts, cpu_submit_time);
   }else if (data->phase == ACTIVITY_API_PHASE_EXIT){
     PRINT("\nACTIVITY_API_PHASE_EXIT -----------------| \n");
     int (*hip_gpu_sync_ptr)(void) = hip_dev_sync;
-    gpu_monitors_apply(&(gpu_monitors_apply_t) {.cct_node=NULL, .gpu_sync_ptr=hip_gpu_sync_ptr}, gpu_monitor_type_exit);
+    gpu_monitors_apply(&(gpu_monitor_apply_t) {.cct_node=NULL, .gpu_sync_ptr=hip_gpu_sync_ptr}, gpu_monitor_type_exit);
 
   }else{
     ;
diff --git a/src/tool/hpcrun/gpu/nvidia/cupti-api.c b/src/tool/hpcrun/gpu/nvidia/cupti-api.c
index eb037e4204..a54e83ad75 100644
--- a/src/tool/hpcrun/gpu/nvidia/cupti-api.c
+++ b/src/tool/hpcrun/gpu/nvidia/cupti-api.c
@@ -1019,7 +1019,7 @@ cupti_subscriber_callback
         // Generate notification entry
         uint64_t cpu_submit_time = hpcrun_nanotime();
 
-				gpu_monitors_apply( &(gpu_monitors_apply_t){.cct_node=api_node,.gpu_sync_ptr=NULL}, gpu_monitor_type_enter);
+				gpu_monitors_apply( &(gpu_monitor_apply_t){.cct_node=api_node, .name="cuda"}, gpu_monitor_type_enter);
 
 
 				gpu_correlation_channel_produce(correlation_id, &gpu_op_ccts,
@@ -1031,7 +1031,7 @@ cupti_subscriber_callback
         correlation_id = cupti_correlation_id_pop();
         TMSG(CUPTI_TRACE, "Driver pop externalId %lu (cb_id = %u)", correlation_id, cb_id);
 
-				gpu_monitors_apply( &(gpu_monitors_apply_t){.cct_node=NULL,.gpu_sync_ptr=NULL}, gpu_monitor_type_exit);
+				gpu_monitors_apply( &(gpu_monitor_apply_t){.cct_node=NULL, .name="cuda"}, gpu_monitor_type_exit);
 
 
 			}
@@ -1182,7 +1182,7 @@ cupti_subscriber_callback
         // Generate notification entry
         uint64_t cpu_submit_time = hpcrun_nanotime();
 
-				gpu_monitors_apply( &(gpu_monitors_apply_t){.cct_node=cupti_kernel_ph, .gpu_sync_ptr=NULL}, gpu_monitor_type_enter);
+				gpu_monitors_apply( &(gpu_monitor_apply_t){.cct_node=cupti_kernel_ph, .name="cuda"}, gpu_monitor_type_enter);
 
 				gpu_correlation_channel_produce(correlation_id, &gpu_op_ccts,
           cpu_submit_time);
@@ -1196,7 +1196,7 @@ cupti_subscriber_callback
         correlation_id = cupti_correlation_id_pop();
         TMSG(CUPTI_TRACE, "Runtime pop externalId %lu (cb_id = %u)", correlation_id, cb_id);
 
-				gpu_monitors_apply( &(gpu_monitors_apply_t){.cct_node=cupti_kernel_ph, .gpu_sync_ptr=NULL}, gpu_monitor_type_exit);
+				gpu_monitors_apply( &(gpu_monitor_apply_t){.cct_node=cupti_kernel_ph, .name="cuda"}, gpu_monitor_type_exit);
 
         cupti_kernel_ph = NULL;
         cupti_trace_ph = NULL;
diff --git a/src/tool/hpcrun/sample-sources/papi-c-cupti.c b/src/tool/hpcrun/sample-sources/papi-c-cupti.c
index a2a9c2e352..bc2018880a 100644
--- a/src/tool/hpcrun/sample-sources/papi-c-cupti.c
+++ b/src/tool/hpcrun/sample-sources/papi-c-cupti.c
@@ -156,94 +156,94 @@ is_papi_c_cuda(const char* name)
   return strstr(name, "cuda") == name;
 }
 
-static void CUPTIAPI
-hpcrun_cuda_kernel_callback(void* userdata,
-			    CUpti_CallbackDomain domain,
-			    CUpti_CallbackId cbid, 
-			    const CUpti_CallbackData* cbInfo)
-{
-  TMSG(CUDA, "Got Kernel Callback");
-
-  papi_cuda_data_t* cuda_data = userdata;
-  int nevents = cuda_data->nevents;
-  int cudaEventSet = cuda_data->event_set;
-  sample_source_t* self = cuda_data->self;
-
-
-  TMSG(CUDA, "nevents = %d, cuda event set = %x", nevents, cudaEventSet);
-
-  // This callback is enabled only for kernel launch; anything else is an error.
-  if (cbid != CUPTI_RUNTIME_TRACE_CBID_cudaLaunch_v3020) {
-    hpcrun_abort("CUDA CUPTI callback seen for unexpected "
-		 "interface operation: callback id  %d\n", cbid); 
-  }
-
-  if (cbInfo->callbackSite == CUPTI_API_ENTER) {
-    TMSG(CUDA, "Cupti API -ENTER- portion");
-    // MC recommends FIXME: Unnecessary, but use cudaDeviceSynchronize
-      // exclusive access to launcher
-    spinlock_lock(&cupti_lock);
-    TMSG(CUPTI, "-ACQ-lock");
-    dcudaThreadSynchronize();
-
-    TMSG(CUPTI,"-- PRE launch callback");
-    TMSG(CUDA, "Start monitoring with event set %d", cudaEventSet);
-    int ret = PAPI_start(cudaEventSet);
-    if (ret != PAPI_OK){
-      EMSG("CUDA monitoring failed to start. PAPI_start failed with %s (%d)", 
-	   PAPI_strerror(ret), ret);
-    }
-  }
-  TMSG(CUDA, "Past (or done with) CUDA -ENTER- portion");
-
-
-  if (cbInfo->callbackSite == CUPTI_API_EXIT) {
-    TMSG(CUDA, "Cupti API -EXIT- portion");
-    // MC recommends Use cudaDeviceSynchronize
-    dcudaThreadSynchronize();
-    TMSG(CUPTI, "-- POST launch callback");
-    long_long eventValues[nevents+2];
-    
-    TMSG(CUDA,"stopping CUDA monitoring w event set %d",cudaEventSet);
-    int ret = PAPI_stop(cudaEventSet, eventValues);
-    if (ret != PAPI_OK){
-      EMSG("CUDA monitoring failed to -stop-. PAPI_stop failed with %s (%d)", 
-	   PAPI_strerror(ret), ret);
-    }  
-    TMSG(CUDA,"stopped CUDA monitoring w event set %d",cudaEventSet);
-
-    ucontext_t uc;
-    TMSG(CUDA,"getting context in CUDA event handler");
-    getcontext(&uc);
-    TMSG(CUDA,"got context in CUDA event handler");
-    bool safe = hpcrun_safe_enter();
-    TMSG(CUDA,"blocked async event in CUDA event handler");
-    {
-      int i;
-      for (i = 0; i < nevents; i++) 
-	{
-	  int metric_id = hpcrun_event2metric(self, i);
-
-	  TMSG(CUDA, "sampling call path for metric_id = %d", metric_id);
-	  hpcrun_sample_callpath(&uc, metric_id, (hpcrun_metricVal_t){.i=eventValues[i]}/*metricIncr*/,
-				 CUPTI_LAUNCH_CALLBACK_DEPTH/*skipInner*/, 
-				 0/*isSync*/, NULL);
-
-
-	  TMSG(CUDA, "sampled call path for metric_id = %d", metric_id);
-	}
-    }
-    TMSG(CUDA,"unblocking async event in CUDA event handler");
-    if (safe) hpcrun_safe_exit();
-    TMSG(CUDA,"unblocked async event in CUDA event handler");
-
-    spinlock_unlock(&cupti_lock);
-    TMSG(CUPTI,"-REL-lock\n");
-  }
-  TMSG(CUDA, "At end (past -EXIT-)");
-}
+//static void CUPTIAPI
+//hpcrun_cuda_kernel_callback(void* userdata,
+//			    CUpti_CallbackDomain domain,
+//			    CUpti_CallbackId cbid,
+//			    const CUpti_CallbackData* cbInfo)
+//{
+//  TMSG(CUDA, "Got Kernel Callback");
+//
+//  papi_cuda_data_t* cuda_data = userdata;
+//  int nevents = cuda_data->nevents;
+//  int cudaEventSet = cuda_data->event_set;
+//  sample_source_t* self = cuda_data->self;
+//
+//
+//  TMSG(CUDA, "nevents = %d, cuda event set = %x", nevents, cudaEventSet);
+//
+//  // This callback is enabled only for kernel launch; anything else is an error.
+//  if (cbid != CUPTI_RUNTIME_TRACE_CBID_cudaLaunch_v3020) {
+//    hpcrun_abort("CUDA CUPTI callback seen for unexpected "
+//		 "interface operation: callback id  %d\n", cbid);
+//  }
+//
+//  if (cbInfo->callbackSite == CUPTI_API_ENTER) {
+//    TMSG(CUDA, "Cupti API -ENTER- portion");
+//    // MC recommends FIXME: Unnecessary, but use cudaDeviceSynchronize
+//      // exclusive access to launcher
+//    spinlock_lock(&cupti_lock);
+//    TMSG(CUPTI, "-ACQ-lock");
+//    dcudaThreadSynchronize();
+//
+//    TMSG(CUPTI,"-- PRE launch callback");
+//    TMSG(CUDA, "Start monitoring with event set %d", cudaEventSet);
+//    int ret = PAPI_start(cudaEventSet);
+//    if (ret != PAPI_OK){
+//      EMSG("CUDA monitoring failed to start. PAPI_start failed with %s (%d)",
+//	   PAPI_strerror(ret), ret);
+//    }
+//  }
+//  TMSG(CUDA, "Past (or done with) CUDA -ENTER- portion");
+//
+//
+//  if (cbInfo->callbackSite == CUPTI_API_EXIT) {
+//    TMSG(CUDA, "Cupti API -EXIT- portion");
+//    // MC recommends Use cudaDeviceSynchronize
+//    dcudaThreadSynchronize();
+//    TMSG(CUPTI, "-- POST launch callback");
+//    long_long eventValues[nevents+2];
+//
+//    TMSG(CUDA,"stopping CUDA monitoring w event set %d",cudaEventSet);
+//    int ret = PAPI_stop(cudaEventSet, eventValues);
+//    if (ret != PAPI_OK){
+//      EMSG("CUDA monitoring failed to -stop-. PAPI_stop failed with %s (%d)",
+//	   PAPI_strerror(ret), ret);
+//    }
+//    TMSG(CUDA,"stopped CUDA monitoring w event set %d",cudaEventSet);
+//
+//    ucontext_t uc;
+//    TMSG(CUDA,"getting context in CUDA event handler");
+//    getcontext(&uc);
+//    TMSG(CUDA,"got context in CUDA event handler");
+//    bool safe = hpcrun_safe_enter();
+//    TMSG(CUDA,"blocked async event in CUDA event handler");
+//    {
+//      int i;
+//      for (i = 0; i < nevents; i++)
+//	{
+//	  int metric_id = hpcrun_event2metric(self, i);
+//
+//	  TMSG(CUDA, "sampling call path for metric_id = %d", metric_id);
+//	  hpcrun_sample_callpath(&uc, metric_id, (hpcrun_metricVal_t){.i=eventValues[i]}/*metricIncr*/,
+//				 CUPTI_LAUNCH_CALLBACK_DEPTH/*skipInner*/,
+//				 0/*isSync*/, NULL);
+//
+//
+//	  TMSG(CUDA, "sampled call path for metric_id = %d", metric_id);
+//	}
+//    }
+//    TMSG(CUDA,"unblocking async event in CUDA event handler");
+//    if (safe) hpcrun_safe_exit();
+//    TMSG(CUDA,"unblocked async event in CUDA event handler");
+//
+//    spinlock_unlock(&cupti_lock);
+//    TMSG(CUPTI,"-REL-lock\n");
+//  }
+//  TMSG(CUDA, "At end (past -EXIT-)");
+//}
 
-static CUpti_SubscriberHandle subscriber;
+//static CUpti_SubscriberHandle subscriber;
 
 //
 // sync setup for cuda/cupti
@@ -286,36 +286,29 @@ papi_c_cupti_setup(void)
   papi_source_info_t* psi = td->ss_info[local.self->sel_idx].ptr;
   local.event_set = get_component_event_set(psi, cuda_component_idx);
 
-  Cupti_call(dcuptiSubscribe, &subscriber,
-             (CUpti_CallbackFunc)hpcrun_cuda_kernel_callback,
-             &local);
-
-  Cupti_call(dcuptiEnableCallback, 1, subscriber,
-             CUPTI_CB_DOMAIN_RUNTIME_API,
-             CUPTI_RUNTIME_TRACE_CBID_cudaLaunch_v3020);
-
   one_time = true;
   spinlock_unlock(&setup_lock);
   TMSG(CUDA, "CUPTI setup release lock");
+
 }
 
 //
 // Get or create a cupti event set --- but only ONCE per process
 //
-void
-papi_c_cupti_get_event_set(int* ev_s)
+ void
+papi_c_cupti_get_event_set(int* event_set)
 {
   TMSG(CUDA, "Get event set");
   spinlock_lock(&setup_lock);
   TMSG(CUDA, "Cupti lock acquired");
   if (! event_set_created) {
     TMSG(CUDA, "No event set created, so create one");
-    int ret = PAPI_create_eventset(ev_s);
+    int ret = PAPI_create_eventset(event_set);
     if (ret != PAPI_OK) {
       hpcrun_abort("Failure: PAPI_create_eventset.Return code = %d ==> %s",
                    ret, PAPI_strerror(ret));
     }
-    local.event_set = *ev_s;
+    local.event_set = *event_set;
     event_set_created = true;
     TMSG(CUDA, "Event set %d created", local.event_set);
   }
@@ -323,21 +316,25 @@ papi_c_cupti_get_event_set(int* ev_s)
   TMSG(CUDA, "Cupti lock released");
 }
 
-int
-papi_c_cupti_add_event(int ev_s, int ev)
+void
+papi_c_cupti_add_event(int event_set, int event)
 {
   int rv = PAPI_OK;
   TMSG(CUDA, "Adding event to cupti event set");
   spinlock_lock(&setup_lock);
   TMSG(CUDA, "Cupti lock acquired");
   if (! event_set_finalized) {
-    TMSG(CUDA, "Really add event %x to cupti event set", ev);
-    rv = PAPI_add_event(local.event_set, ev);
-    TMSG(CUDA, "Check event set passed in = %d, cuda event set = %d", ev_s, local.event_set);
+    TMSG(CUDA, "Really add event %x to cupti event set", event);
+    rv = PAPI_add_event(local.event_set, event);
+    if (rv != PAPI_OK) {
+      EMSG("failure in PAPI gen_event_set(): PAPI_add_event() returned: %s (%d)",
+           PAPI_strerror(rv), rv);
+    }
+
+    TMSG(CUDA, "Check event set passed in = %d, cuda event set = %d", event_set, local.event_set);
   }
   spinlock_unlock(&setup_lock);
   TMSG(CUDA, "Cupti lock released");
-  return rv;
 }
 
 void
@@ -349,13 +346,47 @@ papi_c_cupti_finalize_event_set(void)
 }
 
 void
-papi_c_cupti_read(int EventSet, long long *values)
+papi_c_cupti_start()
 {
-  int ret = PAPI_read(EventSet, values);
+  spinlock_lock(&setup_lock);
+  TMSG(CUDA, "Cupti lock acquired");
+  int ret = PAPI_start(local.event_set);
+  if (ret != PAPI_OK) {
+    EMSG("PAPI_start of event set %d failed with %s (%d)",
+         local.event_set, PAPI_strerror(ret), ret);
+  }
+  spinlock_unlock(&setup_lock);
+  TMSG(CUDA, "Cupti lock released");
+}
+
+
+void
+papi_c_cupti_read(long long *values)
+{
+  spinlock_lock(&setup_lock);
+  TMSG(CUDA, "Cupti lock acquired");
+  int ret = PAPI_read(local.event_set, values);
   if (ret != PAPI_OK) {
     EMSG("PAPI_read of event set %d failed with %s (%d)",
-         EventSet, PAPI_strerror(ret), ret);
+         local.event_set, PAPI_strerror(ret), ret);
+  }
+  spinlock_unlock(&setup_lock);
+  TMSG(CUDA, "Cupti lock released");
+}
+
+
+void
+papi_c_cupti_stop(long long *values)
+{
+  spinlock_lock(&setup_lock);
+  TMSG(CUDA, "Cupti lock acquired");
+  int ret = PAPI_stop(local.event_set, values);
+  if (ret != PAPI_OK) {
+    EMSG("PAPI_stop of event set %d failed with %s (%d)",
+         local.event_set, PAPI_strerror(ret), ret);
   }
+  spinlock_unlock(&setup_lock);
+  TMSG(CUDA, "Cupti lock released");
 }
 
 
@@ -365,27 +396,28 @@ papi_c_cupti_read(int EventSet, long long *values)
 static void
 papi_c_cupti_teardown(void)
 {
-  static bool one_time = false;
-  spinlock_lock(&setup_lock);
-  if (one_time) return;
-
-  TMSG(CUDA,"sync teardown called (=unsubscribe)");
-  
-  Cupti_call(dcuptiUnsubscribe, subscriber);
-  one_time = true;
-  spinlock_unlock(&setup_lock);
+//  static bool one_time = false;
+//  spinlock_lock(&setup_lock);
+//  if (one_time) return;
+//
+//  TMSG(CUDA,"sync teardown called (=unsubscribe)");
+//
+//  Cupti_call(cuptiUnsubscribe, subscriber);
+//  one_time = true;
+//  spinlock_unlock(&setup_lock);
 }
 
 static sync_info_list_t cuda_component = {
   .pred = is_papi_c_cuda,
-  .get_event_set = NULL, //papi_c_cupti_get_event_set,
-  .add_event = NULL, //papi_c_cupti_add_event,
-  .finalize_event_set = NULL, //papi_c_cupti_finalize_event_set,
-  .sync_setup = NULL, //papi_c_cupti_setup,
-  .sync_teardown = NULL, //papi_c_cupti_teardown,
-  .start = NULL, //papi_c_no_action,
-  .sync_read = NULL,
-  .stop = NULL, //papi_c_no_action,
+  .get_event_set = papi_c_cupti_get_event_set,
+  .add_event = papi_c_cupti_add_event,
+  .finalize_event_set = papi_c_cupti_finalize_event_set,
+  .is_sync = true,
+  .setup = papi_c_cupti_setup,
+  .teardown = papi_c_cupti_teardown,
+  .start = papi_c_cupti_start,
+  .read = papi_c_cupti_read,
+  .stop = papi_c_cupti_stop,
   .process_only = true,
   .next = NULL,
 };
@@ -395,6 +427,6 @@ void
 SS_OBJ_CONSTRUCTOR(papi_c_cupti)(void)
 {
   // fetch actual cuda/cupti functions
-  dlgpu();
+//  dlgpu();
   papi_c_sync_register(&cuda_component);
-}
+}
\ No newline at end of file
diff --git a/src/tool/hpcrun/sample-sources/papi-c-extended-info.c b/src/tool/hpcrun/sample-sources/papi-c-extended-info.c
index 81ebf10e24..82a5c756d4 100644
--- a/src/tool/hpcrun/sample-sources/papi-c-extended-info.c
+++ b/src/tool/hpcrun/sample-sources/papi-c-extended-info.c
@@ -20,47 +20,12 @@ no_action(void)
 {
 }
 
-
-void
-std_get_event_set(int* ev_s)
-{
-  int ret = PAPI_create_eventset(ev_s);
-  TMSG(PAPI,"PAPI_create_eventset = %d, ev_s = %d", ret, *ev_s);
-  if (ret != PAPI_OK) {
-    hpcrun_abort("Failure: PAPI_create_eventset.Return code = %d ==> %s", 
-                 ret, PAPI_strerror(ret));
-  }
-}
-
-
-int
-std_add_event(int ev_s, int ev)
-{
-  return PAPI_add_event(ev_s, ev);
-}
-
-
-int
-std_start(int ev_s)
-{
-  return PAPI_start(ev_s);
-}
-
-
-int
-std_read_event(int ev_s, long long *values)
-{
-  return PAPI_read(ev_s, values);
-}
-
-
-int
-std_stop(int ev_s, long long *values)
+const char *
+component_get_name(int cidx)
 {
-  return PAPI_stop(ev_s, values);
+  return PAPI_get_component_info(cidx)->name;
 }
 
-
 get_event_set_proc_t
 component_get_event_set(int cidx)
 {
@@ -68,9 +33,10 @@ component_get_event_set(int cidx)
   
   TMSG(PAPI, "looking for sync get_event_set for component idx=%d(%s)", cidx, name);
   for(sync_info_list_t* item=registered_sync_components; item; item = item->next) {
-    if (item->pred(name) && item->get_event_set != NULL) return item->get_event_set;
+    if (item->pred(name)) return item->get_event_set;
   }
-  return std_get_event_set;
+//  hpcrun_abort("Failure: PAPI_create_eventset to not registered component");
+  return NULL;
 }
 
 
@@ -81,9 +47,10 @@ component_add_event_proc(int cidx)
   
   TMSG(PAPI, "looking for sync add_event for component idx=%d(%s)", cidx, name);
   for(sync_info_list_t* item=registered_sync_components; item; item = item->next) {
-    if (item->pred(name) && item->add_event != NULL) return item->add_event;
+    if (item->pred(name)) return item->add_event;
   }
-  return std_add_event;
+//  hpcrun_abort("Failure: PAPI_add_event to not registered component");
+  return NULL;
 }
 
 finalize_event_set_proc_t
@@ -93,7 +60,7 @@ component_finalize_event_set(int cidx)
   
   TMSG(PAPI, "looking for sync finalize_event_set for component idx=%d(%s)", cidx, name);
   for(sync_info_list_t* item=registered_sync_components; item; item = item->next) {
-    if (item->pred(name) && item->finalize_event_set != NULL) return item->finalize_event_set;
+    if (item->pred(name)) return item->finalize_event_set;
   }
   return no_action;
 }
@@ -102,13 +69,10 @@ bool
 component_uses_sync_samples(int cidx)
 {
   const char* name = PAPI_get_component_info(cidx)->name;
-  
+
   TMSG(PAPI, "checking component idx %d (name %s) to see if it is synchronous", cidx, name);
   for(sync_info_list_t* item=registered_sync_components; item; item = item->next) {
-    if (item->pred(name)) {
-      TMSG(PAPI, "Component %s IS a synchronous component", name);
-      return true;
-    }
+    if (item->pred(name)) return item->is_sync;
   }
   return false;
 }
@@ -120,9 +84,9 @@ sync_setup_for_component(int cidx)
   
   TMSG(PAPI, "looking for sync setup for component idx=%d(%s)", cidx, name);
   for(sync_info_list_t* item=registered_sync_components; item; item = item->next) {
-    if (item->pred(name) && item->sync_setup != NULL) return item->sync_setup;
+    if (item->pred(name)) return item->setup;
   }
-  return no_action;
+  return NULL;
 }
 
 teardown_proc_t
@@ -132,9 +96,9 @@ sync_teardown_for_component(int cidx)
   
   TMSG(PAPI, "looking for sync teardown for component idx=%d(%s)", cidx, name);
   for(sync_info_list_t* item=registered_sync_components; item; item = item->next) {
-    if (item->pred(name) && item->sync_teardown != NULL) return item->sync_teardown;
+    if (item->pred(name)) return item->teardown;
   }
-  return no_action;
+  return NULL;
 }
 
 start_proc_t
@@ -144,9 +108,9 @@ sync_start_for_component(int cidx)
   
   TMSG(PAPI, "looking for sync start for component idx=%d(%s)", cidx, name);
   for(sync_info_list_t* item=registered_sync_components; item; item = item->next) {
-    if (item->pred(name) && item->start != NULL) return item->start;
+    if (item->pred(name)) return item->start;
   }
-  return std_start;
+  return NULL;
 }
 
 
@@ -157,9 +121,9 @@ sync_read_for_component(int cidx)
 
   TMSG(PAPI, "looking for sync start for component idx=%d(%s)", cidx, name);
   for(sync_info_list_t* item=registered_sync_components; item; item = item->next) {
-    if (item->pred(name) && item->sync_read != NULL) return item->sync_read;
+    if (item->pred(name)) return item->read;
   }
-  return std_read_event;
+  return NULL;
 }
 
 
@@ -170,7 +134,7 @@ sync_stop_for_component(int cidx)
   
   TMSG(PAPI, "looking for sync stop for component idx=%d(%s)", cidx, name);
   for(sync_info_list_t* item=registered_sync_components; item; item = item->next) {
-    if (item->pred(name) && item->stop != NULL) return item->stop;
+    if (item->pred(name)) return item->stop;
   }
-  return std_stop;
+  return NULL;
 }
diff --git a/src/tool/hpcrun/sample-sources/papi-c-extended-info.h b/src/tool/hpcrun/sample-sources/papi-c-extended-info.h
index 522848b507..3786ff830d 100644
--- a/src/tool/hpcrun/sample-sources/papi-c-extended-info.h
+++ b/src/tool/hpcrun/sample-sources/papi-c-extended-info.h
@@ -1,14 +1,14 @@
 #ifndef PAPI_C_EXTENDED_INFO_H
 #define PAPI_C_EXTENDED_INFO_H
 
-typedef void (*get_event_set_proc_t)(int* ev_s);
-typedef int (*add_event_proc_t)(int ev_s, int evcode);
+typedef void (*get_event_set_proc_t)(int* event_set);
+typedef void (*add_event_proc_t)(int event_set, int evcode);
 typedef void (*finalize_event_set_proc_t)(void);
 typedef void (*setup_proc_t)(void);
 typedef void (*teardown_proc_t)(void);
-typedef int (*start_proc_t)(int ev_s);
-typedef int (*read_proc_t)(int ev_s, long long *values);
-typedef int (*stop_proc_t)(int ev_s, long long *values);
+typedef void (*start_proc_t)(void);
+typedef void (*read_proc_t)(long long *values);
+typedef void (*stop_proc_t)(long long *values);
 typedef bool (*pred_proc_t)(const char* name);
 
 typedef struct sync_info_list_t {
@@ -16,15 +16,17 @@ typedef struct sync_info_list_t {
   const get_event_set_proc_t get_event_set;
   const add_event_proc_t add_event;
   const finalize_event_set_proc_t finalize_event_set;
-  const setup_proc_t sync_setup;
-  const teardown_proc_t sync_teardown;
+  const bool is_sync;
+  const setup_proc_t setup;
+  const teardown_proc_t teardown;
   const start_proc_t start;
-  const read_proc_t sync_read;
+  const read_proc_t read;
   const stop_proc_t stop;
   const bool process_only;
   struct sync_info_list_t* next;
 } sync_info_list_t;
 
+extern const char* component_get_name(int cidx);
 extern bool component_uses_sync_samples(int cidx);
 extern get_event_set_proc_t component_get_event_set(int cidx);
 extern add_event_proc_t component_add_event_proc(int cidx);
diff --git a/src/tool/hpcrun/sample-sources/papi-c-rocm.c b/src/tool/hpcrun/sample-sources/papi-c-rocm.c
index 8ea95bcb3a..bbd7bde26a 100644
--- a/src/tool/hpcrun/sample-sources/papi-c-rocm.c
+++ b/src/tool/hpcrun/sample-sources/papi-c-rocm.c
@@ -301,19 +301,19 @@ papi_c_cupti_setup(void)
 // Get or create a cupti event set --- but only ONCE per process
 //
 void
-papi_c_cupti_get_event_set(int* ev_s)
+papi_c_cupti_get_event_set(int* event_set)
 {
   TMSG(CUDA, "Get event set");
   spinlock_lock(&setup_lock);
   TMSG(CUDA, "Cupti lock acquired");
   if (! event_set_created) {
     TMSG(CUDA, "No event set created, so create one");
-    int ret = PAPI_create_eventset(ev_s);
+    int ret = PAPI_create_eventset(event_set);
     if (ret != PAPI_OK) {
       hpcrun_abort("Failure: PAPI_create_eventset.Return code = %d ==> %s", 
                    ret, PAPI_strerror(ret));
     }
-    local.event_set = *ev_s;
+    local.event_set = *event_set;
     event_set_created = true;
     TMSG(CUDA, "Event set %d created", local.event_set);
   }
@@ -322,7 +322,7 @@ papi_c_cupti_get_event_set(int* ev_s)
 }
 
 int
-papi_c_cupti_add_event(int ev_s, int ev)
+papi_c_cupti_add_event(int event_set, int ev)
 {
   int rv = PAPI_OK;
   TMSG(CUDA, "Adding event to cupti event set");
@@ -331,7 +331,7 @@ papi_c_cupti_add_event(int ev_s, int ev)
   if (! event_set_finalized) {
     TMSG(CUDA, "Really add event %x to cupti event set", ev);
     rv = PAPI_add_event(local.event_set, ev);
-    TMSG(CUDA, "Check event set passed in = %d, cuda event set = %d", ev_s, local.event_set);
+    TMSG(CUDA, "Check event set passed in = %d, cuda event set = %d", event_set, local.event_set);
   }
   spinlock_unlock(&setup_lock);
   TMSG(CUDA, "Cupti lock released");
@@ -369,8 +369,8 @@ static sync_info_list_t cuda_component = {
   .get_event_set = papi_c_cupti_get_event_set,
   .add_event = papi_c_cupti_add_event,
   .finalize_event_set = papi_c_cupti_finalize_event_set,
-  .sync_setup = papi_c_cupti_setup,
-  .sync_teardown = papi_c_cupti_teardown,
+  .setup = papi_c_cupti_setup,
+  .teardown = papi_c_cupti_teardown,
   .start = papi_c_no_action,
   .stop = papi_c_no_action,
   .process_only = true,
diff --git a/src/tool/hpcrun/sample-sources/papi-c.c b/src/tool/hpcrun/sample-sources/papi-c.c
index 8038079240..7ba21d6f68 100644
--- a/src/tool/hpcrun/sample-sources/papi-c.c
+++ b/src/tool/hpcrun/sample-sources/papi-c.c
@@ -110,6 +110,7 @@
 #define DEBUG 1
 
 #include <hpcrun/gpu/gpu-print.h>
+#include <gpu-monitors.h>
 
 #define OVERFLOW_MODE 0
 #define WEIGHT_METRIC 0
@@ -120,6 +121,9 @@
  * forward declarations 
  *****************************************************************************/
 static void papi_event_handler(int event_set, void *pc, long long ovec, void *context);
+static void papi_monitor_enter(void *reg_info, gpu_monitor_apply_t *args_in);
+static void papi_monitor_exit(void *reg_info, gpu_monitor_apply_t *args_in);
+
 static int  event_is_derived(int ev_code);
 static void event_fatal_error(int ev_code, int papi_ret);
 
@@ -127,6 +131,10 @@ static void event_fatal_error(int ev_code, int papi_ret);
  * local variables
  *****************************************************************************/
 
+// Support for derived events (proxy sampling).
+static int derived[MAX_EVENTS];
+static int some_overflow;
+
 
 // Special case to make PAPI_library_init() a soft failure.
 // Make sure that we call no other PAPI functions.
@@ -141,22 +149,10 @@ static bool disable_papi_cuda = false;
 
 static kind_info_t *papi_kind;
 
-// gpu monitor
-static __thread gpu_monitor_fn_entry_t gpu_monitor_enter;
-static __thread gpu_monitor_fn_entry_t gpu_monitor_exit;
-
-typedef struct papi_mon_comp_t{
-  struct papi_mon_comp_t *next;
-  int idx;
-}papi_mon_comp_t;
-
-static __thread papi_mon_comp_t *papi_mon_comp_list = NULL;
 
 /******************************************************************************
  * private operations 
  *****************************************************************************/
-static void papi_monitor_enter(void *reg_info, void *args_in);
-static void papi_monitor_exit(void *reg_info, void *args_in);
 
 static bool
 is_gpu_component(int cidx)
@@ -169,22 +165,6 @@ is_gpu_component(int cidx)
 }
 
 
-static void
-papi_add_mon_comp(int cidx)
-{
-  if (is_gpu_component(cidx)){
-    papi_mon_comp_t *new_comp = hpcrun_malloc(sizeof(papi_mon_comp_t));
-    new_comp->next = papi_mon_comp_list;
-    new_comp->idx = cidx;
-
-    papi_mon_comp_list = new_comp;
-  }
-}
-
-
-static void
-gpu_metrics_attribute_papi(int metric_id, cct_node_t *cct_node, long long value);
-
 static int
 get_event_index(sample_source_t *self, int event_code)
 {
@@ -203,9 +183,9 @@ get_event_index(sample_source_t *self, int event_code)
 int
 get_component_event_set(papi_source_info_t* psi, int cidx)
 {
-   if (cidx < 0 || cidx >= psi->num_components) {
+  if (cidx < 0 || cidx >= psi->num_components) {
     hpcrun_abort("PAPI component index out of range [0,%d]: %d", psi->num_components, cidx);
-   }
+  }
 
    papi_component_info_t* ci = &(psi->component_info[cidx]);
 
@@ -219,14 +199,107 @@ get_component_event_set(papi_source_info_t* psi, int cidx)
 //
 // add an event to a component's event set
 //
-int
-component_add_event(papi_source_info_t* psi, int cidx, int evcode)
+void
+component_add_event(papi_source_info_t* psi, int evcode)
 {
+  int cidx = PAPI_get_event_component(evcode);
   int event_set = get_component_event_set(psi, cidx);
+
   papi_component_info_t* ci = &(psi->component_info[cidx]);
-  return ci->add_event(event_set, evcode);
+  ci->add_event(event_set, evcode);
+  ci->some_derived |= event_is_derived(evcode);
+
+  TMSG(PAPI, "Added event code %x to component %d", evcode, cidx);
+  {
+    char buffer[PAPI_MAX_STR_LEN];
+    PAPI_event_code_to_name(evcode, buffer);
+    TMSG(PAPI,
+         "PAPI_add_event(eventSet=%%d, event_code=%x (event name %s)) component=%d",
+    /* eventSet, */ evcode, buffer, cidx);
+  }
+}
+
+
+static void
+papi_register_events(papi_source_info_t *psi, evlist_t *evl)
+{
+  int i;
+  int nevents = evl->nevents;
+
+  // add events to new event_sets
+  for (i = 0; i < nevents; i++) {
+    int evcode = evl->events[i].event;
+    component_add_event(psi, evcode);
+
+  }
+
+  // finalize component event sets
+  for (i = 0; i < psi->num_components; i++) {
+    papi_component_info_t *ci = &(psi->component_info[i]);
+    ci->finalize_event_set();
+  }
 }
 
+
+static void
+papi_register_sync_callback(papi_component_info_t *ci)
+{
+  gpu_monitor_node_t node;
+  node.component = ci;
+  node.enter_fn = papi_monitor_enter;
+  node.exit_fn = papi_monitor_exit;
+  gpu_monitor_register(node);
+}
+
+
+static void
+papi_register_overflow_callback(int eventSet, int evcode, long thresh)
+{
+  int ret = PAPI_overflow(eventSet, evcode, thresh, OVERFLOW_MODE,
+                          papi_event_handler);
+  TMSG(PAPI, "PAPI_overflow(eventSet=%d, evcode=%x, thresh=%d) = %d",
+       eventSet, evcode, thresh, ret);
+  if (ret != PAPI_OK) {
+    EMSG("failure in PAPI gen_event_set(): PAPI_overflow() returned: %s (%d)",
+         PAPI_strerror(ret), ret);
+    event_fatal_error(evcode, ret);
+  }
+}
+
+
+static void
+papi_register_callbacks(papi_source_info_t *psi, evlist_t *evl)
+{
+  int i;
+  // set up overflow handling for asynchronous event sets for active components
+  // set up synchronous handling for synchronous event sets for active compoents
+  for (i = 0; i < evl->nevents; i++) {
+    int evcode = evl->events[i].event;
+    long thresh = evl->events[i].thresh;
+    int cidx = PAPI_get_event_component(evcode);
+
+    int eventSet = get_component_event_set(psi, cidx);
+    papi_component_info_t *ci = &(psi->component_info[cidx]);
+
+    // **** No overflow for synchronous events ****
+    if (ci->is_sync) {
+      TMSG(PAPI, "event code %d (component %d) is synchronous, so do NOT set overflow", evcode, cidx);
+      TMSG(PAPI, "Set up papi_monitor_apply instead");
+      TMSG(PAPI, "synchronous sample component index = %d", cidx);
+
+      papi_register_sync_callback(ci);
+    }
+    else{
+      // ***** Only set overflow if NOT derived event *****
+      if (! derived[i]) {
+        papi_register_overflow_callback(eventSet, evcode, thresh);
+      }
+    }
+  }
+
+}
+
+
 static bool
 thread_count_scaling_for_component(int cidx)
 {
@@ -236,14 +309,6 @@ thread_count_scaling_for_component(int cidx)
 }
 
 
-/******************************************************************************
- * sample source registration
- *****************************************************************************/
-
-// Support for derived events (proxy sampling).
-static int derived[MAX_EVENTS];
-static int some_overflow;
-
 /******************************************************************************
  * method functions
  *****************************************************************************/
@@ -391,7 +456,7 @@ METHOD_FN(start)
     if (ci->inUse) {
       if (component_uses_sync_samples(cidx)) {
   TMSG(PAPI, "component %d is synchronous, use synchronous start", cidx);
-  ci->start(ci->eventSet);
+  ci->start();
       }
       else {
   TMSG(PAPI,"starting PAPI event set %d for component %d", ci->eventSet, cidx);
@@ -678,7 +743,6 @@ METHOD_FN(gen_event_set, int lush_metrics)
   tool_enter();
   thread_data_t *td = hpcrun_get_thread_data();
   int i;
-  int ret;
 
   TMSG(PAPI, "generating all event sets for all components");
   if (papi_unavail) { goto finish; }
@@ -697,6 +761,7 @@ METHOD_FN(gen_event_set, int lush_metrics)
   psi->num_components = num_components;
   for (i = 0; i < num_components; i++) {
     papi_component_info_t *ci = &(psi->component_info[i]);
+    ci->name = component_get_name(i);
     ci->inUse = false;
     ci->eventSet = PAPI_NULL;
     ci->state = INIT;
@@ -706,88 +771,20 @@ METHOD_FN(gen_event_set, int lush_metrics)
     ci->finalize_event_set = component_finalize_event_set(i);
     ci->scale_by_thread_count = thread_count_scaling_for_component(i);
     ci->is_sync = component_uses_sync_samples(i);
-    ci->sync_setup = sync_setup_for_component(i);
-    ci->sync_teardown = sync_teardown_for_component(i);
+    ci->setup = sync_setup_for_component(i);
+    ci->teardown = sync_teardown_for_component(i);
     ci->start = sync_start_for_component(i);
-    ci->sync_read = sync_read_for_component(i);
+    ci->read = sync_read_for_component(i);
     ci->stop = sync_stop_for_component(i);
     memset(ci->prev_values, 0, sizeof(ci->prev_values));
-
-    papi_add_mon_comp(i);
   }
 
   // record the component state in thread state
   td->ss_info[self->sel_idx].ptr = psi;
 
-  int nevents = (self->evl).nevents;
-  for (i = 0; i < nevents; i++) {
-    int evcode = self->evl.events[i].event;
-    int cidx = PAPI_get_event_component(evcode);
+  papi_register_events(psi, &self->evl);
 
-    ret = component_add_event(psi, cidx, evcode);
-    psi->component_info[cidx].some_derived |= event_is_derived(evcode);
-    TMSG(PAPI, "Added event code %x to component %d", evcode, cidx);
-    {
-      char buffer[PAPI_MAX_STR_LEN];
-      PAPI_event_code_to_name(evcode, buffer);
-      TMSG(PAPI,
-     "PAPI_add_event(eventSet=%%d, event_code=%x (event name %s)) component=%d",
-     /* eventSet, */ evcode, buffer, cidx);
-    }
-    if (ret != PAPI_OK) {
-      EMSG("failure in PAPI gen_event_set(): PAPI_add_event() returned: %s (%d)",
-     PAPI_strerror(ret), ret);
-      event_fatal_error(evcode, ret);
-    }
-  }
-
-  // finalize component event sets
-    for (i = 0; i < num_components; i++) {
-      papi_component_info_t *ci = &(psi->component_info[i]);
-      ci->finalize_event_set();
-    }
-
-  // set up overflow handling for asynchronous event sets for active components
-  // set up synchronous handling for synchronous event sets for active compoents
-  for (i = 0; i < nevents; i++) {
-    int evcode = self->evl.events[i].event;
-    long thresh = self->evl.events[i].thresh;
-    int cidx = PAPI_get_event_component(evcode);
-    int eventSet = get_component_event_set(psi, cidx);
-
-    // **** No overflow for synchronous events ****
-    // **** Use component-specific setup for synchronous events ****
-    if (component_uses_sync_samples(cidx)) {
-      TMSG(PAPI, "event code %d (component %d) is synchronous, so do NOT set overflow", evcode, cidx);
-      TMSG(PAPI, "Set up sync handler instead");
-      TMSG(PAPI, "synchronous sample component index = %d", cidx);
-      sync_setup_for_component(cidx)();
-      continue;
-    }
-    // ***** Only set overflow if NOT derived event *****
-    if (! derived[i]) {
-      ret = PAPI_overflow(eventSet, evcode, thresh, OVERFLOW_MODE,
-        papi_event_handler);
-      TMSG(PAPI, "PAPI_overflow(eventSet=%d, evcode=%x, thresh=%d) = %d",
-     eventSet, evcode, thresh, ret);
-      if (ret != PAPI_OK) {
-  EMSG("failure in PAPI gen_event_set(): PAPI_overflow() returned: %s (%d)",
-       PAPI_strerror(ret), ret);
-  event_fatal_error(evcode, ret);
-      }
-    }
-  }
-
-  /// Register papi handler callbacks
-  gpu_monitor_enter.reg_info = psi;
-  gpu_monitor_enter.fn = papi_monitor_enter;
-  gpu_monitor_register(gpu_monitor_type_enter,
-                            &gpu_monitor_enter);
-
-  gpu_monitor_exit.reg_info = psi;
-  gpu_monitor_exit.fn = papi_monitor_exit;
-  gpu_monitor_register(gpu_monitor_type_exit,
-                            &gpu_monitor_exit);
+  papi_register_callbacks(psi, &self->evl);
 
 finish:
   tool_exit();
@@ -1097,14 +1094,29 @@ papi_insert_cct(cct_node_t *api_node){
 }
 
 static void
-papi_monitor_enter(void *reg_info, void *args_in)
+gpu_metrics_attribute_papi
+(
+int metric_id,
+cct_node_t *cct_node,
+long long value
+)
+{
+  metric_data_list_t* metrics = hpcrun_reify_metric_set(cct_node, metric_id);
+
+  hpcrun_metric_std_inc(metric_id,
+                        metrics,
+                        (cct_metric_data_t) {.i = value});
+}
+
+
+static void
+papi_monitor_enter(void *component, gpu_monitor_apply_t *args_in)
 {
   tool_enter();
-  papi_source_info_t *psi = (papi_source_info_t *) reg_info;
-  gpu_monitors_apply_t *args = (gpu_monitors_apply_t *) args_in;
+  papi_component_info_t *ci = (papi_component_info_t *) component;
+  gpu_monitor_apply_t *args = (gpu_monitor_apply_t *) args_in;
 
   sample_source_t *self = &obj_name(); /// just for debug
-  int ret;
 
 //  PRINT("|------->PAPI_MONITOR_ENTER | cct = %p\n", args->cct_node);
 
@@ -1115,23 +1127,13 @@ papi_monitor_enter(void *reg_info, void *args_in)
 
   // Save counts on the end so we could substract that from next call (we don't want to measure ourselves)
 
-  for ( papi_mon_comp_t *it = papi_mon_comp_list; it != NULL; it = it->next) {
-    int cid = it->idx;
-    papi_component_info_t *ci = &(psi->component_info[cid]);
+  if (ci->inUse) {
+//    if (args->gpu_sync_ptr)  // for amd it seems that there is no default sync like in nvidia case
+//      args->gpu_sync_ptr();
 
-    if (ci->inUse) {
-      if (args->gpu_sync_ptr)  // for amd it seems that there is no default sync like in nvidia case
-        args->gpu_sync_ptr();
-
-      PRINT("Self = %p | Component %d \t | cct = %p \n\n", self, cid, args->cct_node );
-      ret = ci->sync_read(ci->eventSet, prev_values);
-      if (ret != PAPI_OK) {
-        EMSG("PAPI_read of event set %d for component %d failed with %s (%d)",
-             ci->eventSet, cid, PAPI_strerror(ret), ret);
-      }
-
-      PRINT("ENTER_read Event = %d, value = %lld \n", ci->eventSet, prev_values[0]);
-    }
+    PRINT("Self = %p | Component %s \t | cct = %p \n\n", self, ci->name, args->cct_node );
+    ci->read(prev_values);
+    PRINT("ENTER_read Event = %d, value = %lld \n", ci->eventSet, prev_values[0]);
   }
 
 finish:
@@ -1140,11 +1142,11 @@ papi_monitor_enter(void *reg_info, void *args_in)
 
 
 static void
-papi_monitor_exit(void *reg_info, void *args_in)
+papi_monitor_exit(void *component, gpu_monitor_apply_t *args_in)
 {
   tool_enter();
-  papi_source_info_t *psi = (papi_source_info_t *) reg_info;
-  gpu_monitors_apply_t *args = (gpu_monitors_apply_t *) args_in;
+  papi_component_info_t *ci = (papi_component_info_t *) component;
+  gpu_monitor_apply_t *args = (gpu_monitor_apply_t *) args_in;
 
   sample_source_t *self = &obj_name(); /// just for debug
   int my_event_codes[MAX_EVENTS];
@@ -1157,62 +1159,38 @@ papi_monitor_exit(void *reg_info, void *args_in)
   // if sampling disabled explicitly for this thread, skip all processing
   if (hpcrun_suppress_sample() || sample_filters_apply()) goto finish;
 
-  // Collect counters for components in use
-  for ( papi_mon_comp_t *it = papi_mon_comp_list; it != NULL; it = it->next) {
-    int cid = it->idx;
-
-    papi_component_info_t *ci = &(psi->component_info[cid]);
-    if (ci->inUse){
+  if (ci->inUse){
 
-      if (args->gpu_sync_ptr)
-        args->gpu_sync_ptr();
+//    if (args->gpu_sync_ptr)
+//      args->gpu_sync_ptr();
 
-      ret = ci->sync_read(ci->eventSet, my_event_values);
-      if (ret != PAPI_OK) {
-        EMSG("PAPI_read of event set %d for component %d failed with %s (%d)",
-             ci->eventSet, cid, PAPI_strerror(ret), ret);
-      }
+    ci->read(my_event_values);
 
-      // Attribute collected metric to cct nodes
-      ret = PAPI_list_events(ci->eventSet, my_event_codes, &my_event_count);
-      if (ret != PAPI_OK) {
-        hpcrun_abort("PAPI_list_events failed inside papi_event_handler."
-                     "Return code = %d ==> %s", ret, PAPI_strerror(ret));
-      }
 
-      for (int eid = 0; eid < my_event_count; ++eid) {
-        int event_index = get_event_index(self, my_event_codes[eid]);
-        int metric_id = hpcrun_event2metric(self, event_index);
+    // Attribute collected metric to cct nodes
+    ret = PAPI_list_events(ci->eventSet, my_event_codes, &my_event_count);
+    if (ret != PAPI_OK) {
+      hpcrun_abort("PAPI_list_events failed inside papi_event_handler."
+                   "Return code = %d ==> %s", ret, PAPI_strerror(ret));
+    }
 
-        PRINT("%d Event = %x, event_index = %d, metric_id = %d || value = %lld ---> %lld\n",
-               eid, my_event_codes[eid], event_index, metric_id, prev_values[eid], my_event_values[eid]);
+    for (int eid = 0; eid < my_event_count; ++eid) {
+      int event_index = get_event_index(self, my_event_codes[eid]);
+      int metric_id = hpcrun_event2metric(self, event_index);
 
-        blame_shift_apply(metric_id, cct_node, my_event_values[eid] /*metricIncr*/);
+      PRINT("%d Event = %x, event_index = %d, metric_id = %d || value = %lld ---> %lld\n",
+             eid, my_event_codes[eid], event_index, metric_id, prev_values[eid], my_event_values[eid]);
 
+      blame_shift_apply(metric_id, cct_node, my_event_values[eid] /*metricIncr*/);
 
-        gpu_metrics_attribute_papi(metric_id, cct_node, my_event_values[eid]);
-      }
 
+      gpu_metrics_attribute_papi(metric_id, cct_node, my_event_values[eid]);
     }
+
   }
+
   cct_node = NULL;
 
 finish:
   tool_exit();
 }
-
-
-static void
-gpu_metrics_attribute_papi
-(
- int metric_id,
- cct_node_t *cct_node,
- long long value
-)
-{
-  metric_data_list_t* metrics = hpcrun_reify_metric_set(cct_node, metric_id);
-  
-  hpcrun_metric_std_inc(metric_id,
-                        metrics,
-                        (cct_metric_data_t) {.i = value});
-}
\ No newline at end of file
diff --git a/src/tool/hpcrun/sample-sources/papi-c.h b/src/tool/hpcrun/sample-sources/papi-c.h
index cccc1c819e..ae289c26bb 100644
--- a/src/tool/hpcrun/sample-sources/papi-c.h
+++ b/src/tool/hpcrun/sample-sources/papi-c.h
@@ -60,6 +60,7 @@
  *****************************************************************************/
 
 typedef struct {
+  const char *name;
   bool inUse;
   int eventSet;
   source_state_t state;
@@ -72,10 +73,10 @@ typedef struct {
   add_event_proc_t add_event;
   finalize_event_set_proc_t finalize_event_set;
   start_proc_t start;
-  read_proc_t sync_read;
+  read_proc_t read;
   stop_proc_t stop;
-  setup_proc_t sync_setup;
-  teardown_proc_t sync_teardown;
+  setup_proc_t setup;
+  teardown_proc_t teardown;
 } papi_component_info_t;
 
 

From c7d7ef40ad8a803fc41baf76c2d113df1eabeade Mon Sep 17 00:00:00 2001
From: dejangrubisic <grubisic.dejan@yahoo.com>
Date: Tue, 24 Nov 2020 10:31:44 -0600
Subject: [PATCH 120/177] checkpoint 3

---
 src/tool/hpcrun/gpu-monitors.h                |  2 +-
 src/tool/hpcrun/main.c                        |  8 ++-
 src/tool/hpcrun/sample-sources/papi-c-cupti.c |  8 +--
 src/tool/hpcrun/sample-sources/papi-c-rocm.c  |  2 +-
 src/tool/hpcrun/sample-sources/papi-c.c       | 57 +++++++++++--------
 src/tool/hpcrun/sample-sources/papi-c.h       |  2 +-
 6 files changed, 44 insertions(+), 35 deletions(-)

diff --git a/src/tool/hpcrun/gpu-monitors.h b/src/tool/hpcrun/gpu-monitors.h
index 2ed8ce3974..e16ff8e08a 100644
--- a/src/tool/hpcrun/gpu-monitors.h
+++ b/src/tool/hpcrun/gpu-monitors.h
@@ -20,7 +20,7 @@ typedef struct gpu_monitor_apply_t {
 } gpu_monitor_apply_t;
 
 
-typedef void (*gpu_monitor_fn_t)(void* component, gpu_monitor_apply_t* args_in);
+typedef void (*gpu_monitor_fn_t)(const void* component, gpu_monitor_apply_t* args_in);
 
 typedef struct gpu_monitor_node_t {
 	struct gpu_monitor_node_t * next;
diff --git a/src/tool/hpcrun/main.c b/src/tool/hpcrun/main.c
index 060e696761..c835eb539d 100644
--- a/src/tool/hpcrun/main.c
+++ b/src/tool/hpcrun/main.c
@@ -756,12 +756,14 @@ hpcrun_thread_init(int id, local_thread_data_t* local_thread_data, bool has_trac
 
   epoch_t* epoch = TD_GET(core_profile_trace_data.epoch);
 
-  if (! hpcrun_thread_suppress_sample) {
-    // handle event sets for sample sources
-    SAMPLE_SOURCES(gen_event_set,lush_metrics);
+  if (! hpcrun_thread_suppress_sample ) { //TODO Dejan: has_trace instead?
+//TODO Dejan: thread_init_action must go before gen-event-set
     // sample sources take thread specific action prior to start (often is a 'registration' action);
     SAMPLE_SOURCES(thread_init_action);
 
+    // handle event sets for sample sources
+    SAMPLE_SOURCES(gen_event_set,lush_metrics);
+
     // start the sample sources
     SAMPLE_SOURCES(start);
 
diff --git a/src/tool/hpcrun/sample-sources/papi-c-cupti.c b/src/tool/hpcrun/sample-sources/papi-c-cupti.c
index bc2018880a..5ff3874cfe 100644
--- a/src/tool/hpcrun/sample-sources/papi-c-cupti.c
+++ b/src/tool/hpcrun/sample-sources/papi-c-cupti.c
@@ -83,10 +83,10 @@ typedef struct {
   sample_source_t* self;
 } papi_cuda_data_t;
 
-static bool event_set_created = false;
-static bool event_set_finalized = false;
+static __thread bool event_set_created = false;
+static __thread bool event_set_finalized = false;
 
-static papi_cuda_data_t local = {};
+static __thread papi_cuda_data_t local = {};
 
 static spinlock_t cupti_lock = SPINLOCK_UNLOCKED;
 static spinlock_t setup_lock = SPINLOCK_UNLOCKED;
@@ -284,7 +284,7 @@ papi_c_cupti_setup(void)
   }
 
   papi_source_info_t* psi = td->ss_info[local.self->sel_idx].ptr;
-  local.event_set = get_component_event_set(psi, cuda_component_idx);
+  local.event_set = get_component_event_set( &(psi->component_info[cuda_component_idx]) );
 
   one_time = true;
   spinlock_unlock(&setup_lock);
diff --git a/src/tool/hpcrun/sample-sources/papi-c-rocm.c b/src/tool/hpcrun/sample-sources/papi-c-rocm.c
index bbd7bde26a..d2621ed7f6 100644
--- a/src/tool/hpcrun/sample-sources/papi-c-rocm.c
+++ b/src/tool/hpcrun/sample-sources/papi-c-rocm.c
@@ -282,7 +282,7 @@ papi_c_cupti_setup(void)
   }
 
   papi_source_info_t* psi = td->ss_info[local.self->sel_idx].ptr;
-  local.event_set = get_component_event_set(psi, cuda_component_idx);
+  local.event_set = get_component_event_set( &(psi->component_info[cuda_component_idx]) );
 
   Cupti_call(dcuptiSubscribe, &subscriber,
              (CUpti_CallbackFunc)hpcrun_cuda_kernel_callback, 
diff --git a/src/tool/hpcrun/sample-sources/papi-c.c b/src/tool/hpcrun/sample-sources/papi-c.c
index 7ba21d6f68..c15094014a 100644
--- a/src/tool/hpcrun/sample-sources/papi-c.c
+++ b/src/tool/hpcrun/sample-sources/papi-c.c
@@ -121,8 +121,8 @@
  * forward declarations 
  *****************************************************************************/
 static void papi_event_handler(int event_set, void *pc, long long ovec, void *context);
-static void papi_monitor_enter(void *reg_info, gpu_monitor_apply_t *args_in);
-static void papi_monitor_exit(void *reg_info, gpu_monitor_apply_t *args_in);
+static void papi_monitor_enter(const void *reg_info, gpu_monitor_apply_t *args_in);
+static void papi_monitor_exit(const void *reg_info, gpu_monitor_apply_t *args_in);
 
 static int  event_is_derived(int ev_code);
 static void event_fatal_error(int ev_code, int papi_ret);
@@ -177,18 +177,24 @@ get_event_index(sample_source_t *self, int event_code)
   assert(0);
 }
 
-//
-// fetch a given component's event set. Create one if need be
-//
-int
-get_component_event_set(papi_source_info_t* psi, int cidx)
+
+static int
+evcode_to_component_id(papi_source_info_t* psi, int evcode)
 {
+  int cidx = PAPI_get_event_component(evcode);
   if (cidx < 0 || cidx >= psi->num_components) {
     hpcrun_abort("PAPI component index out of range [0,%d]: %d", psi->num_components, cidx);
   }
+  return cidx;
+}
 
-   papi_component_info_t* ci = &(psi->component_info[cidx]);
 
+//
+// fetch a given component's event set. Create one if need be
+//
+int
+get_component_event_set(papi_component_info_t* ci)
+{
    if (!ci->inUse) {
      ci->get_event_set(&(ci->eventSet));
      ci->inUse = true;
@@ -196,16 +202,17 @@ get_component_event_set(papi_source_info_t* psi, int cidx)
   return ci->eventSet;
 }
 
+
 //
 // add an event to a component's event set
 //
 void
 component_add_event(papi_source_info_t* psi, int evcode)
 {
-  int cidx = PAPI_get_event_component(evcode);
-  int event_set = get_component_event_set(psi, cidx);
-
+  int cidx = evcode_to_component_id(psi, evcode);
   papi_component_info_t* ci = &(psi->component_info[cidx]);
+  int event_set = get_component_event_set(ci);
+
   ci->add_event(event_set, evcode);
   ci->some_derived |= event_is_derived(evcode);
 
@@ -221,14 +228,14 @@ component_add_event(papi_source_info_t* psi, int evcode)
 
 
 static void
-papi_register_events(papi_source_info_t *psi, evlist_t *evl)
+papi_register_events(papi_source_info_t *psi, evlist_t evl)
 {
   int i;
-  int nevents = evl->nevents;
+  int nevents = evl.nevents;
 
   // add events to new event_sets
   for (i = 0; i < nevents; i++) {
-    int evcode = evl->events[i].event;
+    int evcode = evl.events[i].event;
     component_add_event(psi, evcode);
 
   }
@@ -268,19 +275,19 @@ papi_register_overflow_callback(int eventSet, int evcode, long thresh)
 
 
 static void
-papi_register_callbacks(papi_source_info_t *psi, evlist_t *evl)
+papi_register_callbacks(papi_source_info_t *psi, evlist_t evl)
 {
   int i;
   // set up overflow handling for asynchronous event sets for active components
   // set up synchronous handling for synchronous event sets for active compoents
-  for (i = 0; i < evl->nevents; i++) {
-    int evcode = evl->events[i].event;
-    long thresh = evl->events[i].thresh;
-    int cidx = PAPI_get_event_component(evcode);
-
-    int eventSet = get_component_event_set(psi, cidx);
+  for (i = 0; i < evl.nevents; i++) {
+    int evcode = evl.events[i].event;
+    long thresh = evl.events[i].thresh;
+    int cidx = evcode_to_component_id(psi, evcode);
     papi_component_info_t *ci = &(psi->component_info[cidx]);
 
+    int eventSet = get_component_event_set(ci);
+
     // **** No overflow for synchronous events ****
     if (ci->is_sync) {
       TMSG(PAPI, "event code %d (component %d) is synchronous, so do NOT set overflow", evcode, cidx);
@@ -782,9 +789,9 @@ METHOD_FN(gen_event_set, int lush_metrics)
   // record the component state in thread state
   td->ss_info[self->sel_idx].ptr = psi;
 
-  papi_register_events(psi, &self->evl);
+  papi_register_events(psi, self->evl);
 
-  papi_register_callbacks(psi, &self->evl);
+  papi_register_callbacks(psi, self->evl);
 
 finish:
   tool_exit();
@@ -1110,7 +1117,7 @@ long long value
 
 
 static void
-papi_monitor_enter(void *component, gpu_monitor_apply_t *args_in)
+papi_monitor_enter(const void *component, gpu_monitor_apply_t *args_in)
 {
   tool_enter();
   papi_component_info_t *ci = (papi_component_info_t *) component;
@@ -1142,7 +1149,7 @@ papi_monitor_enter(void *component, gpu_monitor_apply_t *args_in)
 
 
 static void
-papi_monitor_exit(void *component, gpu_monitor_apply_t *args_in)
+papi_monitor_exit(const void *component, gpu_monitor_apply_t *args_in)
 {
   tool_enter();
   papi_component_info_t *ci = (papi_component_info_t *) component;
diff --git a/src/tool/hpcrun/sample-sources/papi-c.h b/src/tool/hpcrun/sample-sources/papi-c.h
index ae289c26bb..e740e18b5a 100644
--- a/src/tool/hpcrun/sample-sources/papi-c.h
+++ b/src/tool/hpcrun/sample-sources/papi-c.h
@@ -90,6 +90,6 @@ typedef struct {
  * external declarations 
  *****************************************************************************/
 
-extern int get_component_event_set(papi_source_info_t *psi, int cidx);
+extern int get_component_event_set(papi_component_info_t* ci);
 
 #endif // PAPI_C_H

From 33eb3516ffa176583c879ef2143eb37cb2035049 Mon Sep 17 00:00:00 2001
From: dejangrubisic <grubisic.dejan@yahoo.com>
Date: Tue, 24 Nov 2020 14:09:51 -0600
Subject: [PATCH 121/177] Workable version

---
 src/tool/hpcrun/gpu-monitors.c                |   8 +-
 src/tool/hpcrun/gpu-monitors.h                |  16 +-
 src/tool/hpcrun/gpu/amd/roctracer-api.c       |   4 +-
 src/tool/hpcrun/gpu/nvidia/cupti-api.c        |   8 +-
 src/tool/hpcrun/sample-sources/papi-c-cupti.c |   8 +
 src/tool/hpcrun/sample-sources/papi-c.c       | 170 ++++++++----------
 src/tool/hpcrun/sample-sources/papi-c.h       |   2 +-
 7 files changed, 97 insertions(+), 119 deletions(-)

diff --git a/src/tool/hpcrun/gpu-monitors.c b/src/tool/hpcrun/gpu-monitors.c
index df48579e6f..b340bcd970 100644
--- a/src/tool/hpcrun/gpu-monitors.c
+++ b/src/tool/hpcrun/gpu-monitors.c
@@ -12,7 +12,7 @@ void
 gpu_monitor_register(	gpu_monitor_node_t node)
 {
   gpu_monitor_node_t* new_node = hpcrun_malloc(sizeof(gpu_monitor_node_t));
-  new_node->component = node.component;
+  new_node->ci = node.ci;
   new_node->enter_fn = node.enter_fn;
   new_node->exit_fn = node.exit_fn;
   new_node->next = gpu_monitor_list;
@@ -21,15 +21,15 @@ gpu_monitor_register(	gpu_monitor_node_t node)
 
 
 void
-gpu_monitors_apply(gpu_monitor_apply_t *args_in, gpu_monitor_type_t type)
+gpu_monitors_apply(cct_node_t *cct_node, gpu_monitor_type_t type)
 {
   gpu_monitor_node_t *node = gpu_monitor_list;
 
 	while (node != NULL) {
     if (type == gpu_monitor_type_enter)
-      node->enter_fn(node->component, args_in);
+      node->enter_fn(node->ci, cct_node);
     else if (type == gpu_monitor_type_exit)
-      node->exit_fn(node->component, args_in);
+      node->exit_fn(node->ci, cct_node);
 
 		node = node->next;
 	}
diff --git a/src/tool/hpcrun/gpu-monitors.h b/src/tool/hpcrun/gpu-monitors.h
index e16ff8e08a..6753a29c0f 100644
--- a/src/tool/hpcrun/gpu-monitors.h
+++ b/src/tool/hpcrun/gpu-monitors.h
@@ -5,7 +5,9 @@
 #ifndef HPCTOOLKIT_GPU_MONITORS_H
 #define HPCTOOLKIT_GPU_MONITORS_H
 
-#include "cct.h"
+#include <cct.h>
+#include <sample-sources/papi-c.h>
+
 
 
 typedef enum {
@@ -14,24 +16,18 @@ typedef enum {
 } gpu_monitor_type_t;
 
 
-typedef struct gpu_monitor_apply_t {
-	cct_node_t *cct_node;
-  const char *name;
-} gpu_monitor_apply_t;
-
-
-typedef void (*gpu_monitor_fn_t)(const void* component, gpu_monitor_apply_t* args_in);
+typedef void (*gpu_monitor_fn_t)(papi_component_info_t *ci, const cct_node_t *cct_node);
 
 typedef struct gpu_monitor_node_t {
 	struct gpu_monitor_node_t * next;
-	void *component;
+  papi_component_info_t *ci;
 	gpu_monitor_fn_t enter_fn;
   gpu_monitor_fn_t exit_fn;
 } gpu_monitor_node_t;
 
 
 extern void gpu_monitor_register(gpu_monitor_node_t node);
-extern void gpu_monitors_apply(gpu_monitor_apply_t *args, gpu_monitor_type_t type);
+extern void gpu_monitors_apply(cct_node_t *cct_node, gpu_monitor_type_t type);
 
 
 #endif //HPCTOOLKIT_GPU_MONITORS_H
diff --git a/src/tool/hpcrun/gpu/amd/roctracer-api.c b/src/tool/hpcrun/gpu/amd/roctracer-api.c
index f273ccfc9c..5ca8be7ceb 100644
--- a/src/tool/hpcrun/gpu/amd/roctracer-api.c
+++ b/src/tool/hpcrun/gpu/amd/roctracer-api.c
@@ -451,13 +451,13 @@ roctracer_subscriber_callback
 
     PRINT("\nACTIVITY_API_PHASE_ENTER -----------------| cct = %p \n", api_node);
     int (*hip_gpu_sync_ptr)(void) = hip_dev_sync;
-    gpu_monitors_apply(&(gpu_monitor_apply_t) {.cct_node=api_node, .gpu_sync_ptr=hip_gpu_sync_ptr}, gpu_monitor_type_enter);
+    gpu_monitors_apply(api_node, gpu_monitor_type_enter);
 
     gpu_correlation_channel_produce(correlation_id, &gpu_op_ccts, cpu_submit_time);
   }else if (data->phase == ACTIVITY_API_PHASE_EXIT){
     PRINT("\nACTIVITY_API_PHASE_EXIT -----------------| \n");
     int (*hip_gpu_sync_ptr)(void) = hip_dev_sync;
-    gpu_monitors_apply(&(gpu_monitor_apply_t) {.cct_node=NULL, .gpu_sync_ptr=hip_gpu_sync_ptr}, gpu_monitor_type_exit);
+    gpu_monitors_apply(NULL, gpu_monitor_type_exit);
 
   }else{
     ;
diff --git a/src/tool/hpcrun/gpu/nvidia/cupti-api.c b/src/tool/hpcrun/gpu/nvidia/cupti-api.c
index a54e83ad75..294a683ee0 100644
--- a/src/tool/hpcrun/gpu/nvidia/cupti-api.c
+++ b/src/tool/hpcrun/gpu/nvidia/cupti-api.c
@@ -1019,7 +1019,7 @@ cupti_subscriber_callback
         // Generate notification entry
         uint64_t cpu_submit_time = hpcrun_nanotime();
 
-				gpu_monitors_apply( &(gpu_monitor_apply_t){.cct_node=api_node, .name="cuda"}, gpu_monitor_type_enter);
+				gpu_monitors_apply( api_node, gpu_monitor_type_enter);
 
 
 				gpu_correlation_channel_produce(correlation_id, &gpu_op_ccts,
@@ -1031,7 +1031,7 @@ cupti_subscriber_callback
         correlation_id = cupti_correlation_id_pop();
         TMSG(CUPTI_TRACE, "Driver pop externalId %lu (cb_id = %u)", correlation_id, cb_id);
 
-				gpu_monitors_apply( &(gpu_monitor_apply_t){.cct_node=NULL, .name="cuda"}, gpu_monitor_type_exit);
+				gpu_monitors_apply( NULL, gpu_monitor_type_exit);
 
 
 			}
@@ -1182,7 +1182,7 @@ cupti_subscriber_callback
         // Generate notification entry
         uint64_t cpu_submit_time = hpcrun_nanotime();
 
-				gpu_monitors_apply( &(gpu_monitor_apply_t){.cct_node=cupti_kernel_ph, .name="cuda"}, gpu_monitor_type_enter);
+				gpu_monitors_apply( cupti_kernel_ph, gpu_monitor_type_enter);
 
 				gpu_correlation_channel_produce(correlation_id, &gpu_op_ccts,
           cpu_submit_time);
@@ -1196,7 +1196,7 @@ cupti_subscriber_callback
         correlation_id = cupti_correlation_id_pop();
         TMSG(CUPTI_TRACE, "Runtime pop externalId %lu (cb_id = %u)", correlation_id, cb_id);
 
-				gpu_monitors_apply( &(gpu_monitor_apply_t){.cct_node=cupti_kernel_ph, .name="cuda"}, gpu_monitor_type_exit);
+				gpu_monitors_apply( cupti_kernel_ph, gpu_monitor_type_exit);
 
         cupti_kernel_ph = NULL;
         cupti_trace_ph = NULL;
diff --git a/src/tool/hpcrun/sample-sources/papi-c-cupti.c b/src/tool/hpcrun/sample-sources/papi-c-cupti.c
index 5ff3874cfe..6a24049d67 100644
--- a/src/tool/hpcrun/sample-sources/papi-c-cupti.c
+++ b/src/tool/hpcrun/sample-sources/papi-c-cupti.c
@@ -285,6 +285,14 @@ papi_c_cupti_setup(void)
 
   papi_source_info_t* psi = td->ss_info[local.self->sel_idx].ptr;
   local.event_set = get_component_event_set( &(psi->component_info[cuda_component_idx]) );
+//TODO Dejan: Can I delete these hpcrun_cuda_kernel_callback callback?
+//  Cupti_call(dcuptiSubscribe, &subscriber,
+//             (CUpti_CallbackFunc)hpcrun_cuda_kernel_callback,
+//             &local);
+//
+//  Cupti_call(dcuptiEnableCallback, 1, subscriber,
+//             CUPTI_CB_DOMAIN_RUNTIME_API,
+//             CUPTI_RUNTIME_TRACE_CBID_cudaLaunch_v3020);
 
   one_time = true;
   spinlock_unlock(&setup_lock);
diff --git a/src/tool/hpcrun/sample-sources/papi-c.c b/src/tool/hpcrun/sample-sources/papi-c.c
index c15094014a..b338078747 100644
--- a/src/tool/hpcrun/sample-sources/papi-c.c
+++ b/src/tool/hpcrun/sample-sources/papi-c.c
@@ -121,8 +121,8 @@
  * forward declarations 
  *****************************************************************************/
 static void papi_event_handler(int event_set, void *pc, long long ovec, void *context);
-static void papi_monitor_enter(const void *reg_info, gpu_monitor_apply_t *args_in);
-static void papi_monitor_exit(const void *reg_info, gpu_monitor_apply_t *args_in);
+static void papi_monitor_enter(papi_component_info_t *ci,  const cct_node_t *cct_node);
+static void papi_monitor_exit(papi_component_info_t *ci,  const cct_node_t *cct_node);
 
 static int  event_is_derived(int ev_code);
 static void event_fatal_error(int ev_code, int papi_ret);
@@ -154,16 +154,6 @@ static kind_info_t *papi_kind;
  * private operations 
  *****************************************************************************/
 
-static bool
-is_gpu_component(int cidx)
-{
-  const char* name = PAPI_get_component_info(cidx)->name;
-  if(strstr(name, "cuda") == name || strstr(name, "rocm")==name) {
-    return true;
-  }
-  return false;
-}
-
 
 static int
 get_event_index(sample_source_t *self, int event_code)
@@ -252,7 +242,7 @@ static void
 papi_register_sync_callback(papi_component_info_t *ci)
 {
   gpu_monitor_node_t node;
-  node.component = ci;
+  node.ci = ci;
   node.enter_fn = papi_monitor_enter;
   node.exit_fn = papi_monitor_exit;
   gpu_monitor_register(node);
@@ -262,10 +252,10 @@ papi_register_sync_callback(papi_component_info_t *ci)
 static void
 papi_register_overflow_callback(int eventSet, int evcode, long thresh)
 {
-  int ret = PAPI_overflow(eventSet, evcode, thresh, OVERFLOW_MODE,
-                          papi_event_handler);
-  TMSG(PAPI, "PAPI_overflow(eventSet=%d, evcode=%x, thresh=%d) = %d",
-       eventSet, evcode, thresh, ret);
+  TMSG(PAPI, "PAPI_overflow(eventSet=%d, evcode=%x, thresh=%d) register",
+       eventSet, evcode, thresh);
+
+  int ret = PAPI_overflow(eventSet, evcode, thresh, OVERFLOW_MODE, papi_event_handler);
   if (ret != PAPI_OK) {
     EMSG("failure in PAPI gen_event_set(): PAPI_overflow() returned: %s (%d)",
          PAPI_strerror(ret), ret);
@@ -281,11 +271,11 @@ papi_register_callbacks(papi_source_info_t *psi, evlist_t evl)
   // set up overflow handling for asynchronous event sets for active components
   // set up synchronous handling for synchronous event sets for active compoents
   for (i = 0; i < evl.nevents; i++) {
+
     int evcode = evl.events[i].event;
     long thresh = evl.events[i].thresh;
     int cidx = evcode_to_component_id(psi, evcode);
     papi_component_info_t *ci = &(psi->component_info[cidx]);
-
     int eventSet = get_component_event_set(ci);
 
     // **** No overflow for synchronous events ****
@@ -297,8 +287,7 @@ papi_register_callbacks(papi_source_info_t *psi, evlist_t evl)
       papi_register_sync_callback(ci);
     }
     else{
-      // ***** Only set overflow if NOT derived event *****
-      if (! derived[i]) {
+      if (! derived[i]) { // ***** Only set overflow if NOT derived event *****
         papi_register_overflow_callback(eventSet, evcode, thresh);
       }
     }
@@ -337,7 +326,6 @@ strip_papi_prefix(const char *str)
   return str;
 }
 
-static atomic_uint stop_papi_flag = { 0 };
 
 static void
 METHOD_FN(init)
@@ -357,10 +345,10 @@ METHOD_FN(init)
     if (cidx) {
       int res = PAPI_disable_component(cidx);
       if (res == PAPI_OK) {
-  TMSG(PAPI, "PAPI cuda component disabled");
+        TMSG(PAPI, "PAPI cuda component disabled");
       }
       else {
-  EMSG("*** PAPI cuda component could not be disabled!!!");
+        EMSG("*** PAPI cuda component could not be disabled!!!");
       }
     }
   }
@@ -547,16 +535,12 @@ METHOD_FN(stop)
   long_long values[nevents+2];
   //  long_long *values = (long_long *) alloca(sizeof(long_long) * (nevents+2));
 
-
-  if(atomic_fetch_add(&stop_papi_flag, 1) == 0) {
-    //TODO: PAPI_stop is called from monitor_fini_thread and monitor_fini_process -> PAPI_error
-
     int ret = PAPI_stop(ci->eventSet, values);
     if (ret != PAPI_OK) {
       EMSG("Failed to stop PAPI for eventset %d. Return code = %d ==> %s",
            ci->eventSet, ret, PAPI_strerror(ret));
     }
-  }
+
       }
     }
   }
@@ -579,7 +563,7 @@ METHOD_FN(shutdown)
   }while(0);
   // FIXME: add component shutdown code here
 
-//  PAPI_shutdown();
+  PAPI_shutdown();
 
   self->state = UNINIT;
 finish:
@@ -964,12 +948,12 @@ papi_event_handler(int event_set, void *pc, long long ovec,
   sample_source_t *self = &obj_name();
   long long values[MAX_EVENTS];
   int my_events[MAX_EVENTS];
-  int my_event_count = MAX_EVENTS;
+  int my_events_number = MAX_EVENTS;
   int nevents  = self->evl.nevents;
   int i, ret;
 
-  int my_event_codes[MAX_EVENTS];
-  int my_event_codes_count = MAX_EVENTS;
+  int my_events_code[MAX_EVENTS];
+  int my_events_code_count = MAX_EVENTS;
 
   // if sampling disabled explicitly for this thread, skip all processing
   if (hpcrun_suppress_sample() || sample_filters_apply()) goto finish;
@@ -1000,41 +984,41 @@ papi_event_handler(int event_set, void *pc, long long ovec,
   }
 
   ret = PAPI_get_overflow_event_index(event_set, ovec, my_events,
-              &my_event_count);
+              &my_events_number);
   if (ret != PAPI_OK) {
     TMSG(PAPI_SAMPLE, "papi_event_handler: event set %d ovec %ld "
    "get_overflow_event_index return code = %d ==> %s",
    event_set, ovec, ret, PAPI_strerror(ret));
 #ifdef DEBUG_PAPI_OVERFLOW
-    ret = PAPI_list_events(event_set, my_event_codes, &my_event_codes_count);
+    ret = PAPI_list_events(event_set, my_events_code, &my_events_code_count);
     if (ret != PAPI_OK) {
       TMSG(PAPI_SAMPLE, "PAPI_list_events failed inside papi_event_handler."
      "Return code = %d ==> %s", ret, PAPI_strerror(ret));
     } else {
-      for (i = 0; i < my_event_codes_count; i++) {
+      for (i = 0; i < my_events_code_count; i++) {
         TMSG(PAPI_SAMPLE, "event set %d event code %d = %x\n",
-       event_set, i, my_event_codes[i]);
+       event_set, i, my_events_code[i]);
       }
     }
     TMSG(PAPI_SAMPLE, "get_overflow_event_index failure in papi_event_handler");
 #endif
   }
 
-  ret = PAPI_list_events(event_set, my_event_codes, &my_event_codes_count);
+  ret = PAPI_list_events(event_set, my_events_code, &my_events_code_count);
   if (ret != PAPI_OK) {
     hpcrun_abort("PAPI_list_events failed inside papi_event_handler."
      "Return code = %d ==> %s", ret, PAPI_strerror(ret));
   }
 
-  for (i = 0; i < my_event_count; i++) {
+  for (i = 0; i < my_events_number; i++) {
     // FIXME: SUBTLE ERROR: metric_id may not be same from hpcrun_new_metric()!
     // This means lush's 'time' metric should be *last*
 
     TMSG(PAPI_SAMPLE,"handling papi overflow event: "
   "event set %d event index = %d event code = 0x%x",
-  event_set, my_events[i], my_event_codes[my_events[i]]);
+  event_set, my_events[i], my_events_code[my_events[i]]);
 
-    int event_index = get_event_index(self, my_event_codes[my_events[i]]);
+    int event_index = get_event_index(self, my_events_code[my_events[i]]);
 
     int metric_id = hpcrun_event2metric(self, event_index);
 
@@ -1082,8 +1066,7 @@ papi_event_handler(int event_set, void *pc, long long ovec,
 }
 
 
-static __thread cct_node_t *cct_node;
-static __thread long long prev_values[MAX_EVENTS];
+static __thread cct_node_t *cct_node_loc; // TODO Dejan: Should I use this get cct_node from exit?
 
 static void
 papi_insert_cct(cct_node_t *api_node){
@@ -1101,11 +1084,11 @@ papi_insert_cct(cct_node_t *api_node){
 }
 
 static void
-gpu_metrics_attribute_papi
+attribute_metric_to_cct
 (
-int metric_id,
-cct_node_t *cct_node,
-long long value
+ int metric_id,
+ cct_node_t *cct_node,
+ long long value
 )
 {
   metric_data_list_t* metrics = hpcrun_reify_metric_set(cct_node, metric_id);
@@ -1117,30 +1100,55 @@ long long value
 
 
 static void
-papi_monitor_enter(const void *component, gpu_monitor_apply_t *args_in)
+attribute_counters(papi_component_info_t *ci, long long *collected_values, cct_node_t *cct_node)
 {
-  tool_enter();
-  papi_component_info_t *ci = (papi_component_info_t *) component;
-  gpu_monitor_apply_t *args = (gpu_monitor_apply_t *) args_in;
+  sample_source_t *self = &obj_name();
+  int events_codes[MAX_EVENTS];
+  int my_events_number = MAX_EVENTS;
+  int ret;
+
+  // Attribute collected metric to cct nodes
+  ret = PAPI_list_events(ci->eventSet, events_codes, &my_events_number);
+  if (ret != PAPI_OK) {
+    hpcrun_abort("PAPI_list_events failed inside papi_event_handler."
+                 "Return code = %d ==> %s", ret, PAPI_strerror(ret));
+  }
+
+  for (int eid = 0; eid < my_events_number; ++eid) {
+    int event_index = get_event_index(self, events_codes[eid]);
+    int metric_id = hpcrun_event2metric(self, event_index);
+    long long int final_counts = collected_values[eid] - ci->prev_values[eid];
 
-  sample_source_t *self = &obj_name(); /// just for debug
 
-//  PRINT("|------->PAPI_MONITOR_ENTER | cct = %p\n", args->cct_node);
+    blame_shift_apply(metric_id, cct_node, final_counts/*metricIncr*/);
+    attribute_metric_to_cct(metric_id, cct_node, final_counts);
+
+    PRINT("PAPI_EXIT:: %d Event = %x, event_index = %d, metric_id = %d || value = %lld - %lld == %lld\n",
+          eid, events_codes[eid], event_index, metric_id,
+          collected_values[eid], ci->prev_values[eid],
+          final_counts);
+  }
+}
+
+
+static void
+papi_monitor_enter(papi_component_info_t *ci, const cct_node_t *cct_node)
+{
+  tool_enter();
+//  sample_source_t *self = &obj_name();
+
+//  PRINT("|------->PAPI_MONITOR_ENTER | cct = %p\n", cct_node);
 
   // if sampling disabled explicitly for this thread, skip all processing
   if (hpcrun_suppress_sample() || sample_filters_apply()) goto finish;
 
-  cct_node = args->cct_node;
+  cct_node_loc = cct_node;
 
   // Save counts on the end so we could substract that from next call (we don't want to measure ourselves)
 
   if (ci->inUse) {
-//    if (args->gpu_sync_ptr)  // for amd it seems that there is no default sync like in nvidia case
-//      args->gpu_sync_ptr();
-
-    PRINT("Self = %p | Component %s \t | cct = %p \n\n", self, ci->name, args->cct_node );
-    ci->read(prev_values);
-    PRINT("ENTER_read Event = %d, value = %lld \n", ci->eventSet, prev_values[0]);
+    ci->read(ci->prev_values);
+    PRINT("PAPI_ENTER:: Component %s Event = %d, value = %lld   |  %p\n", ci->name, ci->eventSet, ci->prev_values[0], cct_node);
   }
 
 finish:
@@ -1149,54 +1157,20 @@ papi_monitor_enter(const void *component, gpu_monitor_apply_t *args_in)
 
 
 static void
-papi_monitor_exit(const void *component, gpu_monitor_apply_t *args_in)
+papi_monitor_exit(papi_component_info_t *ci,  const cct_node_t *cct_node)
 {
   tool_enter();
-  papi_component_info_t *ci = (papi_component_info_t *) component;
-  gpu_monitor_apply_t *args = (gpu_monitor_apply_t *) args_in;
-
-  sample_source_t *self = &obj_name(); /// just for debug
-  int my_event_codes[MAX_EVENTS];
-  long long my_event_values[MAX_EVENTS];
-  int my_event_count = MAX_EVENTS;
-  int ret;
-
-//  PRINT("|------->PAPI_MONITOR_EXIT| running? %d\n", METHOD_CALL(self, started));
+  long long collected_values[MAX_EVENTS];
 
   // if sampling disabled explicitly for this thread, skip all processing
   if (hpcrun_suppress_sample() || sample_filters_apply()) goto finish;
 
   if (ci->inUse){
-
-//    if (args->gpu_sync_ptr)
-//      args->gpu_sync_ptr();
-
-    ci->read(my_event_values);
-
-
-    // Attribute collected metric to cct nodes
-    ret = PAPI_list_events(ci->eventSet, my_event_codes, &my_event_count);
-    if (ret != PAPI_OK) {
-      hpcrun_abort("PAPI_list_events failed inside papi_event_handler."
-                   "Return code = %d ==> %s", ret, PAPI_strerror(ret));
-    }
-
-    for (int eid = 0; eid < my_event_count; ++eid) {
-      int event_index = get_event_index(self, my_event_codes[eid]);
-      int metric_id = hpcrun_event2metric(self, event_index);
-
-      PRINT("%d Event = %x, event_index = %d, metric_id = %d || value = %lld ---> %lld\n",
-             eid, my_event_codes[eid], event_index, metric_id, prev_values[eid], my_event_values[eid]);
-
-      blame_shift_apply(metric_id, cct_node, my_event_values[eid] /*metricIncr*/);
-
-
-      gpu_metrics_attribute_papi(metric_id, cct_node, my_event_values[eid]);
-    }
-
+    ci->read(collected_values);
+    attribute_counters(ci, collected_values, cct_node_loc);
   }
 
-  cct_node = NULL;
+  cct_node_loc = NULL;
 
 finish:
   tool_exit();
diff --git a/src/tool/hpcrun/sample-sources/papi-c.h b/src/tool/hpcrun/sample-sources/papi-c.h
index e740e18b5a..13521a2d01 100644
--- a/src/tool/hpcrun/sample-sources/papi-c.h
+++ b/src/tool/hpcrun/sample-sources/papi-c.h
@@ -53,7 +53,7 @@
 
 #include "papi-c-extended-info.h"
 
-
+#include "sample_source_obj.h"
 
 /******************************************************************************
  * type declarations 

From 338717c8579c0e691fa2070c75e243357525b29a Mon Sep 17 00:00:00 2001
From: dejangrubisic <grubisic.dejan@yahoo.com>
Date: Tue, 24 Nov 2020 20:49:44 -0600
Subject: [PATCH 122/177] papi-c-cupti works

---
 src/tool/hpcrun/gpu-monitors.c                |  18 +-
 src/tool/hpcrun/gpu-monitors.h                |   8 +-
 src/tool/hpcrun/gpu/gpu-activity-process.c    |   7 +-
 src/tool/hpcrun/gpu/gpu-correlation-id-map.c  |   4 +-
 src/tool/hpcrun/gpu/gpu-correlation-id.h      |   3 +-
 src/tool/hpcrun/gpu/gpu-correlation.h         |   1 -
 src/tool/hpcrun/gpu/nvidia/cupti-api.c        |  36 +-
 src/tool/hpcrun/main.c                        |   3 +-
 src/tool/hpcrun/sample-sources/papi-c-cupti.c | 428 ++++--------------
 .../sample-sources/papi-c-extended-info.c     |   2 +-
 .../sample-sources/papi-c-extended-info.h     |   2 +-
 src/tool/hpcrun/sample-sources/papi-c.c       |  37 +-
 src/tool/hpcrun/sample-sources/papi-c.h       |   5 +-
 src/tool/hpcrun/tool_state.c                  |   5 +
 14 files changed, 163 insertions(+), 396 deletions(-)

diff --git a/src/tool/hpcrun/gpu-monitors.c b/src/tool/hpcrun/gpu-monitors.c
index b340bcd970..9d05ce6443 100644
--- a/src/tool/hpcrun/gpu-monitors.c
+++ b/src/tool/hpcrun/gpu-monitors.c
@@ -25,12 +25,16 @@ gpu_monitors_apply(cct_node_t *cct_node, gpu_monitor_type_t type)
 {
   gpu_monitor_node_t *node = gpu_monitor_list;
 
-	while (node != NULL) {
-    if (type == gpu_monitor_type_enter)
+  if (type == gpu_monitor_type_enter){
+    while (node != NULL) {
       node->enter_fn(node->ci, cct_node);
-    else if (type == gpu_monitor_type_exit)
-      node->exit_fn(node->ci, cct_node);
-
-		node = node->next;
-	}
+      node = node->next;
+    }
+  }
+  else if (type == gpu_monitor_type_exit){
+    while (node != NULL) {
+      node->exit_fn(node->ci);
+      node = node->next;
+    }
+  }
 }
\ No newline at end of file
diff --git a/src/tool/hpcrun/gpu-monitors.h b/src/tool/hpcrun/gpu-monitors.h
index 6753a29c0f..7fd2c0d334 100644
--- a/src/tool/hpcrun/gpu-monitors.h
+++ b/src/tool/hpcrun/gpu-monitors.h
@@ -16,13 +16,15 @@ typedef enum {
 } gpu_monitor_type_t;
 
 
-typedef void (*gpu_monitor_fn_t)(papi_component_info_t *ci, const cct_node_t *cct_node);
+typedef void (*gpu_monitor_enter_fn_t)(papi_component_info_t *ci, cct_node_t *cct_node);
+typedef void (*gpu_monitor_exit_fn_t)(papi_component_info_t *ci);
+
 
 typedef struct gpu_monitor_node_t {
 	struct gpu_monitor_node_t * next;
   papi_component_info_t *ci;
-	gpu_monitor_fn_t enter_fn;
-  gpu_monitor_fn_t exit_fn;
+  gpu_monitor_enter_fn_t enter_fn;
+  gpu_monitor_exit_fn_t exit_fn;
 } gpu_monitor_node_t;
 
 
diff --git a/src/tool/hpcrun/gpu/gpu-activity-process.c b/src/tool/hpcrun/gpu/gpu-activity-process.c
index 6b4b97d355..f0692d109c 100644
--- a/src/tool/hpcrun/gpu/gpu-activity-process.c
+++ b/src/tool/hpcrun/gpu/gpu-activity-process.c
@@ -57,6 +57,7 @@
 #include <hpcrun/gpu/gpu-activity.h>
 #include <hpcrun/gpu/gpu-activity-channel.h>
 #include <hpcrun/gpu/gpu-trace-item.h>
+#include <hpcrun/gpu/gpu-correlation-id.h>
 #include <hpcrun/gpu/gpu-correlation-id-map.h>
 #include <hpcrun/gpu/gpu-context-id-map.h>
 #include <hpcrun/gpu/gpu-event-id-map.h>
@@ -463,9 +464,11 @@ gpu_synchronization_process
   if (cid_map_entry != NULL) {
     uint64_t external_id =
       gpu_correlation_id_map_entry_external_id_get(cid_map_entry);
+
     gpu_host_correlation_map_entry_t *host_op_entry =
       gpu_host_correlation_map_lookup(external_id);
-    if (host_op_entry != NULL) {
+    if (host_op_entry != NULL && external_id != IGNORE_CORR_ID) {
+
       cct_node_t *host_op_node =
         gpu_host_correlation_map_entry_op_cct_get(host_op_entry,
           gpu_placeholder_type_sync);
@@ -678,7 +681,7 @@ gpu_activity_process
     break;
 
   case GPU_ACTIVITY_SYNCHRONIZATION:
-//    gpu_synchronization_process(ga);
+    gpu_synchronization_process(ga);
     break;
 
   case GPU_ACTIVITY_MEMORY:
diff --git a/src/tool/hpcrun/gpu/gpu-correlation-id-map.c b/src/tool/hpcrun/gpu/gpu-correlation-id-map.c
index 1fe45d67b1..cb434d3750 100644
--- a/src/tool/hpcrun/gpu/gpu-correlation-id-map.c
+++ b/src/tool/hpcrun/gpu/gpu-correlation-id-map.c
@@ -119,9 +119,9 @@ typedef struct typed_splay_node(correlation_id) {
 // local data
 //******************************************************************************
 
-static gpu_correlation_id_map_entry_t *map_root = NULL;
+static __thread gpu_correlation_id_map_entry_t *map_root = NULL;
 
-static gpu_correlation_id_map_entry_t *free_list = NULL;
+static __thread gpu_correlation_id_map_entry_t *free_list = NULL;
 
 
 
diff --git a/src/tool/hpcrun/gpu/gpu-correlation-id.h b/src/tool/hpcrun/gpu/gpu-correlation-id.h
index e6c325adcf..3bf8cbb899 100644
--- a/src/tool/hpcrun/gpu/gpu-correlation-id.h
+++ b/src/tool/hpcrun/gpu/gpu-correlation-id.h
@@ -52,7 +52,8 @@
 
 #include <stdint.h>
 
-
+//we use this for our activity that should be ignored
+#define IGNORE_CORR_ID (~0ULL)
 
 //******************************************************************************
 // interface operations
diff --git a/src/tool/hpcrun/gpu/gpu-correlation.h b/src/tool/hpcrun/gpu/gpu-correlation.h
index e39ab4fe9e..f862314bdc 100644
--- a/src/tool/hpcrun/gpu/gpu-correlation.h
+++ b/src/tool/hpcrun/gpu/gpu-correlation.h
@@ -60,7 +60,6 @@
 
 #define UNIT_TEST_CORRELATION_HEADER 0
 
-#define PAPI_CORR_ID -1
 
 //******************************************************************************
 // forward type declarations
diff --git a/src/tool/hpcrun/gpu/nvidia/cupti-api.c b/src/tool/hpcrun/gpu/nvidia/cupti-api.c
index 294a683ee0..fe1c62b2d7 100644
--- a/src/tool/hpcrun/gpu/nvidia/cupti-api.c
+++ b/src/tool/hpcrun/gpu/nvidia/cupti-api.c
@@ -778,6 +778,23 @@ ensure_kernel_ip_present
 }
 
 
+static void
+cupti_gpu_monitors_apply_enter(cct_node_t *cct_node)
+{
+  cupti_correlation_id_push(IGNORE_CORR_ID);
+  gpu_monitors_apply( cct_node, gpu_monitor_type_enter);
+  cupti_correlation_id_pop();
+}
+
+
+static void
+cupti_gpu_monitors_apply_exit()
+{
+  cupti_correlation_id_push(IGNORE_CORR_ID);
+  gpu_monitors_apply( NULL, gpu_monitor_type_exit);
+  cupti_correlation_id_pop();
+}
+
 static void
 cupti_subscriber_callback
 (
@@ -789,11 +806,6 @@ cupti_subscriber_callback
 {
 
 	if (is_tool_active()) {
-//		const CUpti_CallbackData *cd = (const CUpti_CallbackData *) cb_info;
-//		PRINT("\nTOOL callback: -----------------%s\n", cd->functionName );
-
-//		TMSG(CUPTI, "PAPI correlation callback");
-//		gpu_correlation_channel_produce(PAPI_CORR_ID, NULL, 0);
 		return;
 	}
 
@@ -1019,21 +1031,19 @@ cupti_subscriber_callback
         // Generate notification entry
         uint64_t cpu_submit_time = hpcrun_nanotime();
 
-				gpu_monitors_apply( api_node, gpu_monitor_type_enter);
 
+        cupti_gpu_monitors_apply_enter(api_node);
 
 				gpu_correlation_channel_produce(correlation_id, &gpu_op_ccts,
           cpu_submit_time);
 
         TMSG(CUPTI_TRACE, "Driver push externalId %lu (cb_id = %u)", correlation_id, cb_id);
       } else if (cd->callbackSite == CUPTI_API_EXIT) {
+        cupti_gpu_monitors_apply_exit();
+
         uint64_t correlation_id __attribute__((unused)); // not used if PRINT omitted
         correlation_id = cupti_correlation_id_pop();
         TMSG(CUPTI_TRACE, "Driver pop externalId %lu (cb_id = %u)", correlation_id, cb_id);
-
-				gpu_monitors_apply( NULL, gpu_monitor_type_exit);
-
-
 			}
     } else if (is_kernel_op && cupti_runtime_api_flag && cd->callbackSite ==
       CUPTI_API_ENTER) {
@@ -1182,13 +1192,15 @@ cupti_subscriber_callback
         // Generate notification entry
         uint64_t cpu_submit_time = hpcrun_nanotime();
 
-				gpu_monitors_apply( cupti_kernel_ph, gpu_monitor_type_enter);
+        cupti_gpu_monitors_apply_enter(cupti_kernel_ph);
 
 				gpu_correlation_channel_produce(correlation_id, &gpu_op_ccts,
           cpu_submit_time);
 
         TMSG(CUPTI_TRACE, "Runtime push externalId %lu (cb_id = %u)", correlation_id, cb_id);
       } else if (cd->callbackSite == CUPTI_API_EXIT) {
+
+        cupti_gpu_monitors_apply_exit();
         // Exit an CUDA runtime api
         cupti_runtime_api_flag_unset();
 
@@ -1196,8 +1208,6 @@ cupti_subscriber_callback
         correlation_id = cupti_correlation_id_pop();
         TMSG(CUPTI_TRACE, "Runtime pop externalId %lu (cb_id = %u)", correlation_id, cb_id);
 
-				gpu_monitors_apply( cupti_kernel_ph, gpu_monitor_type_exit);
-
         cupti_kernel_ph = NULL;
         cupti_trace_ph = NULL;
       }
diff --git a/src/tool/hpcrun/main.c b/src/tool/hpcrun/main.c
index c835eb539d..e3d984855d 100644
--- a/src/tool/hpcrun/main.c
+++ b/src/tool/hpcrun/main.c
@@ -756,8 +756,7 @@ hpcrun_thread_init(int id, local_thread_data_t* local_thread_data, bool has_trac
 
   epoch_t* epoch = TD_GET(core_profile_trace_data.epoch);
 
-  if (! hpcrun_thread_suppress_sample ) { //TODO Dejan: has_trace instead?
-//TODO Dejan: thread_init_action must go before gen-event-set
+  if (! hpcrun_thread_suppress_sample ) {
     // sample sources take thread specific action prior to start (often is a 'registration' action);
     SAMPLE_SOURCES(thread_init_action);
 
diff --git a/src/tool/hpcrun/sample-sources/papi-c-cupti.c b/src/tool/hpcrun/sample-sources/papi-c-cupti.c
index 6a24049d67..9c9050e619 100644
--- a/src/tool/hpcrun/sample-sources/papi-c-cupti.c
+++ b/src/tool/hpcrun/sample-sources/papi-c-cupti.c
@@ -1,428 +1,190 @@
-// ******************* System Includes ********************
-#include <ucontext.h> 
-#include <dlfcn.h>
+// -*-Mode: C++;-*- // technically C99
 
-#include <stdbool.h>
-#include <string.h>
-#include <stdint.h>
-// *********************************************************
+// * BeginRiceCopyright *****************************************************
+//
+// --------------------------------------------------------------------------
+// Part of HPCToolkit (hpctoolkit.org)
+//
+// Information about sources of support for research and development of
+// HPCToolkit is at 'hpctoolkit.org' and in 'README.Acknowledgments'.
+// --------------------------------------------------------------------------
+//
+// Copyright ((c)) 2002-2020, Rice University
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+//   notice, this list of conditions and the following disclaimer.
+//
+// * Redistributions in binary form must reproduce the above copyright
+//   notice, this list of conditions and the following disclaimer in the
+//   documentation and/or other materials provided with the distribution.
+//
+// * Neither the name of Rice University (RICE) nor the names of its
+//   contributors may be used to endorse or promote products derived from
+//   this software without specific prior written permission.
+//
+// This software is provided by RICE and contributors "as is" and any
+// express or implied warranties, including, but not limited to, the
+// implied warranties of merchantability and fitness for a particular
+// purpose are disclaimed. In no event shall RICE or contributors be
+// liable for any direct, indirect, incidental, special, exemplary, or
+// consequential damages (including, but not limited to, procurement of
+// substitute goods or services; loss of use, data, or profits; or
+// business interruption) however caused and on any theory of liability,
+// whether in contract, strict liability, or tort (including negligence
+// or otherwise) arising in any way out of the use of this software, even
+// if advised of the possibility of such damage.
+//
+// ******************************************************* EndRiceCopyright *
+
+//***************************************************************************
+//
+// File:
+//   cupti-api.c
+//
+// Purpose:
+//   implementation of wrapper around NVIDIA's CUPTI performance tools API
+//
+//***************************************************************************
 
+//***************************************************************************
+// system includes
+//***************************************************************************
 
-// ******************** PAPI *******************************
 #include <papi.h>
-// *********************************************************
-
-// ******************** MONITOR *******************************
 #include <monitor.h>
-// *********************************************************
 
-// ******************** GPU includes ***********************
-#include <cuda_runtime_api.h>
-#include <cupti.h>
-// *********************************************************
 
-// ******* HPCToolkit Includes *********************************
-#include <lib/prof-lean/spinlock.h>
 
-#include <hpcrun/thread_data.h>
+//***************************************************************************
+// local includes
+//***************************************************************************
+
 #include <messages/messages.h>
-#include <hpcrun/sample_event.h>
-#include <hpcrun/safe-sampling.h>
-#include <hpcrun/sample_sources_all.h>
-#include <sample-sources/common.h>
 #include <sample-sources/ss-obj-name.h>
-// *********************************************************
-
-// ******** local includes ***********
 #include "papi-c.h"
 #include "papi-c-extended-info.h"
-// ***********************************
-
-// ****************** Convenience macros *******************
-
-#define CUPTI_LAUNCH_CALLBACK_DEPTH 7
-
-#define Cupti_call(fn, ...)                                    \
-{                                                              \
-  int ret = fn(__VA_ARGS__);                                   \
-  if (ret != CUPTI_SUCCESS) {                                  \
-    const char* errstr;                                        \
-    dcuptiGetResultString(ret, &errstr);                        \
-    hpcrun_abort("error: CUDA/CUPTI API "                      \
-                 #fn " failed w error code %d ==> '%s'\n",     \
-                 ret, errstr);                                 \
-  }                                                            \
-}
 
-#define Cupti_call_silent(fn, ...)                             \
-{                                                              \
-  (void) fn(__VA_ARGS__);                                      \
-}
 
-#define Chk_dlopen(v, lib, flags)                     \
-  void* v = monitor_real_dlopen(lib, flags);          \
-  if (! v) {                                          \
-    fprintf(stderr, "gpu dlopen %s failed\n", lib);   \
-    return;                                           \
-  }                                                   \
-
-#define Chk_dlsym(h, fn) {                                \
-  dlerror();                                              \
-  d ## fn = dlsym(h, #fn);                                \
-  char* e = dlerror();                                    \
-  if (e) {                                                \
-    fprintf(stderr, "dlsym(%s) fails w '%s'\n", #fn, e);  \
-    return;                                               \
-  }                                                       \
-}
-// ***********************************************************
 
-typedef struct {
-  int nevents;
-  int event_set;
-  sample_source_t* self;
-} papi_cuda_data_t;
+//******************************************************************************
+// static data
+//******************************************************************************
 
 static __thread bool event_set_created = false;
 static __thread bool event_set_finalized = false;
+static __thread int my_event_set = PAPI_NULL;
 
-static __thread papi_cuda_data_t local = {};
-
-static spinlock_t cupti_lock = SPINLOCK_UNLOCKED;
-static spinlock_t setup_lock = SPINLOCK_UNLOCKED;
-
-// ******************** cuda/cupti functions ***********************
-// Some cuda/cupti functions must not be wrapped! So, we fetch them via dlopen.
-// NOTE: naming convention is to prepend the letter "d" to the actual function
-// The indirect functions are below.
-//
-cudaError_t (*dcudaThreadSynchronize)(void);
-
-CUptiResult (*dcuptiGetResultString)(CUptiResult result, const char** str); 
-
-CUptiResult (*dcuptiSubscribe)(CUpti_SubscriberHandle* subscriber,
-                               CUpti_CallbackFunc callback, 
-                               void* userdata);
-
-CUptiResult (*dcuptiEnableCallback)(uint32_t enable,
-                                    CUpti_SubscriberHandle subscriber, 
-                                    CUpti_CallbackDomain domain,
-                                    CUpti_CallbackId cbid);
 
-CUptiResult (*dcuptiUnsubscribe)(CUpti_SubscriberHandle subscriber); 
 
+//******************************************************************************
+// private operations
+//******************************************************************************
 
-// *****************************************************************
-typedef struct cuda_callback_t {
-  sample_source_t* ss;
-  int event_set;
-} cuda_callback_t;
-
-//
-// populate the cuda/cupti functions via dlopen
-//
-
-static void
-dlgpu(void)
-{
-  // only use dlfunctions in NON static case
-#ifndef HPCRUN_STATIC_LINK
-  Chk_dlopen(cudart, "libcudart.so", RTLD_NOW | RTLD_GLOBAL);
-  Chk_dlsym(cudart, cudaThreadSynchronize);
-
-  Chk_dlopen(cupti, "libcupti.so", RTLD_NOW | RTLD_GLOBAL);
-  Chk_dlsym(cupti, cuptiGetResultString);
-  Chk_dlsym(cupti, cuptiSubscribe);
-  Chk_dlsym(cupti, cuptiEnableCallback);
-  Chk_dlsym(cupti, cuptiUnsubscribe);
-#endif // ! HPCRUN_STATIC_LINK
-}
-
-//
-// noop routine
-//
 static void
 papi_c_no_action(void)
 {
   ;
 }
 
-//
-// Predicate to determine if this component is being referenced
-//
+
 static bool
 is_papi_c_cuda(const char* name)
 {
   return strstr(name, "cuda") == name;
 }
 
-//static void CUPTIAPI
-//hpcrun_cuda_kernel_callback(void* userdata,
-//			    CUpti_CallbackDomain domain,
-//			    CUpti_CallbackId cbid,
-//			    const CUpti_CallbackData* cbInfo)
-//{
-//  TMSG(CUDA, "Got Kernel Callback");
-//
-//  papi_cuda_data_t* cuda_data = userdata;
-//  int nevents = cuda_data->nevents;
-//  int cudaEventSet = cuda_data->event_set;
-//  sample_source_t* self = cuda_data->self;
-//
-//
-//  TMSG(CUDA, "nevents = %d, cuda event set = %x", nevents, cudaEventSet);
-//
-//  // This callback is enabled only for kernel launch; anything else is an error.
-//  if (cbid != CUPTI_RUNTIME_TRACE_CBID_cudaLaunch_v3020) {
-//    hpcrun_abort("CUDA CUPTI callback seen for unexpected "
-//		 "interface operation: callback id  %d\n", cbid);
-//  }
-//
-//  if (cbInfo->callbackSite == CUPTI_API_ENTER) {
-//    TMSG(CUDA, "Cupti API -ENTER- portion");
-//    // MC recommends FIXME: Unnecessary, but use cudaDeviceSynchronize
-//      // exclusive access to launcher
-//    spinlock_lock(&cupti_lock);
-//    TMSG(CUPTI, "-ACQ-lock");
-//    dcudaThreadSynchronize();
-//
-//    TMSG(CUPTI,"-- PRE launch callback");
-//    TMSG(CUDA, "Start monitoring with event set %d", cudaEventSet);
-//    int ret = PAPI_start(cudaEventSet);
-//    if (ret != PAPI_OK){
-//      EMSG("CUDA monitoring failed to start. PAPI_start failed with %s (%d)",
-//	   PAPI_strerror(ret), ret);
-//    }
-//  }
-//  TMSG(CUDA, "Past (or done with) CUDA -ENTER- portion");
-//
-//
-//  if (cbInfo->callbackSite == CUPTI_API_EXIT) {
-//    TMSG(CUDA, "Cupti API -EXIT- portion");
-//    // MC recommends Use cudaDeviceSynchronize
-//    dcudaThreadSynchronize();
-//    TMSG(CUPTI, "-- POST launch callback");
-//    long_long eventValues[nevents+2];
-//
-//    TMSG(CUDA,"stopping CUDA monitoring w event set %d",cudaEventSet);
-//    int ret = PAPI_stop(cudaEventSet, eventValues);
-//    if (ret != PAPI_OK){
-//      EMSG("CUDA monitoring failed to -stop-. PAPI_stop failed with %s (%d)",
-//	   PAPI_strerror(ret), ret);
-//    }
-//    TMSG(CUDA,"stopped CUDA monitoring w event set %d",cudaEventSet);
-//
-//    ucontext_t uc;
-//    TMSG(CUDA,"getting context in CUDA event handler");
-//    getcontext(&uc);
-//    TMSG(CUDA,"got context in CUDA event handler");
-//    bool safe = hpcrun_safe_enter();
-//    TMSG(CUDA,"blocked async event in CUDA event handler");
-//    {
-//      int i;
-//      for (i = 0; i < nevents; i++)
-//	{
-//	  int metric_id = hpcrun_event2metric(self, i);
-//
-//	  TMSG(CUDA, "sampling call path for metric_id = %d", metric_id);
-//	  hpcrun_sample_callpath(&uc, metric_id, (hpcrun_metricVal_t){.i=eventValues[i]}/*metricIncr*/,
-//				 CUPTI_LAUNCH_CALLBACK_DEPTH/*skipInner*/,
-//				 0/*isSync*/, NULL);
-//
-//
-//	  TMSG(CUDA, "sampled call path for metric_id = %d", metric_id);
-//	}
-//    }
-//    TMSG(CUDA,"unblocking async event in CUDA event handler");
-//    if (safe) hpcrun_safe_exit();
-//    TMSG(CUDA,"unblocked async event in CUDA event handler");
-//
-//    spinlock_unlock(&cupti_lock);
-//    TMSG(CUPTI,"-REL-lock\n");
-//  }
-//  TMSG(CUDA, "At end (past -EXIT-)");
-//}
-
-//static CUpti_SubscriberHandle subscriber;
 
-//
-// sync setup for cuda/cupti
-//
+// Get or create a cupti event set
 static void
-papi_c_cupti_setup(void)
-{
-  // FIXME: Remove local definition
-  // CUpti_SubscriberHandle subscriber;
-
-  static bool one_time = false;
-
-  spinlock_lock(&setup_lock);
-  TMSG(CUDA, "CUPTI setup acquire lock");
-  if (one_time) {
-    spinlock_unlock(&setup_lock);
-    TMSG(CUDA, "CUPTI setup release lock (setup already called)");
-    return;
-  }
-
-  TMSG(CUDA,"sync setup called");
-
-  thread_data_t* td = hpcrun_get_thread_data();
-  local.self = hpcrun_fetch_source_by_name("papi");
-
-  local.nevents  = local.self->evl.nevents;
-
-  // get cuda event set
-
-  int cuda_component_idx;
-  int n_components = PAPI_num_components();
-
-  for (int i = 0; i < n_components; i++) {
-    if (is_papi_c_cuda(PAPI_get_component_info(i)->name)) {
-      cuda_component_idx = i;
-      break;
-    }
-  }
-
-  papi_source_info_t* psi = td->ss_info[local.self->sel_idx].ptr;
-  local.event_set = get_component_event_set( &(psi->component_info[cuda_component_idx]) );
-//TODO Dejan: Can I delete these hpcrun_cuda_kernel_callback callback?
-//  Cupti_call(dcuptiSubscribe, &subscriber,
-//             (CUpti_CallbackFunc)hpcrun_cuda_kernel_callback,
-//             &local);
-//
-//  Cupti_call(dcuptiEnableCallback, 1, subscriber,
-//             CUPTI_CB_DOMAIN_RUNTIME_API,
-//             CUPTI_RUNTIME_TRACE_CBID_cudaLaunch_v3020);
-
-  one_time = true;
-  spinlock_unlock(&setup_lock);
-  TMSG(CUDA, "CUPTI setup release lock");
-
-}
-
-//
-// Get or create a cupti event set --- but only ONCE per process
-//
- void
 papi_c_cupti_get_event_set(int* event_set)
 {
   TMSG(CUDA, "Get event set");
-  spinlock_lock(&setup_lock);
-  TMSG(CUDA, "Cupti lock acquired");
   if (! event_set_created) {
     TMSG(CUDA, "No event set created, so create one");
-    int ret = PAPI_create_eventset(event_set);
+    int ret = PAPI_create_eventset(&my_event_set);
     if (ret != PAPI_OK) {
       hpcrun_abort("Failure: PAPI_create_eventset.Return code = %d ==> %s",
                    ret, PAPI_strerror(ret));
     }
-    local.event_set = *event_set;
+    *event_set = my_event_set;
     event_set_created = true;
-    TMSG(CUDA, "Event set %d created", local.event_set);
+    TMSG(CUDA, "Event set %d created", my_event_set);
   }
-  spinlock_unlock(&setup_lock);
-  TMSG(CUDA, "Cupti lock released");
 }
 
+
+// Add event to my_event_set
 void
-papi_c_cupti_add_event(int event_set, int event)
+papi_c_cupti_add_event(int event_set, int evcode)
 {
+  assert(event_set == my_event_set);
+
   int rv = PAPI_OK;
-  TMSG(CUDA, "Adding event to cupti event set");
-  spinlock_lock(&setup_lock);
-  TMSG(CUDA, "Cupti lock acquired");
   if (! event_set_finalized) {
-    TMSG(CUDA, "Really add event %x to cupti event set", event);
-    rv = PAPI_add_event(local.event_set, event);
+    TMSG(CUDA, "Adding event %x to cupti event set", evcode);
+    rv = PAPI_add_event(my_event_set, evcode);
     if (rv != PAPI_OK) {
-      EMSG("failure in PAPI gen_event_set(): PAPI_add_event() returned: %s (%d)",
-           PAPI_strerror(rv), rv);
+      hpcrun_abort("failure in PAPI gen_event_set(): PAPI_add_event() returned: %s (%d)",
+                   PAPI_strerror(rv), rv);
     }
-
-    TMSG(CUDA, "Check event set passed in = %d, cuda event set = %d", event_set, local.event_set);
+    TMSG(CUDA, "Added event %d, to cuda event set %d", evcode, my_event_set);
   }
-  spinlock_unlock(&setup_lock);
-  TMSG(CUDA, "Cupti lock released");
 }
 
+// No adding new events after this point
 void
 papi_c_cupti_finalize_event_set(void)
 {
-  spinlock_lock(&setup_lock);
   event_set_finalized = true;
-  spinlock_unlock(&setup_lock);
 }
 
+
 void
 papi_c_cupti_start()
 {
-  spinlock_lock(&setup_lock);
-  TMSG(CUDA, "Cupti lock acquired");
-  int ret = PAPI_start(local.event_set);
+  int ret = PAPI_start(my_event_set);
   if (ret != PAPI_OK) {
-    EMSG("PAPI_start of event set %d failed with %s (%d)",
-         local.event_set, PAPI_strerror(ret), ret);
+    hpcrun_abort("PAPI_start of event set %d failed with %s (%d)",
+         my_event_set, PAPI_strerror(ret), ret);
   }
-  spinlock_unlock(&setup_lock);
-  TMSG(CUDA, "Cupti lock released");
 }
 
 
 void
 papi_c_cupti_read(long long *values)
 {
-  spinlock_lock(&setup_lock);
-  TMSG(CUDA, "Cupti lock acquired");
-  int ret = PAPI_read(local.event_set, values);
+  int ret = PAPI_read(my_event_set, values);
   if (ret != PAPI_OK) {
-    EMSG("PAPI_read of event set %d failed with %s (%d)",
-         local.event_set, PAPI_strerror(ret), ret);
+    hpcrun_abort("PAPI_read of event set %d failed with %s (%d)",
+         my_event_set, PAPI_strerror(ret), ret);
   }
-  spinlock_unlock(&setup_lock);
-  TMSG(CUDA, "Cupti lock released");
 }
 
 
 void
 papi_c_cupti_stop(long long *values)
 {
-  spinlock_lock(&setup_lock);
-  TMSG(CUDA, "Cupti lock acquired");
-  int ret = PAPI_stop(local.event_set, values);
+  int ret = PAPI_stop(my_event_set, values);
   if (ret != PAPI_OK) {
-    EMSG("PAPI_stop of event set %d failed with %s (%d)",
-         local.event_set, PAPI_strerror(ret), ret);
+    hpcrun_abort("PAPI_stop of event set %d failed with %s (%d)",
+         my_event_set, PAPI_strerror(ret), ret);
   }
-  spinlock_unlock(&setup_lock);
-  TMSG(CUDA, "Cupti lock released");
 }
 
 
-//
-// sync teardown for cuda/cupti
-//
-static void
-papi_c_cupti_teardown(void)
-{
-//  static bool one_time = false;
-//  spinlock_lock(&setup_lock);
-//  if (one_time) return;
-//
-//  TMSG(CUDA,"sync teardown called (=unsubscribe)");
-//
-//  Cupti_call(cuptiUnsubscribe, subscriber);
-//  one_time = true;
-//  spinlock_unlock(&setup_lock);
-}
-
 static sync_info_list_t cuda_component = {
   .pred = is_papi_c_cuda,
   .get_event_set = papi_c_cupti_get_event_set,
   .add_event = papi_c_cupti_add_event,
   .finalize_event_set = papi_c_cupti_finalize_event_set,
-  .is_sync = true,
-  .setup = papi_c_cupti_setup,
-  .teardown = papi_c_cupti_teardown,
+  .is_gpu_sync = true,
+  .setup = papi_c_no_action,
+  .teardown = papi_c_no_action,
   .start = papi_c_cupti_start,
   .read = papi_c_cupti_read,
   .stop = papi_c_cupti_stop,
@@ -434,7 +196,5 @@ static sync_info_list_t cuda_component = {
 void
 SS_OBJ_CONSTRUCTOR(papi_c_cupti)(void)
 {
-  // fetch actual cuda/cupti functions
-//  dlgpu();
   papi_c_sync_register(&cuda_component);
 }
\ No newline at end of file
diff --git a/src/tool/hpcrun/sample-sources/papi-c-extended-info.c b/src/tool/hpcrun/sample-sources/papi-c-extended-info.c
index 82a5c756d4..f113705199 100644
--- a/src/tool/hpcrun/sample-sources/papi-c-extended-info.c
+++ b/src/tool/hpcrun/sample-sources/papi-c-extended-info.c
@@ -72,7 +72,7 @@ component_uses_sync_samples(int cidx)
 
   TMSG(PAPI, "checking component idx %d (name %s) to see if it is synchronous", cidx, name);
   for(sync_info_list_t* item=registered_sync_components; item; item = item->next) {
-    if (item->pred(name)) return item->is_sync;
+    if (item->pred(name)) return item->is_gpu_sync;
   }
   return false;
 }
diff --git a/src/tool/hpcrun/sample-sources/papi-c-extended-info.h b/src/tool/hpcrun/sample-sources/papi-c-extended-info.h
index 3786ff830d..eb83b101dc 100644
--- a/src/tool/hpcrun/sample-sources/papi-c-extended-info.h
+++ b/src/tool/hpcrun/sample-sources/papi-c-extended-info.h
@@ -16,7 +16,7 @@ typedef struct sync_info_list_t {
   const get_event_set_proc_t get_event_set;
   const add_event_proc_t add_event;
   const finalize_event_set_proc_t finalize_event_set;
-  const bool is_sync;
+  const bool is_gpu_sync;
   const setup_proc_t setup;
   const teardown_proc_t teardown;
   const start_proc_t start;
diff --git a/src/tool/hpcrun/sample-sources/papi-c.c b/src/tool/hpcrun/sample-sources/papi-c.c
index b338078747..95a3566c4c 100644
--- a/src/tool/hpcrun/sample-sources/papi-c.c
+++ b/src/tool/hpcrun/sample-sources/papi-c.c
@@ -121,8 +121,8 @@
  * forward declarations 
  *****************************************************************************/
 static void papi_event_handler(int event_set, void *pc, long long ovec, void *context);
-static void papi_monitor_enter(papi_component_info_t *ci,  const cct_node_t *cct_node);
-static void papi_monitor_exit(papi_component_info_t *ci,  const cct_node_t *cct_node);
+static void papi_monitor_enter(papi_component_info_t *ci, cct_node_t *cct_node);
+static void papi_monitor_exit(papi_component_info_t *ci);
 
 static int  event_is_derived(int ev_code);
 static void event_fatal_error(int ev_code, int papi_ret);
@@ -279,7 +279,7 @@ papi_register_callbacks(papi_source_info_t *psi, evlist_t evl)
     int eventSet = get_component_event_set(ci);
 
     // **** No overflow for synchronous events ****
-    if (ci->is_sync) {
+    if (ci->is_gpu_sync) {
       TMSG(PAPI, "event code %d (component %d) is synchronous, so do NOT set overflow", evcode, cidx);
       TMSG(PAPI, "Set up papi_monitor_apply instead");
       TMSG(PAPI, "synchronous sample component index = %d", cidx);
@@ -761,7 +761,7 @@ METHOD_FN(gen_event_set, int lush_metrics)
     ci->add_event = component_add_event_proc(i);
     ci->finalize_event_set = component_finalize_event_set(i);
     ci->scale_by_thread_count = thread_count_scaling_for_component(i);
-    ci->is_sync = component_uses_sync_samples(i);
+    ci->is_gpu_sync = component_uses_sync_samples(i);
     ci->setup = sync_setup_for_component(i);
     ci->teardown = sync_teardown_for_component(i);
     ci->start = sync_start_for_component(i);
@@ -1066,23 +1066,6 @@ papi_event_handler(int event_set, void *pc, long long ovec,
 }
 
 
-static __thread cct_node_t *cct_node_loc; // TODO Dejan: Should I use this get cct_node from exit?
-
-static void
-papi_insert_cct(cct_node_t *api_node){
-
-//  gpu_op_ccts_t gpu_op_ccts;
-//
-//  hpcrun_safe_enter();
-//  gpu_op_ccts_insert(api_node, &gpu_op_ccts, gpu_placeholder_type_sync);
-//  hpcrun_safe_exit();
-//
-//  cupti_papi_ph = gpu_op_ccts_get(&gpu_op_ccts, gpu_placeholder_type_sync);
-//
-//  gpu_correlation_channel_produce(correlation_id, &gpu_op_ccts,
-//                                  cpu_submit_time);
-}
-
 static void
 attribute_metric_to_cct
 (
@@ -1132,22 +1115,21 @@ attribute_counters(papi_component_info_t *ci, long long *collected_values, cct_n
 
 
 static void
-papi_monitor_enter(papi_component_info_t *ci, const cct_node_t *cct_node)
+papi_monitor_enter(papi_component_info_t *ci, cct_node_t *cct_node)
 {
   tool_enter();
-//  sample_source_t *self = &obj_name();
-
 //  PRINT("|------->PAPI_MONITOR_ENTER | cct = %p\n", cct_node);
 
   // if sampling disabled explicitly for this thread, skip all processing
   if (hpcrun_suppress_sample() || sample_filters_apply()) goto finish;
 
-  cct_node_loc = cct_node;
+  ci->cct_node = cct_node;
 
   // Save counts on the end so we could substract that from next call (we don't want to measure ourselves)
 
   if (ci->inUse) {
     ci->read(ci->prev_values);
+
     PRINT("PAPI_ENTER:: Component %s Event = %d, value = %lld   |  %p\n", ci->name, ci->eventSet, ci->prev_values[0], cct_node);
   }
 
@@ -1157,7 +1139,7 @@ papi_monitor_enter(papi_component_info_t *ci, const cct_node_t *cct_node)
 
 
 static void
-papi_monitor_exit(papi_component_info_t *ci,  const cct_node_t *cct_node)
+papi_monitor_exit(papi_component_info_t *ci)
 {
   tool_enter();
   long long collected_values[MAX_EVENTS];
@@ -1167,10 +1149,9 @@ papi_monitor_exit(papi_component_info_t *ci,  const cct_node_t *cct_node)
 
   if (ci->inUse){
     ci->read(collected_values);
-    attribute_counters(ci, collected_values, cct_node_loc);
+    attribute_counters(ci, collected_values, ci->cct_node);
   }
 
-  cct_node_loc = NULL;
 
 finish:
   tool_exit();
diff --git a/src/tool/hpcrun/sample-sources/papi-c.h b/src/tool/hpcrun/sample-sources/papi-c.h
index 13521a2d01..689fcb3f09 100644
--- a/src/tool/hpcrun/sample-sources/papi-c.h
+++ b/src/tool/hpcrun/sample-sources/papi-c.h
@@ -54,6 +54,8 @@
 #include "papi-c-extended-info.h"
 
 #include "sample_source_obj.h"
+#include "cct.h"
+
 
 /******************************************************************************
  * type declarations 
@@ -67,7 +69,8 @@ typedef struct {
   int some_derived;
   bool scale_by_thread_count;
   long long prev_values[MAX_EVENTS];
-  bool is_sync;
+  cct_node_t *cct_node;
+  bool is_gpu_sync;
   bool setup_process_only;
   get_event_set_proc_t get_event_set;
   add_event_proc_t add_event;
diff --git a/src/tool/hpcrun/tool_state.c b/src/tool/hpcrun/tool_state.c
index 03673c6a02..1a978c6e90 100644
--- a/src/tool/hpcrun/tool_state.c
+++ b/src/tool/hpcrun/tool_state.c
@@ -6,13 +6,18 @@
 
 static __thread int tool_active = false;
 
+
+
 void tool_enter(){
 	tool_active++;
 }
+
+
 void tool_exit(){
 	tool_active--;
 }
 
+
 bool is_tool_active(){
 	return tool_active;
 }
\ No newline at end of file

From e139a47bf5efc275e0df04d490aa412f3b15e0dc Mon Sep 17 00:00:00 2001
From: dejangrubisic <grubisic.dejan@yahoo.com>
Date: Sun, 13 Dec 2020 09:20:07 -0600
Subject: [PATCH 123/177] Deleting redundant function

---
 src/tool/hpcrun/gpu/gpu-trace.c | 17 ++---------------
 1 file changed, 2 insertions(+), 15 deletions(-)

diff --git a/src/tool/hpcrun/gpu/gpu-trace.c b/src/tool/hpcrun/gpu/gpu-trace.c
index 2d1bea000f..575a58129a 100644
--- a/src/tool/hpcrun/gpu/gpu-trace.c
+++ b/src/tool/hpcrun/gpu/gpu-trace.c
@@ -196,19 +196,6 @@ gpu_trace_cct_insert_context
 }
 
 
-static uint64_t
-gpu_trace_time
-(
- uint64_t gpu_time
-)
-{
-  // return time in ns
-  uint64_t time = gpu_time;
-
-  return time;
-}
-
-
 static void
 gpu_trace_stream_append
 (
@@ -425,8 +412,8 @@ consume_one_trace_item
 
   cct_node_t *no_activity = gpu_trace_cct_no_activity(td);
 
-  uint64_t start = gpu_trace_time(start_time);
-  uint64_t end   = gpu_trace_time(end_time);
+  uint64_t start = start_time;
+  uint64_t end   = end_time;
 
   stream_start_set(start_time);
 

From 5b37714e4b8ff94f6c35f05866484da684d28a6b Mon Sep 17 00:00:00 2001
From: dejangrubisic <grubisic.dejan@yahoo.com>
Date: Sat, 19 Dec 2020 15:38:35 -0600
Subject: [PATCH 124/177] Debug 0

---
 src/tool/hpcrun/sample-sources/papi-c.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/tool/hpcrun/sample-sources/papi-c.c b/src/tool/hpcrun/sample-sources/papi-c.c
index 95a3566c4c..20a77d7893 100644
--- a/src/tool/hpcrun/sample-sources/papi-c.c
+++ b/src/tool/hpcrun/sample-sources/papi-c.c
@@ -107,7 +107,7 @@
  * macros
  *****************************************************************************/
 
-#define DEBUG 1
+#define DEBUG 0
 
 #include <hpcrun/gpu/gpu-print.h>
 #include <gpu-monitors.h>

From a38e4f1a0c36abd0ea2e22bbcc803b1046c53a26 Mon Sep 17 00:00:00 2001
From: Dejan Grubisic <grubisic.dejan@yahoo.com>
Date: Wed, 13 Jan 2021 16:28:54 -0600
Subject: [PATCH 125/177] support for papi-rocm, problems:loading amd
 libraries.so

---
 configure                                    |  80 ++++
 configure.ac                                 |  42 ++
 src/tool/hpcrun/Makefile.am                  |   9 +-
 src/tool/hpcrun/Makefile.in                  | 452 ++++++++++---------
 src/tool/hpcrun/gpu/amd/roctracer-api.c      |   4 +-
 src/tool/hpcrun/messages/debug-flag.c        |   2 +-
 src/tool/hpcrun/sample-sources/amd.c         |  14 +-
 src/tool/hpcrun/sample-sources/papi-c-rocm.c | 452 ++++++-------------
 src/tool/hpcrun/sample-sources/papi-c.c      |   2 +-
 9 files changed, 509 insertions(+), 548 deletions(-)

diff --git a/configure b/configure
index 2316b68409..fccfb6ac47 100755
--- a/configure
+++ b/configure
@@ -748,6 +748,8 @@ TBB_LIB_DIR
 TBB_PROXY_LIB
 TBB_LFLAGS
 TBB_IFLAGS
+OPT_PAPI_ROCM_FALSE
+OPT_PAPI_ROCM_TRUE
 OPT_PAPI_CUPTI_FALSE
 OPT_PAPI_CUPTI_TRUE
 OPT_PAPI_COMPONENT_FALSE
@@ -1051,6 +1053,7 @@ with_papi
 enable_force_papi
 enable_papi_c
 enable_papi_c_cupti
+enable_papi_c_rocm
 with_perfmon
 enable_perf_events
 enable_kernel_blocking
@@ -1758,6 +1761,8 @@ Optional Features:
   --enable-papi-c         use component papi, if available (default yes)
   --enable-papi-c-cupti   use papi CUPTI support, if available (default no),
                           requires papi cuda component
+  --enable-papi-c-rocm    use papi ROCM support, if available (default no),
+                          requires papi rocm component
   --enable-perf-events    force enable or disable perf events in hpcrun
                           (normally 2.6.32 or later), only needed if fails to
                           auto-detect correctly
@@ -21908,6 +21913,67 @@ $as_echo "$use_papi_c_cupti" >&6; }
 fi
 
 
+#-------------------------------------------------
+# Option: --enable-papi-c-rocm
+#-------------------------------------------------
+
+use_papi_c_rocm=no
+
+# Check whether --enable-papi-c-rocm was given.
+if test "${enable_papi_c_rocm+set}" = set; then :
+  enableval=$enable_papi_c_rocm; use_papi_c_rocm="$enableval"
+fi
+
+
+if test "$use_papi_c" = no || test "$use_papi_c_rocm" != yes ; then
+  use_papi_c_rocm=no
+else
+  { $as_echo "$as_me:${as_lineno-$LINENO}: checking for papi rocm component" >&5
+$as_echo_n "checking for papi rocm component... " >&6; }
+
+  ORIG_CFLAGS="$CFLAGS"
+  ORIG_LIBS="$LIBS"
+  CFLAGS="$CFLAGS $OPT_PAPI_IFLAGS"
+  LIBS="$OPT_PAPI_LDFLAGS $papi_extra_libs"
+  ac_ext=c
+ac_cpp='$CPP $CPPFLAGS'
+ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5'
+ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
+ac_compiler_gnu=$ac_cv_c_compiler_gnu
+
+
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+
+#include "papi.h"
+int main()
+{
+}
+
+_ACEOF
+if ac_fn_c_try_link "$LINENO"; then :
+  use_papi_c_rocm=yes
+else
+  use_papi_c_rocm=no
+fi
+rm -f core conftest.err conftest.$ac_objext \
+    conftest$ac_exeext conftest.$ac_ext
+
+  ac_ext=cpp
+ac_cpp='$CXXCPP $CPPFLAGS'
+ac_compile='$CXX -c $CXXFLAGS $CPPFLAGS conftest.$ac_ext >&5'
+ac_link='$CXX -o conftest$ac_exeext $CXXFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
+ac_compiler_gnu=$ac_cv_cxx_compiler_gnu
+
+  CFLAGS="$ORIG_CFLAGS"
+  LIBS="$ORIG_LIBS"
+
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $use_papi_c_rocm" >&5
+$as_echo "$use_papi_c_rocm" >&6; }
+fi
+
+
 #-------------------------------------------------
 # Option: --with-perfmon=PATH
 #-------------------------------------------------
@@ -22274,6 +22340,14 @@ else
   OPT_PAPI_CUPTI_FALSE=
 fi
 
+ if test "$use_papi_c_rocm" = yes; then
+  OPT_PAPI_ROCM_TRUE=
+  OPT_PAPI_ROCM_FALSE='#'
+else
+  OPT_PAPI_ROCM_TRUE='#'
+  OPT_PAPI_ROCM_FALSE=
+fi
+
 
 
 #-------------------------------------------------
@@ -25264,6 +25338,10 @@ if test -z "${OPT_PAPI_CUPTI_TRUE}" && test -z "${OPT_PAPI_CUPTI_FALSE}"; then
   as_fn_error $? "conditional \"OPT_PAPI_CUPTI\" was never defined.
 Usually this means the macro was only invoked conditionally." "$LINENO" 5
 fi
+if test -z "${OPT_PAPI_ROCM_TRUE}" && test -z "${OPT_PAPI_ROCM_FALSE}"; then
+  as_fn_error $? "conditional \"OPT_PAPI_ROCM\" was never defined.
+Usually this means the macro was only invoked conditionally." "$LINENO" 5
+fi
 if test -z "${OPT_USE_ZLIB_TRUE}" && test -z "${OPT_USE_ZLIB_FALSE}"; then
   as_fn_error $? "conditional \"OPT_USE_ZLIB\" was never defined.
 Usually this means the macro was only invoked conditionally." "$LINENO" 5
@@ -28041,6 +28119,8 @@ $as_echo "$as_me:   gtpin:        ${GTPIN}" >&6;}
 $as_echo "$as_me:   metrics-discovery: ${METRICS_DISCOVERY}" >&6;}
 { $as_echo "$as_me:${as_lineno-$LINENO}:   papi-c-cupti: ${use_papi_c_cupti}" >&5
 $as_echo "$as_me:   papi-c-cupti: ${use_papi_c_cupti}" >&6;}
+{ $as_echo "$as_me:${as_lineno-$LINENO}:   papi-c-rocm: ${use_papi_c_rocm}" >&5
+$as_echo "$as_me:   papi-c-rocm: ${use_papi_c_rocm}" >&6;}
 { $as_echo "$as_me:${as_lineno-$LINENO}:   rocm:         ${rocm_mesg}" >&5
 $as_echo "$as_me:   rocm:         ${rocm_mesg}" >&6;}
 if test "$OPT_HAVE_ROCM" = yes ; then
diff --git a/configure.ac b/configure.ac
index 37882505ec..867a9e40c3 100644
--- a/configure.ac
+++ b/configure.ac
@@ -3248,6 +3248,46 @@ int main()
 fi
 
 
+#-------------------------------------------------
+# Option: --enable-papi-c-rocm
+#-------------------------------------------------
+
+use_papi_c_rocm=no
+
+AC_ARG_ENABLE([papi-c-rocm],
+  [AS_HELP_STRING([--enable-papi-c-rocm],
+      [use papi ROCM support, if available (default no), requires
+       papi rocm component])],
+  [use_papi_c_rocm="$enableval"],
+  [])
+
+if test "$use_papi_c" = no || test "$use_papi_c_rocm" != yes ; then
+  use_papi_c_rocm=no
+else
+  AC_MSG_CHECKING([for papi rocm component])
+
+  ORIG_CFLAGS="$CFLAGS"
+  ORIG_LIBS="$LIBS"
+  CFLAGS="$CFLAGS $OPT_PAPI_IFLAGS"
+  LIBS="$OPT_PAPI_LDFLAGS $papi_extra_libs"
+  AC_LANG_PUSH([C])
+
+  AC_LINK_IFELSE([
+  AC_LANG_SOURCE([[
+#include "papi.h"
+int main()
+{
+}
+]])], [use_papi_c_rocm=yes], [use_papi_c_rocm=no])
+
+  AC_LANG_POP
+  CFLAGS="$ORIG_CFLAGS"
+  LIBS="$ORIG_LIBS"
+
+  AC_MSG_RESULT([$use_papi_c_rocm])
+fi
+
+
 #-------------------------------------------------
 # Option: --with-perfmon=PATH
 #-------------------------------------------------
@@ -3530,6 +3570,7 @@ AM_CONDITIONAL(OPT_PAPI_DYNAMIC, [test "$OPT_PAPI_DYNAMIC" = yes])
 AM_CONDITIONAL(OPT_PAPI_STATIC,  [test "$OPT_PAPI_STATIC" = yes])
 AM_CONDITIONAL(OPT_PAPI_COMPONENT, [test "$use_papi_c" = yes])
 AM_CONDITIONAL(OPT_PAPI_CUPTI, [test "$use_papi_c_cupti" = yes])
+AM_CONDITIONAL(OPT_PAPI_ROCM, [test "$use_papi_c_rocm" = yes])
 
 
 #-------------------------------------------------
@@ -5838,6 +5879,7 @@ AC_MSG_NOTICE([  igc:          ${IGC}])
 AC_MSG_NOTICE([  gtpin:        ${GTPIN}])
 AC_MSG_NOTICE([  metrics-discovery: ${METRICS_DISCOVERY}])
 AC_MSG_NOTICE([  papi-c-cupti: ${use_papi_c_cupti}])
+AC_MSG_NOTICE([  papi-c-rocm: ${use_papi_c_rocm}])
 AC_MSG_NOTICE([  rocm:         ${rocm_mesg}])
 if test "$OPT_HAVE_ROCM" = yes ; then
   AC_MSG_NOTICE([  rocm hip:     $ROCM_HIP_MESG])
diff --git a/src/tool/hpcrun/Makefile.am b/src/tool/hpcrun/Makefile.am
index bbf1ae39ad..6b7438d0ec 100644
--- a/src/tool/hpcrun/Makefile.am
+++ b/src/tool/hpcrun/Makefile.am
@@ -504,6 +504,11 @@ if OPT_PAPI_CUPTI
     MY_PAPI_FILES += sample-sources/papi-c-cupti.c
 endif
 
+if OPT_PAPI_ROCM
+    MY_PAPI_FILES += sample-sources/papi-c-rocm.c
+endif
+
+
 if OPT_ENABLE_CUPTI
 MY_CUPTI_FILES = sample-sources/nvidia.c	\
 	gpu/nvidia/cubin-hash-map.c		\
@@ -546,8 +551,8 @@ MY_ROCM_FILES =\
 	gpu/amd/roctracer-activity-translate.c \
 	gpu/amd/roctracer-api.c 	\
 	gpu/amd/rocm-debug-api.c \
-	gpu/amd/rocm-binary-processing.c \
-	gpu/amd/hip-api.c
+	gpu/amd/rocm-binary-processing.c 
+	
 endif
 
 if OPT_ENABLE_LEVEL0
diff --git a/src/tool/hpcrun/Makefile.in b/src/tool/hpcrun/Makefile.in
index 53ffc0cefd..e9ff59de93 100644
--- a/src/tool/hpcrun/Makefile.in
+++ b/src/tool/hpcrun/Makefile.in
@@ -168,25 +168,25 @@ host_triplet = @host@
 @OPT_ENABLE_KERNEL_4_3_TRUE@@OPT_ENABLE_PERF_EVENT_TRUE@am__append_14 = sample-sources/perf/kernel_blocking.c
 @OPT_ENABLE_KERNEL_4_3_FALSE@@OPT_ENABLE_PERF_EVENT_TRUE@am__append_15 = sample-sources/perf/kernel_blocking_stub.c
 @OPT_PAPI_CUPTI_TRUE@am__append_16 = sample-sources/papi-c-cupti.c
-@OPT_ENABLE_OPENCL_TRUE@am__append_17 = libhpcrun_opencl.la
-@OPT_ENABLE_LEVEL0_TRUE@am__append_18 = libhpcrun_level0.la
+@OPT_PAPI_ROCM_TRUE@am__append_17 = sample-sources/papi-c-rocm.c
+@OPT_ENABLE_OPENCL_TRUE@am__append_18 = libhpcrun_opencl.la
+@OPT_ENABLE_LEVEL0_TRUE@am__append_19 = libhpcrun_level0.la
 
 #
 # BG/Q backend requires special treatment to avoid deadlocks
 #
-@OPT_BGQ_BACKEND_TRUE@am__append_19 = -DUSE_HW_THREAD_ID -DNONZERO_THRESHOLD
-@OPT_BGQ_BACKEND_TRUE@am__append_20 = -I$(srcdir)/utilities/bgq-cnk
+@OPT_BGQ_BACKEND_TRUE@am__append_20 = -DUSE_HW_THREAD_ID -DNONZERO_THRESHOLD
 @OPT_BGQ_BACKEND_TRUE@am__append_21 = -I$(srcdir)/utilities/bgq-cnk
-@OPT_ENABLE_MPI_WRAP_TRUE@am__append_22 = mpi-overrides.c
+@OPT_BGQ_BACKEND_TRUE@am__append_22 = -I$(srcdir)/utilities/bgq-cnk
 @OPT_ENABLE_MPI_WRAP_TRUE@am__append_23 = mpi-overrides.c
+@OPT_ENABLE_MPI_WRAP_TRUE@am__append_24 = mpi-overrides.c
 
 #-----------------------------------------------------------
 # whirled peas
 #-----------------------------------------------------------
-@HOST_OS_LINUX_TRUE@am__append_24 = $(MY_LINUX_DYNAMIC_FILES)
-@HOST_CPU_MIPS_TRUE@am__append_25 = $(MY_MIPS_FILES)
+@HOST_OS_LINUX_TRUE@am__append_25 = $(MY_LINUX_DYNAMIC_FILES)
 @HOST_CPU_MIPS_TRUE@am__append_26 = $(MY_MIPS_FILES)
-@HOST_CPU_MIPS_TRUE@am__append_27 = $(MY_MIPS_INCLUDE_DIRS)
+@HOST_CPU_MIPS_TRUE@am__append_27 = $(MY_MIPS_FILES)
 @HOST_CPU_MIPS_TRUE@am__append_28 = $(MY_MIPS_INCLUDE_DIRS)
 @HOST_CPU_MIPS_TRUE@am__append_29 = $(MY_MIPS_INCLUDE_DIRS)
 @HOST_CPU_MIPS_TRUE@am__append_30 = $(MY_MIPS_INCLUDE_DIRS)
@@ -197,15 +197,15 @@ host_triplet = @host@
 @HOST_CPU_MIPS_TRUE@am__append_35 = $(MY_MIPS_INCLUDE_DIRS)
 @HOST_CPU_MIPS_TRUE@am__append_36 = $(MY_MIPS_INCLUDE_DIRS)
 @HOST_CPU_MIPS_TRUE@am__append_37 = $(MY_MIPS_INCLUDE_DIRS)
+@HOST_CPU_MIPS_TRUE@am__append_38 = $(MY_MIPS_INCLUDE_DIRS)
 
 # Note: setting CCASFLAGS here is a no-op hack with the side effect of
 # prefixing the tramp.s file names so they will be compiled separately
 # for .o and .so targets.  CFLAGS does this for the .c files, but
 # CFLAGS doesn't apply to .s files.  See the automake docs section
 # 8.3.9.2, Objects created with both libtool and without.
-@HOST_CPU_PPC_TRUE@am__append_38 = $(MY_PPC_FILES)
 @HOST_CPU_PPC_TRUE@am__append_39 = $(MY_PPC_FILES)
-@HOST_CPU_PPC_TRUE@am__append_40 = $(MY_PPC_INCLUDE_DIRS)
+@HOST_CPU_PPC_TRUE@am__append_40 = $(MY_PPC_FILES)
 @HOST_CPU_PPC_TRUE@am__append_41 = $(MY_PPC_INCLUDE_DIRS)
 @HOST_CPU_PPC_TRUE@am__append_42 = $(MY_PPC_INCLUDE_DIRS)
 @HOST_CPU_PPC_TRUE@am__append_43 = $(MY_PPC_INCLUDE_DIRS)
@@ -218,13 +218,13 @@ host_triplet = @host@
 @HOST_CPU_PPC_TRUE@am__append_50 = $(MY_PPC_INCLUDE_DIRS)
 @HOST_CPU_PPC_TRUE@am__append_51 = $(MY_PPC_INCLUDE_DIRS)
 @HOST_CPU_PPC_TRUE@am__append_52 = $(MY_PPC_INCLUDE_DIRS)
-@HOST_CPU_X86_FAMILY_TRUE@am__append_53 = $(MY_X86_FILES)
+@HOST_CPU_PPC_TRUE@am__append_53 = $(MY_PPC_INCLUDE_DIRS)
 @HOST_CPU_X86_FAMILY_TRUE@am__append_54 = $(MY_X86_FILES)
-@HOST_CPU_X86_FAMILY_TRUE@am__append_55 = $(MY_X86_INCLUDE_DIRS)
+@HOST_CPU_X86_FAMILY_TRUE@am__append_55 = $(MY_X86_FILES)
 @HOST_CPU_X86_FAMILY_TRUE@am__append_56 = $(MY_X86_INCLUDE_DIRS)
-@HOST_CPU_X86_FAMILY_TRUE@am__append_57 = $(XED2_HPCRUN_LIBS)
-@HOST_CPU_X86_FAMILY_TRUE@am__append_58 = $(XED2_HPCLINK_LIBS) 
-@HOST_CPU_X86_FAMILY_TRUE@am__append_59 = $(MY_X86_INCLUDE_DIRS)
+@HOST_CPU_X86_FAMILY_TRUE@am__append_57 = $(MY_X86_INCLUDE_DIRS)
+@HOST_CPU_X86_FAMILY_TRUE@am__append_58 = $(XED2_HPCRUN_LIBS)
+@HOST_CPU_X86_FAMILY_TRUE@am__append_59 = $(XED2_HPCLINK_LIBS) 
 @HOST_CPU_X86_FAMILY_TRUE@am__append_60 = $(MY_X86_INCLUDE_DIRS)
 @HOST_CPU_X86_FAMILY_TRUE@am__append_61 = $(MY_X86_INCLUDE_DIRS)
 @HOST_CPU_X86_FAMILY_TRUE@am__append_62 = $(MY_X86_INCLUDE_DIRS)
@@ -236,9 +236,9 @@ host_triplet = @host@
 @HOST_CPU_X86_FAMILY_TRUE@am__append_68 = $(MY_X86_INCLUDE_DIRS)
 @HOST_CPU_X86_FAMILY_TRUE@am__append_69 = $(MY_X86_INCLUDE_DIRS)
 @HOST_CPU_X86_FAMILY_TRUE@am__append_70 = $(MY_X86_INCLUDE_DIRS)
-@HOST_CPU_IA64_TRUE@am__append_71 = $(MY_IA64_FILES)
+@HOST_CPU_X86_FAMILY_TRUE@am__append_71 = $(MY_X86_INCLUDE_DIRS)
 @HOST_CPU_IA64_TRUE@am__append_72 = $(MY_IA64_FILES)
-@HOST_CPU_IA64_TRUE@am__append_73 = $(MY_IA64_INCLUDE_DIRS)
+@HOST_CPU_IA64_TRUE@am__append_73 = $(MY_IA64_FILES)
 @HOST_CPU_IA64_TRUE@am__append_74 = $(MY_IA64_INCLUDE_DIRS)
 @HOST_CPU_IA64_TRUE@am__append_75 = $(MY_IA64_INCLUDE_DIRS)
 @HOST_CPU_IA64_TRUE@am__append_76 = $(MY_IA64_INCLUDE_DIRS)
@@ -249,9 +249,9 @@ host_triplet = @host@
 @HOST_CPU_IA64_TRUE@am__append_81 = $(MY_IA64_INCLUDE_DIRS)
 @HOST_CPU_IA64_TRUE@am__append_82 = $(MY_IA64_INCLUDE_DIRS)
 @HOST_CPU_IA64_TRUE@am__append_83 = $(MY_IA64_INCLUDE_DIRS)
-@HOST_CPU_AARCH64_TRUE@am__append_84 = $(MY_AARCH64_FILES)
+@HOST_CPU_IA64_TRUE@am__append_84 = $(MY_IA64_INCLUDE_DIRS)
 @HOST_CPU_AARCH64_TRUE@am__append_85 = $(MY_AARCH64_FILES)
-@HOST_CPU_AARCH64_TRUE@am__append_86 = $(MY_AARCH64_INCLUDE_DIRS)
+@HOST_CPU_AARCH64_TRUE@am__append_86 = $(MY_AARCH64_FILES)
 @HOST_CPU_AARCH64_TRUE@am__append_87 = $(MY_AARCH64_INCLUDE_DIRS)
 @HOST_CPU_AARCH64_TRUE@am__append_88 = $(MY_AARCH64_INCLUDE_DIRS)
 @HOST_CPU_AARCH64_TRUE@am__append_89 = $(MY_AARCH64_INCLUDE_DIRS)
@@ -264,49 +264,50 @@ host_triplet = @host@
 @HOST_CPU_AARCH64_TRUE@am__append_96 = $(MY_AARCH64_INCLUDE_DIRS)
 @HOST_CPU_AARCH64_TRUE@am__append_97 = $(MY_AARCH64_INCLUDE_DIRS)
 @HOST_CPU_AARCH64_TRUE@am__append_98 = $(MY_AARCH64_INCLUDE_DIRS)
-@OPT_PAPI_DYNAMIC_TRUE@am__append_99 = $(MY_PAPI_FILES)
-@OPT_PAPI_DYNAMIC_TRUE@am__append_100 = $(PAPI_INC_FLGS)
-@OPT_PAPI_DYNAMIC_TRUE@am__append_101 = $(PAPI_LD_FLGS)
-@OPT_PAPI_DYNAMIC_TRUE@am__append_102 = -DHPCRUN_SS_PAPI
-@OPT_ENABLE_CUPTI_TRUE@am__append_103 = $(MY_CUPTI_FILES)
+@HOST_CPU_AARCH64_TRUE@am__append_99 = $(MY_AARCH64_INCLUDE_DIRS)
+@OPT_PAPI_DYNAMIC_TRUE@am__append_100 = $(MY_PAPI_FILES)
+@OPT_PAPI_DYNAMIC_TRUE@am__append_101 = $(PAPI_INC_FLGS)
+@OPT_PAPI_DYNAMIC_TRUE@am__append_102 = $(PAPI_LD_FLGS)
+@OPT_PAPI_DYNAMIC_TRUE@am__append_103 = -DHPCRUN_SS_PAPI
 @OPT_ENABLE_CUPTI_TRUE@am__append_104 = $(MY_CUPTI_FILES)
-@OPT_ENABLE_CUPTI_TRUE@am__append_105 = $(CUPTI_INC_FLGS)
-@OPT_ENABLE_CUPTI_TRUE@am__append_106 = -DHPCRUN_SS_NVIDIA
-@OPT_PAPI_CUPTI_TRUE@am__append_107 = $(CUPTI_INC_FLGS)
-@OPT_PAPI_CUPTI_TRUE@am__append_108 = -DHPCRUN_SS_PAPI_C_CUPTI
-@OPT_PAPI_STATIC_TRUE@am__append_109 = $(MY_PAPI_FILES)
-@OPT_PAPI_STATIC_TRUE@am__append_110 = $(PAPI_INC_FLGS)
-@OPT_PAPI_STATIC_TRUE@am__append_111 = $(OPT_PAPI_LIBS_STAT)
-@OPT_PAPI_STATIC_TRUE@am__append_112 = -DHPCRUN_SS_PAPI
-@OPT_ENABLE_UPC_TRUE@am__append_113 = $(MY_UPC_FILES)
+@OPT_ENABLE_CUPTI_TRUE@am__append_105 = $(MY_CUPTI_FILES)
+@OPT_ENABLE_CUPTI_TRUE@am__append_106 = $(CUPTI_INC_FLGS)
+@OPT_ENABLE_CUPTI_TRUE@am__append_107 = -DHPCRUN_SS_NVIDIA
+@OPT_PAPI_CUPTI_TRUE@am__append_108 = $(CUPTI_INC_FLGS)
+@OPT_PAPI_CUPTI_TRUE@am__append_109 = -DHPCRUN_SS_PAPI_C_CUPTI
+@OPT_PAPI_STATIC_TRUE@am__append_110 = $(MY_PAPI_FILES)
+@OPT_PAPI_STATIC_TRUE@am__append_111 = $(PAPI_INC_FLGS)
+@OPT_PAPI_STATIC_TRUE@am__append_112 = $(OPT_PAPI_LIBS_STAT)
+@OPT_PAPI_STATIC_TRUE@am__append_113 = -DHPCRUN_SS_PAPI
 @OPT_ENABLE_UPC_TRUE@am__append_114 = $(MY_UPC_FILES)
-@OPT_ENABLE_UPC_TRUE@am__append_115 = $(OPT_UPC_IFLAGS)
+@OPT_ENABLE_UPC_TRUE@am__append_115 = $(MY_UPC_FILES)
 @OPT_ENABLE_UPC_TRUE@am__append_116 = $(OPT_UPC_IFLAGS)
-@OPT_ENABLE_UPC_TRUE@am__append_117 = $(OPT_UPC_LDFLAGS)
-@OPT_ENABLE_LUSH_PTHREADS_TRUE@am__append_118 = -DLUSH_PTHREADS
+@OPT_ENABLE_UPC_TRUE@am__append_117 = $(OPT_UPC_IFLAGS)
+@OPT_ENABLE_UPC_TRUE@am__append_118 = $(OPT_UPC_LDFLAGS)
 @OPT_ENABLE_LUSH_PTHREADS_TRUE@am__append_119 = -DLUSH_PTHREADS
-@OPT_ENABLE_CUDA_TRUE@am__append_120 = $(MY_CUDA_FILES)
-@OPT_ENABLE_CUDA_TRUE@am__append_121 = -DENABLE_CUDA
-@OPT_ENABLE_CUDA_TRUE@am__append_122 = $(OPT_CUDA_IFLAGS)
-@OPT_ENABLE_CUDA_TRUE@am__append_123 = $(MY_CUDA_FILES)
-@OPT_ENABLE_ROCM_TRUE@am__append_124 = $(MY_ROCM_FILES)
-@OPT_ENABLE_ROCM_TRUE@am__append_125 = -DENABLE_ROCM
-@OPT_ENABLE_ROCM_TRUE@am__append_126 = $(OPT_ROCM_IFLAGS)
-@OPT_ENABLE_ROCM_TRUE@am__append_127 = -DHPCRUN_SS_AMD
-@OPT_ENABLE_LEVEL0_TRUE@am__append_128 = $(MY_LEVEL0_FILES)
-@OPT_ENABLE_LEVEL0_TRUE@am__append_129 = -DENABLE_LEVEL0
-@OPT_ENABLE_LEVEL0_TRUE@am__append_130 = $(OPT_LEVEL0_IFLAGS)
-@OPT_ENABLE_LEVEL0_TRUE@am__append_131 = -DHPCRUN_SS_LEVEL0
-@OPT_ENABLE_OPENCL_TRUE@am__append_132 = $(MY_OPENCL_FILES)
-@OPT_ENABLE_OPENCL_TRUE@am__append_133 = -DENABLE_OPENCL
-@OPT_ENABLE_OPENCL_TRUE@am__append_134 = $(OPT_OPENCL_IFLAGS)
-@OPT_ENABLE_OPENCL_TRUE@am__append_135 = -DHPCRUN_SS_OPENCL
-@OPT_ENABLE_GTPIN_TRUE@am__append_136 = $(MY_GTPIN_FILES)
-@OPT_ENABLE_GTPIN_TRUE@am__append_137 = -DENABLE_GTPIN -DGTPIN_LIBDIR=$(OPT_GTPIN_LIBDIR)
-@OPT_ENABLE_GTPIN_TRUE@am__append_138 = $(OPT_GTPIN_IFLAGS)
-@OPT_ENABLE_GTPIN_TRUE@am__append_139 = -DHPCRUN_SS_GTPIN
-@OPT_ENABLE_LUSH_TRUE@@OPT_WITH_CILK_TRUE@am__append_140 = libagent-cilk.la
-@OPT_ENABLE_LUSH_TRUE@am__append_141 = libagent-pthread.la \
+@OPT_ENABLE_LUSH_PTHREADS_TRUE@am__append_120 = -DLUSH_PTHREADS
+@OPT_ENABLE_CUDA_TRUE@am__append_121 = $(MY_CUDA_FILES)
+@OPT_ENABLE_CUDA_TRUE@am__append_122 = -DENABLE_CUDA
+@OPT_ENABLE_CUDA_TRUE@am__append_123 = $(OPT_CUDA_IFLAGS)
+@OPT_ENABLE_CUDA_TRUE@am__append_124 = $(MY_CUDA_FILES)
+@OPT_ENABLE_ROCM_TRUE@am__append_125 = $(MY_ROCM_FILES)
+@OPT_ENABLE_ROCM_TRUE@am__append_126 = -DENABLE_ROCM
+@OPT_ENABLE_ROCM_TRUE@am__append_127 = $(OPT_ROCM_IFLAGS)
+@OPT_ENABLE_ROCM_TRUE@am__append_128 = -DHPCRUN_SS_AMD
+@OPT_ENABLE_LEVEL0_TRUE@am__append_129 = $(MY_LEVEL0_FILES)
+@OPT_ENABLE_LEVEL0_TRUE@am__append_130 = -DENABLE_LEVEL0
+@OPT_ENABLE_LEVEL0_TRUE@am__append_131 = $(OPT_LEVEL0_IFLAGS)
+@OPT_ENABLE_LEVEL0_TRUE@am__append_132 = -DHPCRUN_SS_LEVEL0
+@OPT_ENABLE_OPENCL_TRUE@am__append_133 = $(MY_OPENCL_FILES)
+@OPT_ENABLE_OPENCL_TRUE@am__append_134 = -DENABLE_OPENCL
+@OPT_ENABLE_OPENCL_TRUE@am__append_135 = $(OPT_OPENCL_IFLAGS)
+@OPT_ENABLE_OPENCL_TRUE@am__append_136 = -DHPCRUN_SS_OPENCL
+@OPT_ENABLE_GTPIN_TRUE@am__append_137 = $(MY_GTPIN_FILES)
+@OPT_ENABLE_GTPIN_TRUE@am__append_138 = -DENABLE_GTPIN -DGTPIN_LIBDIR=$(OPT_GTPIN_LIBDIR)
+@OPT_ENABLE_GTPIN_TRUE@am__append_139 = $(OPT_GTPIN_IFLAGS)
+@OPT_ENABLE_GTPIN_TRUE@am__append_140 = -DHPCRUN_SS_GTPIN
+@OPT_ENABLE_LUSH_TRUE@@OPT_WITH_CILK_TRUE@am__append_141 = libagent-cilk.la
+@OPT_ENABLE_LUSH_TRUE@am__append_142 = libagent-pthread.la \
 @OPT_ENABLE_LUSH_TRUE@	libagent-tbb.la
 subdir = src/tool/hpcrun
 ACLOCAL_M4 = $(top_srcdir)/aclocal.m4
@@ -531,17 +532,18 @@ am__libhpcrun_la_SOURCES_DIST = utilities/first_func.c main.h main.c \
 	trampoline/aarch64/aarch64-tramp.c \
 	utilities/arch/libunwind/libunwind-context-pc.c \
 	sample-sources/papi.c sample-sources/papi-c-cupti.c \
-	sample-sources/papi-c.c sample-sources/papi-c-extended-info.c \
-	sample-sources/nvidia.c gpu/nvidia/cubin-hash-map.c \
-	gpu/nvidia/cubin-id-map.c gpu/nvidia/cubin-symbols.c \
-	gpu/nvidia/cuda-api.c gpu/nvidia/cuda-device-map.c \
+	sample-sources/papi-c-rocm.c sample-sources/papi-c.c \
+	sample-sources/papi-c-extended-info.c sample-sources/nvidia.c \
+	gpu/nvidia/cubin-hash-map.c gpu/nvidia/cubin-id-map.c \
+	gpu/nvidia/cubin-symbols.c gpu/nvidia/cuda-api.c \
+	gpu/nvidia/cuda-device-map.c \
 	gpu/nvidia/cupti-activity-translate.c \
 	gpu/nvidia/cupti-analysis.c gpu/nvidia/cupti-api.c \
 	gpu/nvidia/cupti-gpu-api.c sample-sources/upc.c \
 	sample-sources/amd.c gpu/amd/roctracer-activity-translate.c \
 	gpu/amd/roctracer-api.c gpu/amd/rocm-debug-api.c \
-	gpu/amd/rocm-binary-processing.c gpu/amd/hip-api.c \
-	sample-sources/level0.c gpu/level0/level0-api.c \
+	gpu/amd/rocm-binary-processing.c sample-sources/level0.c \
+	gpu/level0/level0-api.c \
 	gpu/level0/level0-command-list-context-map.c \
 	gpu/level0/level0-command-list-map.c \
 	gpu/level0/level0-command-process.c \
@@ -723,14 +725,15 @@ am__objects_26 = trampoline/aarch64/libhpcrun_la-aarch64-tramp.lo \
 	utilities/arch/libunwind/libhpcrun_la-libunwind-context-pc.lo
 @HOST_CPU_AARCH64_TRUE@am__objects_27 = $(am__objects_26)
 @OPT_PAPI_CUPTI_TRUE@am__objects_28 = sample-sources/libhpcrun_la-papi-c-cupti.lo
-@OPT_PAPI_COMPONENT_FALSE@am__objects_29 =  \
+@OPT_PAPI_ROCM_TRUE@am__objects_29 = sample-sources/libhpcrun_la-papi-c-rocm.lo
+@OPT_PAPI_COMPONENT_FALSE@am__objects_30 =  \
 @OPT_PAPI_COMPONENT_FALSE@	sample-sources/libhpcrun_la-papi.lo \
-@OPT_PAPI_COMPONENT_FALSE@	$(am__objects_28)
-@OPT_PAPI_COMPONENT_TRUE@am__objects_29 = sample-sources/libhpcrun_la-papi-c.lo \
+@OPT_PAPI_COMPONENT_FALSE@	$(am__objects_28) $(am__objects_29)
+@OPT_PAPI_COMPONENT_TRUE@am__objects_30 = sample-sources/libhpcrun_la-papi-c.lo \
 @OPT_PAPI_COMPONENT_TRUE@	sample-sources/libhpcrun_la-papi-c-extended-info.lo \
-@OPT_PAPI_COMPONENT_TRUE@	$(am__objects_28)
-@OPT_PAPI_DYNAMIC_TRUE@am__objects_30 = $(am__objects_29)
-@OPT_ENABLE_CUPTI_TRUE@am__objects_31 =  \
+@OPT_PAPI_COMPONENT_TRUE@	$(am__objects_28) $(am__objects_29)
+@OPT_PAPI_DYNAMIC_TRUE@am__objects_31 = $(am__objects_30)
+@OPT_ENABLE_CUPTI_TRUE@am__objects_32 =  \
 @OPT_ENABLE_CUPTI_TRUE@	sample-sources/libhpcrun_la-nvidia.lo \
 @OPT_ENABLE_CUPTI_TRUE@	gpu/nvidia/libhpcrun_la-cubin-hash-map.lo \
 @OPT_ENABLE_CUPTI_TRUE@	gpu/nvidia/libhpcrun_la-cubin-id-map.lo \
@@ -741,19 +744,18 @@ am__objects_26 = trampoline/aarch64/libhpcrun_la-aarch64-tramp.lo \
 @OPT_ENABLE_CUPTI_TRUE@	gpu/nvidia/libhpcrun_la-cupti-analysis.lo \
 @OPT_ENABLE_CUPTI_TRUE@	gpu/nvidia/libhpcrun_la-cupti-api.lo \
 @OPT_ENABLE_CUPTI_TRUE@	gpu/nvidia/libhpcrun_la-cupti-gpu-api.lo
-@OPT_ENABLE_CUPTI_TRUE@am__objects_32 = $(am__objects_31)
-am__objects_33 = sample-sources/libhpcrun_la-upc.lo
-@OPT_ENABLE_UPC_TRUE@am__objects_34 = $(am__objects_33)
-am__objects_35 =
-@OPT_ENABLE_ROCM_TRUE@am__objects_36 =  \
+@OPT_ENABLE_CUPTI_TRUE@am__objects_33 = $(am__objects_32)
+am__objects_34 = sample-sources/libhpcrun_la-upc.lo
+@OPT_ENABLE_UPC_TRUE@am__objects_35 = $(am__objects_34)
+am__objects_36 =
+@OPT_ENABLE_ROCM_TRUE@am__objects_37 =  \
 @OPT_ENABLE_ROCM_TRUE@	sample-sources/libhpcrun_la-amd.lo \
 @OPT_ENABLE_ROCM_TRUE@	gpu/amd/libhpcrun_la-roctracer-activity-translate.lo \
 @OPT_ENABLE_ROCM_TRUE@	gpu/amd/libhpcrun_la-roctracer-api.lo \
 @OPT_ENABLE_ROCM_TRUE@	gpu/amd/libhpcrun_la-rocm-debug-api.lo \
-@OPT_ENABLE_ROCM_TRUE@	gpu/amd/libhpcrun_la-rocm-binary-processing.lo \
-@OPT_ENABLE_ROCM_TRUE@	gpu/amd/libhpcrun_la-hip-api.lo
-@OPT_ENABLE_ROCM_TRUE@am__objects_37 = $(am__objects_36)
-@OPT_ENABLE_LEVEL0_TRUE@am__objects_38 =  \
+@OPT_ENABLE_ROCM_TRUE@	gpu/amd/libhpcrun_la-rocm-binary-processing.lo
+@OPT_ENABLE_ROCM_TRUE@am__objects_38 = $(am__objects_37)
+@OPT_ENABLE_LEVEL0_TRUE@am__objects_39 =  \
 @OPT_ENABLE_LEVEL0_TRUE@	sample-sources/libhpcrun_la-level0.lo \
 @OPT_ENABLE_LEVEL0_TRUE@	gpu/level0/libhpcrun_la-level0-api.lo \
 @OPT_ENABLE_LEVEL0_TRUE@	gpu/level0/libhpcrun_la-level0-command-list-context-map.lo \
@@ -762,8 +764,8 @@ am__objects_35 =
 @OPT_ENABLE_LEVEL0_TRUE@	gpu/level0/libhpcrun_la-level0-data-node.lo \
 @OPT_ENABLE_LEVEL0_TRUE@	gpu/level0/libhpcrun_la-level0-event-map.lo \
 @OPT_ENABLE_LEVEL0_TRUE@	gpu/level0/libhpcrun_la-level0-handle-map.lo
-@OPT_ENABLE_LEVEL0_TRUE@am__objects_39 = $(am__objects_38)
-@OPT_ENABLE_OPENCL_TRUE@am__objects_40 =  \
+@OPT_ENABLE_LEVEL0_TRUE@am__objects_40 = $(am__objects_39)
+@OPT_ENABLE_OPENCL_TRUE@am__objects_41 =  \
 @OPT_ENABLE_OPENCL_TRUE@	sample-sources/libhpcrun_la-opencl.lo \
 @OPT_ENABLE_OPENCL_TRUE@	gpu/opencl/libhpcrun_la-opencl-api.lo \
 @OPT_ENABLE_OPENCL_TRUE@	gpu/opencl/libhpcrun_la-opencl-memory-manager.lo \
@@ -771,28 +773,28 @@ am__objects_35 =
 @OPT_ENABLE_OPENCL_TRUE@	gpu/opencl/libhpcrun_la-opencl-h2d-map.lo \
 @OPT_ENABLE_OPENCL_TRUE@	gpu/opencl/libhpcrun_la-opencl-queue-map.lo \
 @OPT_ENABLE_OPENCL_TRUE@	gpu/opencl/libhpcrun_la-opencl-context-map.lo
-@OPT_ENABLE_OPENCL_TRUE@am__objects_41 = $(am__objects_40)
-@OPT_ENABLE_GTPIN_TRUE@am__objects_42 = gpu/instrumentation/libhpcrun_la-kernel-data-map.lo \
+@OPT_ENABLE_OPENCL_TRUE@am__objects_42 = $(am__objects_41)
+@OPT_ENABLE_GTPIN_TRUE@am__objects_43 = gpu/instrumentation/libhpcrun_la-kernel-data-map.lo \
 @OPT_ENABLE_GTPIN_TRUE@	gpu/instrumentation/libhpcrun_la-gtpin-instrumentation.lo \
 @OPT_ENABLE_GTPIN_TRUE@	gpu/instrumentation/libhpcrun_la-gtpin-correlation-id-map.lo
-@OPT_ENABLE_GTPIN_TRUE@am__objects_43 = $(am__objects_42)
-am__objects_44 = unwind/common/libhpcrun_la-backtrace.lo \
+@OPT_ENABLE_GTPIN_TRUE@am__objects_44 = $(am__objects_43)
+am__objects_45 = unwind/common/libhpcrun_la-backtrace.lo \
 	unwind/common/libhpcrun_la-unw-throw.lo
-am__objects_45 = $(am__objects_44) \
+am__objects_46 = $(am__objects_45) \
 	unwind/common/libhpcrun_la-binarytree_uwi.lo \
 	unwind/common/libhpcrun_la-interval_t.lo \
 	unwind/common/libhpcrun_la-libunw_intervals.lo \
 	unwind/common/libhpcrun_la-stack_troll.lo \
 	unwind/common/libhpcrun_la-uw_hash.lo \
 	unwind/common/libhpcrun_la-uw_recipe_map.lo
-am__objects_46 = $(am__objects_45) \
+am__objects_47 = $(am__objects_46) \
 	unwind/generic-libunwind/libhpcrun_la-libunw-unwind.lo \
 	unwind/common/libhpcrun_la-default_validation_summary.lo
-am__objects_47 = $(am__objects_45) \
+am__objects_48 = $(am__objects_46) \
 	unwind/ppc64/libhpcrun_la-ppc64-unwind.lo \
 	unwind/ppc64/libhpcrun_la-ppc64-unwind-interval.lo \
 	unwind/common/libhpcrun_la-default_validation_summary.lo
-am__objects_48 = $(am__objects_45) \
+am__objects_49 = $(am__objects_46) \
 	unwind/x86-family/libhpcrun_la-x86-all.lo \
 	unwind/x86-family/libhpcrun_la-amd-xop.lo \
 	unwind/x86-family/libhpcrun_la-x86-cold-path.lo \
@@ -812,15 +814,15 @@ am__objects_48 = $(am__objects_45) \
 	unwind/x86-family/manual-intervals/libhpcrun_la-x86-32bit-icc-variant.lo \
 	unwind/x86-family/manual-intervals/libhpcrun_la-x86-fail-intervals.lo \
 	unwind/x86-family/manual-intervals/libhpcrun_la-x86-pgi-mp_pexit.lo
-@UNW_LIBUNW_FALSE@@UNW_PPC64_FALSE@@UNW_X86_TRUE@am__objects_49 = $(am__objects_48)
-@UNW_LIBUNW_FALSE@@UNW_PPC64_TRUE@am__objects_49 = $(am__objects_47)
-@UNW_LIBUNW_TRUE@am__objects_49 = $(am__objects_46)
+@UNW_LIBUNW_FALSE@@UNW_PPC64_FALSE@@UNW_X86_TRUE@am__objects_50 = $(am__objects_49)
+@UNW_LIBUNW_FALSE@@UNW_PPC64_TRUE@am__objects_50 = $(am__objects_48)
+@UNW_LIBUNW_TRUE@am__objects_50 = $(am__objects_47)
 am_libhpcrun_la_OBJECTS = $(am__objects_14) $(am__objects_15) \
 	$(am__objects_17) $(am__objects_19) $(am__objects_21) \
 	$(am__objects_23) $(am__objects_25) $(am__objects_27) \
-	$(am__objects_30) $(am__objects_32) $(am__objects_34) \
-	$(am__objects_35) $(am__objects_37) $(am__objects_39) \
-	$(am__objects_41) $(am__objects_43) $(am__objects_49) \
+	$(am__objects_31) $(am__objects_33) $(am__objects_35) \
+	$(am__objects_36) $(am__objects_38) $(am__objects_40) \
+	$(am__objects_42) $(am__objects_44) $(am__objects_50) \
 	utilities/libhpcrun_la-last_func.lo
 libhpcrun_la_OBJECTS = $(am_libhpcrun_la_OBJECTS)
 libhpcrun_la_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) \
@@ -1027,12 +1029,13 @@ am__libhpcrun_o_SOURCES_DIST = utilities/first_func.c main.h main.c \
 	gpu/nvidia/cupti-activity-translate.c \
 	gpu/nvidia/cupti-analysis.c gpu/nvidia/cupti-api.c \
 	gpu/nvidia/cupti-gpu-api.c sample-sources/papi.c \
-	sample-sources/papi-c-cupti.c sample-sources/papi-c.c \
-	sample-sources/papi-c-extended-info.c sample-sources/upc.c \
-	unwind/common/backtrace.c unwind/common/unw-throw.c \
-	unwind/common/binarytree_uwi.c unwind/common/interval_t.c \
-	unwind/common/libunw_intervals.c unwind/common/stack_troll.c \
-	unwind/common/uw_hash.c unwind/common/uw_recipe_map.c \
+	sample-sources/papi-c-cupti.c sample-sources/papi-c-rocm.c \
+	sample-sources/papi-c.c sample-sources/papi-c-extended-info.c \
+	sample-sources/upc.c unwind/common/backtrace.c \
+	unwind/common/unw-throw.c unwind/common/binarytree_uwi.c \
+	unwind/common/interval_t.c unwind/common/libunw_intervals.c \
+	unwind/common/stack_troll.c unwind/common/uw_hash.c \
+	unwind/common/uw_recipe_map.c \
 	unwind/generic-libunwind/libunw-unwind.c \
 	unwind/ppc64/ppc64-unwind.c \
 	unwind/ppc64/ppc64-unwind-interval.c \
@@ -1055,19 +1058,19 @@ am__libhpcrun_o_SOURCES_DIST = utilities/first_func.c main.h main.c \
 	unwind/x86-family/manual-intervals/x86-fail-intervals.c \
 	unwind/x86-family/manual-intervals/x86-pgi-mp_pexit.c \
 	utilities/last_func.c
-@HOST_CPU_PPC_TRUE@am__objects_50 = trampoline/common/libhpcrun_o-trampoline_eager.$(OBJEXT)
-@HOST_CPU_PPC_FALSE@am__objects_51 = trampoline/common/libhpcrun_o-trampoline_lazy.$(OBJEXT)
-@OPT_ENABLE_PERF_EVENT_TRUE@am__objects_52 = sample-sources/perf/libhpcrun_o-event_custom.$(OBJEXT) \
+@HOST_CPU_PPC_TRUE@am__objects_51 = trampoline/common/libhpcrun_o-trampoline_eager.$(OBJEXT)
+@HOST_CPU_PPC_FALSE@am__objects_52 = trampoline/common/libhpcrun_o-trampoline_lazy.$(OBJEXT)
+@OPT_ENABLE_PERF_EVENT_TRUE@am__objects_53 = sample-sources/perf/libhpcrun_o-event_custom.$(OBJEXT) \
 @OPT_ENABLE_PERF_EVENT_TRUE@	sample-sources/perf/libhpcrun_o-linux_perf.$(OBJEXT) \
 @OPT_ENABLE_PERF_EVENT_TRUE@	sample-sources/perf/libhpcrun_o-perf_event_open.$(OBJEXT) \
 @OPT_ENABLE_PERF_EVENT_TRUE@	sample-sources/perf/libhpcrun_o-perf-util.$(OBJEXT) \
 @OPT_ENABLE_PERF_EVENT_TRUE@	sample-sources/perf/libhpcrun_o-perf_mmap.$(OBJEXT) \
 @OPT_ENABLE_PERF_EVENT_TRUE@	sample-sources/perf/libhpcrun_o-perf_skid.$(OBJEXT)
-@OPT_ENABLE_PERF_EVENT_TRUE@@OPT_PERFMON_TRUE@am__objects_53 = sample-sources/perf/libhpcrun_o-perfmon-util.$(OBJEXT)
-@OPT_ENABLE_PERF_EVENT_TRUE@@OPT_PERFMON_FALSE@am__objects_54 = sample-sources/perf/libhpcrun_o-perfmon-util-dummy.$(OBJEXT)
-@OPT_ENABLE_KERNEL_4_3_TRUE@@OPT_ENABLE_PERF_EVENT_TRUE@am__objects_55 = sample-sources/perf/libhpcrun_o-kernel_blocking.$(OBJEXT)
-@OPT_ENABLE_KERNEL_4_3_FALSE@@OPT_ENABLE_PERF_EVENT_TRUE@am__objects_56 = sample-sources/perf/libhpcrun_o-kernel_blocking_stub.$(OBJEXT)
-am__objects_57 = utilities/libhpcrun_o-first_func.$(OBJEXT) \
+@OPT_ENABLE_PERF_EVENT_TRUE@@OPT_PERFMON_TRUE@am__objects_54 = sample-sources/perf/libhpcrun_o-perfmon-util.$(OBJEXT)
+@OPT_ENABLE_PERF_EVENT_TRUE@@OPT_PERFMON_FALSE@am__objects_55 = sample-sources/perf/libhpcrun_o-perfmon-util-dummy.$(OBJEXT)
+@OPT_ENABLE_KERNEL_4_3_TRUE@@OPT_ENABLE_PERF_EVENT_TRUE@am__objects_56 = sample-sources/perf/libhpcrun_o-kernel_blocking.$(OBJEXT)
+@OPT_ENABLE_KERNEL_4_3_FALSE@@OPT_ENABLE_PERF_EVENT_TRUE@am__objects_57 = sample-sources/perf/libhpcrun_o-kernel_blocking_stub.$(OBJEXT)
+am__objects_58 = utilities/libhpcrun_o-first_func.$(OBJEXT) \
 	libhpcrun_o-main.$(OBJEXT) libhpcrun_o-disabled.$(OBJEXT) \
 	libhpcrun_o-closure-registry.$(OBJEXT) \
 	libhpcrun_o-cct_insert_backtrace.$(OBJEXT) \
@@ -1187,28 +1190,28 @@ am__objects_57 = utilities/libhpcrun_o-first_func.$(OBJEXT) \
 	utilities/libhpcrun_o-line_wrapping.$(OBJEXT) \
 	utilities/libhpcrun_o-timer.$(OBJEXT) \
 	utilities/libhpcrun_o-tokenize.$(OBJEXT) \
-	utilities/libhpcrun_o-unlink.$(OBJEXT) $(am__objects_50) \
-	$(am__objects_51) $(am__objects_52) $(am__objects_53) \
-	$(am__objects_54) $(am__objects_55) $(am__objects_56)
-am__objects_58 = fnbounds/libhpcrun_o-fnbounds_static.$(OBJEXT) \
+	utilities/libhpcrun_o-unlink.$(OBJEXT) $(am__objects_51) \
+	$(am__objects_52) $(am__objects_53) $(am__objects_54) \
+	$(am__objects_55) $(am__objects_56) $(am__objects_57)
+am__objects_59 = fnbounds/libhpcrun_o-fnbounds_static.$(OBJEXT) \
 	libhpcrun_o-custom-init-static.$(OBJEXT)
-am__objects_59 = unwind/common/libhpcrun_o-default_validation_summary.$(OBJEXT)
-@HOST_CPU_MIPS_TRUE@am__objects_60 = $(am__objects_59)
-am__objects_61 = trampoline/ppc64/libhpcrun_o-ppc64-tramp.$(OBJEXT) \
+am__objects_60 = unwind/common/libhpcrun_o-default_validation_summary.$(OBJEXT)
+@HOST_CPU_MIPS_TRUE@am__objects_61 = $(am__objects_60)
+am__objects_62 = trampoline/ppc64/libhpcrun_o-ppc64-tramp.$(OBJEXT) \
 	utilities/arch/ppc64/libhpcrun_o-ppc64-context-pc.$(OBJEXT)
-@HOST_CPU_PPC_TRUE@am__objects_62 = $(am__objects_61)
-am__objects_63 =  \
+@HOST_CPU_PPC_TRUE@am__objects_63 = $(am__objects_62)
+am__objects_64 =  \
 	trampoline/x86-family/libhpcrun_o-x86-tramp.$(OBJEXT) \
 	utilities/arch/x86-family/libhpcrun_o-x86-context-pc.$(OBJEXT)
-@HOST_CPU_X86_FAMILY_TRUE@am__objects_64 = $(am__objects_63)
-am__objects_65 = trampoline/ia64/libhpcrun_o-ia64-tramp.$(OBJEXT) \
+@HOST_CPU_X86_FAMILY_TRUE@am__objects_65 = $(am__objects_64)
+am__objects_66 = trampoline/ia64/libhpcrun_o-ia64-tramp.$(OBJEXT) \
 	utilities/arch/ia64/libhpcrun_o-ia64-context-pc.$(OBJEXT)
-@HOST_CPU_IA64_TRUE@am__objects_66 = $(am__objects_65)
-am__objects_67 =  \
+@HOST_CPU_IA64_TRUE@am__objects_67 = $(am__objects_66)
+am__objects_68 =  \
 	trampoline/aarch64/libhpcrun_o-aarch64-tramp.$(OBJEXT) \
 	utilities/arch/libunwind/libhpcrun_o-libunwind-context-pc.$(OBJEXT)
-@HOST_CPU_AARCH64_TRUE@am__objects_68 = $(am__objects_67)
-@OPT_ENABLE_CUPTI_TRUE@am__objects_69 = sample-sources/libhpcrun_o-nvidia.$(OBJEXT) \
+@HOST_CPU_AARCH64_TRUE@am__objects_69 = $(am__objects_68)
+@OPT_ENABLE_CUPTI_TRUE@am__objects_70 = sample-sources/libhpcrun_o-nvidia.$(OBJEXT) \
 @OPT_ENABLE_CUPTI_TRUE@	gpu/nvidia/libhpcrun_o-cubin-hash-map.$(OBJEXT) \
 @OPT_ENABLE_CUPTI_TRUE@	gpu/nvidia/libhpcrun_o-cubin-id-map.$(OBJEXT) \
 @OPT_ENABLE_CUPTI_TRUE@	gpu/nvidia/libhpcrun_o-cubin-symbols.$(OBJEXT) \
@@ -1218,33 +1221,34 @@ am__objects_67 =  \
 @OPT_ENABLE_CUPTI_TRUE@	gpu/nvidia/libhpcrun_o-cupti-analysis.$(OBJEXT) \
 @OPT_ENABLE_CUPTI_TRUE@	gpu/nvidia/libhpcrun_o-cupti-api.$(OBJEXT) \
 @OPT_ENABLE_CUPTI_TRUE@	gpu/nvidia/libhpcrun_o-cupti-gpu-api.$(OBJEXT)
-@OPT_ENABLE_CUPTI_TRUE@am__objects_70 = $(am__objects_69)
-@OPT_PAPI_CUPTI_TRUE@am__objects_71 = sample-sources/libhpcrun_o-papi-c-cupti.$(OBJEXT)
-@OPT_PAPI_COMPONENT_FALSE@am__objects_72 = sample-sources/libhpcrun_o-papi.$(OBJEXT) \
-@OPT_PAPI_COMPONENT_FALSE@	$(am__objects_71)
-@OPT_PAPI_COMPONENT_TRUE@am__objects_72 = sample-sources/libhpcrun_o-papi-c.$(OBJEXT) \
+@OPT_ENABLE_CUPTI_TRUE@am__objects_71 = $(am__objects_70)
+@OPT_PAPI_CUPTI_TRUE@am__objects_72 = sample-sources/libhpcrun_o-papi-c-cupti.$(OBJEXT)
+@OPT_PAPI_ROCM_TRUE@am__objects_73 = sample-sources/libhpcrun_o-papi-c-rocm.$(OBJEXT)
+@OPT_PAPI_COMPONENT_FALSE@am__objects_74 = sample-sources/libhpcrun_o-papi.$(OBJEXT) \
+@OPT_PAPI_COMPONENT_FALSE@	$(am__objects_72) $(am__objects_73)
+@OPT_PAPI_COMPONENT_TRUE@am__objects_74 = sample-sources/libhpcrun_o-papi-c.$(OBJEXT) \
 @OPT_PAPI_COMPONENT_TRUE@	sample-sources/libhpcrun_o-papi-c-extended-info.$(OBJEXT) \
-@OPT_PAPI_COMPONENT_TRUE@	$(am__objects_71)
-@OPT_PAPI_STATIC_TRUE@am__objects_73 = $(am__objects_72)
-am__objects_74 = sample-sources/libhpcrun_o-upc.$(OBJEXT)
-@OPT_ENABLE_UPC_TRUE@am__objects_75 = $(am__objects_74)
-am__objects_76 = unwind/common/libhpcrun_o-backtrace.$(OBJEXT) \
+@OPT_PAPI_COMPONENT_TRUE@	$(am__objects_72) $(am__objects_73)
+@OPT_PAPI_STATIC_TRUE@am__objects_75 = $(am__objects_74)
+am__objects_76 = sample-sources/libhpcrun_o-upc.$(OBJEXT)
+@OPT_ENABLE_UPC_TRUE@am__objects_77 = $(am__objects_76)
+am__objects_78 = unwind/common/libhpcrun_o-backtrace.$(OBJEXT) \
 	unwind/common/libhpcrun_o-unw-throw.$(OBJEXT)
-am__objects_77 = $(am__objects_76) \
+am__objects_79 = $(am__objects_78) \
 	unwind/common/libhpcrun_o-binarytree_uwi.$(OBJEXT) \
 	unwind/common/libhpcrun_o-interval_t.$(OBJEXT) \
 	unwind/common/libhpcrun_o-libunw_intervals.$(OBJEXT) \
 	unwind/common/libhpcrun_o-stack_troll.$(OBJEXT) \
 	unwind/common/libhpcrun_o-uw_hash.$(OBJEXT) \
 	unwind/common/libhpcrun_o-uw_recipe_map.$(OBJEXT)
-am__objects_78 = $(am__objects_77) \
+am__objects_80 = $(am__objects_79) \
 	unwind/generic-libunwind/libhpcrun_o-libunw-unwind.$(OBJEXT) \
 	unwind/common/libhpcrun_o-default_validation_summary.$(OBJEXT)
-am__objects_79 = $(am__objects_77) \
+am__objects_81 = $(am__objects_79) \
 	unwind/ppc64/libhpcrun_o-ppc64-unwind.$(OBJEXT) \
 	unwind/ppc64/libhpcrun_o-ppc64-unwind-interval.$(OBJEXT) \
 	unwind/common/libhpcrun_o-default_validation_summary.$(OBJEXT)
-am__objects_80 = $(am__objects_77) \
+am__objects_82 = $(am__objects_79) \
 	unwind/x86-family/libhpcrun_o-x86-all.$(OBJEXT) \
 	unwind/x86-family/libhpcrun_o-amd-xop.$(OBJEXT) \
 	unwind/x86-family/libhpcrun_o-x86-cold-path.$(OBJEXT) \
@@ -1264,14 +1268,14 @@ am__objects_80 = $(am__objects_77) \
 	unwind/x86-family/manual-intervals/libhpcrun_o-x86-32bit-icc-variant.$(OBJEXT) \
 	unwind/x86-family/manual-intervals/libhpcrun_o-x86-fail-intervals.$(OBJEXT) \
 	unwind/x86-family/manual-intervals/libhpcrun_o-x86-pgi-mp_pexit.$(OBJEXT)
-@UNW_LIBUNW_FALSE@@UNW_PPC64_FALSE@@UNW_X86_TRUE@am__objects_81 = $(am__objects_80)
-@UNW_LIBUNW_FALSE@@UNW_PPC64_TRUE@am__objects_81 = $(am__objects_79)
-@UNW_LIBUNW_TRUE@am__objects_81 = $(am__objects_78)
-am_libhpcrun_o_OBJECTS = $(am__objects_57) $(am__objects_58) \
-	$(am__objects_60) $(am__objects_62) $(am__objects_64) \
-	$(am__objects_66) $(am__objects_68) $(am__objects_70) \
-	$(am__objects_73) $(am__objects_75) $(am__objects_35) \
-	$(am__objects_81) utilities/libhpcrun_o-last_func.$(OBJEXT)
+@UNW_LIBUNW_FALSE@@UNW_PPC64_FALSE@@UNW_X86_TRUE@am__objects_83 = $(am__objects_82)
+@UNW_LIBUNW_FALSE@@UNW_PPC64_TRUE@am__objects_83 = $(am__objects_81)
+@UNW_LIBUNW_TRUE@am__objects_83 = $(am__objects_80)
+am_libhpcrun_o_OBJECTS = $(am__objects_58) $(am__objects_59) \
+	$(am__objects_61) $(am__objects_63) $(am__objects_65) \
+	$(am__objects_67) $(am__objects_69) $(am__objects_71) \
+	$(am__objects_75) $(am__objects_77) $(am__objects_36) \
+	$(am__objects_83) utilities/libhpcrun_o-last_func.$(OBJEXT)
 libhpcrun_o_OBJECTS = $(am_libhpcrun_o_OBJECTS)
 @HOST_CPU_X86_FAMILY_TRUE@am__DEPENDENCIES_2 = $(am__DEPENDENCIES_1)
 @OPT_PAPI_STATIC_TRUE@am__DEPENDENCIES_3 = $(am__DEPENDENCIES_1)
@@ -1777,10 +1781,10 @@ bin_SCRIPTS = $(am__append_4) $(am__append_6)
 pkglibexec_SCRIPTS = $(am__append_1)
 include_HEADERS = $(am__append_2)
 pkglib_LIBRARIES = $(am__append_5)
-pkglib_LTLIBRARIES = $(am__append_3) $(am__append_7) $(am__append_17) \
-	$(am__append_18) $(am__append_140) $(am__append_141)
-BUILT_SOURCES = $(am__append_22)
-CLEANFILES = $(am__append_23)
+pkglib_LTLIBRARIES = $(am__append_3) $(am__append_7) $(am__append_18) \
+	$(am__append_19) $(am__append_141) $(am__append_142)
+BUILT_SOURCES = $(am__append_23)
+CLEANFILES = $(am__append_24)
 @OPT_ENABLE_HPCRUN_DYNAMIC_TRUE@noinst_LTLIBRARIES = libhpcrun.la
 PAPI_INC_FLGS = @OPT_PAPI_IFLAGS@ 
 PAPI_LD_FLGS = @OPT_PAPI_LDFLAGS@
@@ -1867,10 +1871,10 @@ UNW_MIPS_INCLUDE_DIRS = \
 
 UNW_MIPS_LD_FLAGS = 
 MY_CPP_DEFINES = -D_GNU_SOURCE -DINLINE_FN=1 -DLOCAL_BUILD=1 \
-	-D__HIP_PLATFORM_HCC__=1 $(am__append_11) $(am__append_19) \
-	$(am__append_102) $(am__append_106) $(am__append_108) \
-	$(am__append_112) $(am__append_127) $(am__append_131) \
-	$(am__append_135) $(am__append_139)
+	-D__HIP_PLATFORM_HCC__=1 $(am__append_11) $(am__append_20) \
+	$(am__append_103) $(am__append_107) $(am__append_109) \
+	$(am__append_113) $(am__append_128) $(am__append_132) \
+	$(am__append_136) $(am__append_140)
 MY_BASE_FILES = utilities/first_func.c main.h main.c disabled.c \
 	closure-registry.c cct_insert_backtrace.c \
 	cct_backtrace_finalize.c env.c epoch.c files.c \
@@ -1965,10 +1969,10 @@ MY_AARCH64_FILES = \
 	utilities/arch/libunwind/libunwind-context-pc.c
 
 @OPT_PAPI_COMPONENT_FALSE@MY_PAPI_FILES = sample-sources/papi.c \
-@OPT_PAPI_COMPONENT_FALSE@	$(am__append_16)
+@OPT_PAPI_COMPONENT_FALSE@	$(am__append_16) $(am__append_17)
 @OPT_PAPI_COMPONENT_TRUE@MY_PAPI_FILES = sample-sources/papi-c.c \
 @OPT_PAPI_COMPONENT_TRUE@	sample-sources/papi-c-extended-info.c \
-@OPT_PAPI_COMPONENT_TRUE@	$(am__append_16)
+@OPT_PAPI_COMPONENT_TRUE@	$(am__append_16) $(am__append_17)
 @OPT_ENABLE_CUPTI_TRUE@MY_CUPTI_FILES = sample-sources/nvidia.c	\
 @OPT_ENABLE_CUPTI_TRUE@	gpu/nvidia/cubin-hash-map.c		\
 @OPT_ENABLE_CUPTI_TRUE@	gpu/nvidia/cubin-id-map.c		\
@@ -2002,8 +2006,7 @@ MY_AARCH64_FILES = \
 @OPT_ENABLE_ROCM_TRUE@	gpu/amd/roctracer-activity-translate.c \
 @OPT_ENABLE_ROCM_TRUE@	gpu/amd/roctracer-api.c 	\
 @OPT_ENABLE_ROCM_TRUE@	gpu/amd/rocm-debug-api.c \
-@OPT_ENABLE_ROCM_TRUE@	gpu/amd/rocm-binary-processing.c \
-@OPT_ENABLE_ROCM_TRUE@	gpu/amd/hip-api.c
+@OPT_ENABLE_ROCM_TRUE@	gpu/amd/rocm-binary-processing.c 
 
 @OPT_ENABLE_LEVEL0_TRUE@MY_LEVEL0_FILES = \
 @OPT_ENABLE_LEVEL0_TRUE@	sample-sources/level0.c \
@@ -2054,11 +2057,11 @@ MY_AARCH64_INCLUDE_DIRS = \
 	-I$(srcdir)/utilities/arch/aarch64
 
 libhpcrun_la_SOURCES = $(MY_BASE_FILES) $(MY_DYNAMIC_FILES) \
-	$(am__append_24) $(am__append_25) $(am__append_38) \
-	$(am__append_53) $(am__append_71) $(am__append_84) \
-	$(am__append_99) $(am__append_103) $(am__append_113) \
-	$(am__append_120) $(am__append_124) $(am__append_128) \
-	$(am__append_132) $(am__append_136) $(UNW_SOURCE_FILES) \
+	$(am__append_25) $(am__append_26) $(am__append_39) \
+	$(am__append_54) $(am__append_72) $(am__append_85) \
+	$(am__append_100) $(am__append_104) $(am__append_114) \
+	$(am__append_121) $(am__append_125) $(am__append_129) \
+	$(am__append_133) $(am__append_137) $(UNW_SOURCE_FILES) \
 	utilities/last_func.c
 libhpcrun_fake_audit_la_SOURCES = \
 	audit/fake-auditor.c
@@ -2067,9 +2070,9 @@ libhpcrun_audit_la_SOURCES = \
 	audit/auditor.c
 
 libhpcrun_o_SOURCES = $(MY_BASE_FILES) $(MY_STATIC_FILES) \
-	$(am__append_26) $(am__append_39) $(am__append_54) \
-	$(am__append_72) $(am__append_85) $(am__append_104) \
-	$(am__append_109) $(am__append_114) $(am__append_123) \
+	$(am__append_27) $(am__append_40) $(am__append_55) \
+	$(am__append_73) $(am__append_86) $(am__append_105) \
+	$(am__append_110) $(am__append_115) $(am__append_124) \
 	$(UNW_SOURCE_FILES) utilities/last_func.c
 libhpcrun_wrap_a_SOURCES = \
 	monitor-exts/openmp.c
@@ -2114,12 +2117,12 @@ libhpctoolkit_a_SOURCES = \
 # cppflags
 #-----------------------------------------------------------
 libhpcrun_la_CPPFLAGS = $(MY_CPP_DEFINES) $(MY_INCLUDE_DIRS) \
-	$(am__append_20) $(am__append_27) $(am__append_40) \
-	$(am__append_55) $(am__append_73) $(am__append_86) \
-	$(am__append_100) $(am__append_105) $(am__append_107) \
-	$(am__append_115) $(am__append_118) $(am__append_121) \
-	$(am__append_125) $(am__append_129) $(am__append_133) \
-	$(am__append_137) $(UNW_INCLUDE_DIRS)
+	$(am__append_21) $(am__append_28) $(am__append_41) \
+	$(am__append_56) $(am__append_74) $(am__append_87) \
+	$(am__append_101) $(am__append_106) $(am__append_108) \
+	$(am__append_116) $(am__append_119) $(am__append_122) \
+	$(am__append_126) $(am__append_130) $(am__append_134) \
+	$(am__append_138) $(UNW_INCLUDE_DIRS)
 libhpcrun_fake_audit_la_CPPFLAGS = \
 	$(MY_CPP_DEFINES)		\
 	$(MY_INCLUDE_DIRS)
@@ -2129,51 +2132,51 @@ libhpcrun_audit_la_CPPFLAGS = \
 	$(MY_INCLUDE_DIRS)
 
 libhpcrun_o_CPPFLAGS = -DHPCRUN_STATIC_LINK $(MY_CPP_DEFINES) \
-	$(MY_INCLUDE_DIRS) $(am__append_21) $(am__append_28) \
-	$(am__append_41) $(am__append_56) $(am__append_74) \
-	$(am__append_87) $(am__append_110) $(am__append_116) \
-	$(am__append_119) $(UNW_INCLUDE_DIRS)
+	$(MY_INCLUDE_DIRS) $(am__append_22) $(am__append_29) \
+	$(am__append_42) $(am__append_57) $(am__append_75) \
+	$(am__append_88) $(am__append_111) $(am__append_117) \
+	$(am__append_120) $(UNW_INCLUDE_DIRS)
 libhpcrun_wrap_a_CPPFLAGS = \
 	-DHPCRUN_STATIC_LINK		\
 	$(MY_CPP_DEFINES)		\
 	$(MY_INCLUDE_DIRS)
 
 libhpcrun_ga_la_CPPFLAGS = $(MY_CPP_DEFINES) $(MY_INCLUDE_DIRS) \
-	$(am__append_29) $(am__append_42) $(am__append_59) \
-	$(am__append_75) $(am__append_88) $(UNW_INCLUDE_DIRS)
+	$(am__append_30) $(am__append_43) $(am__append_60) \
+	$(am__append_76) $(am__append_89) $(UNW_INCLUDE_DIRS)
 libhpcrun_ga_wrap_a_CPPFLAGS = -DHPCRUN_STATIC_LINK $(MY_CPP_DEFINES) \
-	$(MY_INCLUDE_DIRS) $(am__append_30) $(am__append_43) \
-	$(am__append_60) $(am__append_76) $(am__append_89) \
+	$(MY_INCLUDE_DIRS) $(am__append_31) $(am__append_44) \
+	$(am__append_61) $(am__append_77) $(am__append_90) \
 	$(UNW_INCLUDE_DIRS)
 libhpcrun_gprof_la_CPPFLAGS = $(MY_CPP_DEFINES) $(MY_INCLUDE_DIRS) \
-	$(am__append_44) $(am__append_61) $(am__append_90)
+	$(am__append_45) $(am__append_62) $(am__append_91)
 libhpcrun_gprof_wrap_a_CPPFLAGS = -DHPCRUN_STATIC_LINK \
-	$(MY_CPP_DEFINES) $(MY_INCLUDE_DIRS) $(am__append_45) \
-	$(am__append_62) $(am__append_91)
+	$(MY_CPP_DEFINES) $(MY_INCLUDE_DIRS) $(am__append_46) \
+	$(am__append_63) $(am__append_92)
 libhpcrun_io_la_CPPFLAGS = $(MY_CPP_DEFINES) $(MY_INCLUDE_DIRS) \
-	$(am__append_31) $(am__append_46) $(am__append_63) \
-	$(am__append_77) $(am__append_92) $(UNW_INCLUDE_DIRS)
+	$(am__append_32) $(am__append_47) $(am__append_64) \
+	$(am__append_78) $(am__append_93) $(UNW_INCLUDE_DIRS)
 libhpcrun_io_wrap_a_CPPFLAGS = -DHPCRUN_STATIC_LINK $(MY_CPP_DEFINES) \
-	$(MY_INCLUDE_DIRS) $(am__append_32) $(am__append_47) \
-	$(am__append_64) $(am__append_78) $(am__append_93) \
+	$(MY_INCLUDE_DIRS) $(am__append_33) $(am__append_48) \
+	$(am__append_65) $(am__append_79) $(am__append_94) \
 	$(UNW_INCLUDE_DIRS)
 libhpcrun_memleak_la_CPPFLAGS = $(MY_CPP_DEFINES) $(MY_INCLUDE_DIRS) \
-	$(am__append_33) $(am__append_48) $(am__append_65) \
-	$(am__append_79) $(am__append_94) $(UNW_INCLUDE_DIRS)
+	$(am__append_34) $(am__append_49) $(am__append_66) \
+	$(am__append_80) $(am__append_95) $(UNW_INCLUDE_DIRS)
 libhpcrun_memleak_wrap_a_CPPFLAGS = -DHPCRUN_STATIC_LINK \
-	$(MY_CPP_DEFINES) $(MY_INCLUDE_DIRS) $(am__append_34) \
-	$(am__append_49) $(am__append_66) $(am__append_80) \
-	$(am__append_95) $(UNW_INCLUDE_DIRS)
+	$(MY_CPP_DEFINES) $(MY_INCLUDE_DIRS) $(am__append_35) \
+	$(am__append_50) $(am__append_67) $(am__append_81) \
+	$(am__append_96) $(UNW_INCLUDE_DIRS)
 libhpcrun_pthread_la_CPPFLAGS = $(MY_CPP_DEFINES) $(MY_INCLUDE_DIRS) \
-	$(am__append_35) $(am__append_50) $(am__append_67) \
-	$(am__append_81) $(am__append_96) $(UNW_INCLUDE_DIRS)
+	$(am__append_36) $(am__append_51) $(am__append_68) \
+	$(am__append_82) $(am__append_97) $(UNW_INCLUDE_DIRS)
 libhpcrun_pthread_wrap_a_CPPFLAGS = -DHPCRUN_STATIC_LINK \
-	$(MY_CPP_DEFINES) $(MY_INCLUDE_DIRS) $(am__append_36) \
-	$(am__append_51) $(am__append_68) $(am__append_82) \
-	$(am__append_97) $(UNW_INCLUDE_DIRS)
+	$(MY_CPP_DEFINES) $(MY_INCLUDE_DIRS) $(am__append_37) \
+	$(am__append_52) $(am__append_69) $(am__append_83) \
+	$(am__append_98) $(UNW_INCLUDE_DIRS)
 libhpcrun_mpi_la_CPPFLAGS = $(MY_CPP_DEFINES) -I$(MPI_INC) \
-	$(MY_INCLUDE_DIRS) $(am__append_37) $(am__append_52) \
-	$(am__append_69) $(am__append_83) $(am__append_98) \
+	$(MY_INCLUDE_DIRS) $(am__append_38) $(am__append_53) \
+	$(am__append_70) $(am__append_84) $(am__append_99) \
 	$(UNW_INCLUDE_DIRS)
 libhpctoolkit_la_CPPFLAGS = \
 	$(MY_CPP_DEFINES)		\
@@ -2189,8 +2192,8 @@ libhpctoolkit_a_CPPFLAGS = \
 # cflags
 #-----------------------------------------------------------
 libhpcrun_la_CFLAGS = $(CFLAGS) $(HOST_CFLAGS) $(PERFMON_CFLAGS) \
-	$(am__append_122) $(am__append_126) $(am__append_130) \
-	$(am__append_134) $(am__append_138) $(GOTCHA_IFLAGS)
+	$(am__append_123) $(am__append_127) $(am__append_131) \
+	$(am__append_135) $(am__append_139) $(GOTCHA_IFLAGS)
 libhpcrun_o_CFLAGS = $(CFLAGS) $(HOST_CFLAGS) $(PERFMON_CFLAGS)
 libhpcrun_wrap_a_CFLAGS = $(CFLAGS) $(HOST_CFLAGS)
 libhpcrun_ga_la_CFLAGS = $(CFLAGS) $(HOST_CFLAGS)
@@ -2222,8 +2225,8 @@ OUR_LIBUNWIND_A = $(top_builddir)/src/extern/libunwind/libunwind.a
 OUR_LZMA_A = $(top_builddir)/src/extern/lzma/liblzma.a
 libhpcrun_la_LDFLAGS = -Wl,-Bsymbolic -L$(LIBMONITOR_LIB) -lmonitor \
 	-lpthread -lrt -L$(LIBELF_LIB) -lelf $(PERFMON_LDFLAGS_DYN) \
-	$(OPT_ROCM_LDFLAGS) $(am__append_57) $(am__append_101) \
-	$(am__append_117) $(GOTCHA_LDFLAGS) $(UNW_DYNAMIC_LD_FLAGS)
+	$(OPT_ROCM_LDFLAGS) $(am__append_58) $(am__append_102) \
+	$(am__append_118) $(GOTCHA_LDFLAGS) $(UNW_DYNAMIC_LD_FLAGS)
 libhpcrun_fake_audit_la_LDFLAGS = -Wl,-Bsymbolic -ldl
 libhpcrun_audit_la_LDFLAGS = -Wl,-Bsymbolic -ldl
 libhpcrun_ga_la_LDFLAGS = -Wl,-Bsymbolic
@@ -2249,9 +2252,9 @@ libhpcrun_la_LIBADD = \
 
 libhpcrun_o_LDADD = $(PROF_LEAN_A) $(SUPPORT_LEAN_A) \
 	$(PERFMON_LDFLAGS_STAT) $(MBEDTLS_LIBS) $(OUR_LIBUNWIND_A) \
-	$(OUR_LZMA_A) $(am__append_58) $(am__append_111) \
+	$(OUR_LZMA_A) $(am__append_59) $(am__append_112) \
 	$(UNW_STATIC_LD_FLAGS)
-MY_AGENT_INCLUDE_DIRS = $(MY_INCLUDE_DIRS) $(am__append_70) \
+MY_AGENT_INCLUDE_DIRS = $(MY_INCLUDE_DIRS) $(am__append_71) \
 	$(UNW_INCLUDE_DIRS)
 @HOST_CPU_AARCH64_TRUE@libhpcrun_la_CCASFLAGS = $(AM_CCASFLAGS)
 @HOST_CPU_PPC_TRUE@libhpcrun_la_CCASFLAGS = $(AM_CCASFLAGS)
@@ -2956,6 +2959,9 @@ sample-sources/libhpcrun_la-papi.lo: sample-sources/$(am__dirstamp) \
 sample-sources/libhpcrun_la-papi-c-cupti.lo:  \
 	sample-sources/$(am__dirstamp) \
 	sample-sources/$(DEPDIR)/$(am__dirstamp)
+sample-sources/libhpcrun_la-papi-c-rocm.lo:  \
+	sample-sources/$(am__dirstamp) \
+	sample-sources/$(DEPDIR)/$(am__dirstamp)
 sample-sources/libhpcrun_la-papi-c.lo: sample-sources/$(am__dirstamp) \
 	sample-sources/$(DEPDIR)/$(am__dirstamp)
 sample-sources/libhpcrun_la-papi-c-extended-info.lo:  \
@@ -3007,8 +3013,6 @@ gpu/amd/libhpcrun_la-rocm-debug-api.lo: gpu/amd/$(am__dirstamp) \
 	gpu/amd/$(DEPDIR)/$(am__dirstamp)
 gpu/amd/libhpcrun_la-rocm-binary-processing.lo:  \
 	gpu/amd/$(am__dirstamp) gpu/amd/$(DEPDIR)/$(am__dirstamp)
-gpu/amd/libhpcrun_la-hip-api.lo: gpu/amd/$(am__dirstamp) \
-	gpu/amd/$(DEPDIR)/$(am__dirstamp)
 sample-sources/libhpcrun_la-level0.lo: sample-sources/$(am__dirstamp) \
 	sample-sources/$(DEPDIR)/$(am__dirstamp)
 gpu/level0/$(am__dirstamp):
@@ -3579,6 +3583,9 @@ sample-sources/libhpcrun_o-papi.$(OBJEXT):  \
 sample-sources/libhpcrun_o-papi-c-cupti.$(OBJEXT):  \
 	sample-sources/$(am__dirstamp) \
 	sample-sources/$(DEPDIR)/$(am__dirstamp)
+sample-sources/libhpcrun_o-papi-c-rocm.$(OBJEXT):  \
+	sample-sources/$(am__dirstamp) \
+	sample-sources/$(DEPDIR)/$(am__dirstamp)
 sample-sources/libhpcrun_o-papi-c.$(OBJEXT):  \
 	sample-sources/$(am__dirstamp) \
 	sample-sources/$(DEPDIR)/$(am__dirstamp)
@@ -4000,7 +4007,6 @@ distclean-compile:
 @AMDEP_TRUE@@am__include@ @am__quote@gpu/$(DEPDIR)/libhpcrun_o-gpu-trace-demultiplexer.Po@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@gpu/$(DEPDIR)/libhpcrun_o-gpu-trace-item.Po@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@gpu/$(DEPDIR)/libhpcrun_o-gpu-trace.Po@am__quote@
-@AMDEP_TRUE@@am__include@ @am__quote@gpu/amd/$(DEPDIR)/libhpcrun_la-hip-api.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@gpu/amd/$(DEPDIR)/libhpcrun_la-rocm-binary-processing.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@gpu/amd/$(DEPDIR)/libhpcrun_la-rocm-debug-api.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@gpu/amd/$(DEPDIR)/libhpcrun_la-roctracer-activity-translate.Plo@am__quote@
@@ -4116,6 +4122,7 @@ distclean-compile:
 @AMDEP_TRUE@@am__include@ @am__quote@sample-sources/$(DEPDIR)/libhpcrun_la-opencl.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@sample-sources/$(DEPDIR)/libhpcrun_la-papi-c-cupti.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@sample-sources/$(DEPDIR)/libhpcrun_la-papi-c-extended-info.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@sample-sources/$(DEPDIR)/libhpcrun_la-papi-c-rocm.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@sample-sources/$(DEPDIR)/libhpcrun_la-papi-c.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@sample-sources/$(DEPDIR)/libhpcrun_la-papi.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@sample-sources/$(DEPDIR)/libhpcrun_la-pthread-blame.Plo@am__quote@
@@ -4138,6 +4145,7 @@ distclean-compile:
 @AMDEP_TRUE@@am__include@ @am__quote@sample-sources/$(DEPDIR)/libhpcrun_o-omp-mutex.Po@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@sample-sources/$(DEPDIR)/libhpcrun_o-papi-c-cupti.Po@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@sample-sources/$(DEPDIR)/libhpcrun_o-papi-c-extended-info.Po@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@sample-sources/$(DEPDIR)/libhpcrun_o-papi-c-rocm.Po@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@sample-sources/$(DEPDIR)/libhpcrun_o-papi-c.Po@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@sample-sources/$(DEPDIR)/libhpcrun_o-papi.Po@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@sample-sources/$(DEPDIR)/libhpcrun_o-pthread-blame.Po@am__quote@
@@ -5551,6 +5559,13 @@ sample-sources/libhpcrun_la-papi-c-cupti.lo: sample-sources/papi-c-cupti.c
 @AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
 @am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -c -o sample-sources/libhpcrun_la-papi-c-cupti.lo `test -f 'sample-sources/papi-c-cupti.c' || echo '$(srcdir)/'`sample-sources/papi-c-cupti.c
 
+sample-sources/libhpcrun_la-papi-c-rocm.lo: sample-sources/papi-c-rocm.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -MT sample-sources/libhpcrun_la-papi-c-rocm.lo -MD -MP -MF sample-sources/$(DEPDIR)/libhpcrun_la-papi-c-rocm.Tpo -c -o sample-sources/libhpcrun_la-papi-c-rocm.lo `test -f 'sample-sources/papi-c-rocm.c' || echo '$(srcdir)/'`sample-sources/papi-c-rocm.c
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) sample-sources/$(DEPDIR)/libhpcrun_la-papi-c-rocm.Tpo sample-sources/$(DEPDIR)/libhpcrun_la-papi-c-rocm.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='sample-sources/papi-c-rocm.c' object='sample-sources/libhpcrun_la-papi-c-rocm.lo' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -c -o sample-sources/libhpcrun_la-papi-c-rocm.lo `test -f 'sample-sources/papi-c-rocm.c' || echo '$(srcdir)/'`sample-sources/papi-c-rocm.c
+
 sample-sources/libhpcrun_la-papi-c.lo: sample-sources/papi-c.c
 @am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -MT sample-sources/libhpcrun_la-papi-c.lo -MD -MP -MF sample-sources/$(DEPDIR)/libhpcrun_la-papi-c.Tpo -c -o sample-sources/libhpcrun_la-papi-c.lo `test -f 'sample-sources/papi-c.c' || echo '$(srcdir)/'`sample-sources/papi-c.c
 @am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) sample-sources/$(DEPDIR)/libhpcrun_la-papi-c.Tpo sample-sources/$(DEPDIR)/libhpcrun_la-papi-c.Plo
@@ -5677,13 +5692,6 @@ gpu/amd/libhpcrun_la-rocm-binary-processing.lo: gpu/amd/rocm-binary-processing.c
 @AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
 @am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -c -o gpu/amd/libhpcrun_la-rocm-binary-processing.lo `test -f 'gpu/amd/rocm-binary-processing.c' || echo '$(srcdir)/'`gpu/amd/rocm-binary-processing.c
 
-gpu/amd/libhpcrun_la-hip-api.lo: gpu/amd/hip-api.c
-@am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -MT gpu/amd/libhpcrun_la-hip-api.lo -MD -MP -MF gpu/amd/$(DEPDIR)/libhpcrun_la-hip-api.Tpo -c -o gpu/amd/libhpcrun_la-hip-api.lo `test -f 'gpu/amd/hip-api.c' || echo '$(srcdir)/'`gpu/amd/hip-api.c
-@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) gpu/amd/$(DEPDIR)/libhpcrun_la-hip-api.Tpo gpu/amd/$(DEPDIR)/libhpcrun_la-hip-api.Plo
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='gpu/amd/hip-api.c' object='gpu/amd/libhpcrun_la-hip-api.lo' libtool=yes @AMDEPBACKSLASH@
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
-@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -c -o gpu/amd/libhpcrun_la-hip-api.lo `test -f 'gpu/amd/hip-api.c' || echo '$(srcdir)/'`gpu/amd/hip-api.c
-
 sample-sources/libhpcrun_la-level0.lo: sample-sources/level0.c
 @am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -MT sample-sources/libhpcrun_la-level0.lo -MD -MP -MF sample-sources/$(DEPDIR)/libhpcrun_la-level0.Tpo -c -o sample-sources/libhpcrun_la-level0.lo `test -f 'sample-sources/level0.c' || echo '$(srcdir)/'`sample-sources/level0.c
 @am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) sample-sources/$(DEPDIR)/libhpcrun_la-level0.Tpo sample-sources/$(DEPDIR)/libhpcrun_la-level0.Plo
@@ -8323,6 +8331,20 @@ sample-sources/libhpcrun_o-papi-c-cupti.obj: sample-sources/papi-c-cupti.c
 @AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
 @am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_o_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_o_CFLAGS) $(CFLAGS) -c -o sample-sources/libhpcrun_o-papi-c-cupti.obj `if test -f 'sample-sources/papi-c-cupti.c'; then $(CYGPATH_W) 'sample-sources/papi-c-cupti.c'; else $(CYGPATH_W) '$(srcdir)/sample-sources/papi-c-cupti.c'; fi`
 
+sample-sources/libhpcrun_o-papi-c-rocm.o: sample-sources/papi-c-rocm.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_o_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_o_CFLAGS) $(CFLAGS) -MT sample-sources/libhpcrun_o-papi-c-rocm.o -MD -MP -MF sample-sources/$(DEPDIR)/libhpcrun_o-papi-c-rocm.Tpo -c -o sample-sources/libhpcrun_o-papi-c-rocm.o `test -f 'sample-sources/papi-c-rocm.c' || echo '$(srcdir)/'`sample-sources/papi-c-rocm.c
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) sample-sources/$(DEPDIR)/libhpcrun_o-papi-c-rocm.Tpo sample-sources/$(DEPDIR)/libhpcrun_o-papi-c-rocm.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='sample-sources/papi-c-rocm.c' object='sample-sources/libhpcrun_o-papi-c-rocm.o' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_o_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_o_CFLAGS) $(CFLAGS) -c -o sample-sources/libhpcrun_o-papi-c-rocm.o `test -f 'sample-sources/papi-c-rocm.c' || echo '$(srcdir)/'`sample-sources/papi-c-rocm.c
+
+sample-sources/libhpcrun_o-papi-c-rocm.obj: sample-sources/papi-c-rocm.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_o_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_o_CFLAGS) $(CFLAGS) -MT sample-sources/libhpcrun_o-papi-c-rocm.obj -MD -MP -MF sample-sources/$(DEPDIR)/libhpcrun_o-papi-c-rocm.Tpo -c -o sample-sources/libhpcrun_o-papi-c-rocm.obj `if test -f 'sample-sources/papi-c-rocm.c'; then $(CYGPATH_W) 'sample-sources/papi-c-rocm.c'; else $(CYGPATH_W) '$(srcdir)/sample-sources/papi-c-rocm.c'; fi`
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) sample-sources/$(DEPDIR)/libhpcrun_o-papi-c-rocm.Tpo sample-sources/$(DEPDIR)/libhpcrun_o-papi-c-rocm.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='sample-sources/papi-c-rocm.c' object='sample-sources/libhpcrun_o-papi-c-rocm.obj' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_o_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_o_CFLAGS) $(CFLAGS) -c -o sample-sources/libhpcrun_o-papi-c-rocm.obj `if test -f 'sample-sources/papi-c-rocm.c'; then $(CYGPATH_W) 'sample-sources/papi-c-rocm.c'; else $(CYGPATH_W) '$(srcdir)/sample-sources/papi-c-rocm.c'; fi`
+
 sample-sources/libhpcrun_o-papi-c.o: sample-sources/papi-c.c
 @am__fastdepCC_TRUE@	$(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_o_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_o_CFLAGS) $(CFLAGS) -MT sample-sources/libhpcrun_o-papi-c.o -MD -MP -MF sample-sources/$(DEPDIR)/libhpcrun_o-papi-c.Tpo -c -o sample-sources/libhpcrun_o-papi-c.o `test -f 'sample-sources/papi-c.c' || echo '$(srcdir)/'`sample-sources/papi-c.c
 @am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) sample-sources/$(DEPDIR)/libhpcrun_o-papi-c.Tpo sample-sources/$(DEPDIR)/libhpcrun_o-papi-c.Po
diff --git a/src/tool/hpcrun/gpu/amd/roctracer-api.c b/src/tool/hpcrun/gpu/amd/roctracer-api.c
index cfffa8512d..1530dedf63 100644
--- a/src/tool/hpcrun/gpu/amd/roctracer-api.c
+++ b/src/tool/hpcrun/gpu/amd/roctracer-api.c
@@ -48,7 +48,7 @@
 #include "roctracer-api.h"
 #include "roctracer-activity-translate.h"
 
-#include "hip-api.h"
+// #include "hip-api.h"
 #include "rocm-debug-api.h"
 #include "rocm-binary-processing.h"
 #include "tool_state.h"
@@ -450,13 +450,11 @@ roctracer_subscriber_callback
     uint64_t cpu_submit_time = hpcrun_nanotime();
 
     PRINT("\nACTIVITY_API_PHASE_ENTER -----------------| cct = %p \n", api_node);
-    int (*hip_gpu_sync_ptr)(void) = hip_dev_sync;
     gpu_monitors_apply(api_node, gpu_monitor_type_enter);
 
     gpu_correlation_channel_produce(correlation_id, &gpu_op_ccts, cpu_submit_time);
   }else if (data->phase == ACTIVITY_API_PHASE_EXIT){
     PRINT("\nACTIVITY_API_PHASE_EXIT -----------------| \n");
-    int (*hip_gpu_sync_ptr)(void) = hip_dev_sync;
     gpu_monitors_apply(NULL, gpu_monitor_type_exit);
 
   }else{
diff --git a/src/tool/hpcrun/messages/debug-flag.c b/src/tool/hpcrun/messages/debug-flag.c
index a300554a61..fe416711c6 100644
--- a/src/tool/hpcrun/messages/debug-flag.c
+++ b/src/tool/hpcrun/messages/debug-flag.c
@@ -159,7 +159,7 @@ static pmsg_category all_list_entries [] = {
  // E(CSP_MALLOC),
  // E(MEM__ALLOC),
  E(NORM_IP),
- E(PARTIAL_UNW)
+ E(PARTIAL_UNW) 
 };
 
 
diff --git a/src/tool/hpcrun/sample-sources/amd.c b/src/tool/hpcrun/sample-sources/amd.c
index bb5fc402fe..bc84140db0 100644
--- a/src/tool/hpcrun/sample-sources/amd.c
+++ b/src/tool/hpcrun/sample-sources/amd.c
@@ -43,7 +43,7 @@
 #include <hpcrun/control-knob.h>
 #include <hpcrun/device-finalizers.h>
 #include <hpcrun/gpu/amd/roctracer-api.h>
-#include <hpcrun/gpu/amd/hip-api.h>
+// #include <hpcrun/gpu/amd/hip-api.h>
 #include <hpcrun/gpu/gpu-activity.h>
 #include <hpcrun/gpu/gpu-metrics.h>
 #include <hpcrun/gpu/gpu-trace.h>
@@ -152,12 +152,12 @@ METHOD_FN(process_event_list, int lush_metrics)
     TMSG(CUDA,"nevents = %d", nevents);
 
 
-#ifndef HPCRUN_STATIC_LINK
-  if (hip_bind()) {
-    EEMSG("hpcrun: unable to bind to HIP AMD library %s\n", dlerror());
-    monitor_real_exit(-1);
-  }
-#endif
+// #ifndef HPCRUN_STATIC_LINK
+//   if (hip_bind()) {
+//     EEMSG("hpcrun: unable to bind to HIP AMD library %s\n", dlerror());
+//     monitor_real_exit(-1);
+//   }
+// #endif
 }
 
 static void
diff --git a/src/tool/hpcrun/sample-sources/papi-c-rocm.c b/src/tool/hpcrun/sample-sources/papi-c-rocm.c
index d2621ed7f6..92120ef7a2 100644
--- a/src/tool/hpcrun/sample-sources/papi-c-rocm.c
+++ b/src/tool/hpcrun/sample-sources/papi-c-rocm.c
@@ -1,387 +1,201 @@
-// ******************* System Includes ********************
-#include <ucontext.h> 
-#include <dlfcn.h>
+// -*-Mode: C++;-*- // technically C99
 
-#include <stdbool.h>
-#include <string.h>
-#include <stdint.h>
-// *********************************************************
+// * BeginRiceCopyright *****************************************************
+//
+// --------------------------------------------------------------------------
+// Part of HPCToolkit (hpctoolkit.org)
+//
+// Information about sources of support for research and development of
+// HPCToolkit is at 'hpctoolkit.org' and in 'README.Acknowledgments'.
+// --------------------------------------------------------------------------
+//
+// Copyright ((c)) 2002-2020, Rice University
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+//   notice, this list of conditions and the following disclaimer.
+//
+// * Redistributions in binary form must reproduce the above copyright
+//   notice, this list of conditions and the following disclaimer in the
+//   documentation and/or other materials provided with the distribution.
+//
+// * Neither the name of Rice University (RICE) nor the names of its
+//   contributors may be used to endorse or promote products derived from
+//   this software without specific prior written permission.
+//
+// This software is provided by RICE and contributors "as is" and any
+// express or implied warranties, including, but not limited to, the
+// implied warranties of merchantability and fitness for a particular
+// purpose are disclaimed. In no event shall RICE or contributors be
+// liable for any direct, indirect, incidental, special, exemplary, or
+// consequential damages (including, but not limited to, procurement of
+// substitute goods or services; loss of use, data, or profits; or
+// business interruption) however caused and on any theory of liability,
+// whether in contract, strict liability, or tort (including negligence
+// or otherwise) arising in any way out of the use of this software, even
+// if advised of the possibility of such damage.
+//
+// ******************************************************* EndRiceCopyright *
 
+//***************************************************************************
+//
+// File:
+//   rocm-api.c
+//
+// Purpose:
+//   implementation of wrapper around NVIDIA's ROCM performance tools API
+//
+//***************************************************************************
 
-// ******************** PAPI *******************************
-#include <papi.h>
-// *********************************************************
+//***************************************************************************
+// system includes
+//***************************************************************************
 
-// ******************** MONITOR *******************************
+#include <papi.h>
 #include <monitor.h>
-// *********************************************************
 
-// ******************** GPU includes ***********************
-#include <hip-api.h>
 
-// *********************************************************
 
-// ******* HPCToolkit Includes *********************************
-#include <lib/prof-lean/spinlock.h>
+//***************************************************************************
+// local includes
+//***************************************************************************
 
-#include <hpcrun/thread_data.h>
 #include <messages/messages.h>
-#include <hpcrun/sample_event.h>
-#include <hpcrun/safe-sampling.h>
-#include <hpcrun/sample_sources_all.h>
-#include <sample-sources/common.h>
 #include <sample-sources/ss-obj-name.h>
-// *********************************************************
-
-// ******** local includes ***********
 #include "papi-c.h"
 #include "papi-c-extended-info.h"
-// ***********************************
-
-// ****************** Convenience macros *******************
-
-#define CUPTI_LAUNCH_CALLBACK_DEPTH 7
-
-#define Cupti_call(fn, ...)                                    \
-{                                                              \
-  int ret = fn(__VA_ARGS__);                                   \
-  if (ret != CUPTI_SUCCESS) {                                  \
-    const char* errstr;                                        \
-    dcuptiGetResultString(ret, &errstr);                        \
-    hpcrun_abort("error: CUDA/CUPTI API "                      \
-                 #fn " failed w error code %d ==> '%s'\n",     \
-                 ret, errstr);                                 \
-  }                                                            \
-}
-
-#define Cupti_call_silent(fn, ...)                             \
-{                                                              \
-  (void) fn(__VA_ARGS__);                                      \
-}
-
-#define Chk_dlopen(v, lib, flags)                     \
-  void* v = monitor_real_dlopen(lib, flags);          \
-  if (! v) {                                          \
-    fprintf(stderr, "gpu dlopen %s failed\n", lib);   \
-    return;                                           \
-  }                                                   \
-
-#define Chk_dlsym(h, fn) {                                \
-  dlerror();                                              \
-  d ## fn = dlsym(h, #fn);                                \
-  char* e = dlerror();                                    \
-  if (e) {                                                \
-    fprintf(stderr, "dlsym(%s) fails w '%s'\n", #fn, e);  \
-    return;                                               \
-  }                                                       \
-}
-// ***********************************************************
-
-typedef struct {
-  int nevents;
-  int event_set;
-  sample_source_t* self;
-} papi_cuda_data_t;
-
-static bool event_set_created = false;
-static bool event_set_finalized = false;
-
-static papi_cuda_data_t local = {};
 
-static spinlock_t cupti_lock = SPINLOCK_UNLOCKED;
-static spinlock_t setup_lock = SPINLOCK_UNLOCKED;
 
-// ******************** cuda/cupti functions ***********************
-// Some cuda/cupti functions must not be wrapped! So, we fetch them via dlopen.
-// NOTE: naming convention is to prepend the letter "d" to the actual function
-// The indirect functions are below.
-//
-cudaError_t (*dcudaThreadSynchronize)(void);
-
-CUptiResult (*dcuptiGetResultString)(CUptiResult result, const char** str); 
-
-CUptiResult (*dcuptiSubscribe)(CUpti_SubscriberHandle* subscriber,
-                               CUpti_CallbackFunc callback, 
-                               void* userdata);
 
-CUptiResult (*dcuptiEnableCallback)(uint32_t enable,
-                                    CUpti_SubscriberHandle subscriber, 
-                                    CUpti_CallbackDomain domain,
-                                    CUpti_CallbackId cbid);
+//******************************************************************************
+// static data
+//******************************************************************************
 
-CUptiResult (*dcuptiUnsubscribe)(CUpti_SubscriberHandle subscriber); 
+static __thread bool event_set_created = false;
+static __thread bool event_set_finalized = false;
+static __thread int my_event_set = PAPI_NULL;
 
 
-// *****************************************************************
-typedef struct cuda_callback_t {
-  sample_source_t* ss;
-  int event_set;
-} cuda_callback_t;
-
-//
-// populate the cuda/cupti functions via dlopen
-//
 
-static void
-dlgpu(void)
-{
-  // only use dlfunctions in NON static case
-#ifndef HPCRUN_STATIC_LINK
-  Chk_dlopen(cudart, "libcudart.so", RTLD_NOW | RTLD_GLOBAL);
-  Chk_dlsym(cudart, cudaThreadSynchronize);
-
-  Chk_dlopen(cupti, "libcupti.so", RTLD_NOW | RTLD_GLOBAL);
-  Chk_dlsym(cupti, cuptiGetResultString);
-  Chk_dlsym(cupti, cuptiSubscribe);
-  Chk_dlsym(cupti, cuptiEnableCallback);
-  Chk_dlsym(cupti, cuptiUnsubscribe);
-#endif // ! HPCRUN_STATIC_LINK
-}
+//******************************************************************************
+// private operations
+//******************************************************************************
 
-//
-// noop routine
-//
 static void
 papi_c_no_action(void)
 {
   ;
 }
 
-//
-// Predicate to determine if this component is being referenced
-//
+
 static bool
 is_papi_c_rocm(const char* name)
 {
   return strstr(name, "rocm") == name;
 }
 
-static void CUPTIAPI
-hpcrun_cuda_kernel_callback(void* userdata,
-			    CUpti_CallbackDomain domain,
-			    CUpti_CallbackId cbid, 
-			    const CUpti_CallbackData* cbInfo)
-{
-  TMSG(CUDA, "Got Kernel Callback");
-
-  papi_cuda_data_t* cuda_data = userdata;
-  int nevents = cuda_data->nevents;
-  int cudaEventSet = cuda_data->event_set;
-  sample_source_t* self = cuda_data->self;
-
-
-  TMSG(CUDA, "nevents = %d, cuda event set = %x", nevents, cudaEventSet);
 
-  // This callback is enabled only for kernel launch; anything else is an error.
-  if (cbid != CUPTI_RUNTIME_TRACE_CBID_cudaLaunch_v3020) {
-    hpcrun_abort("CUDA CUPTI callback seen for unexpected "
-		 "interface operation: callback id  %d\n", cbid); 
-  }
-
-  if (cbInfo->callbackSite == CUPTI_API_ENTER) {
-    TMSG(CUDA, "Cupti API -ENTER- portion");
-    // MC recommends FIXME: Unnecessary, but use cudaDeviceSynchronize
-      // exclusive access to launcher
-    spinlock_lock(&cupti_lock);
-    TMSG(CUPTI, "-ACQ-lock");
-    dcudaThreadSynchronize();
-
-    TMSG(CUPTI,"-- PRE launch callback");
-    TMSG(CUDA, "Start monitoring with event set %d", cudaEventSet);
-    int ret = PAPI_start(cudaEventSet);
-    if (ret != PAPI_OK){
-      EMSG("CUDA monitoring failed to start. PAPI_start failed with %s (%d)", 
-	   PAPI_strerror(ret), ret);
-    }
-  }
-  TMSG(CUDA, "Past (or done with) CUDA -ENTER- portion");
-
-
-  if (cbInfo->callbackSite == CUPTI_API_EXIT) {
-    TMSG(CUDA, "Cupti API -EXIT- portion");
-    // MC recommends Use cudaDeviceSynchronize
-    dcudaThreadSynchronize();
-    TMSG(CUPTI, "-- POST launch callback");
-    long_long eventValues[nevents+2];
-    
-    TMSG(CUDA,"stopping CUDA monitoring w event set %d",cudaEventSet);
-    int ret = PAPI_stop(cudaEventSet, eventValues);
-    if (ret != PAPI_OK){
-      EMSG("CUDA monitoring failed to -stop-. PAPI_stop failed with %s (%d)", 
-	   PAPI_strerror(ret), ret);
-    }  
-    TMSG(CUDA,"stopped CUDA monitoring w event set %d",cudaEventSet);
-
-    ucontext_t uc;
-    TMSG(CUDA,"getting context in CUDA event handler");
-    getcontext(&uc);
-    TMSG(CUDA,"got context in CUDA event handler");
-    bool safe = hpcrun_safe_enter();
-    TMSG(CUDA,"blocked async event in CUDA event handler");
-    {
-      int i;
-      for (i = 0; i < nevents; i++) 
-	{
-	  int metric_id = hpcrun_event2metric(self, i);
-
-	  TMSG(CUDA, "sampling call path for metric_id = %d", metric_id);
-	  hpcrun_sample_callpath(&uc, metric_id, eventValues[i]/*metricIncr*/, 
-				 CUPTI_LAUNCH_CALLBACK_DEPTH/*skipInner*/, 
-				 0/*isSync*/, NULL);
-	  TMSG(CUDA, "sampled call path for metric_id = %d", metric_id);
-	}
+// Get or create a rocm event set
+static void
+papi_c_rocm_get_event_set(int* event_set)
+{
+  TMSG(ROCM, "Get event set");
+  if (! event_set_created) {
+    TMSG(ROCM, "No event set created, so create one");
+    int ret = PAPI_create_eventset(&my_event_set);
+    if (ret != PAPI_OK) {
+      hpcrun_abort("Failure: PAPI_create_eventset.Return code = %d ==> %s",
+                   ret, PAPI_strerror(ret));
     }
-    TMSG(CUDA,"unblocking async event in CUDA event handler");
-    if (safe) hpcrun_safe_exit();
-    TMSG(CUDA,"unblocked async event in CUDA event handler");
-
-    spinlock_unlock(&cupti_lock);
-    TMSG(CUPTI,"-REL-lock\n");
+    *event_set = my_event_set;
+    event_set_created = true;
+    TMSG(ROCM, "Event set %d created", my_event_set);
   }
-  TMSG(CUDA, "At end (past -EXIT-)");
 }
 
-static CUpti_SubscriberHandle subscriber;
 
-//
-// sync setup for cuda/cupti
-//
-static void
-papi_c_cupti_setup(void)
+// Add event to my_event_set
+void
+papi_c_rocm_add_event(int event_set, int evcode)
 {
-  // FIXME: Remove local definition
-  // CUpti_SubscriberHandle subscriber;
-
-  static bool one_time = false;
+  assert(event_set == my_event_set);
 
-  spinlock_lock(&setup_lock);
-  TMSG(CUDA, "CUPTI setup acquire lock");
-  if (one_time) {
-    spinlock_unlock(&setup_lock);
-    TMSG(CUDA, "CUPTI setup release lock (setup already called)");
-    return;
-  }
-
-  TMSG(CUDA,"sync setup called");
-
-  thread_data_t* td = hpcrun_get_thread_data();
-  local.self = hpcrun_fetch_source_by_name("papi");
-
-  local.nevents  = local.self->evl.nevents;
-
-  // get cuda event set
-
-  int cuda_component_idx;
-  int n_components = PAPI_num_components();
-
-  for (int i = 0; i < n_components; i++) {
-    if (is_papi_c_cuda(PAPI_get_component_info(i)->name)) {
-      cuda_component_idx = i;
-      break;
+  int rv = PAPI_OK;
+  if (! event_set_finalized) {
+    TMSG(ROCM, "Adding event %x to rocm event set", evcode);
+    rv = PAPI_add_event(my_event_set, evcode);
+    if (rv != PAPI_OK) {
+      hpcrun_abort("failure in PAPI gen_event_set(): PAPI_add_event() returned: %s (%d)",
+                   PAPI_strerror(rv), rv);
     }
+    TMSG(ROCM, "Added event %d, to rocm event set %d", evcode, my_event_set);
   }
-
-  papi_source_info_t* psi = td->ss_info[local.self->sel_idx].ptr;
-  local.event_set = get_component_event_set( &(psi->component_info[cuda_component_idx]) );
-
-  Cupti_call(dcuptiSubscribe, &subscriber,
-             (CUpti_CallbackFunc)hpcrun_cuda_kernel_callback, 
-             &local);
-             
-  Cupti_call(dcuptiEnableCallback, 1, subscriber,
-             CUPTI_CB_DOMAIN_RUNTIME_API,
-             CUPTI_RUNTIME_TRACE_CBID_cudaLaunch_v3020);
-
-  one_time = true;
-  spinlock_unlock(&setup_lock);
-  TMSG(CUDA, "CUPTI setup release lock");
 }
 
-//
-// Get or create a cupti event set --- but only ONCE per process
-//
+// No adding new events after this point
 void
-papi_c_cupti_get_event_set(int* event_set)
+papi_c_rocm_finalize_event_set(void)
 {
-  TMSG(CUDA, "Get event set");
-  spinlock_lock(&setup_lock);
-  TMSG(CUDA, "Cupti lock acquired");
-  if (! event_set_created) {
-    TMSG(CUDA, "No event set created, so create one");
-    int ret = PAPI_create_eventset(event_set);
-    if (ret != PAPI_OK) {
-      hpcrun_abort("Failure: PAPI_create_eventset.Return code = %d ==> %s", 
-                   ret, PAPI_strerror(ret));
-    }
-    local.event_set = *event_set;
-    event_set_created = true;
-    TMSG(CUDA, "Event set %d created", local.event_set);
-  }
-  spinlock_unlock(&setup_lock);
-  TMSG(CUDA, "Cupti lock released");
+  event_set_finalized = true;
 }
 
-int
-papi_c_cupti_add_event(int event_set, int ev)
+
+void
+papi_c_rocm_start()
 {
-  int rv = PAPI_OK;
-  TMSG(CUDA, "Adding event to cupti event set");
-  spinlock_lock(&setup_lock);
-  TMSG(CUDA, "Cupti lock acquired");
-  if (! event_set_finalized) {
-    TMSG(CUDA, "Really add event %x to cupti event set", ev);
-    rv = PAPI_add_event(local.event_set, ev);
-    TMSG(CUDA, "Check event set passed in = %d, cuda event set = %d", event_set, local.event_set);
+  int ret = PAPI_start(my_event_set);
+  if (ret != PAPI_OK) {
+    hpcrun_abort("PAPI_start of event set %d failed with %s (%d)",
+         my_event_set, PAPI_strerror(ret), ret);
   }
-  spinlock_unlock(&setup_lock);
-  TMSG(CUDA, "Cupti lock released");
-  return rv;
 }
 
+
 void
-papi_c_cupti_finalize_event_set(void)
+papi_c_rocm_read(long long *values)
 {
-  spinlock_lock(&setup_lock);
-  event_set_finalized = true;
-  spinlock_unlock(&setup_lock);
+  // hip_dev_sync(); // TODO:Dejan check this out
+  int ret = PAPI_read(my_event_set, values);
+  if (ret != PAPI_OK) {
+    hpcrun_abort("PAPI_read of event set %d failed with %s (%d)",
+         my_event_set, PAPI_strerror(ret), ret);
+  }
 }
 
 
-//
-// sync teardown for cuda/cupti
-//
-static void
-papi_c_cupti_teardown(void)
+void
+papi_c_rocm_stop(long long *values)
 {
-  static bool one_time = false;
-  spinlock_lock(&setup_lock);
-  if (one_time) return;
-
-  TMSG(CUDA,"sync teardown called (=unsubscribe)");
-  
-  Cupti_call(dcuptiUnsubscribe, subscriber);
-  one_time = true;
-  spinlock_unlock(&setup_lock);
+  int ret = PAPI_stop(my_event_set, values);
+  if (ret != PAPI_OK) {
+    hpcrun_abort("PAPI_stop of event set %d failed with %s (%d)",
+         my_event_set, PAPI_strerror(ret), ret);
+  }
 }
 
-static sync_info_list_t cuda_component = {
-  .pred = is_papi_c_cuda,
-  .get_event_set = papi_c_cupti_get_event_set,
-  .add_event = papi_c_cupti_add_event,
-  .finalize_event_set = papi_c_cupti_finalize_event_set,
-  .setup = papi_c_cupti_setup,
-  .teardown = papi_c_cupti_teardown,
-  .start = papi_c_no_action,
-  .stop = papi_c_no_action,
+
+static sync_info_list_t rocm_component = {
+  .pred = is_papi_c_rocm,
+  .get_event_set = papi_c_rocm_get_event_set,
+  .add_event = papi_c_rocm_add_event,
+  .finalize_event_set = papi_c_rocm_finalize_event_set,
+  .is_gpu_sync = true,
+  .setup = papi_c_no_action,
+  .teardown = papi_c_no_action,
+  .start = papi_c_rocm_start,
+  .read = papi_c_rocm_read,
+  .stop = papi_c_rocm_stop,
   .process_only = true,
   .next = NULL,
 };
 
 
 void
-SS_OBJ_CONSTRUCTOR(papi_c_cupti)(void)
+SS_OBJ_CONSTRUCTOR(papi_c_rocm)(void)
 {
-  // fetch actual cuda/cupti functions
-  dlgpu();
-  papi_c_sync_register(&cuda_component);
-}
+  papi_c_sync_register(&rocm_component);
+}
\ No newline at end of file
diff --git a/src/tool/hpcrun/sample-sources/papi-c.c b/src/tool/hpcrun/sample-sources/papi-c.c
index 20a77d7893..2815654187 100644
--- a/src/tool/hpcrun/sample-sources/papi-c.c
+++ b/src/tool/hpcrun/sample-sources/papi-c.c
@@ -352,7 +352,7 @@ METHOD_FN(init)
       }
     }
   }
-  int ret = PAPI_library_init(PAPI_VER_CURRENT);
+  int ret = 0; //PAPI_library_init(PAPI_VER_CURRENT);
   monitor_enable_new_threads();
 
   TMSG(PAPI_C,"PAPI_library_init = %d", ret);

From 69af41dce634ba9f3555f944b5c07329cc24717b Mon Sep 17 00:00:00 2001
From: Dejan Grubisic <grubisic.dejan@yahoo.com>
Date: Fri, 5 Feb 2021 09:34:17 -0600
Subject: [PATCH 126/177] hpcrun main adopted to libmonitor pthread_create
 start

---
 src/tool/hpcrun/main.c                      | 75 ++++++++++++---------
 src/tool/hpcrun/module-ignore-map.c         | 12 ++--
 src/tool/hpcrun/sample-sources/papi-c.c     |  2 +-
 src/tool/hpcrun/sample_sources_registered.c |  4 ++
 4 files changed, 57 insertions(+), 36 deletions(-)

diff --git a/src/tool/hpcrun/main.c b/src/tool/hpcrun/main.c
index 6dacaf3fe8..4a481a3859 100644
--- a/src/tool/hpcrun/main.c
+++ b/src/tool/hpcrun/main.c
@@ -78,6 +78,7 @@
 #include <include/uint.h>
 
 #include <include/hpctoolkit-config.h>
+#include <hpcrun/hpcrun-placeholders.h>
 
 #include "main.h"
 
@@ -112,7 +113,6 @@
 #include "device-initializers.h"
 #include "device-finalizers.h"
 #include "module-ignore-map.h"
-#include "control-knob.h"
 #include "epoch.h"
 #include "thread_data.h"
 #include "threadmgr.h"
@@ -221,7 +221,7 @@ bool hpcrun_no_unwind = false;
  * (public declaration) thread-local variables
  *****************************************************************************/
 static __thread bool hpcrun_thread_suppress_sample = true;
-
+static atomic_bool is_partially_initialized = ATOMIC_VAR_INIT(false);
 
 //***************************************************************************
 // local variables 
@@ -888,43 +888,56 @@ monitor_init_process(int *argc, char **argv, void* data)
     }
   }
 
-  hpcrun_set_using_threads(false);
+  if (atomic_fetch_add(&is_partially_initialized, 1) == 0){
 
-  copy_execname(process_name);
-  hpcrun_files_set_executable(process_name);
+    hpcrun_set_using_threads(false);
 
-  // We initialize the load map and fnbounds before registering sample source.
-  // This is because sample source init (such as PAPI)  may dlopen other libraries,
-  // which will trigger our library monitoring code and fnbound queries
-  hpcrun_initLoadmap();
+    copy_execname(process_name);
+    hpcrun_files_set_executable(process_name);
 
-  // We need to initialize messages related functions and set up measurement directory,
-  // so that we can write vdso and prevent fnbounds print messages to the terminal.
-  messages_init();
-  if (!hpcrun_get_disabled()) {
-    hpcrun_files_set_directory();
-  }
-  messages_logfile_create();
+    // We initialize the load map and fnbounds before registering sample source.
+    // This is because sample source init (such as PAPI)  may dlopen other libraries,
+    // which will trigger our library monitoring code and fnbound queries
+    hpcrun_initLoadmap();
+
+    // We need to initialize messages related functions and set up measurement directory,
+    // so that we can write vdso and prevent fnbounds print messages to the terminal.
+    messages_init();
+    if (!hpcrun_get_disabled()) {
+      hpcrun_files_set_directory();
+    }
+    messages_logfile_create();
 
-  // must initialize unwind recipe map before initializing fnbounds
-  // because mapping of load modules affects the recipe map.
-  hpcrun_unw_init();
+    // must initialize unwind recipe map before initializing fnbounds
+    // because mapping of load modules affects the recipe map.
+    hpcrun_unw_init();
 
-  // We need to save vdso before initializing fnbounds this
-  // is because fnbounds_init will iterate over the load map
-  // and will invoke analysis on vdso
-  hpcrun_save_vdso();
+    // We need to save vdso before initializing fnbounds this
+    // is because fnbounds_init will iterate over the load map
+    // and will invoke analysis on vdso
+    hpcrun_save_vdso();
 
-  // init callbacks for each device //Module_ignore_map is here
-  hpcrun_initializer_init();
+    // init callbacks for each device //Module_ignore_map is here
+    hpcrun_initializer_init();
 
-  // fnbounds must be after module_ignore_map
-  fnbounds_init();
-#ifndef HPCRUN_STATIC_LINK
-  auditor_exports->mainlib_connected(get_saved_vdso_path());
-#endif
+    // fnbounds must be after module_ignore_map
+    fnbounds_init();
+    #ifndef HPCRUN_STATIC_LINK
+      auditor_exports->mainlib_connected(get_saved_vdso_path());
+    #endif
+  }
+
+  struct monitor_thread_info mti;
+  if (monitor_get_new_thread_info(&mti) == 0){
+    // we end up here only if we called from pthread create
+
+    // Check if thread is on the clean spot for initializing sample source
+    load_module_t *lm_mti = pc_to_lm(mti.mti_create_return_addr);
+    if(module_ignore_map_ignore(lm_mti)){
+      return data;
+    }
+  }
 
-  control_knob_init();
 
   hpcrun_registered_sources_init();
 
diff --git a/src/tool/hpcrun/module-ignore-map.c b/src/tool/hpcrun/module-ignore-map.c
index cdd13efff5..86cafc73d0 100644
--- a/src/tool/hpcrun/module-ignore-map.c
+++ b/src/tool/hpcrun/module-ignore-map.c
@@ -108,7 +108,7 @@
 // where any GPU can indicate that its functions should be added to
 // the module ignore map when that type of GPU is being monitored.
 
-#define NUM_FNS 7
+#define NUM_FNS 8
 
 
 
@@ -135,8 +135,10 @@ static const char *IGNORE_FNS[NUM_FNS] = {
   "roctracer_set_properties",  // amd roctracer library
   "amd_dbgapi_initialize",     // amd debug library
   "hipKernelNameRefByPtr",     // amd hip runtime
-  "hsa_queue_create"           // amd hsa runtime
+  "hsa_queue_create",           // amd hsa runtime
+  "hsa_init"
 };
+
 static module_ignore_entry_t modules[NUM_FNS];
 static pfq_rwlock_t modules_lock;
 
@@ -250,7 +252,7 @@ module_ignore_map_lookup
 }
 
 int
-serach_functions_in_module(Elf *e, GElf_Shdr* secHead, Elf_Scn *section)
+search_functions_in_module(Elf *e, GElf_Shdr* secHead, Elf_Scn *section)
 {
   Elf_Data *data;
   char *symName;
@@ -287,6 +289,8 @@ module_ignore_map_ignore
   load_module_t* lm
 )
 {
+  if (lm == NULL) return false;
+  
   // Update path
   // Only one thread could update the flag,
   // Guarantee dlopen modules before notification are updated.
@@ -332,7 +336,7 @@ module_ignore_map_ignore
       gelf_getshdr(scn, &secHead);
       // Only search .dynsym section
       if (secHead.sh_type != SHT_DYNSYM) continue;
-      int module_ignore_index = serach_functions_in_module(elf, &secHead, scn);
+      int module_ignore_index = search_functions_in_module(elf, &secHead, scn);
       if (module_ignore_index != -1) {
         modules[module_ignore_index].module = module;
         modules[module_ignore_index].empty = false;
diff --git a/src/tool/hpcrun/sample-sources/papi-c.c b/src/tool/hpcrun/sample-sources/papi-c.c
index 2815654187..20a77d7893 100644
--- a/src/tool/hpcrun/sample-sources/papi-c.c
+++ b/src/tool/hpcrun/sample-sources/papi-c.c
@@ -352,7 +352,7 @@ METHOD_FN(init)
       }
     }
   }
-  int ret = 0; //PAPI_library_init(PAPI_VER_CURRENT);
+  int ret = PAPI_library_init(PAPI_VER_CURRENT);
   monitor_enable_new_threads();
 
   TMSG(PAPI_C,"PAPI_library_init = %d", ret);
diff --git a/src/tool/hpcrun/sample_sources_registered.c b/src/tool/hpcrun/sample_sources_registered.c
index 99bc99fd87..262aa7c8b5 100644
--- a/src/tool/hpcrun/sample_sources_registered.c
+++ b/src/tool/hpcrun/sample_sources_registered.c
@@ -49,6 +49,7 @@
 #include <string.h>
 #include <unistd.h>
 
+#include "control-knob.h"
 #include "sample_sources_registered.h"
 #include <sample-sources/sample_source_obj.h>
 #include <sample-sources/ss-obj-name.h>
@@ -142,6 +143,9 @@ hpcrun_registered_sources_init(void)
     METHOD_CALL(ss, init);
     TMSG(SS_COMMON, "sample source \"%s\": init", ss->name);
   }
+
+  // set user-defined control_knobs for the sample sources
+  control_knob_init();
 }
 
 void

From a2309f528a4b43e63efaccd6ad8c67a9cf593db9 Mon Sep 17 00:00:00 2001
From: Dejan Grubisic <grubisic.dejan@yahoo.com>
Date: Fri, 5 Feb 2021 12:42:37 -0600
Subject: [PATCH 127/177] added HPCRUN_SS_PAPI_C_ROCM to MY_CPP_DEFINES: this
 is needed for papi-rocm-registration

---
 src/tool/hpcrun/Makefile.am |  5 +-
 src/tool/hpcrun/Makefile.in | 93 +++++++++++++++++++------------------
 2 files changed, 51 insertions(+), 47 deletions(-)

diff --git a/src/tool/hpcrun/Makefile.am b/src/tool/hpcrun/Makefile.am
index eafd99d391..538cf09fda 100644
--- a/src/tool/hpcrun/Makefile.am
+++ b/src/tool/hpcrun/Makefile.am
@@ -998,6 +998,10 @@ libhpcrun_la_CPPFLAGS += $(CUPTI_INC_FLGS)
 MY_CPP_DEFINES += -DHPCRUN_SS_PAPI_C_CUPTI
 endif
 
+if OPT_PAPI_ROCM
+MY_CPP_DEFINES += -DHPCRUN_SS_PAPI_C_ROCM
+endif
+
 if OPT_PAPI_STATIC
   libhpcrun_o_SOURCES   += $(MY_PAPI_FILES)
   libhpcrun_o_CPPFLAGS  += $(PAPI_INC_FLGS)
@@ -1045,7 +1049,6 @@ if OPT_ENABLE_CUDA
   libhpcrun_o_SOURCES += $(MY_CUDA_FILES)
 endif
 
-
 if OPT_ENABLE_ROCM
   libhpcrun_la_SOURCES  += $(MY_ROCM_FILES)
   libhpcrun_la_CPPFLAGS += -DENABLE_ROCM
diff --git a/src/tool/hpcrun/Makefile.in b/src/tool/hpcrun/Makefile.in
index 291f7f7a96..7322efbc5e 100644
--- a/src/tool/hpcrun/Makefile.in
+++ b/src/tool/hpcrun/Makefile.in
@@ -275,39 +275,40 @@ host_triplet = @host@
 @OPT_ENABLE_CUPTI_TRUE@am__append_107 = -DHPCRUN_SS_NVIDIA
 @OPT_PAPI_CUPTI_TRUE@am__append_108 = $(CUPTI_INC_FLGS)
 @OPT_PAPI_CUPTI_TRUE@am__append_109 = -DHPCRUN_SS_PAPI_C_CUPTI
-@OPT_PAPI_STATIC_TRUE@am__append_110 = $(MY_PAPI_FILES)
-@OPT_PAPI_STATIC_TRUE@am__append_111 = $(PAPI_INC_FLGS)
-@OPT_PAPI_STATIC_TRUE@am__append_112 = $(OPT_PAPI_LIBS_STAT)
-@OPT_PAPI_STATIC_TRUE@am__append_113 = -DHPCRUN_SS_PAPI
-@OPT_ENABLE_UPC_TRUE@am__append_114 = $(MY_UPC_FILES)
+@OPT_PAPI_ROCM_TRUE@am__append_110 = -DHPCRUN_SS_PAPI_C_ROCM
+@OPT_PAPI_STATIC_TRUE@am__append_111 = $(MY_PAPI_FILES)
+@OPT_PAPI_STATIC_TRUE@am__append_112 = $(PAPI_INC_FLGS)
+@OPT_PAPI_STATIC_TRUE@am__append_113 = $(OPT_PAPI_LIBS_STAT)
+@OPT_PAPI_STATIC_TRUE@am__append_114 = -DHPCRUN_SS_PAPI
 @OPT_ENABLE_UPC_TRUE@am__append_115 = $(MY_UPC_FILES)
-@OPT_ENABLE_UPC_TRUE@am__append_116 = $(OPT_UPC_IFLAGS)
+@OPT_ENABLE_UPC_TRUE@am__append_116 = $(MY_UPC_FILES)
 @OPT_ENABLE_UPC_TRUE@am__append_117 = $(OPT_UPC_IFLAGS)
-@OPT_ENABLE_UPC_TRUE@am__append_118 = $(OPT_UPC_LDFLAGS)
-@OPT_ENABLE_LUSH_PTHREADS_TRUE@am__append_119 = -DLUSH_PTHREADS
+@OPT_ENABLE_UPC_TRUE@am__append_118 = $(OPT_UPC_IFLAGS)
+@OPT_ENABLE_UPC_TRUE@am__append_119 = $(OPT_UPC_LDFLAGS)
 @OPT_ENABLE_LUSH_PTHREADS_TRUE@am__append_120 = -DLUSH_PTHREADS
-@OPT_ENABLE_CUDA_TRUE@am__append_121 = $(MY_CUDA_FILES)
-@OPT_ENABLE_CUDA_TRUE@am__append_122 = -DENABLE_CUDA
-@OPT_ENABLE_CUDA_TRUE@am__append_123 = $(OPT_CUDA_IFLAGS)
-@OPT_ENABLE_CUDA_TRUE@am__append_124 = $(MY_CUDA_FILES)
-@OPT_ENABLE_ROCM_TRUE@am__append_125 = $(MY_ROCM_FILES)
-@OPT_ENABLE_ROCM_TRUE@am__append_126 = -DENABLE_ROCM
-@OPT_ENABLE_ROCM_TRUE@am__append_127 = $(OPT_ROCM_IFLAGS)
-@OPT_ENABLE_ROCM_TRUE@am__append_128 = -DHPCRUN_SS_AMD
-@OPT_ENABLE_LEVEL0_TRUE@am__append_129 = $(MY_LEVEL0_FILES)
-@OPT_ENABLE_LEVEL0_TRUE@am__append_130 = -DENABLE_LEVEL0
-@OPT_ENABLE_LEVEL0_TRUE@am__append_131 = $(OPT_LEVEL0_IFLAGS)
-@OPT_ENABLE_LEVEL0_TRUE@am__append_132 = -DHPCRUN_SS_LEVEL0
-@OPT_ENABLE_OPENCL_TRUE@am__append_133 = $(MY_OPENCL_FILES)
-@OPT_ENABLE_OPENCL_TRUE@am__append_134 = -DENABLE_OPENCL
-@OPT_ENABLE_OPENCL_TRUE@am__append_135 = $(OPT_OPENCL_IFLAGS)
-@OPT_ENABLE_OPENCL_TRUE@am__append_136 = -DHPCRUN_SS_OPENCL
-@OPT_ENABLE_GTPIN_TRUE@am__append_137 = $(MY_GTPIN_FILES)
-@OPT_ENABLE_GTPIN_TRUE@am__append_138 = -DENABLE_GTPIN -DGTPIN_LIBDIR=$(OPT_GTPIN_LIBDIR)
-@OPT_ENABLE_GTPIN_TRUE@am__append_139 = $(OPT_GTPIN_IFLAGS)
-@OPT_ENABLE_GTPIN_TRUE@am__append_140 = -DHPCRUN_SS_GTPIN
-@OPT_ENABLE_LUSH_TRUE@@OPT_WITH_CILK_TRUE@am__append_141 = libagent-cilk.la
-@OPT_ENABLE_LUSH_TRUE@am__append_142 = libagent-pthread.la \
+@OPT_ENABLE_LUSH_PTHREADS_TRUE@am__append_121 = -DLUSH_PTHREADS
+@OPT_ENABLE_CUDA_TRUE@am__append_122 = $(MY_CUDA_FILES)
+@OPT_ENABLE_CUDA_TRUE@am__append_123 = -DENABLE_CUDA
+@OPT_ENABLE_CUDA_TRUE@am__append_124 = $(OPT_CUDA_IFLAGS)
+@OPT_ENABLE_CUDA_TRUE@am__append_125 = $(MY_CUDA_FILES)
+@OPT_ENABLE_ROCM_TRUE@am__append_126 = $(MY_ROCM_FILES)
+@OPT_ENABLE_ROCM_TRUE@am__append_127 = -DENABLE_ROCM
+@OPT_ENABLE_ROCM_TRUE@am__append_128 = $(OPT_ROCM_IFLAGS)
+@OPT_ENABLE_ROCM_TRUE@am__append_129 = -DHPCRUN_SS_AMD
+@OPT_ENABLE_LEVEL0_TRUE@am__append_130 = $(MY_LEVEL0_FILES)
+@OPT_ENABLE_LEVEL0_TRUE@am__append_131 = -DENABLE_LEVEL0
+@OPT_ENABLE_LEVEL0_TRUE@am__append_132 = $(OPT_LEVEL0_IFLAGS)
+@OPT_ENABLE_LEVEL0_TRUE@am__append_133 = -DHPCRUN_SS_LEVEL0
+@OPT_ENABLE_OPENCL_TRUE@am__append_134 = $(MY_OPENCL_FILES)
+@OPT_ENABLE_OPENCL_TRUE@am__append_135 = -DENABLE_OPENCL
+@OPT_ENABLE_OPENCL_TRUE@am__append_136 = $(OPT_OPENCL_IFLAGS)
+@OPT_ENABLE_OPENCL_TRUE@am__append_137 = -DHPCRUN_SS_OPENCL
+@OPT_ENABLE_GTPIN_TRUE@am__append_138 = $(MY_GTPIN_FILES)
+@OPT_ENABLE_GTPIN_TRUE@am__append_139 = -DENABLE_GTPIN -DGTPIN_LIBDIR=$(OPT_GTPIN_LIBDIR)
+@OPT_ENABLE_GTPIN_TRUE@am__append_140 = $(OPT_GTPIN_IFLAGS)
+@OPT_ENABLE_GTPIN_TRUE@am__append_141 = -DHPCRUN_SS_GTPIN
+@OPT_ENABLE_LUSH_TRUE@@OPT_WITH_CILK_TRUE@am__append_142 = libagent-cilk.la
+@OPT_ENABLE_LUSH_TRUE@am__append_143 = libagent-pthread.la \
 @OPT_ENABLE_LUSH_TRUE@	libagent-tbb.la
 subdir = src/tool/hpcrun
 ACLOCAL_M4 = $(top_srcdir)/aclocal.m4
@@ -1783,7 +1784,7 @@ pkglibexec_SCRIPTS = $(am__append_1)
 include_HEADERS = $(am__append_2)
 pkglib_LIBRARIES = $(am__append_5)
 pkglib_LTLIBRARIES = $(am__append_3) $(am__append_7) $(am__append_18) \
-	$(am__append_19) $(am__append_141) $(am__append_142)
+	$(am__append_19) $(am__append_142) $(am__append_143)
 BUILT_SOURCES = $(am__append_23)
 CLEANFILES = $(am__append_24)
 @OPT_ENABLE_HPCRUN_DYNAMIC_TRUE@noinst_LTLIBRARIES = libhpcrun.la
@@ -1874,8 +1875,8 @@ UNW_MIPS_LD_FLAGS =
 MY_CPP_DEFINES = -D_GNU_SOURCE -DINLINE_FN=1 -DLOCAL_BUILD=1 \
 	-D__HIP_PLATFORM_HCC__=1 $(am__append_11) $(am__append_20) \
 	$(am__append_103) $(am__append_107) $(am__append_109) \
-	$(am__append_113) $(am__append_128) $(am__append_132) \
-	$(am__append_136) $(am__append_140)
+	$(am__append_110) $(am__append_114) $(am__append_129) \
+	$(am__append_133) $(am__append_137) $(am__append_141)
 MY_BASE_FILES = utilities/first_func.c main.h main.c disabled.c \
 	closure-registry.c cct_insert_backtrace.c \
 	cct_backtrace_finalize.c env.c epoch.c files.c \
@@ -2060,9 +2061,9 @@ MY_AARCH64_INCLUDE_DIRS = \
 libhpcrun_la_SOURCES = $(MY_BASE_FILES) $(MY_DYNAMIC_FILES) \
 	$(am__append_25) $(am__append_26) $(am__append_39) \
 	$(am__append_54) $(am__append_72) $(am__append_85) \
-	$(am__append_100) $(am__append_104) $(am__append_114) \
-	$(am__append_121) $(am__append_125) $(am__append_129) \
-	$(am__append_133) $(am__append_137) $(UNW_SOURCE_FILES) \
+	$(am__append_100) $(am__append_104) $(am__append_115) \
+	$(am__append_122) $(am__append_126) $(am__append_130) \
+	$(am__append_134) $(am__append_138) $(UNW_SOURCE_FILES) \
 	utilities/last_func.c
 libhpcrun_fake_audit_la_SOURCES = \
 	audit/fake-auditor.c
@@ -2073,7 +2074,7 @@ libhpcrun_audit_la_SOURCES = \
 libhpcrun_o_SOURCES = $(MY_BASE_FILES) $(MY_STATIC_FILES) \
 	$(am__append_27) $(am__append_40) $(am__append_55) \
 	$(am__append_73) $(am__append_86) $(am__append_105) \
-	$(am__append_110) $(am__append_115) $(am__append_124) \
+	$(am__append_111) $(am__append_116) $(am__append_125) \
 	$(UNW_SOURCE_FILES) utilities/last_func.c
 libhpcrun_wrap_a_SOURCES = \
 	monitor-exts/openmp.c
@@ -2121,9 +2122,9 @@ libhpcrun_la_CPPFLAGS = $(MY_CPP_DEFINES) $(MY_INCLUDE_DIRS) \
 	$(am__append_21) $(am__append_28) $(am__append_41) \
 	$(am__append_56) $(am__append_74) $(am__append_87) \
 	$(am__append_101) $(am__append_106) $(am__append_108) \
-	$(am__append_116) $(am__append_119) $(am__append_122) \
-	$(am__append_126) $(am__append_130) $(am__append_134) \
-	$(am__append_138) $(UNW_INCLUDE_DIRS)
+	$(am__append_117) $(am__append_120) $(am__append_123) \
+	$(am__append_127) $(am__append_131) $(am__append_135) \
+	$(am__append_139) $(UNW_INCLUDE_DIRS)
 libhpcrun_fake_audit_la_CPPFLAGS = \
 	$(MY_CPP_DEFINES)		\
 	$(MY_INCLUDE_DIRS)
@@ -2135,8 +2136,8 @@ libhpcrun_audit_la_CPPFLAGS = \
 libhpcrun_o_CPPFLAGS = -DHPCRUN_STATIC_LINK $(MY_CPP_DEFINES) \
 	$(MY_INCLUDE_DIRS) $(am__append_22) $(am__append_29) \
 	$(am__append_42) $(am__append_57) $(am__append_75) \
-	$(am__append_88) $(am__append_111) $(am__append_117) \
-	$(am__append_120) $(UNW_INCLUDE_DIRS)
+	$(am__append_88) $(am__append_112) $(am__append_118) \
+	$(am__append_121) $(UNW_INCLUDE_DIRS)
 libhpcrun_wrap_a_CPPFLAGS = \
 	-DHPCRUN_STATIC_LINK		\
 	$(MY_CPP_DEFINES)		\
@@ -2193,8 +2194,8 @@ libhpctoolkit_a_CPPFLAGS = \
 # cflags
 #-----------------------------------------------------------
 libhpcrun_la_CFLAGS = $(CFLAGS) $(HOST_CFLAGS) $(PERFMON_CFLAGS) \
-	$(am__append_123) $(am__append_127) $(am__append_131) \
-	$(am__append_135) $(am__append_139) $(GOTCHA_IFLAGS)
+	$(am__append_124) $(am__append_128) $(am__append_132) \
+	$(am__append_136) $(am__append_140) $(GOTCHA_IFLAGS)
 libhpcrun_o_CFLAGS = $(CFLAGS) $(HOST_CFLAGS) $(PERFMON_CFLAGS)
 libhpcrun_wrap_a_CFLAGS = $(CFLAGS) $(HOST_CFLAGS)
 libhpcrun_ga_la_CFLAGS = $(CFLAGS) $(HOST_CFLAGS)
@@ -2227,7 +2228,7 @@ OUR_LZMA_A = $(top_builddir)/src/extern/lzma/liblzma.a
 libhpcrun_la_LDFLAGS = -Wl,-Bsymbolic -L$(LIBMONITOR_LIB) -lmonitor \
 	-lpthread -lrt -L$(LIBELF_LIB) -lelf $(PERFMON_LDFLAGS_DYN) \
 	$(OPT_ROCM_LDFLAGS) $(am__append_58) $(am__append_102) \
-	$(am__append_118) $(GOTCHA_LDFLAGS) $(UNW_DYNAMIC_LD_FLAGS)
+	$(am__append_119) $(GOTCHA_LDFLAGS) $(UNW_DYNAMIC_LD_FLAGS)
 libhpcrun_fake_audit_la_LDFLAGS = -Wl,-Bsymbolic -ldl
 libhpcrun_audit_la_LDFLAGS = -Wl,-Bsymbolic -ldl
 libhpcrun_ga_la_LDFLAGS = -Wl,-Bsymbolic
@@ -2253,7 +2254,7 @@ libhpcrun_la_LIBADD = \
 
 libhpcrun_o_LDADD = $(PROF_LEAN_A) $(SUPPORT_LEAN_A) \
 	$(PERFMON_LDFLAGS_STAT) $(MBEDTLS_LIBS) $(OUR_LIBUNWIND_A) \
-	$(OUR_LZMA_A) $(am__append_59) $(am__append_112) \
+	$(OUR_LZMA_A) $(am__append_59) $(am__append_113) \
 	$(UNW_STATIC_LD_FLAGS)
 MY_AGENT_INCLUDE_DIRS = $(MY_INCLUDE_DIRS) $(am__append_71) \
 	$(UNW_INCLUDE_DIRS)

From 5ec69fbda51eec51f6d1f05bfc308cf8a8e515fa Mon Sep 17 00:00:00 2001
From: Dejan Grubisic <grubisic.dejan@yahoo.com>
Date: Fri, 5 Feb 2021 18:34:21 -0600
Subject: [PATCH 128/177] hpcrun/main.c monitor_init_process splited in two
 phases

---
 src/tool/hpcrun/main.c | 118 ++++++++++++++++++++---------------------
 1 file changed, 59 insertions(+), 59 deletions(-)

diff --git a/src/tool/hpcrun/main.c b/src/tool/hpcrun/main.c
index 21323e1aa1..6bfe869e37 100644
--- a/src/tool/hpcrun/main.c
+++ b/src/tool/hpcrun/main.c
@@ -221,7 +221,6 @@ bool hpcrun_no_unwind = false;
  * (public declaration) thread-local variables
  *****************************************************************************/
 static __thread bool hpcrun_thread_suppress_sample = true;
-static atomic_bool is_partially_initialized = ATOMIC_VAR_INIT(false);
 
 //***************************************************************************
 // local variables 
@@ -844,7 +843,7 @@ hpcrun_wait()
 
 
 //***************************************************************************
-// process control (via libmonitor)
+// hpcrun initialization ( process control via libmonitor)
 //***************************************************************************
 
 void*
@@ -855,9 +854,6 @@ monitor_init_process(int *argc, char **argv, void* data)
 
   hpcrun_thread_suppress_sample = false;
 
-  fork_data_t* fork_data = (fork_data_t*) data;
-  bool is_child = data && fork_data->is_child;
-
   hpcrun_wait();
 
 #ifndef HPCRUN_STATIC_LINK
@@ -888,66 +884,62 @@ monitor_init_process(int *argc, char **argv, void* data)
     }
   }
 
-  if (atomic_fetch_add(&is_partially_initialized, 1) == 0){
+  hpcrun_set_using_threads(false);
 
-    hpcrun_set_using_threads(false);
+  copy_execname(process_name);
+  hpcrun_files_set_executable(process_name);
 
-    copy_execname(process_name);
-    hpcrun_files_set_executable(process_name);
+  TMSG(PROCESS,"hpcrun_files_set_executable called w process name = %s", process_name);
 
-    // We initialize the load map and fnbounds before registering sample source.
-    // This is because sample source init (such as PAPI)  may dlopen other libraries,
-    // which will trigger our library monitoring code and fnbound queries
-    hpcrun_initLoadmap();
+  // We initialize the load map and fnbounds before registering sample source.
+  // This is because sample source init (such as PAPI)  may dlopen other libraries,
+  // which will trigger our library monitoring code and fnbound queries
+  hpcrun_initLoadmap();
 
-    // We do not want creating the measurement directory when
-    // the user only wants to see the complete event list
-    if (getenv("HPCRUN_LIST_EVENT")) {
-      hpcrun_set_disabled();
-    }
-    // We need to initialize messages related functions and set up measurement directory,
-    // so that we can write vdso and prevent fnbounds print messages to the terminal.
-    messages_init();
-    if (!hpcrun_get_disabled()) {
-      hpcrun_files_set_directory();
-      messages_logfile_create();
-
-      // must initialize unwind recipe map before initializing fnbounds
-      // because mapping of load modules affects the recipe map.
-      hpcrun_unw_init();
-
-      // We need to save vdso before initializing fnbounds this
-      // is because fnbounds_init will iterate over the load map
-      // and will invoke analysis on vdso
-      hpcrun_save_vdso();
-
-      // init callbacks for each device //Module_ignore_map is here
-      hpcrun_initializer_init();
-
-      // fnbounds must be after module_ignore_map
-      fnbounds_init();
-      #ifndef HPCRUN_STATIC_LINK
-        auditor_exports->mainlib_connected(get_saved_vdso_path());
-      #endif
-    }
+  // We do not want creating the measurement directory when
+  // the user only wants to see the complete event list
+  if (getenv("HPCRUN_LIST_EVENT")) {
+    hpcrun_set_disabled();
   }
-
-  struct monitor_thread_info mti;
-  if (monitor_get_new_thread_info(&mti) == 0){
-    // we end up here only if we called from pthread create
-
-    // Check if thread is on the clean spot for initializing sample source
-    load_module_t *lm_mti = pc_to_lm(mti.mti_create_return_addr);
-    if(module_ignore_map_ignore(lm_mti)){
-      return data;
-    }
+  // We need to initialize messages related functions and set up measurement directory,
+  // so that we can write vdso and prevent fnbounds print messages to the terminal.
+  messages_init();
+  if (!hpcrun_get_disabled()) {
+    hpcrun_files_set_directory();
+    messages_logfile_create();
+
+    // must initialize unwind recipe map before initializing fnbounds
+    // because mapping of load modules affects the recipe map.
+    hpcrun_unw_init();
+
+    // We need to save vdso before initializing fnbounds this
+    // is because fnbounds_init will iterate over the load map
+    // and will invoke analysis on vdso
+    hpcrun_save_vdso();
+
+    // init callbacks for each device //Module_ignore_map is here
+    hpcrun_initializer_init();
+
+    // fnbounds must be after module_ignore_map
+    fnbounds_init();
+    #ifndef HPCRUN_STATIC_LINK
+      auditor_exports->mainlib_connected(get_saved_vdso_path());
+    #endif
   }
+  
+  return data;
+}
+
 
+static
+void monitor_init_process_deferred()
+{
+  bool is_child = false;
+  
   hpcrun_registered_sources_init();
 
   hpcrun_do_custom_init();
 
-
   // for debugging, limit the life of the execution with an alarm.
   char* life  = getenv("HPCRUN_LIFETIME");
   if (life != NULL){
@@ -969,13 +961,11 @@ monitor_init_process(int *argc, char **argv, void* data)
 
   hpcrun_process_sample_source_none();
 
-  TMSG(PROCESS,"hpcrun_files_set_executable called w process name = %s", process_name);
-
-  TMSG(PROCESS,"init");
+  TMSG(PROCESS,"hpcrun outer initialization");
 
   hpcrun_sample_prob_mesg();
 
-  TMSG(PROCESS, "I am a %s process", is_child ? "child" : "parent");
+  TMSG(PROCESS, "I am a %s process parent");
 
   hpcrun_init_internal(is_child);
 
@@ -986,8 +976,13 @@ monitor_init_process(int *argc, char **argv, void* data)
 
 
   hpcrun_safe_exit();
+}
 
-  return data;
+
+void
+monitor_at_main()
+{  
+    monitor_init_process_deferred();
 }
 
 
@@ -1147,6 +1142,7 @@ monitor_init_thread_support(void)
   hpcrun_safe_exit();
 }
 
+
 void*
 monitor_thread_pre_create(void)
 {
@@ -1164,6 +1160,10 @@ monitor_thread_pre_create(void)
     return NULL;
   }
   
+  // outer initialization
+  monitor_init_process_deferred();
+
+
   hpcrun_safe_enter();
   local_thread_data_t* rv = hpcrun_malloc(sizeof(local_thread_data_t));
 

From 96d911f2760e512d7f499f6c66a8d030e5f9b728 Mon Sep 17 00:00:00 2001
From: Dejan Grubisic <grubisic.dejan@yahoo.com>
Date: Fri, 5 Feb 2021 19:08:38 -0600
Subject: [PATCH 129/177] refactoring monitor_init_process_deferred

---
 src/tool/hpcrun/main.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/src/tool/hpcrun/main.c b/src/tool/hpcrun/main.c
index 6bfe869e37..22b601df46 100644
--- a/src/tool/hpcrun/main.c
+++ b/src/tool/hpcrun/main.c
@@ -932,7 +932,7 @@ monitor_init_process(int *argc, char **argv, void* data)
 
 
 static
-void monitor_init_process_deferred()
+void  hpcrun_prepare_measurement_subsystem()
 {
   bool is_child = false;
   
@@ -974,7 +974,6 @@ void monitor_init_process_deferred()
     STDERR_MSG("Std Err message appears");
   }
 
-
   hpcrun_safe_exit();
 }
 
@@ -982,7 +981,7 @@ void monitor_init_process_deferred()
 void
 monitor_at_main()
 {  
-    monitor_init_process_deferred();
+     hpcrun_prepare_measurement_subsystem();
 }
 
 
@@ -1161,7 +1160,7 @@ monitor_thread_pre_create(void)
   }
   
   // outer initialization
-  monitor_init_process_deferred();
+   hpcrun_prepare_measurement_subsystem();
 
 
   hpcrun_safe_enter();

From ce51ae991d80322ca0bb3228dbe0b19c67845377 Mon Sep 17 00:00:00 2001
From: Dejan Grubisic <grubisic.dejan@yahoo.com>
Date: Tue, 9 Feb 2021 14:17:34 -0600
Subject: [PATCH 130/177] hpcrun_prepare_measurement_subsystem must be executed
 exactly once

---
 src/tool/hpcrun/main.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/src/tool/hpcrun/main.c b/src/tool/hpcrun/main.c
index 22b601df46..386b30f54d 100644
--- a/src/tool/hpcrun/main.c
+++ b/src/tool/hpcrun/main.c
@@ -244,6 +244,8 @@ static hpcrun_aux_cleanup_t * hpcrun_aux_cleanup_free_list_head = NULL;
 static char execname[PATH_MAX] = {'\0'};
 
 static int monitor_fini_process_how = 0;
+static atomic_int is_ms_initialized = ATOMIC_VAR_INIT(0);
+
 
 //***************************************************************************
 // Interface functions for suppressing samples
@@ -936,6 +938,9 @@ void  hpcrun_prepare_measurement_subsystem()
 {
   bool is_child = false;
   
+  if (atomic_fetch_add(&is_ms_initialized, 1) != 0)
+    return;
+
   hpcrun_registered_sources_init();
 
   hpcrun_do_custom_init();

From 33084d0ac57cc1a3a1e8375140b8c3fea777d396 Mon Sep 17 00:00:00 2001
From: Dejan Grubisic <grubisic.dejan@yahoo.com>
Date: Sun, 14 Feb 2021 00:06:33 -0600
Subject: [PATCH 131/177] hpcrun_prepare_measurement_subsystem handles forks
 is_child = true

---
 src/tool/hpcrun/main.c | 94 ++++++++++++++++++++++++------------------
 1 file changed, 54 insertions(+), 40 deletions(-)

diff --git a/src/tool/hpcrun/main.c b/src/tool/hpcrun/main.c
index 386b30f54d..d367c2af81 100644
--- a/src/tool/hpcrun/main.c
+++ b/src/tool/hpcrun/main.c
@@ -244,7 +244,8 @@ static hpcrun_aux_cleanup_t * hpcrun_aux_cleanup_free_list_head = NULL;
 static char execname[PATH_MAX] = {'\0'};
 
 static int monitor_fini_process_how = 0;
-static atomic_int is_ms_initialized = ATOMIC_VAR_INIT(0);
+static atomic_int ms_init_started = ATOMIC_VAR_INIT(0);
+static atomic_int ms_init_completed = ATOMIC_VAR_INIT(0);
 
 
 //***************************************************************************
@@ -848,6 +849,9 @@ hpcrun_wait()
 // hpcrun initialization ( process control via libmonitor)
 //***************************************************************************
 
+static
+void  hpcrun_prepare_measurement_subsystem(bool is_child);
+
 void*
 monitor_init_process(int *argc, char **argv, void* data)
 {
@@ -929,64 +933,74 @@ monitor_init_process(int *argc, char **argv, void* data)
     #endif
   }
   
+ fork_data_t* fork_data = (fork_data_t*) data;
+  bool is_child = data && fork_data->is_child;
+  if (is_child){
+    hpcrun_prepare_measurement_subsystem(is_child);
+  }
+
   return data;
 }
 
 
-static
-void  hpcrun_prepare_measurement_subsystem()
-{
+void
+monitor_at_main()
+{  
   bool is_child = false;
-  
-  if (atomic_fetch_add(&is_ms_initialized, 1) != 0)
-    return;
+  hpcrun_prepare_measurement_subsystem(is_child);
+}
 
-  hpcrun_registered_sources_init();
 
-  hpcrun_do_custom_init();
+static
+void  hpcrun_prepare_measurement_subsystem(bool is_child)
+{  
+  if (atomic_fetch_add(&ms_init_started, 1) == 0){
+    hpcrun_registered_sources_init();
 
-  // for debugging, limit the life of the execution with an alarm.
-  char* life  = getenv("HPCRUN_LIFETIME");
-  if (life != NULL){
-    int seconds = atoi(life);
-    if (seconds > 0) alarm((unsigned int) seconds);
-  }
+    hpcrun_do_custom_init();
 
-  // see if unwinding has been turned off
-  // the same setting governs whether or not fnbounds is needed or used.
-  hpcrun_no_unwind = hpcrun_get_env_bool("HPCRUN_NO_UNWIND");
+    // for debugging, limit the life of the execution with an alarm.
+    char* life  = getenv("HPCRUN_LIFETIME");
+    if (life != NULL){
+      int seconds = atoi(life);
+      if (seconds > 0) alarm((unsigned int) seconds);
+    }
 
-  char* s = getenv(HPCRUN_EVENT_LIST);
+    // see if unwinding has been turned off
+    // the same setting governs whether or not fnbounds is needed or used.
+    hpcrun_no_unwind = hpcrun_get_env_bool("HPCRUN_NO_UNWIND");
 
-  if (! is_child) {
-    hpcrun_sample_sources_from_eventlist(s);
-  }
+    char* s = getenv(HPCRUN_EVENT_LIST);
 
-  hpcrun_set_abort_timeout();
+    if (! is_child) {
+      hpcrun_sample_sources_from_eventlist(s);
+    }
 
-  hpcrun_process_sample_source_none();
+    hpcrun_set_abort_timeout();
 
-  TMSG(PROCESS,"hpcrun outer initialization");
+    hpcrun_process_sample_source_none();
 
-  hpcrun_sample_prob_mesg();
+    TMSG(PROCESS,"hpcrun outer initialization");
 
-  TMSG(PROCESS, "I am a %s process parent");
+    hpcrun_sample_prob_mesg();
 
-  hpcrun_init_internal(is_child);
+    TMSG(PROCESS, "I am a %s process parent");
 
-  if (ENABLED(TST)){
-    EEMSG("TST debug ctl is active!");
-    STDERR_MSG("Std Err message appears");
-  }
+    hpcrun_init_internal(is_child);
 
-  hpcrun_safe_exit();
-}
+    if (ENABLED(TST)){
+      EEMSG("TST debug ctl is active!");
+      STDERR_MSG("Std Err message appears");
+    }
 
+    hpcrun_safe_exit();
 
-void
-monitor_at_main()
-{  
-     hpcrun_prepare_measurement_subsystem();
+    atomic_store(&ms_init_completed, 1);
+
+  }else{
+    while(! atomic_load(&ms_init_completed));
+  }
+    
 }
 
 
@@ -1163,9 +1177,9 @@ monitor_thread_pre_create(void)
   if (module_ignore_map_inrange_lookup(thread_pre_create_address)) {
     return NULL;
   }
-  
+  bool is_child = false;
   // outer initialization
-   hpcrun_prepare_measurement_subsystem();
+   hpcrun_prepare_measurement_subsystem(is_child);
 
 
   hpcrun_safe_enter();

From 2ed48d228ad4df44da32ee2314961a650aa45c23 Mon Sep 17 00:00:00 2001
From: Dejan Grubisic <grubisic.dejan@yahoo.com>
Date: Wed, 3 Mar 2021 20:08:10 -0600
Subject: [PATCH 132/177] Tell libmonitor is not tracking monitoring and
 tracing threads, however libm is for some reason destroy tracing threads

---
 .../hpcrun/gpu/gpu-operation-multiplexer.c    |  5 +++-
 src/tool/hpcrun/gpu/gpu-trace-demultiplexer.c |  5 ++++
 src/tool/hpcrun/gpu/gpu-trace.c               | 11 ++++-----
 src/tool/hpcrun/gpu/nvidia/cupti-api.c        |  5 ++++
 src/tool/hpcrun/thread_data.c                 | 24 +++++++++++++++++++
 src/tool/hpcrun/thread_data.h                 |  2 +-
 6 files changed, 43 insertions(+), 9 deletions(-)

diff --git a/src/tool/hpcrun/gpu/gpu-operation-multiplexer.c b/src/tool/hpcrun/gpu/gpu-operation-multiplexer.c
index 491d07e2c3..268b1bd5eb 100644
--- a/src/tool/hpcrun/gpu/gpu-operation-multiplexer.c
+++ b/src/tool/hpcrun/gpu/gpu-operation-multiplexer.c
@@ -93,6 +93,7 @@ gpu_init_operation_channel(){
 }
 
 
+// OpenCL Monitoring thread
 static void *
 gpu_operation_record
 (
@@ -133,9 +134,11 @@ gpu_operation_multiplexer_create
 
   gpu_operation_channel_set_alloc(max_completion_cb_threads);
 
-  // You are the first to create monitor thread
+  monitor_disable_new_threads();
+  // Create monitor thread
   pthread_create(&thread, NULL, (pthread_start_routine_t) gpu_operation_record,
                  NULL);
+  monitor_enable_new_threads();
 }
 
 
diff --git a/src/tool/hpcrun/gpu/gpu-trace-demultiplexer.c b/src/tool/hpcrun/gpu/gpu-trace-demultiplexer.c
index ee1b7c562a..1d2e30f6b8 100644
--- a/src/tool/hpcrun/gpu/gpu-trace-demultiplexer.c
+++ b/src/tool/hpcrun/gpu/gpu-trace-demultiplexer.c
@@ -54,6 +54,8 @@
 #include "gpu-trace-demultiplexer.h"
 #include "gpu-print.h"
 
+#include <monitor.h>
+
 
 //******************************************************************************
 // type declarations
@@ -96,8 +98,11 @@ gpu_trace_channel_set_create
   new_channel_set->channel_set_ptr = gpu_trace_channel_set_alloc(streams_per_thread);
   atomic_store(&new_channel_set->channel_index, 0);
 
+  monitor_disable_new_threads();
+  // Create tracing thread
   pthread_create(&new_channel_set->thread, NULL, (pthread_start_routine_t) gpu_trace_record,
                  new_channel_set);
+  monitor_enable_new_threads();
 
   return new_channel_set;
 }
diff --git a/src/tool/hpcrun/gpu/gpu-trace.c b/src/tool/hpcrun/gpu/gpu-trace.c
index ed93ea17b7..079d374902 100644
--- a/src/tool/hpcrun/gpu/gpu-trace.c
+++ b/src/tool/hpcrun/gpu/gpu-trace.c
@@ -337,6 +337,7 @@ gpu_trace_fini
 }
 
 
+// Tracing thread
 void *
 gpu_trace_record
 (
@@ -345,6 +346,9 @@ gpu_trace_record
 {
   gpu_trace_channel_set_t *channel_set = (gpu_trace_channel_set_t *) args;
 
+  hpcrun_thread_init_mem_pool_once();
+  atomic_fetch_add(&active_streams_counter, 1);
+
   while (!atomic_load(&stop_trace_flag)) {
     //getting data from a trace channel
     gpu_trace_channel_set_process(channel_set);
@@ -365,14 +369,7 @@ gpu_trace_create
 {
   // Init variables
   gpu_trace_t *trace = gpu_trace_alloc();
-
-  // Create a new thread for the stream without libmonitor watching
-  monitor_disable_new_threads();
-
   trace->thread = gpu_trace_demultiplexer_push(trace->trace_channel);
-  atomic_fetch_add(&active_streams_counter, 1);
-
-  monitor_enable_new_threads();
 
   return trace;
 }
diff --git a/src/tool/hpcrun/gpu/nvidia/cupti-api.c b/src/tool/hpcrun/gpu/nvidia/cupti-api.c
index a2cbc13123..837639fbb7 100644
--- a/src/tool/hpcrun/gpu/nvidia/cupti-api.c
+++ b/src/tool/hpcrun/gpu/nvidia/cupti-api.c
@@ -100,6 +100,8 @@
 
 #include <hpcrun/utilities/hpcrun-nanotime.h>
 
+#include <hpcrun/thread_data.h>
+
 #include "cuda-api.h"
 #include "cupti-api.h"
 #include "cupti-gpu-api.h"
@@ -1305,6 +1307,9 @@ cupti_buffer_completion_callback
  size_t validSize
 )
 {
+  
+  hpcrun_thread_init_mem_pool_once();
+
   // handle notifications
   cupti_buffer_completion_notify();
 
diff --git a/src/tool/hpcrun/thread_data.c b/src/tool/hpcrun/thread_data.c
index 9c76ab5e18..f6dd5c0913 100644
--- a/src/tool/hpcrun/thread_data.c
+++ b/src/tool/hpcrun/thread_data.c
@@ -200,6 +200,30 @@ hpcrun_threaded_data(void)
 }
 
 
+void
+hpcrun_thread_init_mem_pool_once(void){
+  static bool is_initialized = false;
+
+  if (is_initialized == false){
+    hpcrun_mmap_init();
+
+    // ----------------------------------------
+    // call thread manager to get a thread data. If there is unused thread data,
+    //  we can recycle it, otherwise we need to allocate a new one.
+    // If we allocate a new one, we need to initialize the data and trace file.
+    // ----------------------------------------
+
+    int id = 0;
+    thread_data_t* td = NULL;
+    bool has_trace = false;
+    bool demand_new_thread = true;
+
+    hpcrun_threadMgr_data_get_safe(id, NULL, &td, has_trace, demand_new_thread);
+    hpcrun_set_thread_data(td);
+    is_initialized = true;
+  }
+}
+
 //***************************************************************************
 // 
 //***************************************************************************
diff --git a/src/tool/hpcrun/thread_data.h b/src/tool/hpcrun/thread_data.h
index 5095e975d4..09d7bfd065 100644
--- a/src/tool/hpcrun/thread_data.h
+++ b/src/tool/hpcrun/thread_data.h
@@ -307,7 +307,7 @@ extern thread_data_t* hpcrun_safe_get_td(void);
 
 void hpcrun_unthreaded_data(void);
 void hpcrun_threaded_data(void);
-
+void hpcrun_thread_init_mem_pool_once(void);
 
 extern thread_data_t* hpcrun_allocate_thread_data(int id);
 

From 95d267f383a551b79e5d831e30c6e0040f925325 Mon Sep 17 00:00:00 2001
From: Dejan Grubisic <grubisic.dejan@yahoo.com>
Date: Wed, 7 Apr 2021 14:55:58 -0500
Subject: [PATCH 133/177] mem_pool_initialized is thread local var, module
 ignore hpcrun_malloc

---
 src/tool/hpcrun/module-ignore-map.c |  7 ++++---
 src/tool/hpcrun/thread_data.c       | 10 +++++-----
 2 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/src/tool/hpcrun/module-ignore-map.c b/src/tool/hpcrun/module-ignore-map.c
index ec3ed770dd..d810be841b 100644
--- a/src/tool/hpcrun/module-ignore-map.c
+++ b/src/tool/hpcrun/module-ignore-map.c
@@ -108,7 +108,7 @@
 // where any GPU can indicate that its functions should be added to
 // the module ignore map when that type of GPU is being monitored.
 
-#define NUM_FNS 8
+#define NUM_FNS ( sizeof(IGNORE_FNS) / sizeof(char *) )
 
 
 
@@ -128,7 +128,7 @@ typedef struct module_ignore_entry {
 //***************************************************************************
 
 
-static const char *IGNORE_FNS[NUM_FNS] = {
+static const char *IGNORE_FNS[] = {
   "cuLaunchKernel",
   "cudaLaunchKernel",
   "cuptiActivityEnable",
@@ -136,7 +136,8 @@ static const char *IGNORE_FNS[NUM_FNS] = {
   "amd_dbgapi_initialize",     // amd debug library
   "hipKernelNameRefByPtr",     // amd hip runtime
   "hsa_queue_create",           // amd hsa runtime
-  "hsa_init"
+  "hsa_init",
+  "hpcrun_malloc"
 };
 
 static module_ignore_entry_t modules[NUM_FNS];
diff --git a/src/tool/hpcrun/thread_data.c b/src/tool/hpcrun/thread_data.c
index f6dd5c0913..41f2146a71 100644
--- a/src/tool/hpcrun/thread_data.c
+++ b/src/tool/hpcrun/thread_data.c
@@ -63,6 +63,7 @@
 #include "epoch.h"
 #include "handling_sample.h"
 
+#include <hpcrun/threadmgr.h>
 #include "thread_data.h"
 #include "trace.h"
 
@@ -90,7 +91,7 @@ __thread int monitor_tid = -1;
 static thread_data_t _local_td;
 static pthread_key_t _hpcrun_key;
 static int use_getspecific = 0;
-
+static __thread bool mem_pool_initialized = false;
 
 void
 hpcrun_init_pthread_key(void)
@@ -201,10 +202,9 @@ hpcrun_threaded_data(void)
 
 
 void
-hpcrun_thread_init_mem_pool_once(void){
-  static bool is_initialized = false;
+hpcrun_thread_init_mem_pool_once(void){  
 
-  if (is_initialized == false){
+  if (mem_pool_initialized == false){
     hpcrun_mmap_init();
 
     // ----------------------------------------
@@ -220,7 +220,7 @@ hpcrun_thread_init_mem_pool_once(void){
 
     hpcrun_threadMgr_data_get_safe(id, NULL, &td, has_trace, demand_new_thread);
     hpcrun_set_thread_data(td);
-    is_initialized = true;
+    mem_pool_initialized = true;
   }
 }
 

From eb635684b6231a8e7a23a34032d9e1f6c2dbaa7f Mon Sep 17 00:00:00 2001
From: dejangrubisic <grubisic.dejan@yahoo.com>
Date: Mon, 10 May 2021 15:26:15 -0500
Subject: [PATCH 134/177] Try to add hip_dev_sync in rocm_papi_read

---
 configure                                    |   1 +
 configure.ac                                 |   1 +
 src/include/hpctoolkit-config.h.in~          | 223 -------------------
 src/tool/hpcrun/Makefile.am                  |   7 +-
 src/tool/hpcrun/Makefile.in                  | 116 +++++-----
 src/tool/hpcrun/gpu/amd/hip-api.c            |   3 +-
 src/tool/hpcrun/gpu/amd/hip-api.h            |   5 +-
 src/tool/hpcrun/gpu/amd/roctracer-api.c      |   2 +-
 src/tool/hpcrun/sample-sources/amd.c         |  14 +-
 src/tool/hpcrun/sample-sources/papi-c-rocm.c |   4 +-
 10 files changed, 87 insertions(+), 289 deletions(-)
 delete mode 100644 src/include/hpctoolkit-config.h.in~

diff --git a/configure b/configure
index cca1253f4c..1ee1190415 100755
--- a/configure
+++ b/configure
@@ -21736,6 +21736,7 @@ case "$OPT_PAPI" in
     fi
     OPT_PAPI_IFLAGS="-I${OPT_PAPI}/include"
 
+
     for lib in $multilib_path ; do
       lib_dir="${OPT_PAPI}/$lib"
       if test -f "${lib_dir}/libpapi.so" || test -f "${lib_dir}/libpapi.a"
diff --git a/configure.ac b/configure.ac
index f4f00fae2b..78514b4edc 100644
--- a/configure.ac
+++ b/configure.ac
@@ -3562,6 +3562,7 @@ then
     OPT_PAPI_LIBPATH=
     use_papi_c=no
     use_papi_c_cupti=no
+    use_papi_c_rocm=no
   fi
 fi
 
diff --git a/src/include/hpctoolkit-config.h.in~ b/src/include/hpctoolkit-config.h.in~
deleted file mode 100644
index c0ec6d30b3..0000000000
--- a/src/include/hpctoolkit-config.h.in~
+++ /dev/null
@@ -1,223 +0,0 @@
-/* src/include/hpctoolkit-config.h.in.  Generated from configure.ac by autoheader.  */
-
-/* binutils version 2.34 or later */
-#undef BINUTILS_234
-
-/* "Cuda install prefix" */
-#undef CUDA_INSTALL_PREFIX
-
-/* "CUPTI install prefix" */
-#undef CUPTI_INSTALL_PREFIX
-
-/* Data-centric tracing */
-#undef DATACENTRIC_TRACE
-
-/* dyninst uses Instruction::Ptr */
-#undef DYNINST_INSTRUCTION_PTR
-
-/* dyninst supports cuda */
-#undef DYNINST_USE_CUDA
-
-/* dyninst built with libdw */
-#undef DYNINST_USE_LIBDW
-
-/* Support for CLOCK_THREAD_CPUTIME_ID */
-#undef ENABLE_CLOCK_CPUTIME
-
-/* Support for CLOCK_REALTIME and SIGEV_THREAD_ID */
-#undef ENABLE_CLOCK_REALTIME
-
-/* ParseAPI supports openmp (for hpcstruct) */
-#undef ENABLE_OPENMP
-
-/* Symtab supports openmp (for fnbounds) */
-#undef ENABLE_OPENMP_SYMTAB
-
-/* Add extra annotations for debugging with Valgrind */
-#undef ENABLE_VG_ANNOTATIONS
-
-/* Support for AMD XOP instructions */
-#undef ENABLE_XOP
-
-/* The <cupti.h> header file is available. */
-#undef HAVE_CUPTI_H
-
-/* Define to 1 if you have the <cxxabi.h> header file. */
-#undef HAVE_CXXABI_H
-
-/* Define to 1 if you have the <dlfcn.h> header file. */
-#undef HAVE_DLFCN_H
-
-/* Define to 1 if you have the <inttypes.h> header file. */
-#undef HAVE_INTTYPES_H
-
-/* Define to 1 if you have the <memory.h> header file. */
-#undef HAVE_MEMORY_H
-
-/* HOST OS: 32 and 64 bit OS libraries */
-#undef HAVE_OS_MULTILIB
-
-/* Define to 1 if you have the <stdint.h> header file. */
-#undef HAVE_STDINT_H
-
-/* Define to 1 if you have the <stdlib.h> header file. */
-#undef HAVE_STDLIB_H
-
-/* Define to 1 if you have the <strings.h> header file. */
-#undef HAVE_STRINGS_H
-
-/* Define to 1 if you have the <string.h> header file. */
-#undef HAVE_STRING_H
-
-/* Define to 1 if you have the <sys/stat.h> header file. */
-#undef HAVE_SYS_STAT_H
-
-/* Define to 1 if you have the <sys/types.h> header file. */
-#undef HAVE_SYS_TYPES_H
-
-/* Define to 1 if the system has the type `uint'. */
-#undef HAVE_UINT
-
-/* C compiler supports type "uint" */
-#undef HAVE_UINT_LANG_C
-
-/* Define to 1 if the system has the type `ulong'. */
-#undef HAVE_ULONG
-
-/* C compiler supports type "ulong" */
-#undef HAVE_ULONG_LANG_C
-
-/* Define to 1 if you have the <unistd.h> header file. */
-#undef HAVE_UNISTD_H
-
-/* Define to 1 if the system has the type `ushort'. */
-#undef HAVE_USHORT
-
-/* C compiler supports type "ushort" */
-#undef HAVE_USHORT_LANG_C
-
-/* C compiler supports type "voidp" */
-#undef HAVE_VOIDP_LANG_C
-
-/* Host is big endian. */
-#undef HOST_BIG_ENDIAN
-
-/* HOST CPU: ARM 64 (aarch64 */
-#undef HOST_CPU_ARM64
-
-/* HOST CPU: ia64 (itanium) */
-#undef HOST_CPU_IA64
-
-/* HOST CPU: PowerPC (ppc) */
-#undef HOST_CPU_PPC
-
-/* HOST CPU: x86 (32-bit) */
-#undef HOST_CPU_x86
-
-/* HOST CPU: x86-64 */
-#undef HOST_CPU_x86_64
-
-/* Host is little endian. */
-#undef HOST_LITTLE_ENDIAN
-
-/* HOST OS: IRIX */
-#undef HOST_OS_IRIX
-
-/* HOST OS: Linux */
-#undef HOST_OS_LINUX
-
-/* HOST OS: MacOS */
-#undef HOST_OS_MACOS
-
-/* HOST OS: Solaris */
-#undef HOST_OS_SOLARIS
-
-/* HOST OS: Tru64 */
-#undef HOST_OS_TRU64
-
-/* HOST platform: MIPS64LE_LINUX */
-#undef HOST_PLATFORM_MIPS64LE_LINUX
-
-/* IBM Blue Gene support */
-#undef HOST_SYSTEM_IBM_BLUEGENE
-
-/* have custom unwinder for this platform */
-#undef HPCRUN_HAVE_CUSTOM_UNWINDER
-
-/* Git branch and commit hash, if known. */
-#undef HPCTOOLKIT_GIT_VERSION
-
-/* HPCToolkit install prefix */
-#undef HPCTOOLKIT_INSTALL_PREFIX
-
-/* Spack version and variants, if known. */
-#undef HPCTOOLKIT_SPACK_SPEC
-
-/* HPCToolkit version */
-#undef HPCTOOLKIT_VERSION
-
-/* HPCToolkit version string */
-#undef HPCTOOLKIT_VERSION_STRING
-
-/* Define to the sub-directory where libtool stores uninstalled libraries. */
-#undef LT_OBJDIR
-
-/* Standard C headers */
-#undef NO_STD_CHEADERS
-
-/* Have support for cuda */
-#undef OPT_HAVE_CUDA
-
-/* Name of package */
-#undef PACKAGE
-
-/* Define to the address where bug reports for this package should be sent. */
-#undef PACKAGE_BUGREPORT
-
-/* Define to the full name of this package. */
-#undef PACKAGE_NAME
-
-/* Define to the full name and version of this package. */
-#undef PACKAGE_STRING
-
-/* Define to the one symbol short name of this package. */
-#undef PACKAGE_TARNAME
-
-/* Define to the home page for this package. */
-#undef PACKAGE_URL
-
-/* Define to the version of this package. */
-#undef PACKAGE_VERSION
-
-/* The size of `void*', as computed by sizeof. */
-#undef SIZEOF_VOIDP
-
-/* Define to 1 if you have the ANSI C header files. */
-#undef STDC_HEADERS
-
-/* use Level Zero */
-#undef USE_LEVEL0
-
-/* use ROCM */
-#undef USE_ROCM
-
-/* Use system byteswap.h */
-#undef USE_SYSTEM_BYTESWAP
-
-/* Version number of package */
-#undef VERSION
-
-/* Define WORDS_BIGENDIAN to 1 if your processor stores words with the most
-   significant byte first (like Motorola and SPARC, unlike Intel). */
-#if defined AC_APPLE_UNIVERSAL_BUILD
-# if defined __BIG_ENDIAN__
-#  define WORDS_BIGENDIAN 1
-# endif
-#else
-# ifndef WORDS_BIGENDIAN
-#  undef WORDS_BIGENDIAN
-# endif
-#endif
-
-/* Fix pthread.h */
-#undef __thread
diff --git a/src/tool/hpcrun/Makefile.am b/src/tool/hpcrun/Makefile.am
index 538cf09fda..1d09083075 100644
--- a/src/tool/hpcrun/Makefile.am
+++ b/src/tool/hpcrun/Makefile.am
@@ -163,6 +163,9 @@ XED2_INC = @XED2_INC@
 XED2_HPCRUN_LIBS =  @XED2_HPCRUN_LIBS@
 XED2_HPCLINK_LIBS = @XED2_HPCLINK_LIBS@
 CUPTI_INC_FLGS = @OPT_CUPTI_IFLAGS@
+
+ROCM_INC_FLGS = @OPT_ROCM_IFLAGS@
+
 OPENCL_IFLAGS = @OPT_OPENCL_IFLAGS@
 CUPTI_LD_FLGS = @OPT_CUPTI_LDFLAGS@
 CUPTI_BASE = @OPT_CUPTI@
@@ -548,6 +551,7 @@ endif
 if OPT_ENABLE_ROCM
 MY_ROCM_FILES =\
 	sample-sources/amd.c \
+	gpu/amd/hip-api.c \	
 	gpu/amd/roctracer-activity-translate.c \
 	gpu/amd/roctracer-api.c 	\
 	gpu/amd/rocm-debug-api.c \
@@ -999,6 +1003,7 @@ MY_CPP_DEFINES += -DHPCRUN_SS_PAPI_C_CUPTI
 endif
 
 if OPT_PAPI_ROCM
+libhpcrun_la_CPPFLAGS   += $(ROCM_INC_FLGS)
 MY_CPP_DEFINES += -DHPCRUN_SS_PAPI_C_ROCM
 endif
 
@@ -1006,7 +1011,7 @@ if OPT_PAPI_STATIC
   libhpcrun_o_SOURCES   += $(MY_PAPI_FILES)
   libhpcrun_o_CPPFLAGS  += $(PAPI_INC_FLGS)
   libhpcrun_o_LDADD     += $(OPT_PAPI_LIBS_STAT)
-
+  
   MY_CPP_DEFINES  += -DHPCRUN_SS_PAPI
 endif
 
diff --git a/src/tool/hpcrun/Makefile.in b/src/tool/hpcrun/Makefile.in
index 7322efbc5e..3316d06482 100644
--- a/src/tool/hpcrun/Makefile.in
+++ b/src/tool/hpcrun/Makefile.in
@@ -275,40 +275,41 @@ host_triplet = @host@
 @OPT_ENABLE_CUPTI_TRUE@am__append_107 = -DHPCRUN_SS_NVIDIA
 @OPT_PAPI_CUPTI_TRUE@am__append_108 = $(CUPTI_INC_FLGS)
 @OPT_PAPI_CUPTI_TRUE@am__append_109 = -DHPCRUN_SS_PAPI_C_CUPTI
-@OPT_PAPI_ROCM_TRUE@am__append_110 = -DHPCRUN_SS_PAPI_C_ROCM
-@OPT_PAPI_STATIC_TRUE@am__append_111 = $(MY_PAPI_FILES)
-@OPT_PAPI_STATIC_TRUE@am__append_112 = $(PAPI_INC_FLGS)
-@OPT_PAPI_STATIC_TRUE@am__append_113 = $(OPT_PAPI_LIBS_STAT)
-@OPT_PAPI_STATIC_TRUE@am__append_114 = -DHPCRUN_SS_PAPI
-@OPT_ENABLE_UPC_TRUE@am__append_115 = $(MY_UPC_FILES)
+@OPT_PAPI_ROCM_TRUE@am__append_110 = $(ROCM_INC_FLGS)
+@OPT_PAPI_ROCM_TRUE@am__append_111 = -DHPCRUN_SS_PAPI_C_ROCM
+@OPT_PAPI_STATIC_TRUE@am__append_112 = $(MY_PAPI_FILES)
+@OPT_PAPI_STATIC_TRUE@am__append_113 = $(PAPI_INC_FLGS)
+@OPT_PAPI_STATIC_TRUE@am__append_114 = $(OPT_PAPI_LIBS_STAT)
+@OPT_PAPI_STATIC_TRUE@am__append_115 = -DHPCRUN_SS_PAPI
 @OPT_ENABLE_UPC_TRUE@am__append_116 = $(MY_UPC_FILES)
-@OPT_ENABLE_UPC_TRUE@am__append_117 = $(OPT_UPC_IFLAGS)
+@OPT_ENABLE_UPC_TRUE@am__append_117 = $(MY_UPC_FILES)
 @OPT_ENABLE_UPC_TRUE@am__append_118 = $(OPT_UPC_IFLAGS)
-@OPT_ENABLE_UPC_TRUE@am__append_119 = $(OPT_UPC_LDFLAGS)
-@OPT_ENABLE_LUSH_PTHREADS_TRUE@am__append_120 = -DLUSH_PTHREADS
+@OPT_ENABLE_UPC_TRUE@am__append_119 = $(OPT_UPC_IFLAGS)
+@OPT_ENABLE_UPC_TRUE@am__append_120 = $(OPT_UPC_LDFLAGS)
 @OPT_ENABLE_LUSH_PTHREADS_TRUE@am__append_121 = -DLUSH_PTHREADS
-@OPT_ENABLE_CUDA_TRUE@am__append_122 = $(MY_CUDA_FILES)
-@OPT_ENABLE_CUDA_TRUE@am__append_123 = -DENABLE_CUDA
-@OPT_ENABLE_CUDA_TRUE@am__append_124 = $(OPT_CUDA_IFLAGS)
-@OPT_ENABLE_CUDA_TRUE@am__append_125 = $(MY_CUDA_FILES)
-@OPT_ENABLE_ROCM_TRUE@am__append_126 = $(MY_ROCM_FILES)
-@OPT_ENABLE_ROCM_TRUE@am__append_127 = -DENABLE_ROCM
-@OPT_ENABLE_ROCM_TRUE@am__append_128 = $(OPT_ROCM_IFLAGS)
-@OPT_ENABLE_ROCM_TRUE@am__append_129 = -DHPCRUN_SS_AMD
-@OPT_ENABLE_LEVEL0_TRUE@am__append_130 = $(MY_LEVEL0_FILES)
-@OPT_ENABLE_LEVEL0_TRUE@am__append_131 = -DENABLE_LEVEL0
-@OPT_ENABLE_LEVEL0_TRUE@am__append_132 = $(OPT_LEVEL0_IFLAGS)
-@OPT_ENABLE_LEVEL0_TRUE@am__append_133 = -DHPCRUN_SS_LEVEL0
-@OPT_ENABLE_OPENCL_TRUE@am__append_134 = $(MY_OPENCL_FILES)
-@OPT_ENABLE_OPENCL_TRUE@am__append_135 = -DENABLE_OPENCL
-@OPT_ENABLE_OPENCL_TRUE@am__append_136 = $(OPT_OPENCL_IFLAGS)
-@OPT_ENABLE_OPENCL_TRUE@am__append_137 = -DHPCRUN_SS_OPENCL
-@OPT_ENABLE_GTPIN_TRUE@am__append_138 = $(MY_GTPIN_FILES)
-@OPT_ENABLE_GTPIN_TRUE@am__append_139 = -DENABLE_GTPIN -DGTPIN_LIBDIR=$(OPT_GTPIN_LIBDIR)
-@OPT_ENABLE_GTPIN_TRUE@am__append_140 = $(OPT_GTPIN_IFLAGS)
-@OPT_ENABLE_GTPIN_TRUE@am__append_141 = -DHPCRUN_SS_GTPIN
-@OPT_ENABLE_LUSH_TRUE@@OPT_WITH_CILK_TRUE@am__append_142 = libagent-cilk.la
-@OPT_ENABLE_LUSH_TRUE@am__append_143 = libagent-pthread.la \
+@OPT_ENABLE_LUSH_PTHREADS_TRUE@am__append_122 = -DLUSH_PTHREADS
+@OPT_ENABLE_CUDA_TRUE@am__append_123 = $(MY_CUDA_FILES)
+@OPT_ENABLE_CUDA_TRUE@am__append_124 = -DENABLE_CUDA
+@OPT_ENABLE_CUDA_TRUE@am__append_125 = $(OPT_CUDA_IFLAGS)
+@OPT_ENABLE_CUDA_TRUE@am__append_126 = $(MY_CUDA_FILES)
+@OPT_ENABLE_ROCM_TRUE@am__append_127 = $(MY_ROCM_FILES)
+@OPT_ENABLE_ROCM_TRUE@am__append_128 = -DENABLE_ROCM
+@OPT_ENABLE_ROCM_TRUE@am__append_129 = $(OPT_ROCM_IFLAGS)
+@OPT_ENABLE_ROCM_TRUE@am__append_130 = -DHPCRUN_SS_AMD
+@OPT_ENABLE_LEVEL0_TRUE@am__append_131 = $(MY_LEVEL0_FILES)
+@OPT_ENABLE_LEVEL0_TRUE@am__append_132 = -DENABLE_LEVEL0
+@OPT_ENABLE_LEVEL0_TRUE@am__append_133 = $(OPT_LEVEL0_IFLAGS)
+@OPT_ENABLE_LEVEL0_TRUE@am__append_134 = -DHPCRUN_SS_LEVEL0
+@OPT_ENABLE_OPENCL_TRUE@am__append_135 = $(MY_OPENCL_FILES)
+@OPT_ENABLE_OPENCL_TRUE@am__append_136 = -DENABLE_OPENCL
+@OPT_ENABLE_OPENCL_TRUE@am__append_137 = $(OPT_OPENCL_IFLAGS)
+@OPT_ENABLE_OPENCL_TRUE@am__append_138 = -DHPCRUN_SS_OPENCL
+@OPT_ENABLE_GTPIN_TRUE@am__append_139 = $(MY_GTPIN_FILES)
+@OPT_ENABLE_GTPIN_TRUE@am__append_140 = -DENABLE_GTPIN -DGTPIN_LIBDIR=$(OPT_GTPIN_LIBDIR)
+@OPT_ENABLE_GTPIN_TRUE@am__append_141 = $(OPT_GTPIN_IFLAGS)
+@OPT_ENABLE_GTPIN_TRUE@am__append_142 = -DHPCRUN_SS_GTPIN
+@OPT_ENABLE_LUSH_TRUE@@OPT_WITH_CILK_TRUE@am__append_143 = libagent-cilk.la
+@OPT_ENABLE_LUSH_TRUE@am__append_144 = libagent-pthread.la \
 @OPT_ENABLE_LUSH_TRUE@	libagent-tbb.la
 subdir = src/tool/hpcrun
 ACLOCAL_M4 = $(top_srcdir)/aclocal.m4
@@ -541,10 +542,10 @@ am__libhpcrun_la_SOURCES_DIST = utilities/first_func.c main.h main.c \
 	gpu/nvidia/cupti-activity-translate.c \
 	gpu/nvidia/cupti-analysis.c gpu/nvidia/cupti-api.c \
 	gpu/nvidia/cupti-gpu-api.c sample-sources/upc.c \
-	sample-sources/amd.c gpu/amd/roctracer-activity-translate.c \
-	gpu/amd/roctracer-api.c gpu/amd/rocm-debug-api.c \
-	gpu/amd/rocm-binary-processing.c sample-sources/level0.c \
-	gpu/level0/level0-api.c \
+	sample-sources/amd.c gpu/amd/hip-api.c \
+	gpu/amd/roctracer-activity-translate.c gpu/amd/roctracer-api.c \
+	gpu/amd/rocm-debug-api.c gpu/amd/rocm-binary-processing.c \
+	sample-sources/level0.c gpu/level0/level0-api.c \
 	gpu/level0/level0-command-list-context-map.c \
 	gpu/level0/level0-command-list-map.c \
 	gpu/level0/level0-command-process.c \
@@ -751,6 +752,7 @@ am__objects_34 = sample-sources/libhpcrun_la-upc.lo
 am__objects_36 =
 @OPT_ENABLE_ROCM_TRUE@am__objects_37 =  \
 @OPT_ENABLE_ROCM_TRUE@	sample-sources/libhpcrun_la-amd.lo \
+@OPT_ENABLE_ROCM_TRUE@	gpu/amd/libhpcrun_la-hip-api.lo \
 @OPT_ENABLE_ROCM_TRUE@	gpu/amd/libhpcrun_la-roctracer-activity-translate.lo \
 @OPT_ENABLE_ROCM_TRUE@	gpu/amd/libhpcrun_la-roctracer-api.lo \
 @OPT_ENABLE_ROCM_TRUE@	gpu/amd/libhpcrun_la-rocm-debug-api.lo \
@@ -1784,13 +1786,14 @@ pkglibexec_SCRIPTS = $(am__append_1)
 include_HEADERS = $(am__append_2)
 pkglib_LIBRARIES = $(am__append_5)
 pkglib_LTLIBRARIES = $(am__append_3) $(am__append_7) $(am__append_18) \
-	$(am__append_19) $(am__append_142) $(am__append_143)
+	$(am__append_19) $(am__append_143) $(am__append_144)
 BUILT_SOURCES = $(am__append_23)
 CLEANFILES = $(am__append_24)
 @OPT_ENABLE_HPCRUN_DYNAMIC_TRUE@noinst_LTLIBRARIES = libhpcrun.la
 PAPI_INC_FLGS = @OPT_PAPI_IFLAGS@ 
 PAPI_LD_FLGS = @OPT_PAPI_LDFLAGS@
 CUPTI_INC_FLGS = @OPT_CUPTI_IFLAGS@
+ROCM_INC_FLGS = @OPT_ROCM_IFLAGS@
 OPENCL_IFLAGS = @OPT_OPENCL_IFLAGS@
 CUPTI_LD_FLGS = @OPT_CUPTI_LDFLAGS@
 CUPTI_BASE = @OPT_CUPTI@
@@ -1875,8 +1878,8 @@ UNW_MIPS_LD_FLAGS =
 MY_CPP_DEFINES = -D_GNU_SOURCE -DINLINE_FN=1 -DLOCAL_BUILD=1 \
 	-D__HIP_PLATFORM_HCC__=1 $(am__append_11) $(am__append_20) \
 	$(am__append_103) $(am__append_107) $(am__append_109) \
-	$(am__append_110) $(am__append_114) $(am__append_129) \
-	$(am__append_133) $(am__append_137) $(am__append_141)
+	$(am__append_111) $(am__append_115) $(am__append_130) \
+	$(am__append_134) $(am__append_138) $(am__append_142)
 MY_BASE_FILES = utilities/first_func.c main.h main.c disabled.c \
 	closure-registry.c cct_insert_backtrace.c \
 	cct_backtrace_finalize.c env.c epoch.c files.c \
@@ -2005,6 +2008,7 @@ MY_AARCH64_FILES = \
 
 @OPT_ENABLE_ROCM_TRUE@MY_ROCM_FILES = \
 @OPT_ENABLE_ROCM_TRUE@	sample-sources/amd.c \
+@OPT_ENABLE_ROCM_TRUE@	gpu/amd/hip-api.c \
 @OPT_ENABLE_ROCM_TRUE@	gpu/amd/roctracer-activity-translate.c \
 @OPT_ENABLE_ROCM_TRUE@	gpu/amd/roctracer-api.c 	\
 @OPT_ENABLE_ROCM_TRUE@	gpu/amd/rocm-debug-api.c \
@@ -2061,9 +2065,9 @@ MY_AARCH64_INCLUDE_DIRS = \
 libhpcrun_la_SOURCES = $(MY_BASE_FILES) $(MY_DYNAMIC_FILES) \
 	$(am__append_25) $(am__append_26) $(am__append_39) \
 	$(am__append_54) $(am__append_72) $(am__append_85) \
-	$(am__append_100) $(am__append_104) $(am__append_115) \
-	$(am__append_122) $(am__append_126) $(am__append_130) \
-	$(am__append_134) $(am__append_138) $(UNW_SOURCE_FILES) \
+	$(am__append_100) $(am__append_104) $(am__append_116) \
+	$(am__append_123) $(am__append_127) $(am__append_131) \
+	$(am__append_135) $(am__append_139) $(UNW_SOURCE_FILES) \
 	utilities/last_func.c
 libhpcrun_fake_audit_la_SOURCES = \
 	audit/fake-auditor.c
@@ -2074,7 +2078,7 @@ libhpcrun_audit_la_SOURCES = \
 libhpcrun_o_SOURCES = $(MY_BASE_FILES) $(MY_STATIC_FILES) \
 	$(am__append_27) $(am__append_40) $(am__append_55) \
 	$(am__append_73) $(am__append_86) $(am__append_105) \
-	$(am__append_111) $(am__append_116) $(am__append_125) \
+	$(am__append_112) $(am__append_117) $(am__append_126) \
 	$(UNW_SOURCE_FILES) utilities/last_func.c
 libhpcrun_wrap_a_SOURCES = \
 	monitor-exts/openmp.c
@@ -2122,9 +2126,9 @@ libhpcrun_la_CPPFLAGS = $(MY_CPP_DEFINES) $(MY_INCLUDE_DIRS) \
 	$(am__append_21) $(am__append_28) $(am__append_41) \
 	$(am__append_56) $(am__append_74) $(am__append_87) \
 	$(am__append_101) $(am__append_106) $(am__append_108) \
-	$(am__append_117) $(am__append_120) $(am__append_123) \
-	$(am__append_127) $(am__append_131) $(am__append_135) \
-	$(am__append_139) $(UNW_INCLUDE_DIRS)
+	$(am__append_110) $(am__append_118) $(am__append_121) \
+	$(am__append_124) $(am__append_128) $(am__append_132) \
+	$(am__append_136) $(am__append_140) $(UNW_INCLUDE_DIRS)
 libhpcrun_fake_audit_la_CPPFLAGS = \
 	$(MY_CPP_DEFINES)		\
 	$(MY_INCLUDE_DIRS)
@@ -2136,8 +2140,8 @@ libhpcrun_audit_la_CPPFLAGS = \
 libhpcrun_o_CPPFLAGS = -DHPCRUN_STATIC_LINK $(MY_CPP_DEFINES) \
 	$(MY_INCLUDE_DIRS) $(am__append_22) $(am__append_29) \
 	$(am__append_42) $(am__append_57) $(am__append_75) \
-	$(am__append_88) $(am__append_112) $(am__append_118) \
-	$(am__append_121) $(UNW_INCLUDE_DIRS)
+	$(am__append_88) $(am__append_113) $(am__append_119) \
+	$(am__append_122) $(UNW_INCLUDE_DIRS)
 libhpcrun_wrap_a_CPPFLAGS = \
 	-DHPCRUN_STATIC_LINK		\
 	$(MY_CPP_DEFINES)		\
@@ -2194,8 +2198,8 @@ libhpctoolkit_a_CPPFLAGS = \
 # cflags
 #-----------------------------------------------------------
 libhpcrun_la_CFLAGS = $(CFLAGS) $(HOST_CFLAGS) $(PERFMON_CFLAGS) \
-	$(am__append_124) $(am__append_128) $(am__append_132) \
-	$(am__append_136) $(am__append_140) $(GOTCHA_IFLAGS)
+	$(am__append_125) $(am__append_129) $(am__append_133) \
+	$(am__append_137) $(am__append_141) $(GOTCHA_IFLAGS)
 libhpcrun_o_CFLAGS = $(CFLAGS) $(HOST_CFLAGS) $(PERFMON_CFLAGS)
 libhpcrun_wrap_a_CFLAGS = $(CFLAGS) $(HOST_CFLAGS)
 libhpcrun_ga_la_CFLAGS = $(CFLAGS) $(HOST_CFLAGS)
@@ -2228,7 +2232,7 @@ OUR_LZMA_A = $(top_builddir)/src/extern/lzma/liblzma.a
 libhpcrun_la_LDFLAGS = -Wl,-Bsymbolic -L$(LIBMONITOR_LIB) -lmonitor \
 	-lpthread -lrt -L$(LIBELF_LIB) -lelf $(PERFMON_LDFLAGS_DYN) \
 	$(OPT_ROCM_LDFLAGS) $(am__append_58) $(am__append_102) \
-	$(am__append_119) $(GOTCHA_LDFLAGS) $(UNW_DYNAMIC_LD_FLAGS)
+	$(am__append_120) $(GOTCHA_LDFLAGS) $(UNW_DYNAMIC_LD_FLAGS)
 libhpcrun_fake_audit_la_LDFLAGS = -Wl,-Bsymbolic -ldl
 libhpcrun_audit_la_LDFLAGS = -Wl,-Bsymbolic -ldl
 libhpcrun_ga_la_LDFLAGS = -Wl,-Bsymbolic
@@ -2254,7 +2258,7 @@ libhpcrun_la_LIBADD = \
 
 libhpcrun_o_LDADD = $(PROF_LEAN_A) $(SUPPORT_LEAN_A) \
 	$(PERFMON_LDFLAGS_STAT) $(MBEDTLS_LIBS) $(OUR_LIBUNWIND_A) \
-	$(OUR_LZMA_A) $(am__append_59) $(am__append_113) \
+	$(OUR_LZMA_A) $(am__append_59) $(am__append_114) \
 	$(UNW_STATIC_LD_FLAGS)
 MY_AGENT_INCLUDE_DIRS = $(MY_INCLUDE_DIRS) $(am__append_71) \
 	$(UNW_INCLUDE_DIRS)
@@ -3007,6 +3011,8 @@ gpu/amd/$(am__dirstamp):
 gpu/amd/$(DEPDIR)/$(am__dirstamp):
 	@$(MKDIR_P) gpu/amd/$(DEPDIR)
 	@: > gpu/amd/$(DEPDIR)/$(am__dirstamp)
+gpu/amd/libhpcrun_la-hip-api.lo: gpu/amd/$(am__dirstamp) \
+	gpu/amd/$(DEPDIR)/$(am__dirstamp)
 gpu/amd/libhpcrun_la-roctracer-activity-translate.lo:  \
 	gpu/amd/$(am__dirstamp) gpu/amd/$(DEPDIR)/$(am__dirstamp)
 gpu/amd/libhpcrun_la-roctracer-api.lo: gpu/amd/$(am__dirstamp) \
@@ -4009,6 +4015,7 @@ distclean-compile:
 @AMDEP_TRUE@@am__include@ @am__quote@gpu/$(DEPDIR)/libhpcrun_o-gpu-trace-demultiplexer.Po@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@gpu/$(DEPDIR)/libhpcrun_o-gpu-trace-item.Po@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@gpu/$(DEPDIR)/libhpcrun_o-gpu-trace.Po@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@gpu/amd/$(DEPDIR)/libhpcrun_la-hip-api.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@gpu/amd/$(DEPDIR)/libhpcrun_la-rocm-binary-processing.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@gpu/amd/$(DEPDIR)/libhpcrun_la-rocm-debug-api.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@gpu/amd/$(DEPDIR)/libhpcrun_la-roctracer-activity-translate.Plo@am__quote@
@@ -5666,6 +5673,13 @@ sample-sources/libhpcrun_la-amd.lo: sample-sources/amd.c
 @AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
 @am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -c -o sample-sources/libhpcrun_la-amd.lo `test -f 'sample-sources/amd.c' || echo '$(srcdir)/'`sample-sources/amd.c
 
+gpu/amd/libhpcrun_la-hip-api.lo: gpu/amd/hip-api.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -MT gpu/amd/libhpcrun_la-hip-api.lo -MD -MP -MF gpu/amd/$(DEPDIR)/libhpcrun_la-hip-api.Tpo -c -o gpu/amd/libhpcrun_la-hip-api.lo `test -f 'gpu/amd/hip-api.c' || echo '$(srcdir)/'`gpu/amd/hip-api.c
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) gpu/amd/$(DEPDIR)/libhpcrun_la-hip-api.Tpo gpu/amd/$(DEPDIR)/libhpcrun_la-hip-api.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='gpu/amd/hip-api.c' object='gpu/amd/libhpcrun_la-hip-api.lo' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -c -o gpu/amd/libhpcrun_la-hip-api.lo `test -f 'gpu/amd/hip-api.c' || echo '$(srcdir)/'`gpu/amd/hip-api.c
+
 gpu/amd/libhpcrun_la-roctracer-activity-translate.lo: gpu/amd/roctracer-activity-translate.c
 @am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -MT gpu/amd/libhpcrun_la-roctracer-activity-translate.lo -MD -MP -MF gpu/amd/$(DEPDIR)/libhpcrun_la-roctracer-activity-translate.Tpo -c -o gpu/amd/libhpcrun_la-roctracer-activity-translate.lo `test -f 'gpu/amd/roctracer-activity-translate.c' || echo '$(srcdir)/'`gpu/amd/roctracer-activity-translate.c
 @am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) gpu/amd/$(DEPDIR)/libhpcrun_la-roctracer-activity-translate.Tpo gpu/amd/$(DEPDIR)/libhpcrun_la-roctracer-activity-translate.Plo
diff --git a/src/tool/hpcrun/gpu/amd/hip-api.c b/src/tool/hpcrun/gpu/amd/hip-api.c
index f7ac18a832..3a85e9a9f7 100644
--- a/src/tool/hpcrun/gpu/amd/hip-api.c
+++ b/src/tool/hpcrun/gpu/amd/hip-api.c
@@ -61,8 +61,7 @@
 #include <string.h>    // memset
 
 #include <roctracer_hip.h>
-
-
+// #include <hip/hip_runtime.h>
 
 //*****************************************************************************
 // local include files
diff --git a/src/tool/hpcrun/gpu/amd/hip-api.h b/src/tool/hpcrun/gpu/amd/hip-api.h
index 459499e638..cf9b7b03e5 100644
--- a/src/tool/hpcrun/gpu/amd/hip-api.h
+++ b/src/tool/hpcrun/gpu/amd/hip-api.h
@@ -57,10 +57,11 @@
 
 
 //*****************************************************************************
-// nvidia includes
+// rocm includes
 //*****************************************************************************
 
 #include <roctracer_hip.h>
+// #include <hip/hip_runtime.h>
 
 
 
@@ -106,6 +107,6 @@ hip_device_property_query
 );
 
 int
-hip_dev_sync ();
+hip_dev_sync();
 
 #endif //hip_api_h
diff --git a/src/tool/hpcrun/gpu/amd/roctracer-api.c b/src/tool/hpcrun/gpu/amd/roctracer-api.c
index 06c92f233e..893a4402bc 100644
--- a/src/tool/hpcrun/gpu/amd/roctracer-api.c
+++ b/src/tool/hpcrun/gpu/amd/roctracer-api.c
@@ -48,7 +48,7 @@
 #include "roctracer-api.h"
 #include "roctracer-activity-translate.h"
 
-// #include "hip-api.h"
+#include "hip-api.h"
 #include "rocm-debug-api.h"
 #include "rocm-binary-processing.h"
 #include "tool_state.h"
diff --git a/src/tool/hpcrun/sample-sources/amd.c b/src/tool/hpcrun/sample-sources/amd.c
index bc84140db0..bb5fc402fe 100644
--- a/src/tool/hpcrun/sample-sources/amd.c
+++ b/src/tool/hpcrun/sample-sources/amd.c
@@ -43,7 +43,7 @@
 #include <hpcrun/control-knob.h>
 #include <hpcrun/device-finalizers.h>
 #include <hpcrun/gpu/amd/roctracer-api.h>
-// #include <hpcrun/gpu/amd/hip-api.h>
+#include <hpcrun/gpu/amd/hip-api.h>
 #include <hpcrun/gpu/gpu-activity.h>
 #include <hpcrun/gpu/gpu-metrics.h>
 #include <hpcrun/gpu/gpu-trace.h>
@@ -152,12 +152,12 @@ METHOD_FN(process_event_list, int lush_metrics)
     TMSG(CUDA,"nevents = %d", nevents);
 
 
-// #ifndef HPCRUN_STATIC_LINK
-//   if (hip_bind()) {
-//     EEMSG("hpcrun: unable to bind to HIP AMD library %s\n", dlerror());
-//     monitor_real_exit(-1);
-//   }
-// #endif
+#ifndef HPCRUN_STATIC_LINK
+  if (hip_bind()) {
+    EEMSG("hpcrun: unable to bind to HIP AMD library %s\n", dlerror());
+    monitor_real_exit(-1);
+  }
+#endif
 }
 
 static void
diff --git a/src/tool/hpcrun/sample-sources/papi-c-rocm.c b/src/tool/hpcrun/sample-sources/papi-c-rocm.c
index 92120ef7a2..0aca13a1e1 100644
--- a/src/tool/hpcrun/sample-sources/papi-c-rocm.c
+++ b/src/tool/hpcrun/sample-sources/papi-c-rocm.c
@@ -68,7 +68,7 @@
 #include <sample-sources/ss-obj-name.h>
 #include "papi-c.h"
 #include "papi-c-extended-info.h"
-
+#include <hpcrun/gpu/amd/hip-api.h>
 
 
 //******************************************************************************
@@ -158,7 +158,7 @@ papi_c_rocm_start()
 void
 papi_c_rocm_read(long long *values)
 {
-  // hip_dev_sync(); // TODO:Dejan check this out
+  hip_dev_sync(); // TODO:Dejan check this out
   int ret = PAPI_read(my_event_set, values);
   if (ret != PAPI_OK) {
     hpcrun_abort("PAPI_read of event set %d failed with %s (%d)",

From af1a0faa7b9441b04d7f60a682b1279e0dd84380 Mon Sep 17 00:00:00 2001
From: dejangrubisic <grubisic.dejan@yahoo.com>
Date: Tue, 18 May 2021 09:10:23 -0500
Subject: [PATCH 135/177] rocm-component synchronised on read with hip_dev_sync

---
 configure                         | 2 +-
 src/tool/hpcrun/Makefile.am       | 3 ++-
 src/tool/hpcrun/Makefile.in       | 3 ++-
 src/tool/hpcrun/gpu/amd/hip-api.c | 6 +++---
 src/tool/hpcrun/gpu/amd/hip-api.h | 4 ++--
 5 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/configure b/configure
index 1ee1190415..7701819f48 100755
--- a/configure
+++ b/configure
@@ -21736,7 +21736,6 @@ case "$OPT_PAPI" in
     fi
     OPT_PAPI_IFLAGS="-I${OPT_PAPI}/include"
 
-
     for lib in $multilib_path ; do
       lib_dir="${OPT_PAPI}/$lib"
       if test -f "${lib_dir}/libpapi.so" || test -f "${lib_dir}/libpapi.a"
@@ -22306,6 +22305,7 @@ $as_echo "$as_me: WARNING: disable papi due to possible conflict with perfmon" >
     OPT_PAPI_LIBPATH=
     use_papi_c=no
     use_papi_c_cupti=no
+    use_papi_c_rocm=no
   fi
 fi
 
diff --git a/src/tool/hpcrun/Makefile.am b/src/tool/hpcrun/Makefile.am
index 1d09083075..c0d3e6f9fe 100644
--- a/src/tool/hpcrun/Makefile.am
+++ b/src/tool/hpcrun/Makefile.am
@@ -601,6 +601,7 @@ MY_INCLUDE_DIRS =			\
 	-I$(HPCFNBOUNDS_INC)		\
         $(OPT_CUDA_IFLAGS)             \
         $(OPT_CUPTI_IFLAGS)             \
+		$(ROCM_INC_FLGS) \
 	-I$(LIBELF_INC)			\
 	-I$(LIBMONITOR_INC)	\
 	$(GOTCHA_IFLAGS)	\
@@ -1057,7 +1058,7 @@ endif
 if OPT_ENABLE_ROCM
   libhpcrun_la_SOURCES  += $(MY_ROCM_FILES)
   libhpcrun_la_CPPFLAGS += -DENABLE_ROCM
-  libhpcrun_la_CFLAGS   += $(OPT_ROCM_IFLAGS)
+  libhpcrun_la_CFLAGS   += $(ROCM_INC_FLGS)
 
   MY_CPP_DEFINES  += -DHPCRUN_SS_AMD
 endif
diff --git a/src/tool/hpcrun/Makefile.in b/src/tool/hpcrun/Makefile.in
index 3316d06482..3109e55e9d 100644
--- a/src/tool/hpcrun/Makefile.in
+++ b/src/tool/hpcrun/Makefile.in
@@ -294,7 +294,7 @@ host_triplet = @host@
 @OPT_ENABLE_CUDA_TRUE@am__append_126 = $(MY_CUDA_FILES)
 @OPT_ENABLE_ROCM_TRUE@am__append_127 = $(MY_ROCM_FILES)
 @OPT_ENABLE_ROCM_TRUE@am__append_128 = -DENABLE_ROCM
-@OPT_ENABLE_ROCM_TRUE@am__append_129 = $(OPT_ROCM_IFLAGS)
+@OPT_ENABLE_ROCM_TRUE@am__append_129 = $(ROCM_INC_FLGS)
 @OPT_ENABLE_ROCM_TRUE@am__append_130 = -DHPCRUN_SS_AMD
 @OPT_ENABLE_LEVEL0_TRUE@am__append_131 = $(MY_LEVEL0_FILES)
 @OPT_ENABLE_LEVEL0_TRUE@am__append_132 = -DENABLE_LEVEL0
@@ -2042,6 +2042,7 @@ MY_INCLUDE_DIRS = \
 	-I$(HPCFNBOUNDS_INC)		\
         $(OPT_CUDA_IFLAGS)             \
         $(OPT_CUPTI_IFLAGS)             \
+		$(ROCM_INC_FLGS) \
 	-I$(LIBELF_INC)			\
 	-I$(LIBMONITOR_INC)	\
 	$(GOTCHA_IFLAGS)	\
diff --git a/src/tool/hpcrun/gpu/amd/hip-api.c b/src/tool/hpcrun/gpu/amd/hip-api.c
index 3a85e9a9f7..9dd8188c0f 100644
--- a/src/tool/hpcrun/gpu/amd/hip-api.c
+++ b/src/tool/hpcrun/gpu/amd/hip-api.c
@@ -60,8 +60,8 @@
 #include <stdio.h>
 #include <string.h>    // memset
 
-#include <roctracer_hip.h>
-// #include <hip/hip_runtime.h>
+// #include <roctracer_hip.h>
+#include <hip/hip_runtime.h>
 
 //*****************************************************************************
 // local include files
@@ -164,7 +164,7 @@ void
 {
 #ifndef HPCRUN_STATIC_LINK
   // dynamic libraries only availabile in non-static case
-  CHK_DLOPEN(hip, "libhip_hcc.so", RTLD_NOW | RTLD_GLOBAL);
+  CHK_DLOPEN(hip, "libamdhip64.so", RTLD_NOW | RTLD_GLOBAL);
 
 #define HIP_BIND(fn) \
   CHK_DLSYM(hip, fn);
diff --git a/src/tool/hpcrun/gpu/amd/hip-api.h b/src/tool/hpcrun/gpu/amd/hip-api.h
index cf9b7b03e5..5d21ac5d6f 100644
--- a/src/tool/hpcrun/gpu/amd/hip-api.h
+++ b/src/tool/hpcrun/gpu/amd/hip-api.h
@@ -60,8 +60,8 @@
 // rocm includes
 //*****************************************************************************
 
-#include <roctracer_hip.h>
-// #include <hip/hip_runtime.h>
+// #include <roctracer_hip.h>
+#include <hip/hip_runtime.h>
 
 
 

From 6d4149af6082d89066207b51cc4a6a9fd12fc335 Mon Sep 17 00:00:00 2001
From: dejangrubisic <grubisic.dejan@yahoo.com>
Date: Wed, 15 Sep 2021 18:30:01 -0500
Subject: [PATCH 136/177] counters works on rocm-4.3.1. ufront

---
 src/tool/hpcrun/fnbounds/fnbounds_client.c | 16 +++++++++-------
 src/tool/hpcrun/main.c                     |  2 ++
 src/tool/hpcrun/module-ignore-map.c        |  3 ++-
 3 files changed, 13 insertions(+), 8 deletions(-)

diff --git a/src/tool/hpcrun/fnbounds/fnbounds_client.c b/src/tool/hpcrun/fnbounds/fnbounds_client.c
index bdacf2c534..d3fed24289 100644
--- a/src/tool/hpcrun/fnbounds/fnbounds_client.c
+++ b/src/tool/hpcrun/fnbounds/fnbounds_client.c
@@ -458,7 +458,7 @@ launch_server(void)
   struct {
     int sendfd[2], recvfd[2];
   } fds;
-  bool sampling_is_running;
+  bool sampling_is_running = false;
   pid_t pid;
 
   // already running
@@ -476,13 +476,15 @@ launch_server(void)
     return -1;
   }
 
-  // some sample sources need to be stopped in the parent, or else
-  // they cause problems in the child.
-  sampling_is_running = SAMPLE_SOURCES(started);
-  if (sampling_is_running) {
-    SAMPLE_SOURCES(stop);
+  if (hpcrun_is_initialized()){
+    // some sample sources need to be stopped in the parent, or else
+    // they cause problems in the child.
+    sampling_is_running = SAMPLE_SOURCES(started);
+    if (sampling_is_running) {
+      SAMPLE_SOURCES(stop);
+    }
   }
-
+  
   // For safety, we don't assume the direction of stack growth
   pid = auditor_exports->clone(hpcfnbounds_child,
     &server_stack[SERVER_STACK_SIZE * 1024], SIGCHLD, &fds);
diff --git a/src/tool/hpcrun/main.c b/src/tool/hpcrun/main.c
index d367c2af81..634a16d40f 100644
--- a/src/tool/hpcrun/main.c
+++ b/src/tool/hpcrun/main.c
@@ -931,6 +931,8 @@ monitor_init_process(int *argc, char **argv, void* data)
     #ifndef HPCRUN_STATIC_LINK
       auditor_exports->mainlib_connected(get_saved_vdso_path());
     #endif
+  }else{
+    hpcrun_initializer_init();
   }
   
  fork_data_t* fork_data = (fork_data_t*) data;
diff --git a/src/tool/hpcrun/module-ignore-map.c b/src/tool/hpcrun/module-ignore-map.c
index d810be841b..6f5c11ed0a 100644
--- a/src/tool/hpcrun/module-ignore-map.c
+++ b/src/tool/hpcrun/module-ignore-map.c
@@ -108,7 +108,7 @@
 // where any GPU can indicate that its functions should be added to
 // the module ignore map when that type of GPU is being monitored.
 
-#define NUM_FNS ( sizeof(IGNORE_FNS) / sizeof(char *) )
+#define NUM_FNS 10 //( sizeof(IGNORE_FNS) / sizeof(char *) )
 
 
 
@@ -132,6 +132,7 @@ static const char *IGNORE_FNS[] = {
   "cuLaunchKernel",
   "cudaLaunchKernel",
   "cuptiActivityEnable",
+  "rocprofiler_iterate_info",
   "roctracer_set_properties",  // amd roctracer library
   "amd_dbgapi_initialize",     // amd debug library
   "hipKernelNameRefByPtr",     // amd hip runtime

From fbbc0db6f233ee1c0a9b68f8be6940dda1f96005 Mon Sep 17 00:00:00 2001
From: John M Mellor-Crummey <johnmc@rice.edu>
Date: Thu, 7 Oct 2021 18:14:55 -0500
Subject: [PATCH 137/177] simplify registration of ompt callbacks

---
 src/tool/hpcrun/ompt/ompt-interface.c | 45 +++++++++++++--------------
 1 file changed, 21 insertions(+), 24 deletions(-)

diff --git a/src/tool/hpcrun/ompt/ompt-interface.c b/src/tool/hpcrun/ompt/ompt-interface.c
index 1c66637f85..2aa707197c 100644
--- a/src/tool/hpcrun/ompt/ompt-interface.c
+++ b/src/tool/hpcrun/ompt/ompt-interface.c
@@ -477,11 +477,9 @@ init_threads
  void
 )
 {
-  ompt_set_callback_fn
-    (ompt_callback_thread_begin, (ompt_callback_t)ompt_thread_begin);
+  ompt_set_callback(ompt_callback_thread_begin, ompt_thread_begin);
 
-  ompt_set_callback_fn
-    (ompt_callback_thread_end, (ompt_callback_t) ompt_thread_end);
+  ompt_set_callback(ompt_callback_thread_end, ompt_thread_end);
 }
 
 
@@ -491,7 +489,7 @@ init_parallel_regions
  void
 )
 {
-  ompt_parallel_region_register_callbacks(ompt_set_callback_fn);
+  ompt_parallel_region_register_callbacks(ompt_set_callback_internal);
   ompt_regions_init();
 }
 
@@ -502,7 +500,7 @@ init_tasks
  void
 )
 {
-  ompt_task_register_callbacks(ompt_set_callback_fn);
+  ompt_task_register_callbacks(ompt_set_callback_internal);
 }
 
 
@@ -520,8 +518,8 @@ init_mutex_blame_shift
 
   if (!ompt_mutex_blame_requested) return;
 
-  retval = ompt_set_callback_fn(ompt_callback_mutex_released,
-                                (ompt_callback_t) ompt_mutex_blame_accept);
+  retval = ompt_set_callback(ompt_callback_mutex_released, 
+                             ompt_mutex_blame_accept);
   mutex_blame_shift_avail |= ompt_event_may_occur(retval);
 
 
@@ -557,13 +555,11 @@ init_idle_blame_shift
 #if 0
   ompt_idle_blame_shift_request();
 
-  retval = ompt_set_callback_fn(ompt_callback_idle,
-                                (ompt_callback_t)ompt_idle);
+  retval = ompt_set_callback(ompt_callback_idle, ompt_idle);
   idle_blame_shift_avail |= ompt_event_may_occur(retval);
 #endif
 
-  retval = ompt_set_callback_fn(ompt_callback_sync_region_wait,
-                                (ompt_callback_t)ompt_sync);
+  retval = ompt_set_callback(ompt_callback_sync_region_wait, ompt_sync);
   idle_blame_shift_avail |= ompt_event_may_occur(retval);
 
 
@@ -1029,18 +1025,19 @@ hpcrun_ompt_get_parent_region_data
 int
 hpcrun_ompt_get_thread_num(int level)
 {
-    if (ompt_initialized) {
-        int task_type_flags;
-        ompt_data_t *task_data = NULL;
-        ompt_data_t *parallel_data = NULL;
-        ompt_frame_t *task_frame = NULL;
-        int thread_num = 0;
-
-        ompt_get_task_info_fn(level, &task_type_flags, &task_data, &task_frame, &parallel_data, &thread_num);
-        //printf("Task frame pointer = %p\n", task_frame);
-        return thread_num;
-    }
-    return -1;
+  if (ompt_initialized) {
+    int task_type_flags;
+    ompt_data_t *task_data = NULL;
+    ompt_data_t *parallel_data = NULL;
+    ompt_frame_t *task_frame = NULL;
+    int thread_num = 0;
+
+    ompt_get_task_info_fn(level, &task_type_flags, &task_data,
+			  &task_frame, &parallel_data, &thread_num);
+    //printf("Task frame pointer = %p\n", task_frame);
+    return thread_num;
+  }
+  return -1;
 }
 
 

From 54d4ec608097c0a602e439d8dd6cba9739f1d85b Mon Sep 17 00:00:00 2001
From: John M Mellor-Crummey <johnmc@rice.edu>
Date: Fri, 8 Oct 2021 15:44:22 -0500
Subject: [PATCH 138/177] add initial handshake from fnbounds2 to application

we have seen the first message from the application
to fnbounds2 get lost. adding this initial handshake
in the reverse direction avoids that problem.
---
 src/tool/hpcfnbounds2/server.c             | 2 ++
 src/tool/hpcrun/fnbounds/fnbounds_client.c | 4 ++++
 2 files changed, 6 insertions(+)

diff --git a/src/tool/hpcfnbounds2/server.c b/src/tool/hpcfnbounds2/server.c
index ff3a2b1cf8..903c2c723b 100644
--- a/src/tool/hpcfnbounds2/server.c
+++ b/src/tool/hpcfnbounds2/server.c
@@ -135,6 +135,8 @@ init_server (DiscoverFnTy fn_discovery, int fd1, int fd2)
   }
   signal_handler_init();
 
+  write_mesg(77, 0);
+
   for (;;) {
     int ret = read_mesg(&mesg);
 
diff --git a/src/tool/hpcrun/fnbounds/fnbounds_client.c b/src/tool/hpcrun/fnbounds/fnbounds_client.c
index aee0855e56..b605318c7d 100644
--- a/src/tool/hpcrun/fnbounds/fnbounds_client.c
+++ b/src/tool/hpcrun/fnbounds/fnbounds_client.c
@@ -555,6 +555,10 @@ launch_server(void)
 
   TMSG(FNBOUNDS_CLIENT, "syserv launch: success, child shim: %d, server: %d", (int) child_pid, (int) server_pid);
 
+  // Server talks first, but we don't care about the actual message
+  struct syserv_mesg mesg;
+  read_mesg(&mesg);
+
   // restart sample sources
   if (sampling_is_running) {
     SAMPLE_SOURCES(start);

From 5b333e2d863544d742135e4d6770c74f6601f381 Mon Sep 17 00:00:00 2001
From: John M Mellor-Crummey <johnmc@rice.edu>
Date: Wed, 20 Oct 2021 22:25:13 -0500
Subject: [PATCH 139/177] 1st draft of hpctoolkit ompt support for AMD gpus

---
 doc/man/Makefile.in                           |  13 +-
 src/tool/hpcrun/Makefile.am                   |   4 +
 src/tool/hpcrun/Makefile.in                   | 149 +++++--
 src/tool/hpcrun/gpu/gpu-activity-process.c    |  72 +++-
 src/tool/hpcrun/gpu/gpu-activity.c            |   2 +-
 src/tool/hpcrun/gpu/gpu-activity.h            |   2 +
 .../hpcrun/gpu/gpu-host-correlation-map.c     |  10 +-
 src/tool/hpcrun/ompt/omp-tools.h              | 364 ++++++++++++++++--
 src/tool/hpcrun/ompt/ompt-device.c            | 333 +++++++++++-----
 src/tool/hpcrun/ompt/ompt-device.h            |  10 -
 src/tool/hpcrun/ompt/ompt-interface.c         |  11 +-
 src/tool/hpcrun/sample-sources/ss-list.h      |   2 +
 12 files changed, 807 insertions(+), 165 deletions(-)

diff --git a/doc/man/Makefile.in b/doc/man/Makefile.in
index 9a6e4d5e70..67cb362dc1 100644
--- a/doc/man/Makefile.in
+++ b/doc/man/Makefile.in
@@ -1,7 +1,7 @@
-# Makefile.in generated by automake 1.16.1 from Makefile.am.
+# Makefile.in generated by automake 1.15.1 from Makefile.am.
 # @configure_input@
 
-# Copyright (C) 1994-2018 Free Software Foundation, Inc.
+# Copyright (C) 1994-2017 Free Software Foundation, Inc.
 
 # This Makefile.in is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -490,8 +490,8 @@ Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status
 	  *config.status*) \
 	    cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh;; \
 	  *) \
-	    echo ' cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__maybe_remake_depfiles)'; \
-	    cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__maybe_remake_depfiles);; \
+	    echo ' cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe)'; \
+	    cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe);; \
 	esac;
 
 $(top_builddir)/config.status: $(top_srcdir)/configure $(CONFIG_STATUS_DEPENDENCIES)
@@ -581,10 +581,7 @@ ctags CTAGS:
 cscope cscopelist:
 
 
-distdir: $(BUILT_SOURCES)
-	$(MAKE) $(AM_MAKEFLAGS) distdir-am
-
-distdir-am: $(DISTFILES)
+distdir: $(DISTFILES)
 	@srcdirstrip=`echo "$(srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \
 	topsrcdirstrip=`echo "$(top_srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \
 	list='$(DISTFILES)'; \
diff --git a/src/tool/hpcrun/Makefile.am b/src/tool/hpcrun/Makefile.am
index 81ef198492..3a8a7f196c 100644
--- a/src/tool/hpcrun/Makefile.am
+++ b/src/tool/hpcrun/Makefile.am
@@ -387,6 +387,10 @@ MY_BASE_FILES =				\
 	gpu/gpu-trace-channel-set.c	\
 	gpu/gpu-trace-demultiplexer.c	\
 	\
+	gpu/ompt/ompt-gpu-api.c 			\
+	gpu/ompt/ompt-activity-translate.c 		\
+	sample-sources/openmp-target.c                  \
+	\
 	ompt/ompt-callstack.c           \
 	ompt/ompt-defer.c               \
 	ompt/ompt-device.c              \
diff --git a/src/tool/hpcrun/Makefile.in b/src/tool/hpcrun/Makefile.in
index e9125a1814..d15faaebc2 100644
--- a/src/tool/hpcrun/Makefile.in
+++ b/src/tool/hpcrun/Makefile.in
@@ -495,13 +495,14 @@ am__libhpcrun_la_SOURCES_DIST = utilities/first_func.c main.h main.c \
 	gpu/gpu-stream-id-map.c gpu/gpu-trace.c \
 	gpu/gpu-trace-channel.c gpu/gpu-trace-item.c \
 	gpu/gpu-trace-channel-set.c gpu/gpu-trace-demultiplexer.c \
-	ompt/ompt-callstack.c ompt/ompt-defer.c ompt/ompt-device.c \
-	ompt/ompt-defer-write.c ompt/ompt-interface.c \
-	ompt/ompt-queues.c ompt/ompt-region.c ompt/ompt-region-debug.c \
-	ompt/ompt-placeholders.c ompt/ompt-device-map.c \
-	ompt/ompt-task.c ompt/ompt-thread.c extern-real/dl-iterate.c \
-	extern-real/mmap.c syscalls/poll.c syscalls/ppoll.c \
-	syscalls/select.c syscalls/sysv_signal.c \
+	gpu/ompt/ompt-gpu-api.c gpu/ompt/ompt-activity-translate.c \
+	sample-sources/openmp-target.c ompt/ompt-callstack.c \
+	ompt/ompt-defer.c ompt/ompt-device.c ompt/ompt-defer-write.c \
+	ompt/ompt-interface.c ompt/ompt-queues.c ompt/ompt-region.c \
+	ompt/ompt-region-debug.c ompt/ompt-placeholders.c \
+	ompt/ompt-device-map.c ompt/ompt-task.c ompt/ompt-thread.c \
+	extern-real/dl-iterate.c extern-real/mmap.c syscalls/poll.c \
+	syscalls/ppoll.c syscalls/select.c syscalls/sysv_signal.c \
 	utilities/executable-path.h utilities/executable-path.c \
 	utilities/hpcrun-nanotime.h utilities/hpcrun-nanotime.c \
 	utilities/ip-normalized.h utilities/ip-normalized.c \
@@ -677,6 +678,9 @@ am__objects_14 = utilities/libhpcrun_la-first_func.lo \
 	gpu/libhpcrun_la-gpu-trace-item.lo \
 	gpu/libhpcrun_la-gpu-trace-channel-set.lo \
 	gpu/libhpcrun_la-gpu-trace-demultiplexer.lo \
+	gpu/ompt/libhpcrun_la-ompt-gpu-api.lo \
+	gpu/ompt/libhpcrun_la-ompt-activity-translate.lo \
+	sample-sources/libhpcrun_la-openmp-target.lo \
 	ompt/libhpcrun_la-ompt-callstack.lo \
 	ompt/libhpcrun_la-ompt-defer.lo \
 	ompt/libhpcrun_la-ompt-device.lo \
@@ -988,13 +992,14 @@ am__libhpcrun_o_SOURCES_DIST = utilities/first_func.c main.h main.c \
 	gpu/gpu-stream-id-map.c gpu/gpu-trace.c \
 	gpu/gpu-trace-channel.c gpu/gpu-trace-item.c \
 	gpu/gpu-trace-channel-set.c gpu/gpu-trace-demultiplexer.c \
-	ompt/ompt-callstack.c ompt/ompt-defer.c ompt/ompt-device.c \
-	ompt/ompt-defer-write.c ompt/ompt-interface.c \
-	ompt/ompt-queues.c ompt/ompt-region.c ompt/ompt-region-debug.c \
-	ompt/ompt-placeholders.c ompt/ompt-device-map.c \
-	ompt/ompt-task.c ompt/ompt-thread.c extern-real/dl-iterate.c \
-	extern-real/mmap.c syscalls/poll.c syscalls/ppoll.c \
-	syscalls/select.c syscalls/sysv_signal.c \
+	gpu/ompt/ompt-gpu-api.c gpu/ompt/ompt-activity-translate.c \
+	sample-sources/openmp-target.c ompt/ompt-callstack.c \
+	ompt/ompt-defer.c ompt/ompt-device.c ompt/ompt-defer-write.c \
+	ompt/ompt-interface.c ompt/ompt-queues.c ompt/ompt-region.c \
+	ompt/ompt-region-debug.c ompt/ompt-placeholders.c \
+	ompt/ompt-device-map.c ompt/ompt-task.c ompt/ompt-thread.c \
+	extern-real/dl-iterate.c extern-real/mmap.c syscalls/poll.c \
+	syscalls/ppoll.c syscalls/select.c syscalls/sysv_signal.c \
 	utilities/executable-path.h utilities/executable-path.c \
 	utilities/hpcrun-nanotime.h utilities/hpcrun-nanotime.c \
 	utilities/ip-normalized.h utilities/ip-normalized.c \
@@ -1164,6 +1169,9 @@ am__objects_57 = utilities/libhpcrun_o-first_func.$(OBJEXT) \
 	gpu/libhpcrun_o-gpu-trace-item.$(OBJEXT) \
 	gpu/libhpcrun_o-gpu-trace-channel-set.$(OBJEXT) \
 	gpu/libhpcrun_o-gpu-trace-demultiplexer.$(OBJEXT) \
+	gpu/ompt/libhpcrun_o-ompt-gpu-api.$(OBJEXT) \
+	gpu/ompt/libhpcrun_o-ompt-activity-translate.$(OBJEXT) \
+	sample-sources/libhpcrun_o-openmp-target.$(OBJEXT) \
 	ompt/libhpcrun_o-ompt-callstack.$(OBJEXT) \
 	ompt/libhpcrun_o-ompt-defer.$(OBJEXT) \
 	ompt/libhpcrun_o-ompt-device.$(OBJEXT) \
@@ -1919,13 +1927,14 @@ MY_BASE_FILES = utilities/first_func.c main.h main.c disabled.c \
 	gpu/gpu-stream-id-map.c gpu/gpu-trace.c \
 	gpu/gpu-trace-channel.c gpu/gpu-trace-item.c \
 	gpu/gpu-trace-channel-set.c gpu/gpu-trace-demultiplexer.c \
-	ompt/ompt-callstack.c ompt/ompt-defer.c ompt/ompt-device.c \
-	ompt/ompt-defer-write.c ompt/ompt-interface.c \
-	ompt/ompt-queues.c ompt/ompt-region.c ompt/ompt-region-debug.c \
-	ompt/ompt-placeholders.c ompt/ompt-device-map.c \
-	ompt/ompt-task.c ompt/ompt-thread.c extern-real/dl-iterate.c \
-	extern-real/mmap.c syscalls/poll.c syscalls/ppoll.c \
-	syscalls/select.c syscalls/sysv_signal.c \
+	gpu/ompt/ompt-gpu-api.c gpu/ompt/ompt-activity-translate.c \
+	sample-sources/openmp-target.c ompt/ompt-callstack.c \
+	ompt/ompt-defer.c ompt/ompt-device.c ompt/ompt-defer-write.c \
+	ompt/ompt-interface.c ompt/ompt-queues.c ompt/ompt-region.c \
+	ompt/ompt-region-debug.c ompt/ompt-placeholders.c \
+	ompt/ompt-device-map.c ompt/ompt-task.c ompt/ompt-thread.c \
+	extern-real/dl-iterate.c extern-real/mmap.c syscalls/poll.c \
+	syscalls/ppoll.c syscalls/select.c syscalls/sysv_signal.c \
 	utilities/executable-path.h utilities/executable-path.c \
 	utilities/hpcrun-nanotime.h utilities/hpcrun-nanotime.c \
 	utilities/ip-normalized.h utilities/ip-normalized.c \
@@ -2745,6 +2754,19 @@ gpu/libhpcrun_la-gpu-trace-channel-set.lo: gpu/$(am__dirstamp) \
 	gpu/$(DEPDIR)/$(am__dirstamp)
 gpu/libhpcrun_la-gpu-trace-demultiplexer.lo: gpu/$(am__dirstamp) \
 	gpu/$(DEPDIR)/$(am__dirstamp)
+gpu/ompt/$(am__dirstamp):
+	@$(MKDIR_P) gpu/ompt
+	@: > gpu/ompt/$(am__dirstamp)
+gpu/ompt/$(DEPDIR)/$(am__dirstamp):
+	@$(MKDIR_P) gpu/ompt/$(DEPDIR)
+	@: > gpu/ompt/$(DEPDIR)/$(am__dirstamp)
+gpu/ompt/libhpcrun_la-ompt-gpu-api.lo: gpu/ompt/$(am__dirstamp) \
+	gpu/ompt/$(DEPDIR)/$(am__dirstamp)
+gpu/ompt/libhpcrun_la-ompt-activity-translate.lo:  \
+	gpu/ompt/$(am__dirstamp) gpu/ompt/$(DEPDIR)/$(am__dirstamp)
+sample-sources/libhpcrun_la-openmp-target.lo:  \
+	sample-sources/$(am__dirstamp) \
+	sample-sources/$(DEPDIR)/$(am__dirstamp)
 ompt/$(am__dirstamp):
 	@$(MKDIR_P) ompt
 	@: > ompt/$(am__dirstamp)
@@ -3435,6 +3457,13 @@ gpu/libhpcrun_o-gpu-trace-channel-set.$(OBJEXT): gpu/$(am__dirstamp) \
 	gpu/$(DEPDIR)/$(am__dirstamp)
 gpu/libhpcrun_o-gpu-trace-demultiplexer.$(OBJEXT):  \
 	gpu/$(am__dirstamp) gpu/$(DEPDIR)/$(am__dirstamp)
+gpu/ompt/libhpcrun_o-ompt-gpu-api.$(OBJEXT): gpu/ompt/$(am__dirstamp) \
+	gpu/ompt/$(DEPDIR)/$(am__dirstamp)
+gpu/ompt/libhpcrun_o-ompt-activity-translate.$(OBJEXT):  \
+	gpu/ompt/$(am__dirstamp) gpu/ompt/$(DEPDIR)/$(am__dirstamp)
+sample-sources/libhpcrun_o-openmp-target.$(OBJEXT):  \
+	sample-sources/$(am__dirstamp) \
+	sample-sources/$(DEPDIR)/$(am__dirstamp)
 ompt/libhpcrun_o-ompt-callstack.$(OBJEXT): ompt/$(am__dirstamp) \
 	ompt/$(DEPDIR)/$(am__dirstamp)
 ompt/libhpcrun_o-ompt-defer.$(OBJEXT): ompt/$(am__dirstamp) \
@@ -3788,6 +3817,8 @@ mostlyclean-compile:
 	-rm -f gpu/level0/*.lo
 	-rm -f gpu/nvidia/*.$(OBJEXT)
 	-rm -f gpu/nvidia/*.lo
+	-rm -f gpu/ompt/*.$(OBJEXT)
+	-rm -f gpu/ompt/*.lo
 	-rm -f gpu/opencl/*.$(OBJEXT)
 	-rm -f gpu/opencl/*.lo
 	-rm -f lush-agents/*.$(OBJEXT)
@@ -4038,6 +4069,10 @@ distclean-compile:
 @AMDEP_TRUE@@am__include@ @am__quote@gpu/nvidia/$(DEPDIR)/libhpcrun_o-cupti-analysis.Po@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@gpu/nvidia/$(DEPDIR)/libhpcrun_o-cupti-api.Po@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@gpu/nvidia/$(DEPDIR)/libhpcrun_o-cupti-gpu-api.Po@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@gpu/ompt/$(DEPDIR)/libhpcrun_la-ompt-activity-translate.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@gpu/ompt/$(DEPDIR)/libhpcrun_la-ompt-gpu-api.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@gpu/ompt/$(DEPDIR)/libhpcrun_o-ompt-activity-translate.Po@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@gpu/ompt/$(DEPDIR)/libhpcrun_o-ompt-gpu-api.Po@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@gpu/opencl/$(DEPDIR)/libhpcrun_la-opencl-activity-translate.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@gpu/opencl/$(DEPDIR)/libhpcrun_la-opencl-api.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@gpu/opencl/$(DEPDIR)/libhpcrun_la-opencl-context-map.Plo@am__quote@
@@ -4118,6 +4153,7 @@ distclean-compile:
 @AMDEP_TRUE@@am__include@ @am__quote@sample-sources/$(DEPDIR)/libhpcrun_la-omp-idle.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@sample-sources/$(DEPDIR)/libhpcrun_la-omp-mutex.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@sample-sources/$(DEPDIR)/libhpcrun_la-opencl.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@sample-sources/$(DEPDIR)/libhpcrun_la-openmp-target.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@sample-sources/$(DEPDIR)/libhpcrun_la-papi-c-cupti.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@sample-sources/$(DEPDIR)/libhpcrun_la-papi-c-extended-info.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@sample-sources/$(DEPDIR)/libhpcrun_la-papi-c.Plo@am__quote@
@@ -4140,6 +4176,7 @@ distclean-compile:
 @AMDEP_TRUE@@am__include@ @am__quote@sample-sources/$(DEPDIR)/libhpcrun_o-nvidia.Po@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@sample-sources/$(DEPDIR)/libhpcrun_o-omp-idle.Po@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@sample-sources/$(DEPDIR)/libhpcrun_o-omp-mutex.Po@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@sample-sources/$(DEPDIR)/libhpcrun_o-openmp-target.Po@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@sample-sources/$(DEPDIR)/libhpcrun_o-papi-c-cupti.Po@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@sample-sources/$(DEPDIR)/libhpcrun_o-papi-c-extended-info.Po@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@sample-sources/$(DEPDIR)/libhpcrun_o-papi-c.Po@am__quote@
@@ -5202,6 +5239,27 @@ gpu/libhpcrun_la-gpu-trace-demultiplexer.lo: gpu/gpu-trace-demultiplexer.c
 @AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
 @am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -c -o gpu/libhpcrun_la-gpu-trace-demultiplexer.lo `test -f 'gpu/gpu-trace-demultiplexer.c' || echo '$(srcdir)/'`gpu/gpu-trace-demultiplexer.c
 
+gpu/ompt/libhpcrun_la-ompt-gpu-api.lo: gpu/ompt/ompt-gpu-api.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -MT gpu/ompt/libhpcrun_la-ompt-gpu-api.lo -MD -MP -MF gpu/ompt/$(DEPDIR)/libhpcrun_la-ompt-gpu-api.Tpo -c -o gpu/ompt/libhpcrun_la-ompt-gpu-api.lo `test -f 'gpu/ompt/ompt-gpu-api.c' || echo '$(srcdir)/'`gpu/ompt/ompt-gpu-api.c
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) gpu/ompt/$(DEPDIR)/libhpcrun_la-ompt-gpu-api.Tpo gpu/ompt/$(DEPDIR)/libhpcrun_la-ompt-gpu-api.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='gpu/ompt/ompt-gpu-api.c' object='gpu/ompt/libhpcrun_la-ompt-gpu-api.lo' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -c -o gpu/ompt/libhpcrun_la-ompt-gpu-api.lo `test -f 'gpu/ompt/ompt-gpu-api.c' || echo '$(srcdir)/'`gpu/ompt/ompt-gpu-api.c
+
+gpu/ompt/libhpcrun_la-ompt-activity-translate.lo: gpu/ompt/ompt-activity-translate.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -MT gpu/ompt/libhpcrun_la-ompt-activity-translate.lo -MD -MP -MF gpu/ompt/$(DEPDIR)/libhpcrun_la-ompt-activity-translate.Tpo -c -o gpu/ompt/libhpcrun_la-ompt-activity-translate.lo `test -f 'gpu/ompt/ompt-activity-translate.c' || echo '$(srcdir)/'`gpu/ompt/ompt-activity-translate.c
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) gpu/ompt/$(DEPDIR)/libhpcrun_la-ompt-activity-translate.Tpo gpu/ompt/$(DEPDIR)/libhpcrun_la-ompt-activity-translate.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='gpu/ompt/ompt-activity-translate.c' object='gpu/ompt/libhpcrun_la-ompt-activity-translate.lo' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -c -o gpu/ompt/libhpcrun_la-ompt-activity-translate.lo `test -f 'gpu/ompt/ompt-activity-translate.c' || echo '$(srcdir)/'`gpu/ompt/ompt-activity-translate.c
+
+sample-sources/libhpcrun_la-openmp-target.lo: sample-sources/openmp-target.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -MT sample-sources/libhpcrun_la-openmp-target.lo -MD -MP -MF sample-sources/$(DEPDIR)/libhpcrun_la-openmp-target.Tpo -c -o sample-sources/libhpcrun_la-openmp-target.lo `test -f 'sample-sources/openmp-target.c' || echo '$(srcdir)/'`sample-sources/openmp-target.c
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) sample-sources/$(DEPDIR)/libhpcrun_la-openmp-target.Tpo sample-sources/$(DEPDIR)/libhpcrun_la-openmp-target.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='sample-sources/openmp-target.c' object='sample-sources/libhpcrun_la-openmp-target.lo' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -c -o sample-sources/libhpcrun_la-openmp-target.lo `test -f 'sample-sources/openmp-target.c' || echo '$(srcdir)/'`sample-sources/openmp-target.c
+
 ompt/libhpcrun_la-ompt-callstack.lo: ompt/ompt-callstack.c
 @am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -MT ompt/libhpcrun_la-ompt-callstack.lo -MD -MP -MF ompt/$(DEPDIR)/libhpcrun_la-ompt-callstack.Tpo -c -o ompt/libhpcrun_la-ompt-callstack.lo `test -f 'ompt/ompt-callstack.c' || echo '$(srcdir)/'`ompt/ompt-callstack.c
 @am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) ompt/$(DEPDIR)/libhpcrun_la-ompt-callstack.Tpo ompt/$(DEPDIR)/libhpcrun_la-ompt-callstack.Plo
@@ -7512,6 +7570,48 @@ gpu/libhpcrun_o-gpu-trace-demultiplexer.obj: gpu/gpu-trace-demultiplexer.c
 @AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
 @am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_o_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_o_CFLAGS) $(CFLAGS) -c -o gpu/libhpcrun_o-gpu-trace-demultiplexer.obj `if test -f 'gpu/gpu-trace-demultiplexer.c'; then $(CYGPATH_W) 'gpu/gpu-trace-demultiplexer.c'; else $(CYGPATH_W) '$(srcdir)/gpu/gpu-trace-demultiplexer.c'; fi`
 
+gpu/ompt/libhpcrun_o-ompt-gpu-api.o: gpu/ompt/ompt-gpu-api.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_o_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_o_CFLAGS) $(CFLAGS) -MT gpu/ompt/libhpcrun_o-ompt-gpu-api.o -MD -MP -MF gpu/ompt/$(DEPDIR)/libhpcrun_o-ompt-gpu-api.Tpo -c -o gpu/ompt/libhpcrun_o-ompt-gpu-api.o `test -f 'gpu/ompt/ompt-gpu-api.c' || echo '$(srcdir)/'`gpu/ompt/ompt-gpu-api.c
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) gpu/ompt/$(DEPDIR)/libhpcrun_o-ompt-gpu-api.Tpo gpu/ompt/$(DEPDIR)/libhpcrun_o-ompt-gpu-api.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='gpu/ompt/ompt-gpu-api.c' object='gpu/ompt/libhpcrun_o-ompt-gpu-api.o' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_o_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_o_CFLAGS) $(CFLAGS) -c -o gpu/ompt/libhpcrun_o-ompt-gpu-api.o `test -f 'gpu/ompt/ompt-gpu-api.c' || echo '$(srcdir)/'`gpu/ompt/ompt-gpu-api.c
+
+gpu/ompt/libhpcrun_o-ompt-gpu-api.obj: gpu/ompt/ompt-gpu-api.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_o_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_o_CFLAGS) $(CFLAGS) -MT gpu/ompt/libhpcrun_o-ompt-gpu-api.obj -MD -MP -MF gpu/ompt/$(DEPDIR)/libhpcrun_o-ompt-gpu-api.Tpo -c -o gpu/ompt/libhpcrun_o-ompt-gpu-api.obj `if test -f 'gpu/ompt/ompt-gpu-api.c'; then $(CYGPATH_W) 'gpu/ompt/ompt-gpu-api.c'; else $(CYGPATH_W) '$(srcdir)/gpu/ompt/ompt-gpu-api.c'; fi`
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) gpu/ompt/$(DEPDIR)/libhpcrun_o-ompt-gpu-api.Tpo gpu/ompt/$(DEPDIR)/libhpcrun_o-ompt-gpu-api.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='gpu/ompt/ompt-gpu-api.c' object='gpu/ompt/libhpcrun_o-ompt-gpu-api.obj' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_o_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_o_CFLAGS) $(CFLAGS) -c -o gpu/ompt/libhpcrun_o-ompt-gpu-api.obj `if test -f 'gpu/ompt/ompt-gpu-api.c'; then $(CYGPATH_W) 'gpu/ompt/ompt-gpu-api.c'; else $(CYGPATH_W) '$(srcdir)/gpu/ompt/ompt-gpu-api.c'; fi`
+
+gpu/ompt/libhpcrun_o-ompt-activity-translate.o: gpu/ompt/ompt-activity-translate.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_o_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_o_CFLAGS) $(CFLAGS) -MT gpu/ompt/libhpcrun_o-ompt-activity-translate.o -MD -MP -MF gpu/ompt/$(DEPDIR)/libhpcrun_o-ompt-activity-translate.Tpo -c -o gpu/ompt/libhpcrun_o-ompt-activity-translate.o `test -f 'gpu/ompt/ompt-activity-translate.c' || echo '$(srcdir)/'`gpu/ompt/ompt-activity-translate.c
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) gpu/ompt/$(DEPDIR)/libhpcrun_o-ompt-activity-translate.Tpo gpu/ompt/$(DEPDIR)/libhpcrun_o-ompt-activity-translate.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='gpu/ompt/ompt-activity-translate.c' object='gpu/ompt/libhpcrun_o-ompt-activity-translate.o' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_o_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_o_CFLAGS) $(CFLAGS) -c -o gpu/ompt/libhpcrun_o-ompt-activity-translate.o `test -f 'gpu/ompt/ompt-activity-translate.c' || echo '$(srcdir)/'`gpu/ompt/ompt-activity-translate.c
+
+gpu/ompt/libhpcrun_o-ompt-activity-translate.obj: gpu/ompt/ompt-activity-translate.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_o_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_o_CFLAGS) $(CFLAGS) -MT gpu/ompt/libhpcrun_o-ompt-activity-translate.obj -MD -MP -MF gpu/ompt/$(DEPDIR)/libhpcrun_o-ompt-activity-translate.Tpo -c -o gpu/ompt/libhpcrun_o-ompt-activity-translate.obj `if test -f 'gpu/ompt/ompt-activity-translate.c'; then $(CYGPATH_W) 'gpu/ompt/ompt-activity-translate.c'; else $(CYGPATH_W) '$(srcdir)/gpu/ompt/ompt-activity-translate.c'; fi`
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) gpu/ompt/$(DEPDIR)/libhpcrun_o-ompt-activity-translate.Tpo gpu/ompt/$(DEPDIR)/libhpcrun_o-ompt-activity-translate.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='gpu/ompt/ompt-activity-translate.c' object='gpu/ompt/libhpcrun_o-ompt-activity-translate.obj' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_o_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_o_CFLAGS) $(CFLAGS) -c -o gpu/ompt/libhpcrun_o-ompt-activity-translate.obj `if test -f 'gpu/ompt/ompt-activity-translate.c'; then $(CYGPATH_W) 'gpu/ompt/ompt-activity-translate.c'; else $(CYGPATH_W) '$(srcdir)/gpu/ompt/ompt-activity-translate.c'; fi`
+
+sample-sources/libhpcrun_o-openmp-target.o: sample-sources/openmp-target.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_o_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_o_CFLAGS) $(CFLAGS) -MT sample-sources/libhpcrun_o-openmp-target.o -MD -MP -MF sample-sources/$(DEPDIR)/libhpcrun_o-openmp-target.Tpo -c -o sample-sources/libhpcrun_o-openmp-target.o `test -f 'sample-sources/openmp-target.c' || echo '$(srcdir)/'`sample-sources/openmp-target.c
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) sample-sources/$(DEPDIR)/libhpcrun_o-openmp-target.Tpo sample-sources/$(DEPDIR)/libhpcrun_o-openmp-target.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='sample-sources/openmp-target.c' object='sample-sources/libhpcrun_o-openmp-target.o' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_o_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_o_CFLAGS) $(CFLAGS) -c -o sample-sources/libhpcrun_o-openmp-target.o `test -f 'sample-sources/openmp-target.c' || echo '$(srcdir)/'`sample-sources/openmp-target.c
+
+sample-sources/libhpcrun_o-openmp-target.obj: sample-sources/openmp-target.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_o_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_o_CFLAGS) $(CFLAGS) -MT sample-sources/libhpcrun_o-openmp-target.obj -MD -MP -MF sample-sources/$(DEPDIR)/libhpcrun_o-openmp-target.Tpo -c -o sample-sources/libhpcrun_o-openmp-target.obj `if test -f 'sample-sources/openmp-target.c'; then $(CYGPATH_W) 'sample-sources/openmp-target.c'; else $(CYGPATH_W) '$(srcdir)/sample-sources/openmp-target.c'; fi`
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) sample-sources/$(DEPDIR)/libhpcrun_o-openmp-target.Tpo sample-sources/$(DEPDIR)/libhpcrun_o-openmp-target.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='sample-sources/openmp-target.c' object='sample-sources/libhpcrun_o-openmp-target.obj' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_o_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_o_CFLAGS) $(CFLAGS) -c -o sample-sources/libhpcrun_o-openmp-target.obj `if test -f 'sample-sources/openmp-target.c'; then $(CYGPATH_W) 'sample-sources/openmp-target.c'; else $(CYGPATH_W) '$(srcdir)/sample-sources/openmp-target.c'; fi`
+
 ompt/libhpcrun_o-ompt-callstack.o: ompt/ompt-callstack.c
 @am__fastdepCC_TRUE@	$(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_o_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_o_CFLAGS) $(CFLAGS) -MT ompt/libhpcrun_o-ompt-callstack.o -MD -MP -MF ompt/$(DEPDIR)/libhpcrun_o-ompt-callstack.Tpo -c -o ompt/libhpcrun_o-ompt-callstack.o `test -f 'ompt/ompt-callstack.c' || echo '$(srcdir)/'`ompt/ompt-callstack.c
 @am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) ompt/$(DEPDIR)/libhpcrun_o-ompt-callstack.Tpo ompt/$(DEPDIR)/libhpcrun_o-ompt-callstack.Po
@@ -8842,6 +8942,7 @@ clean-libtool:
 	-rm -rf gpu/instrumentation/.libs gpu/instrumentation/_libs
 	-rm -rf gpu/level0/.libs gpu/level0/_libs
 	-rm -rf gpu/nvidia/.libs gpu/nvidia/_libs
+	-rm -rf gpu/ompt/.libs gpu/ompt/_libs
 	-rm -rf gpu/opencl/.libs gpu/opencl/_libs
 	-rm -rf lush/.libs lush/_libs
 	-rm -rf lush-agents/.libs lush-agents/_libs
@@ -9104,6 +9205,8 @@ distclean-generic:
 	-rm -f gpu/level0/$(am__dirstamp)
 	-rm -f gpu/nvidia/$(DEPDIR)/$(am__dirstamp)
 	-rm -f gpu/nvidia/$(am__dirstamp)
+	-rm -f gpu/ompt/$(DEPDIR)/$(am__dirstamp)
+	-rm -f gpu/ompt/$(am__dirstamp)
 	-rm -f gpu/opencl/$(DEPDIR)/$(am__dirstamp)
 	-rm -f gpu/opencl/$(am__dirstamp)
 	-rm -f lush-agents/$(DEPDIR)/$(am__dirstamp)
@@ -9171,7 +9274,7 @@ clean-am: clean-generic clean-libtool clean-noinstLTLIBRARIES \
 	clean-pkglibLTLIBRARIES mostlyclean-am
 
 distclean: distclean-recursive
-	-rm -rf ./$(DEPDIR) audit/$(DEPDIR) cct/$(DEPDIR) dlmopen/$(DEPDIR) extern-real/$(DEPDIR) fnbounds/$(DEPDIR) gpu/$(DEPDIR) gpu/amd/$(DEPDIR) gpu/instrumentation/$(DEPDIR) gpu/level0/$(DEPDIR) gpu/nvidia/$(DEPDIR) gpu/opencl/$(DEPDIR) lush-agents/$(DEPDIR) lush/$(DEPDIR) memory/$(DEPDIR) messages/$(DEPDIR) monitor-exts/$(DEPDIR) ompt/$(DEPDIR) os/linux/$(DEPDIR) sample-sources/$(DEPDIR) sample-sources/blame-shift/$(DEPDIR) sample-sources/perf/$(DEPDIR) syscalls/$(DEPDIR) trampoline/aarch64/$(DEPDIR) trampoline/common/$(DEPDIR) trampoline/x86-family/$(DEPDIR) unwind/common/$(DEPDIR) unwind/generic-libunwind/$(DEPDIR) unwind/ppc64/$(DEPDIR) unwind/x86-family/$(DEPDIR) unwind/x86-family/manual-intervals/$(DEPDIR) utilities/$(DEPDIR) utilities/arch/ia64/$(DEPDIR) utilities/arch/libunwind/$(DEPDIR) utilities/arch/ppc64/$(DEPDIR) utilities/arch/x86-family/$(DEPDIR)
+	-rm -rf ./$(DEPDIR) audit/$(DEPDIR) cct/$(DEPDIR) dlmopen/$(DEPDIR) extern-real/$(DEPDIR) fnbounds/$(DEPDIR) gpu/$(DEPDIR) gpu/amd/$(DEPDIR) gpu/instrumentation/$(DEPDIR) gpu/level0/$(DEPDIR) gpu/nvidia/$(DEPDIR) gpu/ompt/$(DEPDIR) gpu/opencl/$(DEPDIR) lush-agents/$(DEPDIR) lush/$(DEPDIR) memory/$(DEPDIR) messages/$(DEPDIR) monitor-exts/$(DEPDIR) ompt/$(DEPDIR) os/linux/$(DEPDIR) sample-sources/$(DEPDIR) sample-sources/blame-shift/$(DEPDIR) sample-sources/perf/$(DEPDIR) syscalls/$(DEPDIR) trampoline/aarch64/$(DEPDIR) trampoline/common/$(DEPDIR) trampoline/x86-family/$(DEPDIR) unwind/common/$(DEPDIR) unwind/generic-libunwind/$(DEPDIR) unwind/ppc64/$(DEPDIR) unwind/x86-family/$(DEPDIR) unwind/x86-family/manual-intervals/$(DEPDIR) utilities/$(DEPDIR) utilities/arch/ia64/$(DEPDIR) utilities/arch/libunwind/$(DEPDIR) utilities/arch/ppc64/$(DEPDIR) utilities/arch/x86-family/$(DEPDIR)
 	-rm -f Makefile
 distclean-am: clean-am distclean-compile distclean-generic \
 	distclean-tags
@@ -9220,7 +9323,7 @@ install-ps-am:
 installcheck-am:
 
 maintainer-clean: maintainer-clean-recursive
-	-rm -rf ./$(DEPDIR) audit/$(DEPDIR) cct/$(DEPDIR) dlmopen/$(DEPDIR) extern-real/$(DEPDIR) fnbounds/$(DEPDIR) gpu/$(DEPDIR) gpu/amd/$(DEPDIR) gpu/instrumentation/$(DEPDIR) gpu/level0/$(DEPDIR) gpu/nvidia/$(DEPDIR) gpu/opencl/$(DEPDIR) lush-agents/$(DEPDIR) lush/$(DEPDIR) memory/$(DEPDIR) messages/$(DEPDIR) monitor-exts/$(DEPDIR) ompt/$(DEPDIR) os/linux/$(DEPDIR) sample-sources/$(DEPDIR) sample-sources/blame-shift/$(DEPDIR) sample-sources/perf/$(DEPDIR) syscalls/$(DEPDIR) trampoline/aarch64/$(DEPDIR) trampoline/common/$(DEPDIR) trampoline/x86-family/$(DEPDIR) unwind/common/$(DEPDIR) unwind/generic-libunwind/$(DEPDIR) unwind/ppc64/$(DEPDIR) unwind/x86-family/$(DEPDIR) unwind/x86-family/manual-intervals/$(DEPDIR) utilities/$(DEPDIR) utilities/arch/ia64/$(DEPDIR) utilities/arch/libunwind/$(DEPDIR) utilities/arch/ppc64/$(DEPDIR) utilities/arch/x86-family/$(DEPDIR)
+	-rm -rf ./$(DEPDIR) audit/$(DEPDIR) cct/$(DEPDIR) dlmopen/$(DEPDIR) extern-real/$(DEPDIR) fnbounds/$(DEPDIR) gpu/$(DEPDIR) gpu/amd/$(DEPDIR) gpu/instrumentation/$(DEPDIR) gpu/level0/$(DEPDIR) gpu/nvidia/$(DEPDIR) gpu/ompt/$(DEPDIR) gpu/opencl/$(DEPDIR) lush-agents/$(DEPDIR) lush/$(DEPDIR) memory/$(DEPDIR) messages/$(DEPDIR) monitor-exts/$(DEPDIR) ompt/$(DEPDIR) os/linux/$(DEPDIR) sample-sources/$(DEPDIR) sample-sources/blame-shift/$(DEPDIR) sample-sources/perf/$(DEPDIR) syscalls/$(DEPDIR) trampoline/aarch64/$(DEPDIR) trampoline/common/$(DEPDIR) trampoline/x86-family/$(DEPDIR) unwind/common/$(DEPDIR) unwind/generic-libunwind/$(DEPDIR) unwind/ppc64/$(DEPDIR) unwind/x86-family/$(DEPDIR) unwind/x86-family/manual-intervals/$(DEPDIR) utilities/$(DEPDIR) utilities/arch/ia64/$(DEPDIR) utilities/arch/libunwind/$(DEPDIR) utilities/arch/ppc64/$(DEPDIR) utilities/arch/x86-family/$(DEPDIR)
 	-rm -f Makefile
 maintainer-clean-am: distclean-am maintainer-clean-generic
 
diff --git a/src/tool/hpcrun/gpu/gpu-activity-process.c b/src/tool/hpcrun/gpu/gpu-activity-process.c
index b499bc43cf..e554838b80 100644
--- a/src/tool/hpcrun/gpu/gpu-activity-process.c
+++ b/src/tool/hpcrun/gpu/gpu-activity-process.c
@@ -73,7 +73,7 @@
 
 #define UNIT_TEST 0
 
-#define DEBUG 0
+#define DEBUG 1
 
 #include "gpu-print.h"
 
@@ -140,9 +140,57 @@ attribute_activity
   gpu_activity_channel_t *channel =
     gpu_host_correlation_map_entry_channel_get(hc);
   activity->cct_node = cct_node;
+
+  PRINT("attributing activity to %p time = [%lu,%lu)\n",
+	cct_node, activity->details.interval.start, activity->details.interval.end);
+
   gpu_activity_channel_produce(channel, activity);
 }
 
+static void
+gpu_memcpy_process_helper
+(
+ gpu_activity_t *activity,
+ gpu_host_correlation_map_entry_t *host_op_entry
+)
+{
+  gpu_placeholder_type_t mct;
+  switch (activity->details.memcpy.copyKind) {
+  case GPU_MEMCPY_H2D:
+    mct = gpu_placeholder_type_copyin;
+    break;
+  case GPU_MEMCPY_D2H:
+    mct = gpu_placeholder_type_copyout;
+    break;
+  default:
+    mct = gpu_placeholder_type_copy;
+    break;
+  }
+  cct_node_t *host_op_node =
+    gpu_host_correlation_map_entry_op_cct_get(host_op_entry, mct);
+  if (host_op_node == NULL) {
+    // If we cannot find a perfect match for the operation
+    // e.g. cuMemcpy
+    host_op_node = gpu_host_correlation_map_entry_op_cct_get(host_op_entry,
+							     gpu_placeholder_type_copy);
+  }
+
+  assert(host_op_node != NULL);
+  gpu_trace_item_t entry_trace;
+  trace_item_set(&entry_trace, activity, host_op_entry, host_op_node);
+
+  gpu_context_stream_trace
+    (activity->details.memcpy.context_id, activity->details.memcpy.stream_id,
+     &entry_trace);
+
+  uint32_t correlation_id = activity->details.memcpy.correlation_id;
+  PRINT("attributing memcpy activity %llu host_op_entry %p time = [%llu,%llu)\n",
+	correlation_id, activity->details.interval.start, activity->details.interval.end);
+  attribute_activity(host_op_entry, activity, host_op_node);
+  //FIXME(keren): In OpenMP, an external_id may maps to multiple cct_nodes
+  //gpu_host_correlation_map_delete(external_id);
+}
+
 
 static void
 gpu_memcpy_process
@@ -159,6 +207,7 @@ gpu_memcpy_process
     gpu_host_correlation_map_entry_t *host_op_entry =
       gpu_host_correlation_map_lookup(external_id);
     if (host_op_entry != NULL) {
+#if 0
       gpu_placeholder_type_t mct;
       switch (activity->details.memcpy.copyKind) {
         case GPU_MEMCPY_H2D:
@@ -191,10 +240,20 @@ gpu_memcpy_process
       attribute_activity(host_op_entry, activity, host_op_node);
       //FIXME(keren): In OpenMP, an external_id may maps to multiple cct_nodes
       //gpu_host_correlation_map_delete(external_id);
+#else
+      gpu_memcpy_process_helper(activity, host_op_entry);
+#endif
     }
     gpu_correlation_id_map_delete(correlation_id);
   } else {
-    PRINT("Memcpy copy correlation_id %u cannot be found\n", correlation_id);
+    gpu_host_correlation_map_entry_t *host_op_entry =
+      gpu_host_correlation_map_lookup(correlation_id);
+    if (host_op_entry != NULL) {
+      PRINT("INVOKING memcpy helper %llu\n", correlation_id);
+      gpu_memcpy_process_helper(activity, host_op_entry);
+    } else {
+      PRINT("Memcpy copy correlation_id %u cannot be found\n", correlation_id);
+    }
   }
   PRINT("Memcpy copy CorrelationId %u\n", correlation_id);
   PRINT("Memcpy copy kind %u\n", activity->details.memcpy.copyKind);
@@ -597,6 +656,15 @@ gpu_memory_process
       assert(host_op_node != NULL);
       // Memory allocation does not always happen on the device
       // Do not send it to trace channels
+
+      gpu_trace_item_t entry_trace;
+      trace_item_set(&entry_trace, activity, host_op_entry, host_op_node);
+
+      gpu_context_stream_trace
+	(activity->details.memory.context_id,
+	 activity->details.memory.stream_id,
+	 &entry_trace);
+
       attribute_activity(host_op_entry, activity, host_op_node);
     }
     gpu_correlation_id_map_delete(correlation_id);
diff --git a/src/tool/hpcrun/gpu/gpu-activity.c b/src/tool/hpcrun/gpu/gpu-activity.c
index 5e50a32e3c..a05a5d8f74 100644
--- a/src/tool/hpcrun/gpu/gpu-activity.c
+++ b/src/tool/hpcrun/gpu/gpu-activity.c
@@ -66,7 +66,7 @@
 
 #define UNIT_TEST 0
 
-#define DEBUG 0
+#define DEBUG 1
 
 
 #define FORALL_OPENCL_KINDS(macro)					\
diff --git a/src/tool/hpcrun/gpu/gpu-activity.h b/src/tool/hpcrun/gpu/gpu-activity.h
index d098880708..a8d3963f31 100644
--- a/src/tool/hpcrun/gpu/gpu-activity.h
+++ b/src/tool/hpcrun/gpu/gpu-activity.h
@@ -231,6 +231,8 @@ typedef struct gpu_memory_t {
   uint64_t end;
   uint64_t bytes;
   uint32_t correlation_id;
+  uint32_t context_id;
+  uint32_t stream_id;
   gpu_mem_type_t memKind;
 } gpu_memory_t;
 
diff --git a/src/tool/hpcrun/gpu/gpu-host-correlation-map.c b/src/tool/hpcrun/gpu/gpu-host-correlation-map.c
index 981c780367..b30fc59993 100644
--- a/src/tool/hpcrun/gpu/gpu-host-correlation-map.c
+++ b/src/tool/hpcrun/gpu/gpu-host-correlation-map.c
@@ -68,7 +68,7 @@
 // macros
 //******************************************************************************
 
-#define DEBUG 0
+#define DEBUG 1
 
 #include "gpu-print.h"
 
@@ -200,7 +200,8 @@ gpu_host_correlation_map_lookup
 {
   gpu_host_correlation_map_entry_t *result = st_lookup(&map_root, host_correlation_id);
 
-  PRINT("host_correlation_map lookup: id=0x%lx (entry %p)\n", host_correlation_id, result);
+  PRINT("host_correlation_map lookup: id=0x%lx (entry %p) (&map_root=%p) tid=%llu\n",
+	host_correlation_id, result, &map_root, (uint64_t) pthread_self());
 
   return result;
 }
@@ -234,8 +235,9 @@ gpu_host_correlation_map_insert
     st_insert(&map_root, entry);
 
     PRINT("host_correlation_map insert: correlation_id=0x%lx "
-	 "activity_channel=%p (entry=%p)\n",
-	  host_correlation_id, activity_channel, entry);
+	 "activity_channel=%p (entry=%p) (&map_root=%p) tid=%llu\n",
+	  host_correlation_id, activity_channel, entry, &map_root,
+	  (uint64_t) pthread_self());
   }
 }
 
diff --git a/src/tool/hpcrun/ompt/omp-tools.h b/src/tool/hpcrun/ompt/omp-tools.h
index 43788206d2..ffa406ab86 100644
--- a/src/tool/hpcrun/ompt/omp-tools.h
+++ b/src/tool/hpcrun/ompt/omp-tools.h
@@ -1,5 +1,5 @@
 /*
- * include/50/omp-tools.h.var
+ * include/omp-tools.h.var
  */
 
 //===----------------------------------------------------------------------===//
@@ -20,6 +20,16 @@
 #include <stdint.h>
 #include <stddef.h>
 
+#ifdef DEPRECATION_WARNINGS 
+# ifdef __cplusplus
+# define DEPRECATED_51 [[deprecated("as of 5.1")]]
+# else
+# define DEPRECATED_51 __attribute__((deprecated("as of 5.1")))
+#endif
+#else
+#define DEPRECATED_51
+#endif
+
 /*****************************************************************************
  * iteration macros
  *****************************************************************************/
@@ -133,7 +143,7 @@
                                                                                                                          \
     macro (ompt_callback_work,              ompt_callback_work_t,              20) /* task at work begin or end       */ \
                                                                                                                          \
-    macro (ompt_callback_master,            ompt_callback_master_t,            21) /* task at master begin or end     */ \
+    macro (ompt_callback_masked,            ompt_callback_masked_t,            21) /* task at masked begin or end     */ \
                                                                                                                          \
     macro (ompt_callback_target_map,        ompt_callback_target_map_t,        22) /* target map                      */ \
                                                                                                                          \
@@ -153,7 +163,26 @@
                                                                                                                          \
     macro (ompt_callback_reduction,         ompt_callback_sync_region_t,       31) /* reduction                       */ \
                                                                                                                          \
-    macro (ompt_callback_dispatch,          ompt_callback_dispatch_t,          32) /* dispatch of work                */
+    macro (ompt_callback_dispatch,          ompt_callback_dispatch_t,          32) /* dispatch of work                */ \
+    macro (ompt_callback_target_emi,        ompt_callback_target_emi_t,        33) /* target                          */ \
+    macro (ompt_callback_target_data_op_emi,ompt_callback_target_data_op_emi_t,34) /* target data op                  */ \
+    macro (ompt_callback_target_submit_emi, ompt_callback_target_submit_emi_t, 35) /* target submit                   */ \
+    macro (ompt_callback_target_map_emi,    ompt_callback_target_map_emi_t,    36) /* target map                      */ \
+    macro (ompt_callback_error,             ompt_callback_error_t,             37) /* error                           */
+
+#define FOREACH_OMPT_TARGET_CALLBACK(macro) \
+  macro(ompt_callback_device_initialize)    \
+  macro(ompt_callback_device_finalize)      \
+  macro(ompt_callback_device_load)          \
+  macro(ompt_callback_device_unload)        \
+  macro(ompt_callback_target)               \
+  macro(ompt_callback_target_map)           \
+  macro(ompt_callback_target_data_op)       \
+  macro(ompt_callback_target_submit)        \
+  macro(ompt_callback_target_data_op_emi)   \
+  macro(ompt_callback_target_emi)           \
+  macro(ompt_callback_target_map_emi)       \
+  macro(ompt_callback_target_submit_emi)
 
 /*****************************************************************************
  * implementation specific types
@@ -190,7 +219,8 @@ typedef enum ompt_callbacks_t {
   ompt_callback_dependences              = 18,
   ompt_callback_task_dependence          = 19,
   ompt_callback_work                     = 20,
-  ompt_callback_master                   = 21,
+  ompt_callback_master     DEPRECATED_51 = 21,
+  ompt_callback_masked                   = 21,
   ompt_callback_target_map               = 22,
   ompt_callback_sync_region              = 23,
   ompt_callback_lock_init                = 24,
@@ -201,7 +231,12 @@ typedef enum ompt_callbacks_t {
   ompt_callback_flush                    = 29,
   ompt_callback_cancel                   = 30,
   ompt_callback_reduction                = 31,
-  ompt_callback_dispatch                 = 32
+  ompt_callback_dispatch                 = 32,
+  ompt_callback_target_emi               = 33,
+  ompt_callback_target_data_op_emi       = 34,
+  ompt_callback_target_submit_emi        = 35,
+  ompt_callback_target_map_emi           = 36,
+  ompt_callback_error                    = 37
 } ompt_callbacks_t;
 
 typedef enum ompt_record_t {
@@ -239,7 +274,8 @@ typedef enum ompt_thread_t {
 
 typedef enum ompt_scope_endpoint_t {
   ompt_scope_begin                    = 1,
-  ompt_scope_end                      = 2
+  ompt_scope_end                      = 2,
+  ompt_scope_beginend                 = 3
 } ompt_scope_endpoint_t;
 
 typedef enum ompt_dispatch_t {
@@ -248,22 +284,29 @@ typedef enum ompt_dispatch_t {
 } ompt_dispatch_t;
 
 typedef enum ompt_sync_region_t {
-  ompt_sync_region_barrier                = 1,
-  ompt_sync_region_barrier_implicit       = 2,
+  ompt_sync_region_barrier                DEPRECATED_51 = 1,
+  ompt_sync_region_barrier_implicit       DEPRECATED_51 = 2,
   ompt_sync_region_barrier_explicit       = 3,
   ompt_sync_region_barrier_implementation = 4,
   ompt_sync_region_taskwait               = 5,
   ompt_sync_region_taskgroup              = 6,
-  ompt_sync_region_reduction              = 7
+  ompt_sync_region_reduction              = 7,
+  ompt_sync_region_barrier_implicit_workshare = 8,
+  ompt_sync_region_barrier_implicit_parallel = 9,
+  ompt_sync_region_barrier_teams = 10
 } ompt_sync_region_t;
 
 typedef enum ompt_target_data_op_t {
-  ompt_target_data_alloc                = 1,
-  ompt_target_data_transfer_to_device   = 2,
-  ompt_target_data_transfer_from_device = 3,
-  ompt_target_data_delete               = 4,
-  ompt_target_data_associate            = 5,
-  ompt_target_data_disassociate         = 6
+  ompt_target_data_alloc                      = 1,
+  ompt_target_data_transfer_to_device         = 2,
+  ompt_target_data_transfer_from_device       = 3,
+  ompt_target_data_delete                     = 4,
+  ompt_target_data_associate                  = 5,
+  ompt_target_data_disassociate               = 6,
+  ompt_target_data_alloc_async                = 17,
+  ompt_target_data_transfer_to_device_async   = 18,
+  ompt_target_data_transfer_from_device_async = 19,
+  ompt_target_data_delete_async               = 20
 } ompt_target_data_op_t;
 
 typedef enum ompt_work_t {
@@ -273,7 +316,8 @@ typedef enum ompt_work_t {
   ompt_work_single_other       = 4,
   ompt_work_workshare          = 5,
   ompt_work_distribute         = 6,
-  ompt_work_taskloop           = 7
+  ompt_work_taskloop           = 7,
+  ompt_work_scope              = 8
 } ompt_work_t;
 
 typedef enum ompt_mutex_t {
@@ -302,6 +346,7 @@ typedef enum ompt_task_flag_t {
   ompt_task_implicit                  = 0x00000002,
   ompt_task_explicit                  = 0x00000004,
   ompt_task_target                    = 0x00000008,
+  ompt_task_taskwait                  = 0x00000010,
   ompt_task_undeferred                = 0x08000000,
   ompt_task_untied                    = 0x10000000,
   ompt_task_final                     = 0x20000000,
@@ -316,14 +361,19 @@ typedef enum ompt_task_status_t {
   ompt_task_detach        = 4,
   ompt_task_early_fulfill = 5,
   ompt_task_late_fulfill  = 6,
-  ompt_task_switch        = 7
+  ompt_task_switch        = 7,
+  ompt_taskwait_complete  = 8
 } ompt_task_status_t;
 
 typedef enum ompt_target_t {
   ompt_target                         = 1,
   ompt_target_enter_data              = 2,
   ompt_target_exit_data               = 3,
-  ompt_target_update                  = 4
+  ompt_target_update                  = 4,
+  ompt_target_nowait                  = 9,
+  ompt_target_enter_data_nowait       = 10,
+  ompt_target_exit_data_nowait        = 11,
+  ompt_target_update_nowait           = 12
 } ompt_target_t;
 
 typedef enum ompt_parallel_flag_t {
@@ -348,9 +398,15 @@ typedef enum ompt_dependence_type_t {
   ompt_dependence_type_inout           = 3,
   ompt_dependence_type_mutexinoutset   = 4,
   ompt_dependence_type_source          = 5,
-  ompt_dependence_type_sink            = 6
+  ompt_dependence_type_sink            = 6,
+  ompt_dependence_type_inoutset        = 7
 } ompt_dependence_type_t;
 
+typedef enum ompt_severity_t {
+  ompt_warning                         = 1,
+  ompt_fatal                           = 2
+} ompt_severity_t;
+
 typedef enum ompt_cancel_flag_t {
   ompt_cancel_parallel       = 0x01,
   ompt_cancel_sections       = 0x02,
@@ -371,18 +427,20 @@ typedef enum ompt_frame_flag_t {
   ompt_frame_cfa            = 0x10,
   ompt_frame_framepointer   = 0x20,
   ompt_frame_stackaddress   = 0x30
-} ompt_frame_flag_t; 
+} ompt_frame_flag_t;
 
 typedef enum ompt_state_t {
   ompt_state_work_serial                      = 0x000,
   ompt_state_work_parallel                    = 0x001,
   ompt_state_work_reduction                   = 0x002,
 
-  ompt_state_wait_barrier                     = 0x010,
+  ompt_state_wait_barrier                     DEPRECATED_51 = 0x010,
   ompt_state_wait_barrier_implicit_parallel   = 0x011,
   ompt_state_wait_barrier_implicit_workshare  = 0x012,
-  ompt_state_wait_barrier_implicit            = 0x013,
+  ompt_state_wait_barrier_implicit            DEPRECATED_51 = 0x013,
   ompt_state_wait_barrier_explicit            = 0x014,
+  ompt_state_wait_barrier_implementation      = 0x015,
+  ompt_state_wait_barrier_teams               = 0x016,
 
   ompt_state_wait_taskwait                    = 0x020,
   ompt_state_wait_taskgroup                   = 0x021,
@@ -439,6 +497,8 @@ typedef enum ompd_rc_t {
   ompd_rc_device_read_error = 8,
   ompd_rc_device_write_error = 9,
   ompd_rc_nomem = 10,
+  ompd_rc_incomplete = 11,
+  ompd_rc_callback_error = 12
 } ompd_rc_t;
 
 typedef void (*ompt_interface_fn_t) (void);
@@ -720,14 +780,14 @@ typedef void (*ompt_callback_dispatch_t) (
   ompt_data_t *parallel_data,
   ompt_data_t *task_data,
   ompt_dispatch_t kind,
-  ompt_data_t instance 
+  ompt_data_t instance
 );
 
 typedef struct ompt_record_dispatch_t {
   ompt_id_t parallel_id;
   ompt_id_t task_id;
   ompt_dispatch_t kind;
-  ompt_data_t instance; 
+  ompt_data_t instance;
 } ompt_record_dispatch_t;
 
 typedef void (*ompt_callback_task_create_t) (
@@ -799,19 +859,21 @@ typedef struct ompt_record_implicit_task_t {
   int flags;
 } ompt_record_implicit_task_t;
 
-typedef void (*ompt_callback_master_t) (
+typedef void (*ompt_callback_masked_t) (
   ompt_scope_endpoint_t endpoint,
   ompt_data_t *parallel_data,
   ompt_data_t *task_data,
   const void *codeptr_ra
 );
 
-typedef struct ompt_record_master_t {
+typedef ompt_callback_masked_t ompt_callback_master_t DEPRECATED_51;
+
+typedef struct ompt_record_masked_t {
   ompt_scope_endpoint_t endpoint;
   ompt_id_t parallel_id;
   ompt_id_t task_id;
   const void *codeptr_ra;
-} ompt_record_master_t;
+} ompt_record_masked_t;
 
 typedef void (*ompt_callback_sync_region_t) (
   ompt_sync_region_t kind,
@@ -918,6 +980,20 @@ typedef void (*ompt_callback_device_unload_t) (
   uint64_t module_id
 );
 
+typedef void (*ompt_callback_target_data_op_emi_t) (
+  ompt_scope_endpoint_t endpoint,
+  ompt_data_t *target_task_data,
+  ompt_data_t *target_data,
+  ompt_id_t *host_op_id,
+  ompt_target_data_op_t optype,
+  void *src_addr,
+  int src_device_num,
+  void *dest_addr,
+  int dest_device_num,
+  size_t bytes,
+  const void *codeptr_ra
+);
+
 typedef void (*ompt_callback_target_data_op_t) (
   ompt_scope_endpoint_t endpoint,
   ompt_id_t target_id,
@@ -943,6 +1019,16 @@ typedef struct ompt_record_target_data_op_t {
   const void *codeptr_ra;
 } ompt_record_target_data_op_t;
 
+typedef void (*ompt_callback_target_emi_t) (
+  ompt_target_t kind,
+  ompt_scope_endpoint_t endpoint,
+  int device_num,
+  ompt_data_t *task_data,
+  ompt_data_t *target_task_data,
+  ompt_data_t *target_data,
+  const void *codeptr_ra
+);
+
 typedef void (*ompt_callback_target_t) (
   ompt_target_t kind,
   ompt_scope_endpoint_t endpoint,
@@ -961,6 +1047,16 @@ typedef struct ompt_record_target_t {
   const void *codeptr_ra;
 } ompt_record_target_t;
 
+typedef void (*ompt_callback_target_map_emi_t) (
+  ompt_data_t *target_data,
+  unsigned int nitems,
+  void **host_addr,
+  void **device_addr,
+  size_t *bytes,
+  unsigned int *mapping_flags,
+  const void *codeptr_ra
+);
+
 typedef void (*ompt_callback_target_map_t) (
   ompt_id_t target_id,
   unsigned int nitems,
@@ -981,6 +1077,13 @@ typedef struct ompt_record_target_map_t {
   const void *codeptr_ra;
 } ompt_record_target_map_t;
 
+typedef void (*ompt_callback_target_submit_emi_t) (
+  ompt_scope_endpoint_t endpoint,
+  ompt_data_t *target_data,
+  ompt_id_t *host_op_id,
+  unsigned int requested_num_teams
+);
+
 typedef void (*ompt_callback_target_submit_t) (
   ompt_scope_endpoint_t endpoint,
   ompt_id_t target_id,
@@ -1008,6 +1111,19 @@ typedef struct ompt_record_control_tool_t {
   const void *codeptr_ra;
 } ompt_record_control_tool_t;
 
+typedef void (*ompt_callback_error_t) (
+  ompt_severity_t severity,
+  const char *message, size_t length,
+  const void *codeptr_ra
+);
+
+typedef struct ompt_record_error_t {
+  ompt_severity_t severity;
+  const char *message;
+  size_t length;
+  const void *codeptr_ra;
+} ompt_record_error_t;
+
 typedef struct ompd_address_t {
   ompd_seg_t segment;
   ompd_addr_t address;
@@ -1035,6 +1151,198 @@ typedef struct ompd_device_type_sizes_t {
   uint8_t sizeof_pointer;
 } ompd_device_type_sizes_t;
 
+void ompd_dll_locations_valid(void);
+
+typedef ompd_rc_t (*ompd_callback_memory_alloc_fn_t)(ompd_size_t nbytes,
+                                                     void **ptr);
+
+typedef ompd_rc_t (*ompd_callback_memory_free_fn_t)(void *ptr);
+
+typedef ompd_rc_t (*ompd_callback_get_thread_context_for_thread_id_fn_t)(
+    ompd_address_space_context_t *address_space_context, ompd_thread_id_t kind,
+    ompd_size_t sizeof_thread_id, const void *thread_id,
+    ompd_thread_context_t **thread_context);
+
+typedef ompd_rc_t (*ompd_callback_sizeof_fn_t)(
+    ompd_address_space_context_t *address_space_context,
+    ompd_device_type_sizes_t *sizes);
+
+typedef ompd_rc_t (*ompd_callback_symbol_addr_fn_t)(
+    ompd_address_space_context_t *address_space_context,
+    ompd_thread_context_t *thread_context, const char *symbol_name,
+    ompd_address_t *symbol_addr, const char *file_name);
+
+typedef ompd_rc_t (*ompd_callback_memory_read_fn_t)(
+    ompd_address_space_context_t *address_space_context,
+    ompd_thread_context_t *thread_context, const ompd_address_t *addr,
+    ompd_size_t nbytes, void *buffer);
+
+typedef ompd_rc_t (*ompd_callback_memory_write_fn_t)(
+    ompd_address_space_context_t *address_space_context,
+    ompd_thread_context_t *thread_context, const ompd_address_t *addr,
+    ompd_size_t nbytes, const void *buffer);
+
+typedef ompd_rc_t (*ompd_callback_device_host_fn_t)(
+    ompd_address_space_context_t *address_space_context, const void *input,
+    ompd_size_t unit_size, ompd_size_t count, void *output);
+
+typedef ompd_rc_t (*ompd_callback_print_string_fn_t)(const char *string,
+                                                     int category);
+
+typedef struct ompd_callbacks_t {
+  ompd_callback_memory_alloc_fn_t alloc_memory;
+  ompd_callback_memory_free_fn_t free_memory;
+  ompd_callback_print_string_fn_t print_string;
+  ompd_callback_sizeof_fn_t sizeof_type;
+  ompd_callback_symbol_addr_fn_t symbol_addr_lookup;
+  ompd_callback_memory_read_fn_t read_memory;
+  ompd_callback_memory_write_fn_t write_memory;
+  ompd_callback_memory_read_fn_t read_string;
+  ompd_callback_device_host_fn_t device_to_host;
+  ompd_callback_device_host_fn_t host_to_device;
+  ompd_callback_get_thread_context_for_thread_id_fn_t
+      get_thread_context_for_thread_id;
+} ompd_callbacks_t;
+
+void ompd_bp_parallel_begin(void);
+
+void ompd_bp_parallel_end(void);
+
+void ompd_bp_task_begin(void);
+
+void ompd_bp_task_end(void);
+
+void ompd_bp_thread_begin(void);
+
+void ompd_bp_thread_end(void);
+
+void ompd_bp_device_begin(void);
+
+void ompd_bp_device_end(void);
+
+ompd_rc_t ompd_initialize(ompd_word_t api_version,
+                          const ompd_callbacks_t *callbacks);
+
+ompd_rc_t ompd_get_api_version(ompd_word_t *version);
+
+ompd_rc_t ompd_get_version_string(const char **string);
+
+ompd_rc_t ompd_finalize(void);
+
+ompd_rc_t ompd_process_initialize(ompd_address_space_context_t *context,
+                                  ompd_address_space_handle_t **handle);
+
+ompd_rc_t ompd_device_initialize(ompd_address_space_handle_t *process_handle,
+                                 ompd_address_space_context_t *device_context,
+                                 ompd_device_t kind, ompd_size_t sizeof_id,
+                                 void *id,
+                                 ompd_address_space_handle_t **device_handle);
+
+ompd_rc_t ompd_rel_address_space_handle(ompd_address_space_handle_t *handle);
+
+ompd_rc_t ompd_get_omp_version(ompd_address_space_handle_t *address_space,
+                               ompd_word_t *omp_version);
+
+ompd_rc_t
+ompd_get_omp_version_string(ompd_address_space_handle_t *address_space,
+                            const char **string);
+
+ompd_rc_t ompd_get_thread_in_parallel(ompd_parallel_handle_t *parallel_handle,
+                                      int thread_num,
+                                      ompd_thread_handle_t **thread_handle);
+
+ompd_rc_t ompd_get_thread_handle(ompd_address_space_handle_t *handle,
+                                 ompd_thread_id_t kind,
+                                 ompd_size_t sizeof_thread_id,
+                                 const void *thread_id,
+                                 ompd_thread_handle_t **thread_handle);
+
+ompd_rc_t ompd_rel_thread_handle(ompd_thread_handle_t *thread_handle);
+
+ompd_rc_t ompd_thread_handle_compare(ompd_thread_handle_t *thread_handle_1,
+                                     ompd_thread_handle_t *thread_handle_2,
+                                     int *cmp_value);
+
+ompd_rc_t ompd_get_thread_id(ompd_thread_handle_t *thread_handle,
+                             ompd_thread_id_t kind,
+                             ompd_size_t sizeof_thread_id, void *thread_id);
+
+ompd_rc_t
+ompd_get_curr_parallel_handle(ompd_thread_handle_t *thread_handle,
+                              ompd_parallel_handle_t **parallel_handle);
+
+ompd_rc_t ompd_get_enclosing_parallel_handle(
+    ompd_parallel_handle_t *parallel_handle,
+    ompd_parallel_handle_t **enclosing_parallel_handle);
+
+ompd_rc_t
+ompd_get_task_parallel_handle(ompd_task_handle_t *task_handle,
+                              ompd_parallel_handle_t **task_parallel_handle);
+
+ompd_rc_t ompd_rel_parallel_handle(ompd_parallel_handle_t *parallel_handle);
+
+ompd_rc_t
+ompd_parallel_handle_compare(ompd_parallel_handle_t *parallel_handle_1,
+                             ompd_parallel_handle_t *parallel_handle_2,
+                             int *cmp_value);
+
+ompd_rc_t ompd_get_curr_task_handle(ompd_thread_handle_t *thread_handle,
+                                    ompd_task_handle_t **task_handle);
+
+ompd_rc_t
+ompd_get_generating_task_handle(ompd_task_handle_t *task_handle,
+                                ompd_task_handle_t **generating_task_handle);
+
+ompd_rc_t
+ompd_get_scheduling_task_handle(ompd_task_handle_t *task_handle,
+                                ompd_task_handle_t **scheduling_task_handle);
+
+ompd_rc_t ompd_get_task_in_parallel(ompd_parallel_handle_t *parallel_handle,
+                                    int thread_num,
+                                    ompd_task_handle_t **task_handle);
+
+ompd_rc_t ompd_rel_task_handle(ompd_task_handle_t *task_handle);
+
+ompd_rc_t ompd_task_handle_compare(ompd_task_handle_t *task_handle_1,
+                                   ompd_task_handle_t *task_handle_2,
+                                   int *cmp_value);
+
+ompd_rc_t ompd_get_task_function(ompd_task_handle_t *task_handle,
+                                 ompd_address_t *entry_point);
+
+ompd_rc_t ompd_get_task_frame(ompd_task_handle_t *task_handle,
+                              ompd_frame_info_t *exit_frame,
+                              ompd_frame_info_t *enter_frame);
+
+ompd_rc_t
+ompd_enumerate_states(ompd_address_space_handle_t *address_space_handle,
+                      ompd_word_t current_state, ompd_word_t *next_state,
+                      const char **next_state_name, ompd_word_t *more_enums);
+
+ompd_rc_t ompd_get_state(ompd_thread_handle_t *thread_handle,
+                         ompd_word_t *state, ompt_wait_id_t *wait_id);
+
+ompd_rc_t
+ompd_get_display_control_vars(ompd_address_space_handle_t *address_space_handle,
+                              const char *const **control_vars);
+
+ompd_rc_t ompd_rel_display_control_vars(const char *const **control_vars);
+
+ompd_rc_t ompd_enumerate_icvs(ompd_address_space_handle_t *handle,
+                              ompd_icv_id_t current, ompd_icv_id_t *next_id,
+                              const char **next_icv_name,
+                              ompd_scope_t *next_scope, int *more);
+
+ompd_rc_t ompd_get_icv_from_scope(void *handle, ompd_scope_t scope,
+                                  ompd_icv_id_t icv_id, ompd_word_t *icv_value);
+
+ompd_rc_t ompd_get_icv_string_from_scope(void *handle, ompd_scope_t scope,
+                                         ompd_icv_id_t icv_id,
+                                         const char **icv_string);
+
+ompd_rc_t ompd_get_tool_data(void *handle, ompd_scope_t scope,
+                             ompd_word_t *value, ompd_address_t *ptr);
+
 typedef struct ompt_record_ompt_t {
   ompt_callbacks_t type;
   ompt_device_time_t time;
@@ -1051,7 +1359,7 @@ typedef struct ompt_record_ompt_t {
     ompt_record_task_dependence_t task_dependence;
     ompt_record_task_schedule_t task_schedule;
     ompt_record_implicit_task_t implicit_task;
-    ompt_record_master_t master;
+    ompt_record_masked_t masked;
     ompt_record_sync_region_t sync_region;
     ompt_record_mutex_acquire_t mutex_acquire;
     ompt_record_mutex_t mutex;
diff --git a/src/tool/hpcrun/ompt/ompt-device.c b/src/tool/hpcrun/ompt/ompt-device.c
index c09bfb0b7d..9463305437 100644
--- a/src/tool/hpcrun/ompt/ompt-device.c
+++ b/src/tool/hpcrun/ompt/ompt-device.c
@@ -47,8 +47,6 @@
 
 #include "ompt-device.h"
 
-#if HAVE_CUPTI_H
-
 /******************************************************************************
  * global include files
  *****************************************************************************/
@@ -72,12 +70,15 @@
 #include "ompt-placeholders.h"
 
 #include "gpu/gpu-op-placeholders.h"
+#include "gpu/gpu-application-thread-api.h"
 #include "gpu/gpu-correlation-channel.h"
 #include "gpu/gpu-correlation-channel-set.h"
+#include "gpu/gpu-metrics.h"
 #include "gpu/gpu-monitoring.h"
+#include "gpu/gpu-monitoring-thread-api.h"
+#include "gpu/gpu-trace.h"
 
-#include "gpu/nvidia/cupti-api.h"
-#include "sample-sources/nvidia.h"
+#include "gpu/ompt/ompt-gpu-api.h"
 
 
 
@@ -85,7 +86,7 @@
 // macros
 //*****************************************************************************
 
-#define OMPT_ACTIVITY_DEBUG 0
+#define OMPT_ACTIVITY_DEBUG 1
 
 #if OMPT_ACTIVITY_DEBUG
 #define PRINT(...) fprintf(stderr, __VA_ARGS__)
@@ -104,45 +105,58 @@
 #define FOREACH_OMPT_TARGET_FN(macro) \
   macro(ompt_get_device_time) \
   macro(ompt_translate_time) \
-  macro(ompt_set_trace_native) \
+  macro(ompt_set_trace_ompt) \
   macro(ompt_start_trace) \
   macro(ompt_pause_trace) \
   macro(ompt_stop_trace) \
+  macro(ompt_flush_trace) \
   macro(ompt_get_record_type) \
-  macro(ompt_get_record_native) \
+  macro(ompt_get_record_ompt) \
   macro(ompt_get_record_abstract) \
-  macro(ompt_advance_buffer_cursor) \
-  macro(ompt_set_pc_sampling) \
-  macro(ompt_set_external_subscriber)
-
+  macro(ompt_advance_buffer_cursor) 
 
+#define BUFFER_EMPTY(record, buffer, bytes) (((char *) record) >= (((char *)buffer) + bytes))
 
 //*****************************************************************************
-// types 
+// type declarations
 //*****************************************************************************
 
-OMPT_TARGET_API_FUNCTION(void, ompt_set_external_subscriber, 
-(
- int enable
-));
-
+typedef struct ompt_device_entry_t {
+  int device_id;
+  ompt_device_t *device;
+  struct ompt_device_entry_t *next;
+} ompt_device_entry_t;
 
-OMPT_TARGET_API_FUNCTION(void, ompt_set_pc_sampling, 
-(
- ompt_device_t *device,
- int enable,
- int pc_sampling_frequency
-));
 
 
 //*****************************************************************************
 // static variables
 //*****************************************************************************
 
-static bool ompt_pc_sampling_enabled = false;
+static device_finalizer_fn_entry_t device_finalizer_flush;
+static device_finalizer_fn_entry_t device_finalizer_trace;
+static device_finalizer_fn_entry_t device_finalizer_shutdown;
+
+static int ompt_shutdown_complete = 0;
 
-static device_finalizer_fn_entry_t device_finalizer;
+static ompt_device_entry_t *device_list = 0;
 
+static void
+device_list_insert
+(
+ int device_id,
+ ompt_device_t *device
+)
+{
+  // FIXME: replace with splay-uint64
+  ompt_device_entry_t *e = (ompt_device_entry_t *)
+    malloc(sizeof(ompt_device_entry_t));
+  e->device_id = device_id;
+  e->device = device;
+  e->next = device_list;
+  device_list = e;
+  PRINT("device_list_insert id=%d device=%p\n", device_id, device);
+}
 
 //------------------------------------------------
 // declare function pointers for target functions
@@ -179,7 +193,12 @@ hpcrun_ompt_op_id_notify(ompt_scope_endpoint_t endpoint,
     // Enter a ompt runtime api
     PRINT("enter ompt runtime op %lu\n", host_op_id);
     ompt_runtime_api_flag = true;
-    cupti_correlation_id_push(host_op_id);
+
+    gpu_application_thread_process_activities();
+
+#if 0
+    ompt_correlation_id_push(host_op_id);
+#endif
 
     gpu_op_ccts_t gpu_op_ccts;
     memset(&gpu_op_ccts, 0, sizeof(gpu_op_ccts_t));
@@ -199,13 +218,16 @@ hpcrun_ompt_op_id_notify(ompt_scope_endpoint_t endpoint,
 
     // Inform the worker about the placeholders
     uint64_t cpu_submit_time = hpcrun_nanotime();
+    PRINT("producing correlation %lu\n", host_op_id);
     gpu_correlation_channel_produce(host_op_id, &gpu_op_ccts, cpu_submit_time);
   } else {
     PRINT("exit ompt runtime op %lu\n", host_op_id);
     // Enter a runtime api
     ompt_runtime_api_flag = false;
+#if 0
     // Pop the id and make a notification
-    cupti_correlation_id_pop();
+    ompt_correlation_id_pop();
+#endif
     // Clear kernel status
     trace_node = NULL;
   }
@@ -218,7 +240,8 @@ void
 ompt_bind_names(ompt_function_lookup_t lookup)
 {
 #define ompt_bind_name(fn) \
-  fn = (fn ## _t ) lookup(#fn);
+  fn = (fn ## _t ) lookup(#fn); \
+  printf("look up function %s, got %p\n", #fn, fn);
 
   FOREACH_OMPT_TARGET_FN(ompt_bind_name)
 
@@ -228,8 +251,8 @@ ompt_bind_names(ompt_function_lookup_t lookup)
 
 #define BUFFER_SIZE (1024 * 1024 * 8)
 
-void 
-ompt_callback_buffer_request
+static void 
+ompt_buffer_request
 (
  int device_id,
  ompt_buffer_t **buffer,
@@ -242,100 +265,222 @@ ompt_callback_buffer_request
 }
 
 
-void 
-ompt_callback_buffer_complete
+static void 
+ompt_buffer_release
 (
- int device_id,
- ompt_buffer_t *buffer,
- size_t bytes,
- ompt_buffer_cursor_t begin,
- int buffer_owned
+ ompt_buffer_t *buffer
 )
 {
-  // handle notifications
-  gpu_correlation_channel_set_consume();
-
-  // signal advance to return pointer to first record
-  ompt_buffer_cursor_t next = begin;
-  int status = 0;
-  do {
-    // TODO(keren): replace cupti_activity_handle with device_activity handle
-    CUpti_Activity *activity = (CUpti_Activity *)next;
-    cupti_activity_process(activity);
-    status = cupti_buffer_cursor_advance(buffer, bytes, (CUpti_Activity **)&next);
-  } while(status);
+  free(buffer);
 }
 
 
-void
-ompt_pc_sampling_enable()
+static void
+ompt_dump
+(
+ ompt_record_ompt_t *r
+)
 {
-  ompt_pc_sampling_enabled = true;
+  if (r) { 
+    printf("r=%p type=%d time=%lu thread_id=%lu target_id=%lu\n",
+	   r, r->type, r->time, r->thread_id, r->target_id);
+
+    switch (r->type) {
+    case ompt_callback_target:
+      // case ompt_callback_target_emi:
+      {
+	ompt_record_target_t target_rec = r->record.target;
+	printf("\tTarget task: kind=%d endpoint=%d device=%d task_id=%lu target_id=%lu codeptr=%p\n",
+	       target_rec.kind, target_rec.endpoint, target_rec.device_num,
+	       target_rec.task_id, target_rec.target_id, target_rec.codeptr_ra);
+	break;
+      }
+    case ompt_callback_target_data_op:
+      // case ompt_callback_target_data_op_emi:
+      {
+	ompt_record_target_data_op_t target_data_op_rec =
+	  r->record.target_data_op;
+	printf("\tTarget data op: host_op_id=%lu optype=%d src_addr=%p "
+	       "src_device=%d dest_addr=%p dest_device=%d bytes=%lu "
+	       "end_time=%lu duration=%luus codeptr=%p\n",
+	       target_data_op_rec.host_op_id, target_data_op_rec.optype,
+	       target_data_op_rec.src_addr, target_data_op_rec.src_device_num,
+	       target_data_op_rec.dest_addr, target_data_op_rec.dest_device_num,
+	       target_data_op_rec.bytes, target_data_op_rec.end_time,
+	       target_data_op_rec.end_time - r->time,
+	       target_data_op_rec.codeptr_ra);
+	break;
+      }
+    case ompt_callback_target_submit:
+      // case ompt_callback_target_submit_emi:
+      {
+	ompt_record_target_kernel_t target_kernel_rec = r->record.target_kernel;
+	printf("\tTarget kernel: host_op_id=%lu requested_num_teams=%u "
+	       "granted_num_teams=%u end_time=%lu duration=%luus\n",
+	       target_kernel_rec.host_op_id,
+	       target_kernel_rec.requested_num_teams,
+	       target_kernel_rec.granted_num_teams, target_kernel_rec.end_time,
+	       target_kernel_rec.end_time - r->time);
+	break;
+      }
+    default:
+      assert(0);
+      break;
+    }
+  }
 }
 
 
-void
-ompt_pc_sampling_disable()
+static ompt_device_t *
+ompt_get_device
+(
+ int device_id
+)
 {
-  ompt_pc_sampling_enabled = false;
+  ompt_device_entry_t *e = device_list;
+  while (e) {
+    if (e->device_id == device_id) return e->device;
+    e = e->next;
+  }
+  return 0;
 }
 
 
-void
-ompt_trace_configure(ompt_device_t *device)
+static void
+ompt_finalize_flush
+(
+ void *arg,
+ int how
+)
+{
+  PRINT("ompt_finalize_flush enter\n");
+  ompt_device_entry_t *e = device_list;
+  while (e) {
+    PRINT("ompt_finalize_flush flush id=%d device=%p\n",
+	  e->device_id, e->device);
+    ompt_flush_trace(e->device);
+    e = e->next;
+  }
+  PRINT("ompt_finalize_flush exit\n");
+}
+
+
+static void
+ompt_finalize_shutdown
+(
+ void *arg,
+ int how
+)
+{
+  PRINT("ompt_finalize_shutdown enter\n");
+
+  ompt_device_entry_t *e = device_list;
+  while (e) {
+    PRINT("ompt_finalize_flush flush id=%d device=%p\n",
+	  e->device_id, e->device);
+    ompt_stop_trace(e->device);
+    e = e->next;
+  }
+  ompt_shutdown_complete = 1;
+  gpu_application_thread_process_activities();
+  PRINT("ompt_finalize_shutdown exit\n");
+}
+
+
+static void
+ompt_finalize_trace
+(
+ void *arg,
+ int how
+)
 {
-  int flags = 0;
+  PRINT("ompt_finalize_trace enter\n");
+  gpu_trace_fini(arg, how);
+  PRINT("ompt_finalize_trace exit\n");
+}
 
-  // specify desired monitoring
-  flags |= ompt_native_driver;
 
-  flags |= ompt_native_runtime;
 
-  flags |= ompt_native_kernel_invocation;
+static void 
+ompt_buffer_complete
+(
+ int device_id,
+ ompt_buffer_t *buffer,
+ size_t bytes,
+ ompt_buffer_cursor_t begin,
+ int buffer_owned
+)
+{
+  PRINT("ompt_callback_buffer_complete enter device=%d\n", device_id);
+  if (ompt_shutdown_complete == 0) {
 
-  flags |= ompt_native_kernel_execution;
+    gpu_monitoring_thread_activities_ready();
 
-  flags |= ompt_native_data_motion_explicit;
+    ompt_device_t *device = ompt_get_device(device_id);
 
-  // indicate desired monitoring
-  ompt_set_trace_native(device, 1, flags);
-  
-  // set pc sampling after other traces
-  if (ompt_pc_sampling_enabled) {
-    int freq_bits = gpu_monitoring_instruction_sample_frequency_get();
-    ompt_set_pc_sampling(device, true, freq_bits);
+    // signal advance to return pointer to first record
+    ompt_buffer_cursor_t current = begin;
+    int status = 1;
+    while(status) {
+      // extract the next record from the buffer
+      ompt_record_ompt_t *record = ompt_get_record_ompt(buffer, current);
+
+      // process the record
+      ompt_activity_process(record);
+
+      // advance the cursor to the next record
+      // status will be 0 if there is no next record
+      status = ompt_advance_buffer_cursor(device, buffer, bytes, current,
+					  &current);
+      if (BUFFER_EMPTY(record, buffer, bytes)) break;
+    }
   }
 
+  if (buffer_owned) ompt_buffer_release(buffer);
+
+  PRINT("ompt_callback_buffer_complete exit device=%d\n", device_id);
+}
+
+
+void
+ompt_trace_configure(ompt_device_t *device)
+{
+  // indicate desired monitoring
+  // ompt_set_trace_ompt(device, 1, 0);
+  
   // turn on monitoring previously indicated
-  ompt_start_trace(device, ompt_callback_buffer_request, ompt_callback_buffer_complete);
+  ompt_start_trace(device, ompt_buffer_request,
+		   ompt_buffer_complete);
 }
 
 
 void
-ompt_device_initialize(uint64_t device_num,
+ompt_device_initialize(int device_num,
                        const char *type,
                        ompt_device_t *device,
                        ompt_function_lookup_t lookup,
                        const char *documentation)
 {
-  PRINT("ompt_device_initialize->%s, %" PRIu64 "\n", type, device_num);
+  PRINT("ompt_device_initialize->%s, %d\n", type, device_num);
 
   ompt_bind_names(lookup);
 
-  //ompt_trace_configure(device);
+  ompt_trace_configure(device);
 
+  device_list_insert(device_num, device);
   ompt_device_map_insert(device_num, device, type);
 }
 
 
 void 
-ompt_device_finalize(uint64_t device_num)
+ompt_device_finalize(int device_num)
 {
+  PRINT("ompt_device_finalize id=%d\n", device_num);
 }
 
 
 void 
-ompt_device_load(uint64_t device_num,
+ompt_device_load(int device_num,
                  const char *filename,
                  int64_t file_offset,
                  const void *file_addr,
@@ -344,13 +489,16 @@ ompt_device_load(uint64_t device_num,
                  const void *device_addr,
                  uint64_t module_id)
 {
-  PRINT("ompt_device_load->%s, %" PRIu64 "\n", filename, device_num);
+  PRINT("ompt_device_load->%s, %d\n", filename, device_num);
+
+#if 0 // FIXME
   cupti_load_callback_cuda(module_id, host_addr, bytes);
+#endif
 }
 
 
 void 
-ompt_device_unload(uint64_t device_num,
+ompt_device_unload(int device_num,
                    uint64_t module_id)
 {
   //cubin_id_map_delete(module_id);
@@ -374,7 +522,7 @@ ompt_target_callback
 (
   ompt_target_t kind,
   ompt_scope_endpoint_t endpoint,
-  uint64_t device_num,
+  int device_num,
   ompt_data_t *task_data,
   ompt_id_t target_id,
   const void *codeptr_ra
@@ -414,12 +562,14 @@ ompt_target_callback
   // the load module for the runtime library that supports offloading
   int lm = get_load_module(target_node); 
 
+#if 0
   // drop nodes on the call chain until we find one that is not in the load 
   // module for runtime library that supports offloading
   for (;;) { 
     target_node = hpcrun_cct_parent(target_node);
     if (get_load_module(target_node) != lm) break;
   }
+#endif
 
   hpcrun_safe_exit();
   td->overhead--;
@@ -446,6 +596,7 @@ ompt_data_op_callback
  const void *codeptr_ra
 )
 {
+  PRINT("ompt_data_op enter->target_id %" PRIu64 "\n", target_id);
   ompt_placeholder_t op = ompt_placeholders.ompt_tgt_none;
   switch (optype) {                       
 #define ompt_op_macro(op, ompt_op_type, ompt_op_class) \
@@ -461,6 +612,7 @@ ompt_data_op_callback
   }
 
   hpcrun_ompt_op_id_notify(endpoint, host_op_id, op.pc_norm);
+  PRINT("ompt_data_op exit->target_id %" PRIu64 "\n", target_id);
 }
 
 
@@ -509,14 +661,25 @@ ompt_trace_node_get
   return trace_node;
 }
 
-
 void
-prepare_device()
+prepare_device
+(
+ void
+)
 {
   PRINT("ompt_initialize->prepare_device enter\n");
 
-  device_finalizer.fn = cupti_device_flush;
-  device_finalizer_register(device_finalizer_type_flush, &device_finalizer);
+  device_finalizer_flush.fn = ompt_finalize_flush;
+  device_finalizer_register(device_finalizer_type_flush,
+			    &device_finalizer_flush);
+
+  device_finalizer_shutdown.fn = ompt_finalize_shutdown;
+  device_finalizer_register(device_finalizer_type_shutdown,
+			    &device_finalizer_shutdown);
+
+  device_finalizer_trace.fn = ompt_finalize_trace;
+  device_finalizer_register(device_finalizer_type_shutdown,
+			    &device_finalizer_trace);
 
   ompt_set_callback(ompt_callback_device_initialize, ompt_device_initialize);
   ompt_set_callback(ompt_callback_device_finalize, ompt_device_finalize);
@@ -529,5 +692,3 @@ prepare_device()
 
   PRINT("ompt_initialize->prepare_device exit\n");
 }
-
-#endif
diff --git a/src/tool/hpcrun/ompt/ompt-device.h b/src/tool/hpcrun/ompt/ompt-device.h
index 49603b33e5..35bbe93f5a 100644
--- a/src/tool/hpcrun/ompt/ompt-device.h
+++ b/src/tool/hpcrun/ompt/ompt-device.h
@@ -51,8 +51,6 @@
 #include <include/hpctoolkit-config.h>
 #include <cct/cct.h>
 
-#if HAVE_CUPTI_H 
-
 void 
 prepare_device
 (
@@ -112,12 +110,4 @@ ompt_external_subscriber_disable
  void
 );
 
-#else
-
-// no op without a CUDA device
-#define prepare_device()
-
-#endif
-
-
 #endif // _OMPT_INTERFACE_H_
diff --git a/src/tool/hpcrun/ompt/ompt-interface.c b/src/tool/hpcrun/ompt/ompt-interface.c
index 2aa707197c..3f0c4bf40c 100644
--- a/src/tool/hpcrun/ompt/ompt-interface.c
+++ b/src/tool/hpcrun/ompt/ompt-interface.c
@@ -70,6 +70,8 @@
 #include <hpcrun/sample-sources/sample-filters.h>
 #include <hpcrun/thread_data.h>
 
+#include <monitor.h>
+
 #include "ompt-callstack.h"
 #include "ompt-defer.h"
 #include "ompt-interface.h"
@@ -89,7 +91,7 @@
 #define ompt_event_may_occur(r) \
   ((r ==  ompt_set_sometimes) | (r ==  ompt_set_always))
 
-#define OMPT_DEBUG_STARTUP 0
+#define OMPT_DEBUG_STARTUP 1
 #define OMPT_DEBUG_TASK 0
 
 
@@ -668,10 +670,13 @@ ompt_start_tool
  const char *runtime_version
 )
 {
+  // force hpctoolkit initialization
+  monitor_initialize();
+  // post-condition: hpctoolkit is initialized
 
- if (getenv("OMPT_DEBUG_WAIT")) {
+  if (getenv("OMPT_DEBUG_WAIT")) {
     while (ompt_debug_wait);
- }
+  }
  
 #if OMPT_DEBUG_STARTUP
   printf("Starting tool...\n");
diff --git a/src/tool/hpcrun/sample-sources/ss-list.h b/src/tool/hpcrun/sample-sources/ss-list.h
index 4b7cf7acd0..dea2fb16c4 100644
--- a/src/tool/hpcrun/sample-sources/ss-list.h
+++ b/src/tool/hpcrun/sample-sources/ss-list.h
@@ -96,6 +96,8 @@ SAMPLE_SOURCE_DECL_MACRO(nvidia_gpu)
 SAMPLE_SOURCE_DECL_MACRO(amd_gpu)
 #endif
 
+SAMPLE_SOURCE_DECL_MACRO(openmp_gpu)
+
 #ifdef HPCRUN_SS_LEVEL0
 SAMPLE_SOURCE_DECL_MACRO(level0)
 #endif

From bd34914f4dd9ffea49c5d843c0ae44f72d6177eb Mon Sep 17 00:00:00 2001
From: John M Mellor-Crummey <johnmc@rice.edu>
Date: Sun, 31 Oct 2021 13:38:52 -0500
Subject: [PATCH 140/177] draft of ompt emi interface implementation

---
 src/tool/hpcrun/ompt/ompt-device.c | 97 +++++++++++++++++++-----------
 1 file changed, 61 insertions(+), 36 deletions(-)

diff --git a/src/tool/hpcrun/ompt/ompt-device.c b/src/tool/hpcrun/ompt/ompt-device.c
index 9463305437..1463b5ca37 100644
--- a/src/tool/hpcrun/ompt/ompt-device.c
+++ b/src/tool/hpcrun/ompt/ompt-device.c
@@ -45,7 +45,6 @@
 // ******************************************************* EndRiceCopyright *
 
 
-#include "ompt-device.h"
 
 /******************************************************************************
  * global include files
@@ -68,11 +67,13 @@
 #include "ompt-interface.h"
 #include "ompt-device-map.h"
 #include "ompt-placeholders.h"
+#include "ompt-device.h"
 
 #include "gpu/gpu-op-placeholders.h"
 #include "gpu/gpu-application-thread-api.h"
 #include "gpu/gpu-correlation-channel.h"
 #include "gpu/gpu-correlation-channel-set.h"
+#include "gpu/gpu-correlation-id.h"
 #include "gpu/gpu-metrics.h"
 #include "gpu/gpu-monitoring.h"
 #include "gpu/gpu-monitoring-thread-api.h"
@@ -282,7 +283,7 @@ ompt_dump
 )
 {
   if (r) { 
-    printf("r=%p type=%d time=%lu thread_id=%lu target_id=%lu\n",
+    printf("r=%p type=%d time=%lu thread_id=%lu target_id=0x%lx\n",
 	   r, r->type, r->time, r->thread_id, r->target_id);
 
     switch (r->type) {
@@ -290,7 +291,7 @@ ompt_dump
       // case ompt_callback_target_emi:
       {
 	ompt_record_target_t target_rec = r->record.target;
-	printf("\tTarget task: kind=%d endpoint=%d device=%d task_id=%lu target_id=%lu codeptr=%p\n",
+	printf("\tTarget task: kind=%d endpoint=%d device=%d task_id=%lu target_id=0x%lx codeptr=%p\n",
 	       target_rec.kind, target_rec.endpoint, target_rec.device_num,
 	       target_rec.task_id, target_rec.target_id, target_rec.codeptr_ra);
 	break;
@@ -432,7 +433,10 @@ ompt_buffer_complete
       // status will be 0 if there is no next record
       status = ompt_advance_buffer_cursor(device, buffer, bytes, current,
 					  &current);
+#if 0
+      // obsolete with changes from AMD
       if (BUFFER_EMPTY(record, buffer, bytes)) break;
+#endif
     }
   }
 
@@ -446,7 +450,7 @@ void
 ompt_trace_configure(ompt_device_t *device)
 {
   // indicate desired monitoring
-  // ompt_set_trace_ompt(device, 1, 0);
+  ompt_set_trace_ompt(device, 1, 0);
   
   // turn on monitoring previously indicated
   ompt_start_trace(device, ompt_buffer_request,
@@ -518,23 +522,25 @@ get_load_module
 
 
 void 
-ompt_target_callback
+ompt_target_callback_emi
 (
   ompt_target_t kind,
   ompt_scope_endpoint_t endpoint,
   int device_num,
   ompt_data_t *task_data,
-  ompt_id_t target_id,
+  ompt_data_t *target_task_data,
+  ompt_data_t *target_data,
   const void *codeptr_ra
 )
 {
-  PRINT("ompt_target_callback->target_id %" PRIu64 "\n", target_id);
-
   if (endpoint == ompt_scope_end) {
     target_node = NULL;
     return;
   }
 
+  uint64_t target_id = target_data->value = gpu_correlation_id();
+  PRINT("ompt_target_callback->target_id 0x%lx\n", target_id);
+
   // XXX(Keren): Do not use openmp callbacks to consume and produce records
   // HPCToolkit always subscribes its own cupti callback
   //
@@ -582,21 +588,27 @@ ompt_target_callback
   macro(op, ompt_target_data_transfer_from_device, ompt_tgt_copyout) 
 
 void
-ompt_data_op_callback
+ompt_data_op_callback_emi
 (
- ompt_scope_endpoint_t endpoint,
- ompt_id_t target_id,
- ompt_id_t host_op_id,
- ompt_target_data_op_t optype,
- void *src_addr,
- int src_device_num,
- void *dest_addr,
- int dest_device_num,
- size_t bytes,
- const void *codeptr_ra
+  ompt_scope_endpoint_t endpoint,
+  ompt_data_t *target_task_data,
+  ompt_data_t *target_data,
+  ompt_id_t *host_op_id,
+  ompt_target_data_op_t optype,
+  void *src_addr,
+  int src_device_num,
+  void *dest_addr,
+  int dest_device_num,
+  size_t bytes,
+  const void *codeptr_ra
 )
 {
-  PRINT("ompt_data_op enter->target_id %" PRIu64 "\n", target_id);
+  if (endpoint == ompt_scope_end) return;
+
+  uint64_t target_id = target_data->value;
+  uint64_t op_id = *host_op_id = gpu_correlation_id(); 
+
+  PRINT("ompt_data_op enter->target_id 0x%lx\n", target_id);
   ompt_placeholder_t op = ompt_placeholders.ompt_tgt_none;
   switch (optype) {                       
 #define ompt_op_macro(op, ompt_op_type, ompt_op_class) \
@@ -611,23 +623,28 @@ ompt_data_op_callback
       break;
   }
 
-  hpcrun_ompt_op_id_notify(endpoint, host_op_id, op.pc_norm);
-  PRINT("ompt_data_op exit->target_id %" PRIu64 "\n", target_id);
+  hpcrun_ompt_op_id_notify(endpoint, op_id, op.pc_norm);
+  PRINT("ompt_data_op exit->target_id 0x%lx\n", target_id);
 }
 
 
 void
-ompt_submit_callback
+ompt_submit_callback_emi
 (
  ompt_scope_endpoint_t endpoint,
- ompt_id_t target_id,
- ompt_id_t host_op_id,
+ ompt_data_t *target_data,
+ ompt_id_t *host_op_id,
  unsigned int requested_num_teams
 )
 {
-  PRINT("ompt_submit_callback enter->target_id %" PRIu64 "\n", target_id);
-  hpcrun_ompt_op_id_notify(endpoint, host_op_id, ompt_placeholders.ompt_tgt_kernel.pc_norm);
-  PRINT("ompt_submit_callback exit->target_id %" PRIu64 "\n", target_id);
+  uint64_t target_id = target_data->value;
+  PRINT("ompt_submit_callback enter->target_id 0x%lx\n", target_id);
+  if (endpoint == ompt_scope_begin) {
+    *host_op_id = gpu_correlation_id();
+    hpcrun_ompt_op_id_notify(endpoint, *host_op_id,
+      ompt_placeholders.ompt_tgt_kernel.pc_norm);
+  }
+  PRINT("ompt_submit_callback exit->target_id 0x%lx\n", target_id);
 }
 
 
@@ -681,14 +698,22 @@ prepare_device
   device_finalizer_register(device_finalizer_type_shutdown,
 			    &device_finalizer_trace);
 
-  ompt_set_callback(ompt_callback_device_initialize, ompt_device_initialize);
-  ompt_set_callback(ompt_callback_device_finalize, ompt_device_finalize);
-  ompt_set_callback(ompt_callback_device_load, ompt_device_load);
-  ompt_set_callback(ompt_callback_device_unload, ompt_device_unload);
-  ompt_set_callback(ompt_callback_target, ompt_target_callback);
-  ompt_set_callback(ompt_callback_target_data_op, ompt_data_op_callback);
-  ompt_set_callback(ompt_callback_target_submit, ompt_submit_callback);
-  ompt_set_callback(ompt_callback_target_map, ompt_map_callback);
+  ompt_set_callback
+    (ompt_callback_device_initialize, ompt_device_initialize);
+  ompt_set_callback
+    (ompt_callback_device_finalize, ompt_device_finalize);
+  ompt_set_callback
+    (ompt_callback_device_load, ompt_device_load);
+  ompt_set_callback
+    (ompt_callback_device_unload, ompt_device_unload);
+  ompt_set_callback
+    (ompt_callback_target_emi, ompt_target_callback_emi);
+  ompt_set_callback
+    (ompt_callback_target_data_op_emi, ompt_data_op_callback_emi);
+  ompt_set_callback
+    (ompt_callback_target_submit_emi, ompt_submit_callback_emi);
+  ompt_set_callback
+    (ompt_callback_target_map, ompt_map_callback);
 
   PRINT("ompt_initialize->prepare_device exit\n");
 }

From 1766f98dcca5ddd39faa066b93bec06b2e8e3962 Mon Sep 17 00:00:00 2001
From: John M Mellor-Crummey <johnmc@rice.edu>
Date: Sun, 31 Oct 2021 16:45:49 -0500
Subject: [PATCH 141/177] add new files for ompt activity processing

---
 .../hpcrun/gpu/ompt/ompt-activity-translate.c | 367 ++++++++++++++++++
 .../hpcrun/gpu/ompt/ompt-activity-translate.h |  79 ++++
 src/tool/hpcrun/gpu/ompt/ompt-gpu-api.c       |  86 ++++
 src/tool/hpcrun/gpu/ompt/ompt-gpu-api.h       |  76 ++++
 4 files changed, 608 insertions(+)
 create mode 100644 src/tool/hpcrun/gpu/ompt/ompt-activity-translate.c
 create mode 100644 src/tool/hpcrun/gpu/ompt/ompt-activity-translate.h
 create mode 100644 src/tool/hpcrun/gpu/ompt/ompt-gpu-api.c
 create mode 100644 src/tool/hpcrun/gpu/ompt/ompt-gpu-api.h

diff --git a/src/tool/hpcrun/gpu/ompt/ompt-activity-translate.c b/src/tool/hpcrun/gpu/ompt/ompt-activity-translate.c
new file mode 100644
index 0000000000..3d65f00cf0
--- /dev/null
+++ b/src/tool/hpcrun/gpu/ompt/ompt-activity-translate.c
@@ -0,0 +1,367 @@
+// -*-Mode: C++;-*- // technically C99
+
+// * BeginRiceCopyright *****************************************************
+//
+// --------------------------------------------------------------------------
+// Part of HPCToolkit (hpctoolkit.org)
+//
+// Information about sources of support for research and development of
+// HPCToolkit is at 'hpctoolkit.org' and in 'README.Acknowledgments'.
+// --------------------------------------------------------------------------
+//
+// Copyright ((c)) 2002-2021, Rice University
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+//   notice, this list of conditions and the following disclaimer.
+//
+// * Redistributions in binary form must reproduce the above copyright
+//   notice, this list of conditions and the following disclaimer in the
+//   documentation and/or other materials provided with the distribution.
+//
+// * Neither the name of Rice University (RICE) nor the names of its
+//   contributors may be used to endorse or promote products derived from
+//   this software without specific prior written permission.
+//
+// This software is provided by RICE and contributors "as is" and any
+// express or implied warranties, including, but not limited to, the
+// implied warranties of merchantability and fitness for a particular
+// purpose are disclaimed. In no event shall RICE or contributors be
+// liable for any direct, indirect, incidental, special, exemplary, or
+// consequential damages (including, but not limited to, procurement of
+// substitute goods or services; loss of use, data, or profits; or
+// business interruption) however caused and on any theory of liability,
+// whether in contract, strict liability, or tort (including negligence
+// or otherwise) arising in any way out of the use of this software, even
+// if advised of the possibility of such damage.
+//
+// ******************************************************* EndRiceCopyright *
+
+
+//******************************************************************************
+// Description:
+//   Read fields from a ompt_record_ompt_t and assign to a
+//   GPU-independent gpu_activity_t.
+//
+//   This interface is only used by the CUPTI GPU monitoring thread.
+//   It is thread-safe as long as it does not access details structures
+//   shared by worker threads.
+//******************************************************************************
+
+//******************************************************************************
+// local includes
+//******************************************************************************
+
+#include <hpcrun/cct/cct.h>
+#include <hpcrun/cct/cct_addr.h>
+#include <hpcrun/utilities/ip-normalized.h>
+#include <hpcrun/gpu/gpu-activity.h>
+#include <hpcrun/gpu/gpu-correlation-id-map.h>
+#include <hpcrun/gpu/gpu-function-id-map.h>
+#include <hpcrun/gpu/gpu-host-correlation-map.h>
+
+
+#include "ompt-activity-translate.h"
+
+
+//******************************************************************************
+// macros
+//******************************************************************************
+
+
+
+
+//******************************************************************************
+// private operations
+//******************************************************************************
+
+static void
+convert_unknown
+(
+ gpu_activity_t *ga,
+ ompt_record_ompt_t *r,
+ uint64_t *cid_ptr
+)
+{
+  ga->kind = GPU_ACTIVITY_UNKNOWN;
+  *cid_ptr = 0;
+}
+
+
+static void
+convert_ptrop
+(
+ gpu_activity_t *ga,
+ ompt_record_ompt_t *r,
+ uint64_t *cid_ptr
+)
+{
+  ga->kind = GPU_ACTIVITY_UNKNOWN;
+  *cid_ptr = 0;
+}
+
+
+static void
+convert_target
+(
+ gpu_activity_t *ga,
+ ompt_record_ompt_t *r,
+ uint64_t *cid_ptr
+)
+{
+  ompt_record_target_t *t = &r->record.target;
+
+  ga->kind = GPU_ACTIVITY_UNKNOWN;
+  *cid_ptr = 0;
+
+#if 0
+  printf("\tTarget task: kind=%d endpoint=%d device=%d task_id=%lu target_id=%lu codeptr=%p\n",
+	 target_rec.kind, target_rec.endpoint, target_rec.device_num,
+	 target_rec.task_id, target_rec.target_id, 
+#endif
+}
+
+
+static void
+convert_memory
+(
+  gpu_activity_t *ga,
+  ompt_record_ompt_t *r,
+  uint64_t *cid_ptr
+)
+{
+  ompt_record_target_data_op_t *d = &r->record.target_data_op;
+
+  ga->kind = GPU_ACTIVITY_MEMORY;
+  ga->details.memory.memKind = GPU_MEM_UNKNOWN;
+  ga->details.memory.correlation_id = d->host_op_id;
+  *cid_ptr = d->host_op_id;
+
+  ga->details.memory.bytes = d->bytes;
+}
+
+
+static void
+convert_alloc
+(
+  gpu_activity_t *ga,
+  ompt_record_ompt_t *r,
+  uint64_t *cid_ptr
+)
+{
+  convert_memory(ga,r, cid_ptr);
+}
+
+
+static void
+convert_delete
+(
+  gpu_activity_t *ga,
+  ompt_record_ompt_t *r,
+  uint64_t *cid_ptr
+)
+{
+  convert_memory(ga,r, cid_ptr);
+}
+
+
+static gpu_memcpy_type_t
+convert_memcpy_type
+(
+ ompt_target_data_op_t kind
+)
+{
+  switch (kind) {
+  case ompt_target_data_transfer_to_device_async:
+  case ompt_target_data_transfer_to_device:
+    return GPU_MEMCPY_H2D;
+
+  case ompt_target_data_transfer_from_device_async:
+  case ompt_target_data_transfer_from_device:
+    return GPU_MEMCPY_D2H;
+
+  default:
+    return GPU_MEMCPY_UNK;
+  }
+}
+
+
+static void
+convert_memcpy
+(
+ gpu_activity_t *ga,
+ ompt_record_ompt_t *r,
+ uint64_t *cid_ptr
+)
+{
+  ompt_record_target_data_op_t *d = &r->record.target_data_op;
+
+# if 0
+  TMSG(OMPT_ACTIVITY, "Memcpy copy CorrelationId %u", r->correlationId);
+  TMSG(OMPT_ACTIVITY, "Memcpy copy kind %u", d->optype);
+  TMSG(OMPT_ACTIVITY, "Memcpy copy bytes %lu", d->bytes);
+
+  
+  ga->details.memcpy.context_id = r->contextId;
+  ga->details.memcpy.stream_id = r->streamId;
+#endif
+
+  ga->kind = GPU_ACTIVITY_MEMCPY;
+
+  ga->details.memcpy.correlation_id = d->host_op_id;
+  *cid_ptr = d->host_op_id;
+
+  ga->details.memcpy.bytes = d->bytes;
+  ga->details.memcpy.copyKind = convert_memcpy_type(d->optype);
+}
+
+
+static void
+convert_target_data_op
+(
+ gpu_activity_t *ga,
+ ompt_record_ompt_t *r,
+ uint64_t *cid_ptr
+)
+{
+  ompt_record_target_data_op_t *d = &r->record.target_data_op;
+
+  switch(d->optype) {
+
+  case ompt_target_data_transfer_to_device:
+  case ompt_target_data_transfer_from_device:
+    convert_memcpy(ga, r, cid_ptr);
+    break;
+
+  case ompt_target_data_alloc_async:
+  case ompt_target_data_alloc:
+    convert_alloc(ga, r, cid_ptr);
+    break;
+
+  case ompt_target_data_delete_async:
+  case ompt_target_data_delete:
+    convert_delete(ga, r, cid_ptr);
+    break;
+
+  case ompt_target_data_associate:
+  case ompt_target_data_disassociate:
+    convert_ptrop(ga, r, cid_ptr);
+    break;
+
+  default:
+    convert_unknown(ga, r, cid_ptr);
+    break;
+ }
+
+#if 0
+  ( r->thread_id, r->target_id);
+
+
+  printf("\tTarget data op: host_op_id=%lu optype=%d src_addr=%p "
+	 "src_device=%d dest_addr=%p dest_device=%d bytes=%lu "
+	 "end_time=%lu duration=%luus codeptr=%p\n",
+	 d->host_op_id, d->optype,
+	 d->src_addr, d->src_device_num,
+	 d->dest_addr, d->dest_device_num,
+#endif
+
+  gpu_interval_set(&ga->details.interval, r->time, d->end_time); 
+}
+
+
+void
+convert_target_submit
+(
+ gpu_activity_t *ga,
+ ompt_record_ompt_t *r,
+ uint64_t *cid_ptr
+)
+{
+  ompt_record_target_kernel_t *k = &r->record.target_kernel;
+
+  ga->kind = GPU_ACTIVITY_KERNEL;
+  ga->details.kernel.correlation_id = k->host_op_id;
+  *cid_ptr = k->host_op_id;
+
+#if 0
+  ( r->thread_id, r->target_id);
+  printf("\tTarget kernel: host_op_id=%lu requested_num_teams=%u "
+	 "granted_num_teams=%u end_time=%lu duration=%luus\n",
+	 target_kernel_rec.host_op_id,
+	 target_kernel_rec.requested_num_teams,
+	 target_kernel_rec.granted_num_teams,
+
+  ga->details.kernel.dynamicSharedMemory = r->dynamicSharedMemory;
+  ga->details.kernel.staticSharedMemory = r->staticSharedMemory;
+  ga->details.kernel.localMemoryTotal = r->localMemoryTotal;
+  ga->details.kernel.device_id = r->deviceId;
+  ga->details.kernel.context_id = r->contextId;
+  ga->details.kernel.stream_id = r->streamId;
+  ga->details.kernel.blocks = r->blockX * r->blockY * r->blockZ;
+
+
+  uint32_t activeWarpsPerSM = 0;
+  uint32_t maxActiveWarpsPerSM = 0;
+  uint32_t threadRegisters = 0;
+  uint32_t blockThreads = 0;
+  uint32_t blockSharedMemory = 0;
+  cupti_occupancy_analyze(r, &activeWarpsPerSM, &maxActiveWarpsPerSM,
+			  &threadRegisters, &blockThreads, &blockSharedMemory);
+
+  ga->details.kernel.activeWarpsPerSM = activeWarpsPerSM;
+  ga->details.kernel.maxActiveWarpsPerSM = maxActiveWarpsPerSM;
+  ga->details.kernel.threadRegisters = threadRegisters;
+  ga->details.kernel.blockThreads = blockThreads;
+  ga->details.kernel.blockSharedMemory = blockSharedMemory;
+#endif
+
+  gpu_interval_set(&ga->details.interval, r->time, k->end_time); 
+}
+
+
+
+//******************************************************************************
+// interface operations
+//******************************************************************************
+
+void
+ompt_activity_translate
+(
+ gpu_activity_t *ga,
+ ompt_record_ompt_t *r,
+ uint64_t *cid_ptr
+)
+{
+  memset(ga, 0, sizeof(gpu_activity_t));
+  switch (r->type) {
+
+  case ompt_callback_target:
+  case ompt_callback_target_emi:
+
+    convert_target(ga,r, cid_ptr);
+    break;
+
+  case ompt_callback_target_data_op:
+  case ompt_callback_target_data_op_emi:
+
+    convert_target_data_op(ga,r, cid_ptr);
+    break;
+      
+  case ompt_callback_target_submit:
+  case ompt_callback_target_submit_emi:
+
+    convert_target_submit(ga,r, cid_ptr);
+    break;
+      
+  default:
+    convert_unknown(ga, r, cid_ptr);
+    break;
+  }
+  
+
+  cstack_ptr_set(&(ga->next), 0);
+}
diff --git a/src/tool/hpcrun/gpu/ompt/ompt-activity-translate.h b/src/tool/hpcrun/gpu/ompt/ompt-activity-translate.h
new file mode 100644
index 0000000000..30dedb5c01
--- /dev/null
+++ b/src/tool/hpcrun/gpu/ompt/ompt-activity-translate.h
@@ -0,0 +1,79 @@
+// -*-Mode: C++;-*- // technically C99
+
+// * BeginRiceCopyright *****************************************************
+//
+// --------------------------------------------------------------------------
+// Part of HPCToolkit (hpctoolkit.org)
+//
+// Information about sources of support for research and development of
+// HPCToolkit is at 'hpctoolkit.org' and in 'README.Acknowledgments'.
+// --------------------------------------------------------------------------
+//
+// Copyright ((c)) 2002-2021, Rice University
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+//   notice, this list of conditions and the following disclaimer.
+//
+// * Redistributions in binary form must reproduce the above copyright
+//   notice, this list of conditions and the following disclaimer in the
+//   documentation and/or other materials provided with the distribution.
+//
+// * Neither the name of Rice University (RICE) nor the names of its
+//   contributors may be used to endorse or promote products derived from
+//   this software without specific prior written permission.
+//
+// This software is provided by RICE and contributors "as is" and any
+// express or implied warranties, including, but not limited to, the
+// implied warranties of merchantability and fitness for a particular
+// purpose are disclaimed. In no event shall RICE or contributors be
+// liable for any direct, indirect, incidental, special, exemplary, or
+// consequential damages (including, but not limited to, procurement of
+// substitute goods or services; loss of use, data, or profits; or
+// business interruption) however caused and on any theory of liability,
+// whether in contract, strict liability, or tort (including negligence
+// or otherwise) arising in any way out of the use of this software, even
+// if advised of the possibility of such damage.
+//
+// ******************************************************* EndRiceCopyright *
+
+#ifndef ompt_activity_translate_h
+#define ompt_activity_translate_h
+
+
+//******************************************************************************
+// OpenMP includes
+//******************************************************************************
+
+#include <omp-tools.h>
+
+
+
+//******************************************************************************
+// type declarations
+//******************************************************************************
+
+typedef struct gpu_activity_t gpu_activity_t;
+typedef struct cct_node_t cct_node_t;
+
+
+
+//******************************************************************************
+// interface operations
+//******************************************************************************
+
+void
+ompt_activity_translate
+(
+ gpu_activity_t *entry,
+ ompt_record_ompt_t *record,
+ uint64_t *cid_ptr
+);
+
+
+
+#endif
diff --git a/src/tool/hpcrun/gpu/ompt/ompt-gpu-api.c b/src/tool/hpcrun/gpu/ompt/ompt-gpu-api.c
new file mode 100644
index 0000000000..e1dae062ec
--- /dev/null
+++ b/src/tool/hpcrun/gpu/ompt/ompt-gpu-api.c
@@ -0,0 +1,86 @@
+// -*-Mode: C++;-*- // technically C99
+
+// * BeginRiceCopyright *****************************************************
+//
+// --------------------------------------------------------------------------
+// Part of HPCToolkit (hpctoolkit.org)
+//
+// Information about sources of support for research and development of
+// HPCToolkit is at 'hpctoolkit.org' and in 'README.Acknowledgments'.
+// --------------------------------------------------------------------------
+//
+// Copyright ((c)) 2002-2021, Rice University
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+//   notice, this list of conditions and the following disclaimer.
+//
+// * Redistributions in binary form must reproduce the above copyright
+//   notice, this list of conditions and the following disclaimer in the
+//   documentation and/or other materials provided with the distribution.
+//
+// * Neither the name of Rice University (RICE) nor the names of its
+//   contributors may be used to endorse or promote products derived from
+//   this software without specific prior written permission.
+//
+// This software is provided by RICE and contributors "as is" and any
+// express or implied warranties, including, but not limited to, the
+// implied warranties of merchantability and fitness for a particular
+// purpose are disclaimed. In no event shall RICE or contributors be
+// liable for any direct, indirect, incidental, special, exemplary, or
+// consequential damages (including, but not limited to, procurement of
+// substitute goods or services; loss of use, data, or profits; or
+// business interruption) however caused and on any theory of liability,
+// whether in contract, strict liability, or tort (including negligence
+// or otherwise) arising in any way out of the use of this software, even
+// if advised of the possibility of such damage.
+//
+// ******************************************************* EndRiceCopyright *
+
+//******************************************************************************
+// local includes
+//******************************************************************************
+
+#include <hpcrun/gpu/gpu-monitoring-thread-api.h>
+
+#include <hpcrun/gpu/gpu-activity.h>
+#include <hpcrun/gpu/gpu-activity-process.h>
+#include <hpcrun/gpu/gpu-correlation-id-map.h>
+
+#include "ompt-gpu-api.h"
+#include "ompt-activity-translate.h"
+
+
+
+//******************************************************************************
+// interface operations
+//******************************************************************************
+
+void
+ompt_buffer_completion_notify
+(
+ void
+)
+{
+  gpu_monitoring_thread_activities_ready();
+}
+
+
+void
+ompt_activity_process
+(
+ ompt_record_ompt_t *record
+)
+{
+  gpu_activity_t gpu_activity;
+  uint64_t correlation_id;
+  ompt_activity_translate(&gpu_activity, record, &correlation_id);
+  if (gpu_correlation_id_map_lookup(correlation_id) == NULL) {
+    gpu_correlation_id_map_insert(correlation_id, correlation_id);
+  }
+  gpu_activity_process(&gpu_activity);
+}
diff --git a/src/tool/hpcrun/gpu/ompt/ompt-gpu-api.h b/src/tool/hpcrun/gpu/ompt/ompt-gpu-api.h
new file mode 100644
index 0000000000..cca8cee7af
--- /dev/null
+++ b/src/tool/hpcrun/gpu/ompt/ompt-gpu-api.h
@@ -0,0 +1,76 @@
+// -*-Mode: C++;-*- // technically C99
+
+// * BeginRiceCopyright *****************************************************
+//
+// --------------------------------------------------------------------------
+// Part of HPCToolkit (hpctoolkit.org)
+//
+// Information about sources of support for research and development of
+// HPCToolkit is at 'hpctoolkit.org' and in 'README.Acknowledgments'.
+// --------------------------------------------------------------------------
+//
+// Copyright ((c)) 2002-2021, Rice University
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+//   notice, this list of conditions and the following disclaimer.
+//
+// * Redistributions in binary form must reproduce the above copyright
+//   notice, this list of conditions and the following disclaimer in the
+//   documentation and/or other materials provided with the distribution.
+//
+// * Neither the name of Rice University (RICE) nor the names of its
+//   contributors may be used to endorse or promote products derived from
+//   this software without specific prior written permission.
+//
+// This software is provided by RICE and contributors "as is" and any
+// express or implied warranties, including, but not limited to, the
+// implied warranties of merchantability and fitness for a particular
+// purpose are disclaimed. In no event shall RICE or contributors be
+// liable for any direct, indirect, incidental, special, exemplary, or
+// consequential damages (including, but not limited to, procurement of
+// substitute goods or services; loss of use, data, or profits; or
+// business interruption) however caused and on any theory of liability,
+// whether in contract, strict liability, or tort (including negligence
+// or otherwise) arising in any way out of the use of this software, even
+// if advised of the possibility of such damage.
+//
+// ******************************************************* EndRiceCopyright *
+
+#ifndef ompt_gpu_api_h
+#define ompt_gpu_api_h
+
+
+
+//******************************************************************************
+// OpenMP includes
+//******************************************************************************
+
+#include <omp-tools.h>
+
+
+
+//******************************************************************************
+// interface operations
+//******************************************************************************
+
+void
+ompt_buffer_completion_notify
+(
+ void
+);
+
+
+void
+ompt_activity_process
+(
+ ompt_record_ompt_t *record
+);
+
+
+
+#endif

From 2f7623856e0635c5f6a81f1648a7e14778c918a4 Mon Sep 17 00:00:00 2001
From: John M Mellor-Crummey <johnmc@rice.edu>
Date: Sun, 31 Oct 2021 17:11:42 -0500
Subject: [PATCH 142/177] convert gpu activity correlation_id to 64 bits.

---
 src/tool/hpcrun/gpu/gpu-activity-process.c   | 70 ++++++++++++--------
 src/tool/hpcrun/gpu/gpu-activity.h           | 28 ++++----
 src/tool/hpcrun/gpu/gpu-correlation-id-map.c | 14 ++--
 src/tool/hpcrun/gpu/gpu-correlation-id-map.h | 12 ++--
 4 files changed, 70 insertions(+), 54 deletions(-)

diff --git a/src/tool/hpcrun/gpu/gpu-activity-process.c b/src/tool/hpcrun/gpu/gpu-activity-process.c
index b499bc43cf..f92f7faebb 100644
--- a/src/tool/hpcrun/gpu/gpu-activity-process.c
+++ b/src/tool/hpcrun/gpu/gpu-activity-process.c
@@ -140,6 +140,11 @@ attribute_activity
   gpu_activity_channel_t *channel =
     gpu_host_correlation_map_entry_channel_get(hc);
   activity->cct_node = cct_node;
+
+  PRINT("attributing activity to %p time = [%lu,%lu)\n",
+	cct_node, activity->details.interval.start,
+	activity->details.interval.end);
+
   gpu_activity_channel_produce(channel, activity);
 }
 
@@ -150,7 +155,7 @@ gpu_memcpy_process
  gpu_activity_t *activity
 )
 {
-  uint32_t correlation_id = activity->details.memcpy.correlation_id;
+  uint64_t correlation_id = activity->details.memcpy.correlation_id;
   gpu_correlation_id_map_entry_t *cid_map_entry =
     gpu_correlation_id_map_lookup(correlation_id);
   if (cid_map_entry != NULL) {
@@ -196,7 +201,7 @@ gpu_memcpy_process
   } else {
     PRINT("Memcpy copy correlation_id %u cannot be found\n", correlation_id);
   }
-  PRINT("Memcpy copy CorrelationId %u\n", correlation_id);
+  PRINT("Memcpy copy correlation_id 0x%lx\n", correlation_id);
   PRINT("Memcpy copy kind %u\n", activity->details.memcpy.copyKind);
   PRINT("Memcpy copy bytes %lu\n", activity->details.memcpy.bytes);
 }
@@ -217,7 +222,7 @@ gpu_sample_process
  gpu_activity_t* sample
 )
 {
-  uint32_t correlation_id = sample->details.pc_sampling.correlation_id;
+  uint64_t correlation_id = sample->details.pc_sampling.correlation_id;
 
   gpu_correlation_id_map_entry_t *cid_map_entry =
     gpu_correlation_id_map_lookup(correlation_id);
@@ -254,7 +259,7 @@ gpu_sample_process
       PRINT("host_map_entry %lu not found\n", external_id);
     }
   } else {
-    PRINT("correlation_id_map_entry %u not found\n", correlation_id);
+    PRINT("correlation_id_map_entry %lu not found\n", correlation_id);
   }
 }
 
@@ -265,7 +270,7 @@ gpu_sampling_info_process
  gpu_activity_t *sri
 )
 {
-  uint32_t correlation_id = sri->details.pc_sampling_info.correlation_id;
+  uint64_t correlation_id = sri->details.pc_sampling_info.correlation_id;
   gpu_correlation_id_map_entry_t *cid_map_entry =
     gpu_correlation_id_map_lookup(correlation_id);
   if (cid_map_entry != NULL) {
@@ -305,7 +310,8 @@ gpu_correlation_process
   if (gpu_correlation_id_map_lookup(gpu_correlation_id) == NULL) {
     gpu_correlation_id_map_insert(gpu_correlation_id, host_correlation_id);
   } else {
-    gpu_correlation_id_map_external_id_replace(gpu_correlation_id, host_correlation_id);
+    gpu_correlation_id_map_external_id_replace(gpu_correlation_id,
+					       host_correlation_id);
   }
   PRINT("Correlation: native_correlation %u --> host_correlation %lu\n", 
       gpu_correlation_id, host_correlation_id);
@@ -318,7 +324,7 @@ gpu_memset_process
  gpu_activity_t *activity
 )
 {
-  uint32_t correlation_id = activity->details.memset.correlation_id;
+  uint64_t correlation_id = activity->details.memset.correlation_id;
   gpu_correlation_id_map_entry_t *cid_map_entry =
     gpu_correlation_id_map_lookup(correlation_id);
   if (cid_map_entry != NULL) {
@@ -344,7 +350,7 @@ gpu_memset_process
     }
     gpu_correlation_id_map_delete(correlation_id);
   }
-  PRINT("Memset CorrelationId %u\n", correlation_id);
+  PRINT("Memset correlation_id 0x%lx\n", correlation_id);
   PRINT("Memset kind %u\n", activity->details.memset.memKind);
   PRINT("Memset bytes %lu\n", activity->details.memset.bytes);
 }
@@ -356,7 +362,8 @@ gpu_function_process
  gpu_activity_t *activity
 )
 {
-  gpu_function_id_map_insert(activity->details.function.function_id, activity->details.function.pc);
+  gpu_function_id_map_insert(activity->details.function.function_id,
+			     activity->details.function.pc);
   PRINT("Function id %u\n", activity->details.function.function_id);
 }
 
@@ -367,7 +374,7 @@ gpu_kernel_process
  gpu_activity_t *activity
 )
 {
-  uint32_t correlation_id = activity->details.kernel.correlation_id;
+  uint64_t correlation_id = activity->details.kernel.correlation_id;
 
   gpu_correlation_id_map_entry_t *cid_map_entry =
     gpu_correlation_id_map_lookup(correlation_id);
@@ -417,11 +424,12 @@ gpu_kernel_process
       attribute_activity(host_op_entry, activity, kernel_node);
     }
   } else {
-    PRINT("Kernel execution correlation_id %u cannot be found\n", correlation_id);
+    PRINT("Kernel execution correlation_id 0x%lx cannot be found\n",
+	  correlation_id);
   }
 
   PRINT("Kernel execution deviceId %u\n", activity->details.kernel.device_id);
-  PRINT("Kernel execution CorrelationId %u\n", correlation_id);
+  PRINT("Kernel execution correlation_id 0x%lx\n", correlation_id);
 }
 
 
@@ -443,8 +451,9 @@ gpu_kernel_block_process
     cct_node_t *host_op_node =
       gpu_host_correlation_map_entry_op_function_get(host_op_entry);
 
-    // create a child cct node that contains 2 metrics: offset of block head wrt. original binary, dynamic execution count of block
-    cct_node_t *cct_child = hpcrun_cct_insert_ip_norm(host_op_node, ip); // how to set the ip_norm
+    // create a child cct node that contains 2 metrics: offset of block head
+    // wrt. original binary, dynamic execution count of block
+    cct_node_t *cct_child = hpcrun_cct_insert_ip_norm(host_op_node, ip);
     if (cct_child) {
       PRINT("cct_child %p\n", cct_child);
       attribute_activity(host_op_entry, activity, cct_child);
@@ -461,7 +470,7 @@ gpu_synchronization_process
  gpu_activity_t *activity
 )
 {
-  uint32_t correlation_id = activity->details.synchronization.correlation_id;
+  uint64_t correlation_id = activity->details.synchronization.correlation_id;
   gpu_correlation_id_map_entry_t *cid_map_entry =
     gpu_correlation_id_map_lookup(correlation_id);
   if (cid_map_entry != NULL) {
@@ -500,18 +509,23 @@ gpu_synchronization_process
           case GPU_SYNC_EVENT:
             {
               // Find the corresponding stream that records the event
-              gpu_event_id_map_entry_t *event_id_entry = gpu_event_id_map_lookup(event_id);
+              gpu_event_id_map_entry_t *event_id_entry =
+		gpu_event_id_map_lookup(event_id);
               if (event_id_entry != NULL) {
-                context_id = gpu_event_id_map_entry_context_id_get(event_id_entry);
-                stream_id = gpu_event_id_map_entry_stream_id_get(event_id_entry);
-                PRINT("Add context %u stream %u event %u sync\n", context_id, stream_id, event_id);
+                context_id =
+		  gpu_event_id_map_entry_context_id_get(event_id_entry);
+                stream_id =
+		  gpu_event_id_map_entry_stream_id_get(event_id_entry);
+                PRINT("Add context %u stream %u event %u sync\n", context_id,
+		      stream_id, event_id);
                 gpu_context_stream_trace(context_id, stream_id, &entry_trace);
               }
               break;
             }
           default:
             // invalid
-            PRINT("Invalid synchronization %u\n", correlation_id);
+            PRINT("Synchronization correlation_id 0x%lx cannot be found\n",
+		  correlation_id);
         }
       }
       // TODO(Keren): handle event synchronization
@@ -520,7 +534,7 @@ gpu_synchronization_process
     }
     gpu_correlation_id_map_delete(correlation_id);
   }
-  PRINT("Synchronization CorrelationId %u\n", correlation_id);
+  PRINT("Synchronization correlation_id 0x%lx\n", correlation_id);
 }
 
 
@@ -530,7 +544,7 @@ gpu_cdpkernel_process
  gpu_activity_t *activity
 )
 {
-  uint32_t correlation_id = activity->details.cdpkernel.correlation_id;
+  uint64_t correlation_id = activity->details.cdpkernel.correlation_id;
   gpu_correlation_id_map_entry_t *cid_map_entry =
     gpu_correlation_id_map_lookup(correlation_id);
   if (cid_map_entry != NULL) {
@@ -557,7 +571,7 @@ gpu_cdpkernel_process
     }
     gpu_correlation_id_map_delete(correlation_id);
   }
-  PRINT("Cdp Kernel CorrelationId %u\n", correlation_id);
+  PRINT("Cdp Kernel correlation_id 0x%lx\n", correlation_id);
 }
 
 
@@ -582,7 +596,7 @@ gpu_memory_process
  gpu_activity_t *activity
 )
 {
-  uint32_t correlation_id = activity->details.memory.correlation_id;
+  uint64_t correlation_id = activity->details.memory.correlation_id;
   gpu_correlation_id_map_entry_t *cid_map_entry =
     gpu_correlation_id_map_lookup(correlation_id);
   if (cid_map_entry != NULL) {
@@ -601,9 +615,9 @@ gpu_memory_process
     }
     gpu_correlation_id_map_delete(correlation_id);
   } else {
-    PRINT("Memory correlation_id %u cannot be found\n", correlation_id);
+    PRINT("Memory correlation_id 0x%lx cannot be found\n", correlation_id);
   }
-  PRINT("Memory CorrelationId %u\n", correlation_id);
+  PRINT("Memory correlation_id 0x%lx\n", correlation_id);
   PRINT("Memory kind %u\n", activity->details.memory.memKind);
   PRINT("Memory bytes %lu\n", activity->details.memory.bytes);
 }
@@ -615,7 +629,7 @@ gpu_instruction_process
  gpu_activity_t *activity
 )
 {
-  uint32_t correlation_id = activity->details.instruction.correlation_id;
+  uint64_t correlation_id = activity->details.instruction.correlation_id;
   ip_normalized_t pc = activity->details.instruction.pc;
   gpu_correlation_id_map_entry_t *cid_map_entry =
     gpu_correlation_id_map_lookup(correlation_id);
@@ -633,7 +647,7 @@ gpu_instruction_process
       attribute_activity(host_op_entry, activity, func_ins);
     }
   }
-  PRINT("Instruction correlation_id %u\n", correlation_id);
+  PRINT("Instruction correlation_id 0x%lx\n", correlation_id);
 }
 
 
diff --git a/src/tool/hpcrun/gpu/gpu-activity.h b/src/tool/hpcrun/gpu/gpu-activity.h
index d098880708..bc951d8a29 100644
--- a/src/tool/hpcrun/gpu/gpu-activity.h
+++ b/src/tool/hpcrun/gpu/gpu-activity.h
@@ -181,7 +181,7 @@ typedef enum {
 
 // pc sampling
 typedef struct gpu_pc_sampling_t {
-  uint32_t correlation_id;
+  uint64_t correlation_id;
   ip_normalized_t pc;
   uint32_t samples;
   uint32_t latencySamples;
@@ -190,7 +190,7 @@ typedef struct gpu_pc_sampling_t {
 
 
 typedef struct gpu_pc_sampling_info_t {
-  uint32_t correlation_id;
+  uint64_t correlation_id;
   uint64_t droppedSamples;
   uint64_t samplingPeriodInCycles;
   uint64_t totalSamples;
@@ -217,7 +217,7 @@ typedef struct gpu_memcpy_t {
   uint64_t end;
   uint64_t bytes;
   uint64_t submit_time;
-  uint32_t correlation_id;
+  uint64_t correlation_id;
   uint32_t context_id;
   uint32_t stream_id;
   gpu_memcpy_type_t copyKind;
@@ -230,7 +230,9 @@ typedef struct gpu_memory_t {
   uint64_t start;
   uint64_t end;
   uint64_t bytes;
-  uint32_t correlation_id;
+  uint64_t correlation_id;
+  uint32_t context_id;
+  uint32_t stream_id;
   gpu_mem_type_t memKind;
 } gpu_memory_t;
 
@@ -240,7 +242,7 @@ typedef struct gpu_memset_t {
   uint64_t start;
   uint64_t end;
   uint64_t bytes;
-  uint32_t correlation_id;
+  uint64_t correlation_id;
   uint32_t context_id;
   uint32_t stream_id;
   gpu_mem_type_t memKind;
@@ -252,7 +254,7 @@ typedef struct gpu_kernel_t {
   uint64_t start;
   uint64_t end;
   uint64_t submit_time;
-  uint32_t correlation_id;
+  uint64_t correlation_id;
   uint32_t device_id;
   uint32_t context_id;
   uint32_t stream_id;
@@ -278,7 +280,7 @@ typedef struct gpu_kernel_block_t {
 typedef struct gpu_cdpkernel_t {
   uint64_t start;
   uint64_t end;
-  uint32_t correlation_id;
+  uint64_t correlation_id;
   uint32_t context_id;
   uint32_t stream_id;
 } gpu_cdpkernel_t;
@@ -298,7 +300,7 @@ typedef struct gpu_event_t {
 
 
 typedef struct gpu_global_access_t {
-  uint32_t correlation_id;
+  uint64_t correlation_id;
   ip_normalized_t pc;
   uint64_t l2_transactions;
   uint64_t theoreticalL2Transactions;
@@ -308,7 +310,7 @@ typedef struct gpu_global_access_t {
 
 
 typedef struct gpu_local_access_t {
-  uint32_t correlation_id;
+  uint64_t correlation_id;
   ip_normalized_t pc;
   uint64_t sharedTransactions;
   uint64_t theoreticalSharedTransactions;
@@ -318,7 +320,7 @@ typedef struct gpu_local_access_t {
 
 
 typedef struct gpu_branch_t {
-  uint32_t correlation_id;
+  uint64_t correlation_id;
   ip_normalized_t pc;
   uint32_t diverged;
   uint32_t executed;
@@ -328,7 +330,7 @@ typedef struct gpu_branch_t {
 typedef struct gpu_synchronization_t {
   uint64_t start;
   uint64_t end;
-  uint32_t correlation_id;
+  uint64_t correlation_id;
   uint32_t context_id;
   uint32_t stream_id;
   uint32_t event_id;
@@ -337,7 +339,7 @@ typedef struct gpu_synchronization_t {
 
 
 typedef struct gpu_host_correlation_t {
-  uint32_t correlation_id;
+  uint64_t correlation_id;
   uint64_t host_correlation_id;
 } gpu_host_correlation_t;
 
@@ -352,7 +354,7 @@ typedef struct gpu_interval_t {
 
 
 typedef struct gpu_instruction_t {
-  uint32_t correlation_id;
+  uint64_t correlation_id;
   ip_normalized_t pc;
 } gpu_instruction_t;
 
diff --git a/src/tool/hpcrun/gpu/gpu-correlation-id-map.c b/src/tool/hpcrun/gpu/gpu-correlation-id-map.c
index 2ee9ba1d59..927f9803a2 100644
--- a/src/tool/hpcrun/gpu/gpu-correlation-id-map.c
+++ b/src/tool/hpcrun/gpu/gpu-correlation-id-map.c
@@ -142,7 +142,7 @@ gpu_correlation_id_map_entry_alloc()
 static gpu_correlation_id_map_entry_t *
 gpu_correlation_id_map_entry_new
 (
- uint32_t gpu_correlation_id, 
+ uint64_t gpu_correlation_id, 
  uint64_t host_correlation_id
 )
 {
@@ -165,7 +165,7 @@ gpu_correlation_id_map_entry_new
 gpu_correlation_id_map_entry_t *
 gpu_correlation_id_map_lookup
 (
- uint32_t gpu_correlation_id
+ uint64_t gpu_correlation_id
 )
 {
   uint64_t correlation_id = gpu_correlation_id;
@@ -181,7 +181,7 @@ gpu_correlation_id_map_lookup
 void
 gpu_correlation_id_map_insert
 (
- uint32_t gpu_correlation_id, 
+ uint64_t gpu_correlation_id, 
  uint64_t host_correlation_id
 )
 {
@@ -205,7 +205,7 @@ gpu_correlation_id_map_insert
 void
 gpu_correlation_id_map_external_id_replace
 (
- uint32_t gpu_correlation_id, 
+ uint64_t gpu_correlation_id, 
  uint64_t host_correlation_id
 )
 {
@@ -221,7 +221,7 @@ gpu_correlation_id_map_external_id_replace
 void
 gpu_correlation_id_map_delete
 (
- uint32_t gpu_correlation_id
+ uint64_t gpu_correlation_id
 )
 {
   gpu_correlation_id_map_entry_t *node = st_delete(&map_root, gpu_correlation_id);
@@ -232,7 +232,7 @@ gpu_correlation_id_map_delete
 void
 gpu_correlation_id_map_kernel_update
 (
- uint32_t gpu_correlation_id,
+ uint64_t gpu_correlation_id,
  uint32_t device_id,
  uint64_t start,
  uint64_t end
@@ -280,7 +280,7 @@ gpu_correlation_id_map_entry_end_get
 }
 
 
-uint32_t
+uint64_t
 gpu_correlation_id_map_entry_device_id_get
 (
  gpu_correlation_id_map_entry_t *entry
diff --git a/src/tool/hpcrun/gpu/gpu-correlation-id-map.h b/src/tool/hpcrun/gpu/gpu-correlation-id-map.h
index 6eee92061f..75ccc2bf4f 100644
--- a/src/tool/hpcrun/gpu/gpu-correlation-id-map.h
+++ b/src/tool/hpcrun/gpu/gpu-correlation-id-map.h
@@ -71,14 +71,14 @@ typedef struct cct_node_t cct_node_t;
 gpu_correlation_id_map_entry_t *
 gpu_correlation_id_map_lookup
 (
- uint32_t gpu_correlation_id
+ uint64_t gpu_correlation_id
 );
 
 
 void
 gpu_correlation_id_map_insert
 (
- uint32_t gpu_correlation_id,
+ uint64_t gpu_correlation_id,
  uint64_t host_correlation_id
 );
 
@@ -86,14 +86,14 @@ gpu_correlation_id_map_insert
 void
 gpu_correlation_id_map_delete
 (
- uint32_t gpu_correlation_id
+ uint64_t gpu_correlation_id
 );
 
 
 void
 gpu_correlation_id_map_external_id_replace
 (
- uint32_t gpu_correlation_id,
+ uint64_t gpu_correlation_id,
  uint64_t host_correlation_id
 );
 
@@ -101,7 +101,7 @@ gpu_correlation_id_map_external_id_replace
 void
 gpu_correlation_id_map_kernel_update
 (
- uint32_t correlation_id,
+ uint64_t correlation_id,
  uint32_t device_id,
  uint64_t start,
  uint64_t end
@@ -129,7 +129,7 @@ gpu_correlation_id_map_entry_end_get
 );
 
 
-uint32_t
+uint64_t
 gpu_correlation_id_map_entry_device_id_get
 (
  gpu_correlation_id_map_entry_t *entry

From fe8dda1f70e42b52d58e052054cfaba8f7182579 Mon Sep 17 00:00:00 2001
From: John M Mellor-Crummey <johnmc@rice.edu>
Date: Sun, 31 Oct 2021 19:48:26 -0500
Subject: [PATCH 143/177] refine gpu memory op to alloc or delete

---
 src/tool/hpcrun/gpu/gpu-activity-process.c    | 20 +++++++++++++++++--
 src/tool/hpcrun/gpu/gpu-activity.h            |  8 ++++++++
 .../hpcrun/gpu/ompt/ompt-activity-translate.c |  6 ++++--
 3 files changed, 30 insertions(+), 4 deletions(-)

diff --git a/src/tool/hpcrun/gpu/gpu-activity-process.c b/src/tool/hpcrun/gpu/gpu-activity-process.c
index c118fb95f3..da26a331ed 100644
--- a/src/tool/hpcrun/gpu/gpu-activity-process.c
+++ b/src/tool/hpcrun/gpu/gpu-activity-process.c
@@ -199,7 +199,7 @@ gpu_memcpy_process
     }
     gpu_correlation_id_map_delete(correlation_id);
   } else {
-      PRINT("Memcpy copy correlation_id %u cannot be found\n", correlation_id);
+    PRINT("Memcpy copy correlation_id 0x%lx cannot be found\n", correlation_id);
   }
   PRINT("Memcpy copy correlation_id 0x%lx\n", correlation_id);
   PRINT("Memcpy copy kind %u\n", activity->details.memcpy.copyKind);
@@ -589,6 +589,22 @@ gpu_event_process
   PRINT("GPU event %u\n", event_id);
 }
 
+static gpu_placeholder_type_t
+gpu_memory_placeholder
+(
+ gpu_activity_t *activity
+)
+{
+  gpu_mem_op_t mem_op = activity->details.memory.mem_op;;
+  switch(mem_op) {
+  case GPU_MEM_OP_ALLOC: return gpu_placeholder_type_alloc;
+  case GPU_MEM_OP_DELETE: return gpu_placeholder_type_delete;
+  default:
+    assert(0);
+  }
+  return gpu_placeholder_type_alloc;
+}
+
 
 static void
 gpu_memory_process
@@ -605,7 +621,7 @@ gpu_memory_process
     gpu_host_correlation_map_entry_t *host_op_entry =
       gpu_host_correlation_map_lookup(external_id);
     if (host_op_entry != NULL) {
-      gpu_placeholder_type_t ph = gpu_placeholder_type_alloc;
+      gpu_placeholder_type_t ph = gpu_memory_placeholder(activity);
       cct_node_t *host_op_node =
         gpu_host_correlation_map_entry_op_cct_get(host_op_entry, ph);
       assert(host_op_node != NULL);
diff --git a/src/tool/hpcrun/gpu/gpu-activity.h b/src/tool/hpcrun/gpu/gpu-activity.h
index bc951d8a29..32f74a7e96 100644
--- a/src/tool/hpcrun/gpu/gpu-activity.h
+++ b/src/tool/hpcrun/gpu/gpu-activity.h
@@ -179,6 +179,13 @@ typedef enum {
 } gpu_mem_type_t;
 
 
+typedef enum {
+  GPU_MEM_OP_ALLOC        = 0,
+  GPU_MEM_OP_DELETE       = 1,
+  GPU_MEM_OP_UNKNOWN      = 2
+} gpu_mem_op_t;
+
+
 // pc sampling
 typedef struct gpu_pc_sampling_t {
   uint64_t correlation_id;
@@ -234,6 +241,7 @@ typedef struct gpu_memory_t {
   uint32_t context_id;
   uint32_t stream_id;
   gpu_mem_type_t memKind;
+  gpu_mem_op_t mem_op;
 } gpu_memory_t;
 
 
diff --git a/src/tool/hpcrun/gpu/ompt/ompt-activity-translate.c b/src/tool/hpcrun/gpu/ompt/ompt-activity-translate.c
index 3d65f00cf0..d3c3657081 100644
--- a/src/tool/hpcrun/gpu/ompt/ompt-activity-translate.c
+++ b/src/tool/hpcrun/gpu/ompt/ompt-activity-translate.c
@@ -131,6 +131,7 @@ convert_memory
 (
   gpu_activity_t *ga,
   ompt_record_ompt_t *r,
+  gpu_mem_op_t mem_op,
   uint64_t *cid_ptr
 )
 {
@@ -139,6 +140,7 @@ convert_memory
   ga->kind = GPU_ACTIVITY_MEMORY;
   ga->details.memory.memKind = GPU_MEM_UNKNOWN;
   ga->details.memory.correlation_id = d->host_op_id;
+  ga->details.memory.mem_op = mem_op;
   *cid_ptr = d->host_op_id;
 
   ga->details.memory.bytes = d->bytes;
@@ -153,7 +155,7 @@ convert_alloc
   uint64_t *cid_ptr
 )
 {
-  convert_memory(ga,r, cid_ptr);
+  convert_memory(ga, r, GPU_MEM_OP_ALLOC, cid_ptr);
 }
 
 
@@ -165,7 +167,7 @@ convert_delete
   uint64_t *cid_ptr
 )
 {
-  convert_memory(ga,r, cid_ptr);
+  convert_memory(ga, r, GPU_MEM_OP_DELETE, cid_ptr);
 }
 
 

From 2126d7d7457fae11f396a1a21ed93cf00720e8c9 Mon Sep 17 00:00:00 2001
From: John M Mellor-Crummey <johnmc@rice.edu>
Date: Mon, 29 Nov 2021 23:23:11 -0600
Subject: [PATCH 144/177] add new gpu=openmp argument to set up openmp target
 monitoring

---
 .../hpcrun/sample-sources/openmp-target.c     | 194 ++++++++++++++++++
 1 file changed, 194 insertions(+)
 create mode 100644 src/tool/hpcrun/sample-sources/openmp-target.c

diff --git a/src/tool/hpcrun/sample-sources/openmp-target.c b/src/tool/hpcrun/sample-sources/openmp-target.c
new file mode 100644
index 0000000000..7aa46462e6
--- /dev/null
+++ b/src/tool/hpcrun/sample-sources/openmp-target.c
@@ -0,0 +1,194 @@
+//******************************************************************************
+// system includes
+//******************************************************************************
+
+#include <alloca.h>
+#include <assert.h>
+#include <ctype.h>
+#include <setjmp.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <ucontext.h>
+#include <stdbool.h>
+
+#include <pthread.h>
+
+#ifndef HPCRUN_STATIC_LINK
+#include <dlfcn.h>
+#endif
+
+
+
+//******************************************************************************
+// libmonitor
+//******************************************************************************
+
+#include <monitor.h>
+
+
+
+//******************************************************************************
+// local includes
+//******************************************************************************
+
+#include "amd.h"
+
+#include "libdl.h"
+
+#include "simple_oo.h"
+#include "sample_source_obj.h"
+#include "common.h"
+
+#include <hpcrun/control-knob.h>
+#include <hpcrun/device-finalizers.h>
+#include <hpcrun/gpu/amd/roctracer-api.h>
+#include <hpcrun/gpu/gpu-activity.h>
+#include <hpcrun/gpu/gpu-metrics.h>
+#include <hpcrun/gpu/gpu-trace.h>
+#include <hpcrun/hpcrun_options.h>
+#include <hpcrun/hpcrun_stats.h>
+#include <hpcrun/metrics.h>
+#include <hpcrun/module-ignore-map.h>
+#include <hpcrun/ompt/ompt-interface.h>
+#include <hpcrun/safe-sampling.h>
+#include <hpcrun/sample_sources_registered.h>
+#include <hpcrun/sample_event.h>
+#include <hpcrun/thread_data.h>
+#include <hpcrun/trace.h>
+
+#include <utilities/tokenize.h>
+#include <messages/messages.h>
+#include <lush/lush-backtrace.h>
+#include <lib/prof-lean/hpcrun-fmt.h>
+
+
+
+
+//******************************************************************************
+// macros
+//******************************************************************************
+
+#define OPENMP_TARGET "gpu=openmp"
+
+static device_finalizer_fn_entry_t device_finalizer_shutdown;
+static device_finalizer_fn_entry_t device_trace_finalizer_shutdown;
+
+
+//******************************************************************************
+// interface operations
+//******************************************************************************
+
+static void
+METHOD_FN(init)
+{
+    self->state = INIT;
+}
+
+
+static void
+METHOD_FN(thread_init)
+{
+    TMSG(CUDA, "thread_init");
+}
+
+
+static void
+METHOD_FN(thread_init_action)
+{
+    TMSG(CUDA, "thread_init_action");
+}
+
+
+static void
+METHOD_FN(start)
+{
+    TMSG(CUDA, "start");
+    TD_GET(ss_state)[self->sel_idx] = START;
+}
+
+
+static void
+METHOD_FN(thread_fini_action)
+{
+    TMSG(CUDA, "thread_fini_action");
+}
+
+
+static void
+METHOD_FN(stop)
+{
+    hpcrun_get_thread_data();
+
+    TD_GET(ss_state)[self->sel_idx] = STOP;
+}
+
+
+static void
+METHOD_FN(shutdown)
+{
+    self->state = UNINIT;
+}
+
+
+static bool
+METHOD_FN(supports_event, const char *ev_str)
+{
+#ifndef HPCRUN_STATIC_LINK
+    return hpcrun_ev_is(ev_str, OPENMP_TARGET);
+#else
+    return false;
+#endif
+
+
+}
+
+static void
+METHOD_FN(process_event_list, int lush_metrics)
+{
+    int nevents = (self->evl).nevents;
+    gpu_metrics_default_enable();
+    hpcrun_set_trace_metric(HPCRUN_GPU_TRACE_FLAG);
+    TMSG(CUDA,"nevents = %d", nevents);
+}
+
+static void
+METHOD_FN(finalize_event_list)
+{
+    gpu_metrics_default_enable();
+    gpu_trace_init();
+}
+
+
+static void
+METHOD_FN(gen_event_set,int lush_metrics)
+{
+
+}
+
+
+static void
+METHOD_FN(display_events)
+{
+  printf("===========================================================================\n");
+  printf("Available AMD GPU events\n");
+  printf("===========================================================================\n");
+  printf("Name\t\tDescription\n");
+  printf("---------------------------------------------------------------------------\n");
+  printf("%s\t\tOperation-level monitoring of OpenMP offloading.\n"
+	 "\t\tCollect timing information on GPU kernel invocations,\n"
+	 "\t\tmemory copies, etc.\n",
+	 OPENMP_TARGET);
+  printf("\n");
+}
+
+
+
+//**************************************************************************
+// object
+//**************************************************************************
+
+#define ss_name openmp_gpu
+#define ss_cls SS_HARDWARE
+
+#include "ss_obj.h"

From 3f08f5c41107d0b5ea722d0249076842083d2e55 Mon Sep 17 00:00:00 2001
From: John M Mellor-Crummey <johnmc@rice.edu>
Date: Mon, 29 Nov 2021 23:24:52 -0600
Subject: [PATCH 145/177] turn off activity monitoring

---
 src/tool/hpcrun/gpu/gpu-activity-process.c     | 2 +-
 src/tool/hpcrun/gpu/gpu-activity.c             | 2 +-
 src/tool/hpcrun/gpu/gpu-host-correlation-map.c | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/tool/hpcrun/gpu/gpu-activity-process.c b/src/tool/hpcrun/gpu/gpu-activity-process.c
index da26a331ed..5f515a4026 100644
--- a/src/tool/hpcrun/gpu/gpu-activity-process.c
+++ b/src/tool/hpcrun/gpu/gpu-activity-process.c
@@ -73,7 +73,7 @@
 
 #define UNIT_TEST 0
 
-#define DEBUG 1
+#define DEBUG 0
 
 #include "gpu-print.h"
 
diff --git a/src/tool/hpcrun/gpu/gpu-activity.c b/src/tool/hpcrun/gpu/gpu-activity.c
index a05a5d8f74..5e50a32e3c 100644
--- a/src/tool/hpcrun/gpu/gpu-activity.c
+++ b/src/tool/hpcrun/gpu/gpu-activity.c
@@ -66,7 +66,7 @@
 
 #define UNIT_TEST 0
 
-#define DEBUG 1
+#define DEBUG 0
 
 
 #define FORALL_OPENCL_KINDS(macro)					\
diff --git a/src/tool/hpcrun/gpu/gpu-host-correlation-map.c b/src/tool/hpcrun/gpu/gpu-host-correlation-map.c
index b30fc59993..462d9bd790 100644
--- a/src/tool/hpcrun/gpu/gpu-host-correlation-map.c
+++ b/src/tool/hpcrun/gpu/gpu-host-correlation-map.c
@@ -68,7 +68,7 @@
 // macros
 //******************************************************************************
 
-#define DEBUG 1
+#define DEBUG 0
 
 #include "gpu-print.h"
 

From 227314dd7571df68662afa16fbcd31a1a4832759 Mon Sep 17 00:00:00 2001
From: John M Mellor-Crummey <johnmc@rice.edu>
Date: Mon, 29 Nov 2021 23:26:28 -0600
Subject: [PATCH 146/177] adjust white space

---
 src/tool/hpcrun/gpu/gpu-correlation-id-map.c  | 18 +++----
 .../hpcrun/gpu/ompt/ompt-activity-translate.c | 14 ++---
 src/tool/hpcrun/ompt/ompt-device.c            | 52 +++++++++----------
 src/tool/hpcrun/sample-sources/ss-list.h      | 14 ++---
 4 files changed, 49 insertions(+), 49 deletions(-)

diff --git a/src/tool/hpcrun/gpu/gpu-correlation-id-map.c b/src/tool/hpcrun/gpu/gpu-correlation-id-map.c
index 927f9803a2..0a667d696e 100644
--- a/src/tool/hpcrun/gpu/gpu-correlation-id-map.c
+++ b/src/tool/hpcrun/gpu/gpu-correlation-id-map.c
@@ -109,7 +109,7 @@ typedef struct typed_splay_node(correlation_id) {
   uint32_t device_id;
   uint64_t start;
   uint64_t end;
-} typed_splay_node(correlation_id); 
+} typed_splay_node(correlation_id);
 
 
 
@@ -142,13 +142,13 @@ gpu_correlation_id_map_entry_alloc()
 static gpu_correlation_id_map_entry_t *
 gpu_correlation_id_map_entry_new
 (
- uint64_t gpu_correlation_id, 
+ uint64_t gpu_correlation_id,
  uint64_t host_correlation_id
 )
 {
   gpu_correlation_id_map_entry_t *e = gpu_correlation_id_map_entry_alloc();
 
-  memset(e, 0, sizeof(gpu_correlation_id_map_entry_t)); 
+  memset(e, 0, sizeof(gpu_correlation_id_map_entry_t));
 
   e->gpu_correlation_id = gpu_correlation_id;
   e->host_correlation_id = host_correlation_id;
@@ -171,7 +171,7 @@ gpu_correlation_id_map_lookup
   uint64_t correlation_id = gpu_correlation_id;
   gpu_correlation_id_map_entry_t *result = st_lookup(&map_root, correlation_id);
 
-  PRINT("correlation_id map lookup: id=0x%lx (record %p)\n", 
+  PRINT("correlation_id map lookup: id=0x%lx (record %p)\n",
        correlation_id, result);
 
   return result;
@@ -181,21 +181,21 @@ gpu_correlation_id_map_lookup
 void
 gpu_correlation_id_map_insert
 (
- uint64_t gpu_correlation_id, 
+ uint64_t gpu_correlation_id,
  uint64_t host_correlation_id
 )
 {
-  if (st_lookup(&map_root, gpu_correlation_id)) { 
+  if (st_lookup(&map_root, gpu_correlation_id)) {
     // fatal error: correlation_id already present; a
     // correlation should be inserted only once.
     assert(0);
   } else {
-    gpu_correlation_id_map_entry_t *entry = 
+    gpu_correlation_id_map_entry_t *entry =
       gpu_correlation_id_map_entry_new(gpu_correlation_id, host_correlation_id);
 
     st_insert(&map_root, entry);
 
-    PRINT("correlation_id_map insert: correlation_id=0x%lx external_id=%ld (entry=%p)\n", 
+    PRINT("correlation_id_map insert: correlation_id=0x%lx external_id=%ld (entry=%p)\n",
 	  gpu_correlation_id, host_correlation_id, entry);
   }
 }
@@ -205,7 +205,7 @@ gpu_correlation_id_map_insert
 void
 gpu_correlation_id_map_external_id_replace
 (
- uint64_t gpu_correlation_id, 
+ uint64_t gpu_correlation_id,
  uint64_t host_correlation_id
 )
 {
diff --git a/src/tool/hpcrun/gpu/ompt/ompt-activity-translate.c b/src/tool/hpcrun/gpu/ompt/ompt-activity-translate.c
index d3c3657081..adbc21c008 100644
--- a/src/tool/hpcrun/gpu/ompt/ompt-activity-translate.c
+++ b/src/tool/hpcrun/gpu/ompt/ompt-activity-translate.c
@@ -121,7 +121,7 @@ convert_target
 #if 0
   printf("\tTarget task: kind=%d endpoint=%d device=%d task_id=%lu target_id=%lu codeptr=%p\n",
 	 target_rec.kind, target_rec.endpoint, target_rec.device_num,
-	 target_rec.task_id, target_rec.target_id, 
+	 target_rec.task_id, target_rec.target_id,
 #endif
 }
 
@@ -207,7 +207,7 @@ convert_memcpy
   TMSG(OMPT_ACTIVITY, "Memcpy copy kind %u", d->optype);
   TMSG(OMPT_ACTIVITY, "Memcpy copy bytes %lu", d->bytes);
 
-  
+
   ga->details.memcpy.context_id = r->contextId;
   ga->details.memcpy.stream_id = r->streamId;
 #endif
@@ -271,7 +271,7 @@ convert_target_data_op
 	 d->dest_addr, d->dest_device_num,
 #endif
 
-  gpu_interval_set(&ga->details.interval, r->time, d->end_time); 
+  gpu_interval_set(&ga->details.interval, r->time, d->end_time);
 }
 
 
@@ -321,7 +321,7 @@ convert_target_submit
   ga->details.kernel.blockSharedMemory = blockSharedMemory;
 #endif
 
-  gpu_interval_set(&ga->details.interval, r->time, k->end_time); 
+  gpu_interval_set(&ga->details.interval, r->time, k->end_time);
 }
 
 
@@ -352,18 +352,18 @@ ompt_activity_translate
 
     convert_target_data_op(ga,r, cid_ptr);
     break;
-      
+
   case ompt_callback_target_submit:
   case ompt_callback_target_submit_emi:
 
     convert_target_submit(ga,r, cid_ptr);
     break;
-      
+
   default:
     convert_unknown(ga, r, cid_ptr);
     break;
   }
-  
+
 
   cstack_ptr_set(&(ga->next), 0);
 }
diff --git a/src/tool/hpcrun/ompt/ompt-device.c b/src/tool/hpcrun/ompt/ompt-device.c
index 1463b5ca37..88c764ea21 100644
--- a/src/tool/hpcrun/ompt/ompt-device.c
+++ b/src/tool/hpcrun/ompt/ompt-device.c
@@ -2,7 +2,7 @@
 
 // * BeginRiceCopyright *****************************************************
 //
-// $HeadURL$ 
+// $HeadURL$
 // $Id$
 //
 // --------------------------------------------------------------------------
@@ -101,7 +101,7 @@
   typedef return_type (*OMPT_API_FNTYPE(fn)) args
 
 #define OMPT_TARGET_API_FUNCTION(return_type, fn, args)  \
-  OMPT_API_FUNCTION(return_type, fn, args) 
+  OMPT_API_FUNCTION(return_type, fn, args)
 
 #define FOREACH_OMPT_TARGET_FN(macro) \
   macro(ompt_get_device_time) \
@@ -114,7 +114,7 @@
   macro(ompt_get_record_type) \
   macro(ompt_get_record_ompt) \
   macro(ompt_get_record_abstract) \
-  macro(ompt_advance_buffer_cursor) 
+  macro(ompt_advance_buffer_cursor)
 
 #define BUFFER_EMPTY(record, buffer, bytes) (((char *) record) >= (((char *)buffer) + bytes))
 
@@ -237,7 +237,7 @@ hpcrun_ompt_op_id_notify(ompt_scope_endpoint_t endpoint,
 }
 
 
-void 
+void
 ompt_bind_names(ompt_function_lookup_t lookup)
 {
 #define ompt_bind_name(fn) \
@@ -252,7 +252,7 @@ ompt_bind_names(ompt_function_lookup_t lookup)
 
 #define BUFFER_SIZE (1024 * 1024 * 8)
 
-static void 
+static void
 ompt_buffer_request
 (
  int device_id,
@@ -266,7 +266,7 @@ ompt_buffer_request
 }
 
 
-static void 
+static void
 ompt_buffer_release
 (
  ompt_buffer_t *buffer
@@ -282,7 +282,7 @@ ompt_dump
  ompt_record_ompt_t *r
 )
 {
-  if (r) { 
+  if (r) {
     printf("r=%p type=%d time=%lu thread_id=%lu target_id=0x%lx\n",
 	   r, r->type, r->time, r->thread_id, r->target_id);
 
@@ -402,7 +402,7 @@ ompt_finalize_trace
 
 
 
-static void 
+static void
 ompt_buffer_complete
 (
  int device_id,
@@ -451,7 +451,7 @@ ompt_trace_configure(ompt_device_t *device)
 {
   // indicate desired monitoring
   ompt_set_trace_ompt(device, 1, 0);
-  
+
   // turn on monitoring previously indicated
   ompt_start_trace(device, ompt_buffer_request,
 		   ompt_buffer_complete);
@@ -476,14 +476,14 @@ ompt_device_initialize(int device_num,
 }
 
 
-void 
+void
 ompt_device_finalize(int device_num)
 {
   PRINT("ompt_device_finalize id=%d\n", device_num);
 }
 
 
-void 
+void
 ompt_device_load(int device_num,
                  const char *filename,
                  int64_t file_offset,
@@ -501,7 +501,7 @@ ompt_device_load(int device_num,
 }
 
 
-void 
+void
 ompt_device_unload(int device_num,
                    uint64_t module_id)
 {
@@ -509,19 +509,19 @@ ompt_device_unload(int device_num,
 }
 
 
-static int 
+static int
 get_load_module
 (
   cct_node_t *node
 )
 {
-  cct_addr_t *addr = hpcrun_cct_addr(target_node); 
+  cct_addr_t *addr = hpcrun_cct_addr(target_node);
   ip_normalized_t ip = addr->ip_norm;
   return ip.lm_id;
 }
 
 
-void 
+void
 ompt_target_callback_emi
 (
   ompt_target_t kind,
@@ -559,19 +559,19 @@ ompt_target_callback_emi
   td->overhead++;
   // NOTE(keren): hpcrun_safe_enter prevent self interruption
   hpcrun_safe_enter();
-  
+
   int skip_this_frame = 1; // omit this procedure frame on the call path
-  target_node = 
-    hpcrun_sample_callpath(&uc, zero_metric_id, zero_metric_incr, 
-                           skip_this_frame, 1, NULL).sample_node; 
+  target_node =
+    hpcrun_sample_callpath(&uc, zero_metric_id, zero_metric_incr,
+                           skip_this_frame, 1, NULL).sample_node;
 
   // the load module for the runtime library that supports offloading
-  int lm = get_load_module(target_node); 
+  int lm = get_load_module(target_node);
 
 #if 0
-  // drop nodes on the call chain until we find one that is not in the load 
+  // drop nodes on the call chain until we find one that is not in the load
   // module for runtime library that supports offloading
-  for (;;) { 
+  for (;;) {
     target_node = hpcrun_cct_parent(target_node);
     if (get_load_module(target_node) != lm) break;
   }
@@ -585,7 +585,7 @@ ompt_target_callback_emi
   macro(op, ompt_target_data_alloc, ompt_tgt_alloc)		     \
   macro(op, ompt_target_data_delete, ompt_tgt_delete)		     \
   macro(op, ompt_target_data_transfer_to_device, ompt_tgt_copyin)    \
-  macro(op, ompt_target_data_transfer_from_device, ompt_tgt_copyout) 
+  macro(op, ompt_target_data_transfer_from_device, ompt_tgt_copyout)
 
 void
 ompt_data_op_callback_emi
@@ -606,16 +606,16 @@ ompt_data_op_callback_emi
   if (endpoint == ompt_scope_end) return;
 
   uint64_t target_id = target_data->value;
-  uint64_t op_id = *host_op_id = gpu_correlation_id(); 
+  uint64_t op_id = *host_op_id = gpu_correlation_id();
 
   PRINT("ompt_data_op enter->target_id 0x%lx\n", target_id);
   ompt_placeholder_t op = ompt_placeholders.ompt_tgt_none;
-  switch (optype) {                       
+  switch (optype) {
 #define ompt_op_macro(op, ompt_op_type, ompt_op_class) \
     case ompt_op_type:                                 \
       op = ompt_placeholders.ompt_op_class;                              \
       break;
-    
+
     FOREACH_OMPT_DATA_OP(ompt_op_macro);
 
 #undef ompt_op_macro
diff --git a/src/tool/hpcrun/sample-sources/ss-list.h b/src/tool/hpcrun/sample-sources/ss-list.h
index dea2fb16c4..bb90085dd8 100644
--- a/src/tool/hpcrun/sample-sources/ss-list.h
+++ b/src/tool/hpcrun/sample-sources/ss-list.h
@@ -48,7 +48,7 @@
 //******************************************************************************
 // File: ss-list.h
 //
-// Purpose: 
+// Purpose:
 //   This file contains a list of sample sources wrapped by a call to an
 //   unspecified macro. The intended use of this file is to define the
 //   macro, include the file elsewhere one or more times to register the
@@ -61,21 +61,21 @@
 #include <include/hpctoolkit-config.h>
 
 SAMPLE_SOURCE_DECL_MACRO(ga)
-SAMPLE_SOURCE_DECL_MACRO(io)  
+SAMPLE_SOURCE_DECL_MACRO(io)
 #ifdef ENABLE_CLOCK_REALTIME
-SAMPLE_SOURCE_DECL_MACRO(itimer)  
+SAMPLE_SOURCE_DECL_MACRO(itimer)
 #endif
 
 #ifdef HPCRUN_SS_LINUX_PERF
-SAMPLE_SOURCE_DECL_MACRO(linux_perf)  
+SAMPLE_SOURCE_DECL_MACRO(linux_perf)
 #endif
 
-SAMPLE_SOURCE_DECL_MACRO(memleak)  
+SAMPLE_SOURCE_DECL_MACRO(memleak)
 
-SAMPLE_SOURCE_DECL_MACRO(none)  
+SAMPLE_SOURCE_DECL_MACRO(none)
 
 #ifdef HPCRUN_SS_PAPI
-SAMPLE_SOURCE_DECL_MACRO(papi)  
+SAMPLE_SOURCE_DECL_MACRO(papi)
 #endif
 
 SAMPLE_SOURCE_DECL_MACRO(directed_blame)

From df5fee71e218dd26399865f9891bb4aac9ee745b Mon Sep 17 00:00:00 2001
From: dejangrubisic <grubisic.dejan@yahoo.com>
Date: Wed, 28 Apr 2021 15:58:57 -0500
Subject: [PATCH 147/177] compilable version rocprofiler

---
 configure                                 | 139 +++++-
 configure.ac                              | 118 ++++-
 src/tool/hpcrun/Makefile.am               |   1 +
 src/tool/hpcrun/Makefile.in               |  18 +-
 src/tool/hpcrun/gpu/amd/rocprofiler-api.c | 564 ++++++++++++++++++++++
 src/tool/hpcrun/gpu/amd/rocprofiler-api.h |  90 ++++
 src/tool/hpcrun/gpu/amd/roctracer-api.c   |   7 +
 src/tool/hpcrun/sample-sources/amd.c      |  36 +-
 8 files changed, 949 insertions(+), 24 deletions(-)
 create mode 100644 src/tool/hpcrun/gpu/amd/rocprofiler-api.c
 create mode 100644 src/tool/hpcrun/gpu/amd/rocprofiler-api.h

diff --git a/configure b/configure
index 61bf14b9eb..d95189361b 100755
--- a/configure
+++ b/configure
@@ -1081,6 +1081,8 @@ with_rocm
 with_rocm_hip
 with_rocm_dbgapi
 with_rocm_tracer
+with_rocm_profiler
+with_rocm_hsa
 with_level0
 enable_data_centric_tracing
 enable_devtools
@@ -1839,6 +1841,9 @@ Optional Packages:
   --with-rocm-hip=PATH    path to hip install directory
   --with-rocm-dbgapi=PATH path to rocm-dbgapi install directory
   --with-rocm-tracer=PATH path to roctracer-dev install directory
+  --with-rocm-profiler=PATH
+                          path to rocprofiler-dev install directory
+  --with-rocm-hsa=PATH    path to hsa-dev install directory
   --with-level0=PATH      use given Level Zero installation (absolute path)
                           with hpcrun (default is NO)
   --with-valgrind=PATH    path to Valgrind install directory
@@ -24383,6 +24388,8 @@ ROCM=
 ROCM_HIP=
 ROCM_DBGAPI=
 ROCM_TRACER=
+ROCM_PROFILER=
+ROCM_HSA=
 
 
 # Check whether --with-rocm was given.
@@ -24412,17 +24419,39 @@ if test "${with_rocm_tracer+set}" = set; then :
 fi
 
 
+
+# Check whether --with-rocm-profiler was given.
+if test "${with_rocm_profiler+set}" = set; then :
+  withval=$with_rocm_profiler; ROCM_PROFILER="$withval"
+fi
+
+
+
+# Check whether --with-rocm-hsa was given.
+if test "${with_rocm_hsa+set}" = set; then :
+  withval=$with_rocm_hsa; ROCM_HSA="$withval"
+fi
+
+
+
+
 ROCM_HIP_IFLAGS=
 ROCM_DBGAPI_IFLAGS=
 ROCM_TRACER_IFLAGS=
+ROCM_PROFILER_IFLAGS=
+ROCM_HSA_IFLAGS=
 
 ROCM_HIP_LD_DIR=
 ROCM_DBGAPI_LD_DIR=
 ROCM_TRACER_LD_DIR=
+ROCM_PROFILER_LD_DIR=
+ROCM_HSA_LD_DIR=
 
 ROCM_HIP_MESG=
 ROCM_DBGAPI_MESG=
 ROCM_TRACER_MESG=
+ROCM_PROFILER_MESG=
+ROCM_HSA_MESG=
 
 require_rocm=no
 
@@ -24485,6 +24514,38 @@ $as_echo "$as_me: found $ROCM/roctracer/lib/libroctracer64.so" >&6;}
       found=yes
     fi
 
+    # ROCPROFILER
+    if test -f "$ROCM/rocprofiler/include/rocprofiler.h" ; then
+      { $as_echo "$as_me:${as_lineno-$LINENO}: found $ROCM/rocprofiler/include/rocprofiler.h" >&5
+$as_echo "$as_me: found $ROCM/rocprofiler/include/rocprofiler.h" >&6;}
+      ROCM_PROFILER_IFLAGS="-I$ROCM/rocprofiler/include"
+      ROCM_PROFILER_MESG="$ROCM/rocprofiler"
+      found=yes
+    fi
+    if test -f "$ROCM/rocprofiler/lib/librocprofiler64.so" ; then
+      { $as_echo "$as_me:${as_lineno-$LINENO}: found $ROCM/rocprofiler/lib/librocprofiler64.so" >&5
+$as_echo "$as_me: found $ROCM/rocprofiler/lib/librocprofiler64.so" >&6;}
+      ROCM_PROFILER_LD_DIR="$ROCM/rocprofiler/lib"
+      ROCM_PROFILER_MESG="$ROCM/rocprofiler"
+      found=yes
+    fi
+
+    # HSA
+    if test -f "$ROCM/hsa/include/hsa/hsa.h" ; then
+      { $as_echo "$as_me:${as_lineno-$LINENO}: found $ROCM/hsa/include/hsa/hsa.h" >&5
+$as_echo "$as_me: found $ROCM/hsa/include/hsa/hsa.h" >&6;}
+      ROCM_HSA_IFLAGS="-I$ROCM/hsa/include/hsa"
+      ROCM_HSA_MESG="$ROCM/hsa"
+      found=yes
+    fi
+    if test -f "$ROCM/hsa/lib/libhsa-runtime64.so" ; then
+      { $as_echo "$as_me:${as_lineno-$LINENO}: found $ROCM/hsa/lib/libhsa-runtime64.so" >&5
+$as_echo "$as_me: found $ROCM/hsa/lib/libhsa-runtime64.so" >&6;}
+      ROCM_HSA_LD_DIR="$ROCM/hsa/lib"
+      ROCM_HSA_MESG="$ROCM/hsa"
+      found=yes
+    fi
+
     # warn if given dir has nothing useful
     if test "$found" = no ; then
       { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: found nothing useful in $ROCM" >&5
@@ -24586,6 +24647,64 @@ $as_echo "$as_me: WARNING: found nothing useful in $ROCM_TRACER" >&2;}
     ;;
 esac
 
+case "$ROCM_PROFILER" in
+  /* )
+    require_rocm=yes
+    found=no
+
+    if test -f "$ROCM_PROFILER/rocprofiler/include/rocprofiler.h" ; then
+      { $as_echo "$as_me:${as_lineno-$LINENO}: found $ROCM_PROFILER/rocprofiler/include/rocprofiler.h" >&5
+$as_echo "$as_me: found $ROCM_PROFILER/rocprofiler/include/rocprofiler.h" >&6;}
+      ROCM_PROFILER_IFLAGS="-I$ROCM_PROFILER/rocprofiler/include"
+      ROCM_PROFILER_MESG="$ROCM_PROFILER/rocprofiler"
+      found=yes
+    fi
+    if test -f "$ROCM_PROFILER/rocprofiler/lib/librocprofiler64.so" ; then
+      { $as_echo "$as_me:${as_lineno-$LINENO}: found $ROCM_PROFILER/rocprofiler/lib/librocprofiler64.so" >&5
+$as_echo "$as_me: found $ROCM_PROFILER/rocprofiler/lib/librocprofiler64.so" >&6;}
+      ROCM_PROFILER_LD_DIR="$ROCM_PROFILER/rocprofiler/lib"
+      ROCM_PROFILER_MESG="$ROCM_PROFILER/rocprofiler"
+      found=yes
+    fi
+    if test "$found" = no ; then
+      { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: found nothing useful in $ROCM_PROFILER" >&5
+$as_echo "$as_me: WARNING: found nothing useful in $ROCM_PROFILER" >&2;}
+    fi
+    ;;
+  * )
+    ROCM_PROFILER=no
+    ;;
+esac
+
+case "$ROCM_HSA" in
+  /* )
+    require_rocm=yes
+    found=no
+
+    if test -f "$ROCM_HSA/include/hsa/hsa.h" ; then
+      { $as_echo "$as_me:${as_lineno-$LINENO}: found $ROCM_HSA/include/hsa/hsa.h" >&5
+$as_echo "$as_me: found $ROCM_HSA/include/hsa/hsa.h" >&6;}
+      ROCM_HSA_IFLAGS="-I$ROCM_HSA/include/hsa"
+      ROCM_HSA_MESG="$ROCM_HSA"
+      found=yes
+    fi
+    if test -f "$ROCM_HSA/lib/libhsa-runtime64.so" ; then
+      { $as_echo "$as_me:${as_lineno-$LINENO}: found $ROCM_HSA/lib/libhsa-runtime64.so" >&5
+$as_echo "$as_me: found $ROCM_HSA/lib/libhsa-runtime64.so" >&6;}
+      ROCM_HSA_LD_DIR="$ROCM_HSA/lib"
+      ROCM_HSA_MESG="$ROCM_HSA"
+      found=yes
+    fi
+    if test "$found" = no ; then
+      { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: found nothing useful in $ROCM_HSA" >&5
+$as_echo "$as_me: WARNING: found nothing useful in $ROCM_HSA" >&2;}
+    fi
+    ;;
+  * )
+    ROCM_HSA=no
+    ;;
+esac
+
 #
 # Check that we found all the pieces.
 #
@@ -24602,6 +24721,12 @@ then
   if test "x$ROCM_TRACER_IFLAGS" = x ; then
     as_fn_error $? "unable to find roctracer_hip.h" "$LINENO" 5
   fi
+  if test "x$ROCM_PROFILER_IFLAGS" = x ; then
+    as_fn_error $? "unable to find rocprofiler.h" "$LINENO" 5
+  fi
+  if test "x$ROCM_HSA_IFLAGS" = x ; then
+    as_fn_error $? "unable to find hsa.h" "$LINENO" 5
+  fi
 
   if test "x$ROCM_HIP_LD_DIR" = x ; then
     as_fn_error $? "unable to find libamdhip64.so" "$LINENO" 5
@@ -24612,10 +24737,16 @@ then
   if test "x$ROCM_TRACER_LD_DIR" = x ; then
     as_fn_error $? "unable to find libroctracer64.so" "$LINENO" 5
   fi
+  if test "x$ROCM_PROFILER_LD_DIR" = x ; then
+    as_fn_error $? "unable to find librocprofiler64.so" "$LINENO" 5
+  fi
+  if test "x$ROCM_HSA_LD_DIR" = x ; then
+    as_fn_error $? "unable to find libhsa-runtime64.so" "$LINENO" 5
+  fi
 
   OPT_HAVE_ROCM=yes
-  OPT_ROCM_IFLAGS="$ROCM_HIP_IFLAGS  $ROCM_DBGAPI_IFLAGS  $ROCM_TRACER_IFLAGS"
-  OPT_ROCM_LD_LIB_PATH="${ROCM_HIP_LD_DIR}:${ROCM_DBGAPI_LD_DIR}:${ROCM_TRACER_LD_DIR}"
+  OPT_ROCM_IFLAGS="$ROCM_HIP_IFLAGS  $ROCM_DBGAPI_IFLAGS  $ROCM_TRACER_IFLAGS $ROCM_PROFILER_IFLAGS $ROCM_HSA_IFLAGS"
+  OPT_ROCM_LD_LIB_PATH="${ROCM_HIP_LD_DIR}:${ROCM_DBGAPI_LD_DIR}:${ROCM_TRACER_LD_DIR}:${ROCM_PROFILER_LD_DIR}:${ROCM_HSA_LD_DIR}"
 fi
 
 #
@@ -28008,6 +28139,10 @@ $as_echo "$as_me:   rocm hip:     $ROCM_HIP_MESG" >&6;}
 $as_echo "$as_me:   rocm dbgapi:  $ROCM_DBGAPI_MESG" >&6;}
   { $as_echo "$as_me:${as_lineno-$LINENO}:   rocm tracer:  $ROCM_TRACER_MESG" >&5
 $as_echo "$as_me:   rocm tracer:  $ROCM_TRACER_MESG" >&6;}
+  { $as_echo "$as_me:${as_lineno-$LINENO}:   rocm profiler:$ROCM_PROFILER_MESG" >&5
+$as_echo "$as_me:   rocm profiler:$ROCM_PROFILER_MESG" >&6;}
+  { $as_echo "$as_me:${as_lineno-$LINENO}:   rocm hsa:     $ROCM_HSA_MESG" >&5
+$as_echo "$as_me:   rocm hsa:     $ROCM_HSA_MESG" >&6;}
 fi
 { $as_echo "$as_me:${as_lineno-$LINENO}:   valgrind:     ${VALGRIND}" >&5
 $as_echo "$as_me:   valgrind:     ${VALGRIND}" >&6;}
diff --git a/configure.ac b/configure.ac
index ad15c447bf..40edba48b9 100644
--- a/configure.ac
+++ b/configure.ac
@@ -5148,6 +5148,8 @@ ROCM=
 ROCM_HIP=
 ROCM_DBGAPI=
 ROCM_TRACER=
+ROCM_PROFILER=
+ROCM_HSA=
 
 AC_ARG_WITH([rocm],
   AS_HELP_STRING([--with-rocm=PATH],
@@ -5169,17 +5171,35 @@ AC_ARG_WITH([rocm-tracer],
       [path to roctracer-dev install directory]),
   [ROCM_TRACER="$withval"], [])
 
+AC_ARG_WITH([rocm-profiler],
+  AS_HELP_STRING([--with-rocm-profiler=PATH],
+      [path to rocprofiler-dev install directory]),
+  [ROCM_PROFILER="$withval"], [])
+
+AC_ARG_WITH([rocm-hsa],
+  AS_HELP_STRING([--with-rocm-hsa=PATH],
+      [path to hsa-dev install directory]),
+  [ROCM_HSA="$withval"], [])
+
+
+
 ROCM_HIP_IFLAGS=
 ROCM_DBGAPI_IFLAGS=
 ROCM_TRACER_IFLAGS=
+ROCM_PROFILER_IFLAGS=
+ROCM_HSA_IFLAGS=
 
 ROCM_HIP_LD_DIR=
 ROCM_DBGAPI_LD_DIR=
 ROCM_TRACER_LD_DIR=
+ROCM_PROFILER_LD_DIR=
+ROCM_HSA_LD_DIR=
 
 ROCM_HIP_MESG=
 ROCM_DBGAPI_MESG=
 ROCM_TRACER_MESG=
+ROCM_PROFILER_MESG=
+ROCM_HSA_MESG=
 
 require_rocm=no
 
@@ -5235,6 +5255,34 @@ case "$ROCM" in
       found=yes
     fi
 
+    # ROCPROFILER
+    if test -f "$ROCM/rocprofiler/include/rocprofiler.h" ; then
+      AC_MSG_NOTICE([found $ROCM/rocprofiler/include/rocprofiler.h])
+      ROCM_PROFILER_IFLAGS="-I$ROCM/rocprofiler/include"
+      ROCM_PROFILER_MESG="$ROCM/rocprofiler"
+      found=yes
+    fi
+    if test -f "$ROCM/rocprofiler/lib/librocprofiler64.so" ; then
+      AC_MSG_NOTICE([found $ROCM/rocprofiler/lib/librocprofiler64.so])
+      ROCM_PROFILER_LD_DIR="$ROCM/rocprofiler/lib"
+      ROCM_PROFILER_MESG="$ROCM/rocprofiler"
+      found=yes
+    fi
+
+    # HSA
+    if test -f "$ROCM/hsa/include/hsa/hsa.h" ; then
+      AC_MSG_NOTICE([found $ROCM/hsa/include/hsa/hsa.h])
+      ROCM_HSA_IFLAGS="-I$ROCM/hsa/include/hsa"
+      ROCM_HSA_MESG="$ROCM/hsa"
+      found=yes
+    fi
+    if test -f "$ROCM/hsa/lib/libhsa-runtime64.so" ; then
+      AC_MSG_NOTICE([found $ROCM/hsa/lib/libhsa-runtime64.so])
+      ROCM_HSA_LD_DIR="$ROCM/hsa/lib"
+      ROCM_HSA_MESG="$ROCM/hsa"
+      found=yes
+    fi
+
     # warn if given dir has nothing useful
     if test "$found" = no ; then
       AC_MSG_WARN([found nothing useful in $ROCM])
@@ -5326,6 +5374,58 @@ case "$ROCM_TRACER" in
     ;;
 esac
 
+case "$ROCM_PROFILER" in
+  /* )
+    require_rocm=yes
+    found=no
+
+    if test -f "$ROCM_PROFILER/rocprofiler/include/rocprofiler.h" ; then
+      AC_MSG_NOTICE([found $ROCM_PROFILER/rocprofiler/include/rocprofiler.h])
+      ROCM_PROFILER_IFLAGS="-I$ROCM_PROFILER/rocprofiler/include"
+      ROCM_PROFILER_MESG="$ROCM_PROFILER/rocprofiler"
+      found=yes
+    fi
+    if test -f "$ROCM_PROFILER/rocprofiler/lib/librocprofiler64.so" ; then
+      AC_MSG_NOTICE([found $ROCM_PROFILER/rocprofiler/lib/librocprofiler64.so])
+      ROCM_PROFILER_LD_DIR="$ROCM_PROFILER/rocprofiler/lib"
+      ROCM_PROFILER_MESG="$ROCM_PROFILER/rocprofiler"
+      found=yes
+    fi
+    if test "$found" = no ; then
+      AC_MSG_WARN([found nothing useful in $ROCM_PROFILER])
+    fi
+    ;;
+  * )
+    ROCM_PROFILER=no
+    ;;
+esac
+
+case "$ROCM_HSA" in
+  /* )
+    require_rocm=yes
+    found=no
+
+    if test -f "$ROCM_HSA/include/hsa/hsa.h" ; then
+      AC_MSG_NOTICE([found $ROCM_HSA/include/hsa/hsa.h])
+      ROCM_HSA_IFLAGS="-I$ROCM_HSA/include/hsa"
+      ROCM_HSA_MESG="$ROCM_HSA"
+      found=yes
+    fi
+    if test -f "$ROCM_HSA/lib/libhsa-runtime64.so" ; then
+      AC_MSG_NOTICE([found $ROCM_HSA/lib/libhsa-runtime64.so])
+      ROCM_HSA_LD_DIR="$ROCM_HSA/lib"
+      ROCM_HSA_MESG="$ROCM_HSA"
+      found=yes
+    fi
+    if test "$found" = no ; then
+      AC_MSG_WARN([found nothing useful in $ROCM_HSA])
+    fi
+    ;;
+  * )
+    ROCM_HSA=no
+    ;;
+esac
+
 #
 # Check that we found all the pieces.
 #
@@ -5342,6 +5442,12 @@ then
   if test "x$ROCM_TRACER_IFLAGS" = x ; then
     AC_MSG_ERROR([unable to find roctracer_hip.h])
   fi
+  if test "x$ROCM_PROFILER_IFLAGS" = x ; then
+    AC_MSG_ERROR([unable to find rocprofiler.h])
+  fi
+  if test "x$ROCM_HSA_IFLAGS" = x ; then
+    AC_MSG_ERROR([unable to find hsa.h])
+  fi
 
   if test "x$ROCM_HIP_LD_DIR" = x ; then
     AC_MSG_ERROR([unable to find libamdhip64.so])
@@ -5352,10 +5458,16 @@ then
   if test "x$ROCM_TRACER_LD_DIR" = x ; then
     AC_MSG_ERROR([unable to find libroctracer64.so])
   fi
+  if test "x$ROCM_PROFILER_LD_DIR" = x ; then
+    AC_MSG_ERROR([unable to find librocprofiler64.so])
+  fi
+  if test "x$ROCM_HSA_LD_DIR" = x ; then
+    AC_MSG_ERROR([unable to find libhsa-runtime64.so])
+  fi
 
   OPT_HAVE_ROCM=yes
-  OPT_ROCM_IFLAGS="$ROCM_HIP_IFLAGS  $ROCM_DBGAPI_IFLAGS  $ROCM_TRACER_IFLAGS"
-  OPT_ROCM_LD_LIB_PATH="${ROCM_HIP_LD_DIR}:${ROCM_DBGAPI_LD_DIR}:${ROCM_TRACER_LD_DIR}"
+  OPT_ROCM_IFLAGS="$ROCM_HIP_IFLAGS  $ROCM_DBGAPI_IFLAGS  $ROCM_TRACER_IFLAGS $ROCM_PROFILER_IFLAGS $ROCM_HSA_IFLAGS"
+  OPT_ROCM_LD_LIB_PATH="${ROCM_HIP_LD_DIR}:${ROCM_DBGAPI_LD_DIR}:${ROCM_TRACER_LD_DIR}:${ROCM_PROFILER_LD_DIR}:${ROCM_HSA_LD_DIR}"
 fi
 
 #
@@ -5800,6 +5912,8 @@ if test "$OPT_HAVE_ROCM" = yes ; then
   AC_MSG_NOTICE([  rocm hip:     $ROCM_HIP_MESG])
   AC_MSG_NOTICE([  rocm dbgapi:  $ROCM_DBGAPI_MESG])
   AC_MSG_NOTICE([  rocm tracer:  $ROCM_TRACER_MESG])
+  AC_MSG_NOTICE([  rocm profiler:$ROCM_PROFILER_MESG])
+  AC_MSG_NOTICE([  rocm hsa:     $ROCM_HSA_MESG])
 fi
 AC_MSG_NOTICE([  valgrind:     ${VALGRIND}])
 AC_MSG_NOTICE([  valgrind:     annotated: ${OPT_ENABLE_VG_ANNOTATIONS}])
diff --git a/src/tool/hpcrun/Makefile.am b/src/tool/hpcrun/Makefile.am
index a30590f819..94c52bd066 100644
--- a/src/tool/hpcrun/Makefile.am
+++ b/src/tool/hpcrun/Makefile.am
@@ -544,6 +544,7 @@ MY_ROCM_FILES =\
 	sample-sources/amd.c \
 	gpu/amd/roctracer-activity-translate.c \
 	gpu/amd/roctracer-api.c \
+	gpu/amd/rocprofiler-api.c \
 	gpu/amd/rocm-debug-api.c \
 	gpu/amd/rocm-binary-processing.c
 endif
diff --git a/src/tool/hpcrun/Makefile.in b/src/tool/hpcrun/Makefile.in
index 13aaf4d908..42f42ec545 100644
--- a/src/tool/hpcrun/Makefile.in
+++ b/src/tool/hpcrun/Makefile.in
@@ -539,9 +539,9 @@ am__libhpcrun_la_SOURCES_DIST = utilities/first_func.c main.h main.c \
 	gpu/nvidia/cupti-analysis.c gpu/nvidia/cupti-api.c \
 	gpu/nvidia/cupti-gpu-api.c sample-sources/upc.c \
 	sample-sources/amd.c gpu/amd/roctracer-activity-translate.c \
-	gpu/amd/roctracer-api.c gpu/amd/rocm-debug-api.c \
-	gpu/amd/rocm-binary-processing.c sample-sources/level0.c \
-	gpu/level0/level0-api.c \
+	gpu/amd/roctracer-api.c gpu/amd/rocprofiler-api.c \
+	gpu/amd/rocm-debug-api.c gpu/amd/rocm-binary-processing.c \
+	sample-sources/level0.c gpu/level0/level0-api.c \
 	gpu/level0/level0-command-list-context-map.c \
 	gpu/level0/level0-command-list-map.c \
 	gpu/level0/level0-command-process.c \
@@ -748,6 +748,7 @@ am__objects_35 =
 @OPT_ENABLE_ROCM_TRUE@	sample-sources/libhpcrun_la-amd.lo \
 @OPT_ENABLE_ROCM_TRUE@	gpu/amd/libhpcrun_la-roctracer-activity-translate.lo \
 @OPT_ENABLE_ROCM_TRUE@	gpu/amd/libhpcrun_la-roctracer-api.lo \
+@OPT_ENABLE_ROCM_TRUE@	gpu/amd/libhpcrun_la-rocprofiler-api.lo \
 @OPT_ENABLE_ROCM_TRUE@	gpu/amd/libhpcrun_la-rocm-debug-api.lo \
 @OPT_ENABLE_ROCM_TRUE@	gpu/amd/libhpcrun_la-rocm-binary-processing.lo
 @OPT_ENABLE_ROCM_TRUE@am__objects_37 = $(am__objects_36)
@@ -2001,6 +2002,7 @@ MY_AARCH64_FILES = \
 @OPT_ENABLE_ROCM_TRUE@	sample-sources/amd.c \
 @OPT_ENABLE_ROCM_TRUE@	gpu/amd/roctracer-activity-translate.c \
 @OPT_ENABLE_ROCM_TRUE@	gpu/amd/roctracer-api.c \
+@OPT_ENABLE_ROCM_TRUE@	gpu/amd/rocprofiler-api.c \
 @OPT_ENABLE_ROCM_TRUE@	gpu/amd/rocm-debug-api.c \
 @OPT_ENABLE_ROCM_TRUE@	gpu/amd/rocm-binary-processing.c
 
@@ -3007,6 +3009,8 @@ gpu/amd/libhpcrun_la-roctracer-activity-translate.lo:  \
 	gpu/amd/$(am__dirstamp) gpu/amd/$(DEPDIR)/$(am__dirstamp)
 gpu/amd/libhpcrun_la-roctracer-api.lo: gpu/amd/$(am__dirstamp) \
 	gpu/amd/$(DEPDIR)/$(am__dirstamp)
+gpu/amd/libhpcrun_la-rocprofiler-api.lo: gpu/amd/$(am__dirstamp) \
+	gpu/amd/$(DEPDIR)/$(am__dirstamp)
 gpu/amd/libhpcrun_la-rocm-debug-api.lo: gpu/amd/$(am__dirstamp) \
 	gpu/amd/$(DEPDIR)/$(am__dirstamp)
 gpu/amd/libhpcrun_la-rocm-binary-processing.lo:  \
@@ -4003,6 +4007,7 @@ distclean-compile:
 @AMDEP_TRUE@@am__include@ @am__quote@gpu/$(DEPDIR)/libhpcrun_o-gpu-trace.Po@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@gpu/amd/$(DEPDIR)/libhpcrun_la-rocm-binary-processing.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@gpu/amd/$(DEPDIR)/libhpcrun_la-rocm-debug-api.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@gpu/amd/$(DEPDIR)/libhpcrun_la-rocprofiler-api.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@gpu/amd/$(DEPDIR)/libhpcrun_la-roctracer-activity-translate.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@gpu/amd/$(DEPDIR)/libhpcrun_la-roctracer-api.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@gpu/instrumentation/$(DEPDIR)/libhpcrun_la-gtpin-correlation-id-map.Plo@am__quote@
@@ -5660,6 +5665,13 @@ gpu/amd/libhpcrun_la-roctracer-api.lo: gpu/amd/roctracer-api.c
 @AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
 @am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -c -o gpu/amd/libhpcrun_la-roctracer-api.lo `test -f 'gpu/amd/roctracer-api.c' || echo '$(srcdir)/'`gpu/amd/roctracer-api.c
 
+gpu/amd/libhpcrun_la-rocprofiler-api.lo: gpu/amd/rocprofiler-api.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -MT gpu/amd/libhpcrun_la-rocprofiler-api.lo -MD -MP -MF gpu/amd/$(DEPDIR)/libhpcrun_la-rocprofiler-api.Tpo -c -o gpu/amd/libhpcrun_la-rocprofiler-api.lo `test -f 'gpu/amd/rocprofiler-api.c' || echo '$(srcdir)/'`gpu/amd/rocprofiler-api.c
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) gpu/amd/$(DEPDIR)/libhpcrun_la-rocprofiler-api.Tpo gpu/amd/$(DEPDIR)/libhpcrun_la-rocprofiler-api.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='gpu/amd/rocprofiler-api.c' object='gpu/amd/libhpcrun_la-rocprofiler-api.lo' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -c -o gpu/amd/libhpcrun_la-rocprofiler-api.lo `test -f 'gpu/amd/rocprofiler-api.c' || echo '$(srcdir)/'`gpu/amd/rocprofiler-api.c
+
 gpu/amd/libhpcrun_la-rocm-debug-api.lo: gpu/amd/rocm-debug-api.c
 @am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -MT gpu/amd/libhpcrun_la-rocm-debug-api.lo -MD -MP -MF gpu/amd/$(DEPDIR)/libhpcrun_la-rocm-debug-api.Tpo -c -o gpu/amd/libhpcrun_la-rocm-debug-api.lo `test -f 'gpu/amd/rocm-debug-api.c' || echo '$(srcdir)/'`gpu/amd/rocm-debug-api.c
 @am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) gpu/amd/$(DEPDIR)/libhpcrun_la-rocm-debug-api.Tpo gpu/amd/$(DEPDIR)/libhpcrun_la-rocm-debug-api.Plo
diff --git a/src/tool/hpcrun/gpu/amd/rocprofiler-api.c b/src/tool/hpcrun/gpu/amd/rocprofiler-api.c
new file mode 100644
index 0000000000..c03307e7c2
--- /dev/null
+++ b/src/tool/hpcrun/gpu/amd/rocprofiler-api.c
@@ -0,0 +1,564 @@
+// -*-Mode: C++;-*- // technically C99
+
+// * BeginRiceCopyright *****************************************************
+//
+// --------------------------------------------------------------------------
+// Part of HPCToolkit (hpctoolkit.org)
+//
+// Information about sources of support for research and development of
+// HPCToolkit is at 'hpctoolkit.org' and in 'README.Acknowledgments'.
+// --------------------------------------------------------------------------
+//
+// Copyright ((c)) 2002-2021, Rice University
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+//   notice, this list of conditions and the following disclaimer.
+//
+// * Redistributions in binary form must reproduce the above copyright
+//   notice, this list of conditions and the following disclaimer in the
+//   documentation and/or other materials provided with the distribution.
+//
+// * Neither the name of Rice University (RICE) nor the names of its
+//   contributors may be used to endorse or promote products derived from
+//   this software without specific prior written permission.
+//
+// This software is provided by RICE and contributors "as is" and any
+// express or implied warranties, including, but not limited to, the
+// implied warranties of merchantability and fitness for a particular
+// purpose are disclaimed. In no event shall RICE or contributors be
+// liable for any direct, indirect, incidental, special, exemplary, or
+// consequential damages (including, but not limited to, procurement of
+// substitute goods or services; loss of use, data, or profits; or
+// business interruption) however caused and on any theory of liability,
+// whether in contract, strict liability, or tort (including negligence
+// or otherwise) arising in any way out of the use of this software, even
+// if advised of the possibility of such damage.
+//
+// ******************************************************* EndRiceCopyright *
+
+//******************************************************************************
+// local includes
+//******************************************************************************
+
+#include "rocprofiler-api.h"
+// #include "rocm-debug-api.h"
+#include "rocm-binary-processing.h"
+
+#include <roctracer_hip.h>
+#include <rocprofiler.h>
+
+#include <hpcrun/gpu/gpu-activity-channel.h>
+#include <hpcrun/gpu/gpu-activity-process.h>
+#include <hpcrun/gpu/gpu-correlation-channel.h>
+#include <hpcrun/gpu/gpu-correlation-id-map.h>
+#include <hpcrun/gpu/gpu-metrics.h>
+#include <hpcrun/gpu/gpu-monitoring-thread-api.h>
+#include <hpcrun/gpu/gpu-application-thread-api.h>
+#include <hpcrun/gpu/gpu-op-placeholders.h>
+
+#include <hpcrun/safe-sampling.h>
+#include <hpcrun/sample-sources/libdl.h>
+
+#include <hpcrun/utilities/hpcrun-nanotime.h>
+
+// #include <lib/prof-lean/stdatomic.h>
+#include <pthread.h>
+
+#define DEBUG 0
+
+#include "hpcrun/gpu/gpu-print.h"
+//******************************************************************************
+// macros
+//******************************************************************************
+
+
+#define PUBLIC_API __attribute__((visibility("default")))
+
+#if 0
+Returned API status:
+- hsa_status_t - HSA status codes are used from hsa.h header
+
+Loading and Configuring, loadable plugin on-load/unload methods:
+- rocprofiler_settings_t – global properties
+- OnLoadTool 
+- OnLoadToolProp
+- OnUnloadTool
+
+Info API:
+- rocprofiler_info_kind_t - profiling info kind
+- rocprofiler_info_query_t - profiling info query
+- rocprofiler_info_data_t - profiling info data
+- rocprofiler_get_info - return the info for a given info kind
+- rocprofiler_iterote_inf_ - iterate over the info for a given info kind 
+- rocprofiler_query_info - iterate over the info for a given info query
+
+Context API:
+- rocprofiler_t - profiling context handle
+- rocprofiler_feature_kind_t - profiling feature kind
+- rocprofiler_feature_parameter_t - profiling feature parameter
+- rocprofiler_data_kind_t - profiling data kind
+- rocprofiler_data_t - profiling data
+- rocprofiler_feature_t - profiling feature
+- rocprofiler_mode_t - profiling modes
+- rocprofiler_properties_t - profiler properties
+- rocprofiler_open - open new profiling context
+- rocprofiler_close - close profiling context and release all allocated resources
+- rocprofiler_group_count - return profiling groups count
+- rocprofiler_get_group - return profiling group for a given index
+- rocprofiler_get_metrics - method for calculating the metrics data
+- rocprofiler_iterate_trace_data - method for iterating output trace data instances
+- rocprofiler_time_id_t - supported time value ID enumeration
+- rocprofiler_get_time – return time for a given time ID and profiling timestamp value
+
+Sampling API:
+- rocprofiler_start - start profiling
+- rocprofiler_stop - stop profiling
+- rocprofiler_read - read profiling data to the profiling features objects
+- rocprofiler_get_data - wait for profiling data
+  Group versions of start/stop/read/get_data methods:
+  o rocprofiler_group_start
+  o rocprofiler_group_stop
+  o rocprofiler_group_read
+  o rocprofiler_group_get_data
+
+Intercepting API:
+- rocprofiler_callback_t - profiling callback type
+- rocprofiler_callback_data_t - profiling callback data type
+- rocprofiler_dispatch_record_t – dispatch record
+- rocprofiler_queue_callbacks_t – queue callbacks, dispatch/destroy
+- rocprofiler_set_queue_callbacks - set queue kernel dispatch and queue destroy callbacks
+- rocprofiler_remove_queue_callbacks - remove queue callbacks
+
+Context pool API:
+- rocprofiler_pool_t – context pool handle
+- rocprofiler_pool_entry_t – context pool entry
+- rocprofiler_pool_properties_t – context pool properties
+- rocprofiler_pool_handler_t – context pool completion handler
+- rocprofiler_pool_open - context pool open
+- rocprofiler_pool_close - context pool close
+- rocprofiler_pool_fetch – fetch and empty context entry to pool
+- rocprofiler_pool_release – release a context entry
+- rocprofiler_pool_iterate – iterated fetched context entries
+- rocprofiler_pool_flush – flush completed context entries
+#endif
+
+
+#define FORALL_ROCPROFILER_ROUTINES(macro)			\
+  macro(rocprofiler_open)   \
+  macro(rocprofiler_close)   \
+  macro(rocprofiler_get_metrics) \
+  macro(rocprofiler_set_queue_callbacks) \
+  macro(rocprofiler_start_queue_callbacks) \
+  macro(rocprofiler_stop_queue_callbacks) \
+  macro(rocprofiler_remove_queue_callbacks) 
+  
+
+
+#define ROCPROFILER_FN_NAME(f) DYN_FN_NAME(f)
+
+#define ROCPROFILER_FN(fn, args) \
+  static hsa_status_t (*ROCPROFILER_FN_NAME(fn)) args
+
+#define HPCRUN_ROCPROFILER_CALL(fn, args) \
+{      \
+  hsa_status_t status = ROCPROFILER_FN_NAME(fn) args;	\
+  if (status != HSA_STATUS_SUCCESS) {		\    
+    const char* error_string = NULL; \
+    rocprofiler_error_string(&error_string); \
+    fprintf(stderr, "ERROR: %s\n", error_string); \
+    abort(); \    
+  }						\
+}
+
+
+typedef const char* (*hip_kernel_name_fnt)(const hipFunction_t f);
+typedef const char* (*hip_kernel_name_ref_fnt)(const void* hostFunction, hipStream_t stream);
+
+
+// Context stored entry type
+typedef struct {
+  bool valid;
+  hsa_agent_t agent;
+  rocprofiler_group_t group;
+  rocprofiler_callback_data_t data;
+}context_entry_t;
+
+// Context callback arg
+typedef struct {
+  rocprofiler_pool_t** pools;
+}callbacks_arg_t;
+
+// Handler callback arg
+typedef struct {
+  rocprofiler_feature_t* features;
+  unsigned feature_count;
+}handler_arg_t;
+
+
+
+//******************************************************************************
+// local variables
+//******************************************************************************
+
+static hip_kernel_name_fnt hip_kernel_name_fn;
+static hip_kernel_name_ref_fnt hip_kernel_name_ref_fn;
+pthread_mutex_t mutex_context;
+
+//----------------------------------------------------------
+// rocprofiler function pointers for late binding
+//----------------------------------------------------------
+
+ROCPROFILER_FN
+(
+ rocprofiler_open,
+ (
+    hsa_agent_t agent,			// GPU handle
+    rocprofiler_feature_t* features,	// [in/out] profiling feature array
+    uint32_t feature_count,			// profiling feature count
+    rocprofiler_t** context,		// [out] profiling context handle
+    uint32_t mode,				// profiling mode mask
+    rocprofiler_properties_t* properties	// profiler properties
+ )
+);
+
+ROCPROFILER_FN
+(
+  rocprofiler_close,
+  (
+	  rocprofiler_t* context		// [in] profiling context  
+  )
+);
+
+ROCPROFILER_FN
+(
+  rocprofiler_get_metrics,
+	(
+    rocprofiler_t* context		// [in/out] profiling context
+  )
+);
+
+ROCPROFILER_FN
+(
+  rocprofiler_set_queue_callbacks,
+  (
+    rocprofiler_queue_callbacks_t callbacks,           // callbacks
+    void* data
+  )
+);
+
+ROCPROFILER_FN
+(
+  rocprofiler_start_queue_callbacks,
+  (
+    void
+  )
+);
+
+ROCPROFILER_FN
+(
+  rocprofiler_stop_queue_callbacks,
+  (
+    void
+  )
+);
+
+ROCPROFILER_FN
+(
+  rocprofiler_remove_queue_callbacks,
+  (
+    void
+  )
+);
+
+
+//******************************************************************************
+// private operations
+//******************************************************************************
+
+static const char *
+rocprofiler_path
+(
+ void
+)
+{
+  const char *path = "librocprofiler64.so";
+
+  return path;
+}
+
+
+unsigned metrics_input(rocprofiler_feature_t** ret) {
+  // Profiling feature objects
+  const unsigned feature_count = 6;
+  rocprofiler_feature_t* features = (rocprofiler_feature_t*) malloc(sizeof(rocprofiler_feature_t) * feature_count);
+  memset(features, 0, feature_count * sizeof(rocprofiler_feature_t));
+
+  // PMC events
+  features[0].kind = ROCPROFILER_FEATURE_KIND_METRIC;
+  features[0].name = "GRBM_COUNT";
+  features[1].kind = ROCPROFILER_FEATURE_KIND_METRIC;
+  features[1].name = "GRBM_GUI_ACTIVE";
+  features[2].kind = ROCPROFILER_FEATURE_KIND_METRIC;
+  features[2].name = "GPUBusy";
+  features[3].kind = ROCPROFILER_FEATURE_KIND_METRIC;
+  features[3].name = "SQ_WAVES";
+  features[4].kind = ROCPROFILER_FEATURE_KIND_METRIC;
+  features[4].name = "SQ_INSTS_VALU";
+  features[5].kind = ROCPROFILER_FEATURE_KIND_METRIC;
+  features[5].name = "VALUInsts";
+//  features[6].kind = ROCPROFILER_FEATURE_KIND_METRIC;
+//  features[6].name = "TCC_HIT_sum";
+//  features[7].kind = ROCPROFILER_FEATURE_KIND_METRIC;
+//  features[7].name = "TCC_MISS_sum";
+//  features[8].kind = ROCPROFILER_FEATURE_KIND_METRIC;
+//  features[8].name = "WRITE_SIZE";
+
+  *ret = features;
+  return feature_count;
+}
+
+
+// Dump stored context entry
+static void dump_context_entry(context_entry_t* entry, rocprofiler_feature_t* features, unsigned feature_count) {
+  volatile bool valid = entry->valid;
+  while (valid == false) sched_yield();
+
+  const char *kernel_name = entry->data.kernel_name;
+  const rocprofiler_dispatch_record_t* record = entry->data.record;
+
+  fflush(stdout);
+  fprintf(stdout, "kernel symbol(0x%lx) name(\"%s\") tid(%u) queue-id(%u)) ", // gpu-id(%u) ",
+    entry->data.kernel_object,
+    kernel_name,
+    entry->data.thread_id,
+    entry->data.queue_id);
+    // HsaRsrcFactory::Instance().GetAgentInfo(entry->agent)->dev_index);
+  if (record) fprintf(stdout, "time(%lu,%lu,%lu,%lu)",
+    record->dispatch,
+    record->begin,
+    record->end,
+    record->complete);
+  fprintf(stdout, "\n");
+  fflush(stdout);
+
+  rocprofiler_group_t *group = &entry->group;
+  if (group->context == NULL) {
+    EMSG("error: AMD group->context = NULL");    
+  }
+  if (feature_count > 0) {
+    HPCRUN_ROCPROFILER_CALL(rocprofiler_group_get_data, (group));  
+    HPCRUN_ROCPROFILER_CALL(rocprofiler_get_metrics, (group->context));    
+  }
+
+  for (unsigned i = 0; i < feature_count; ++i) {
+    const rocprofiler_feature_t* p = &features[i];
+    fprintf(stdout, ">  %s ", p->name);
+    switch (p->data.kind) {
+      // Output metrics results
+      case ROCPROFILER_DATA_KIND_INT64:
+        fprintf(stdout, "= (%lu)\n", p->data.result_int64);
+        break;
+      default:
+        fprintf(stderr, "Undefined data kind(%u)\n", p->data.kind);
+        abort();
+    }
+  }
+}
+
+
+// Profiling completion handler
+// Dump and delete the context entry
+// Return true if the context was dumped successfully
+static bool context_handler1(rocprofiler_group_t group, void* arg) {
+  context_entry_t* ctx_entry = (context_entry_t*)arg;
+
+  if (pthread_mutex_lock(&mutex_context) != 0) {
+    perror("pthread_mutex_lock");
+    abort();
+  }
+
+  rocprofiler_feature_t* features = ctx_entry->group.features[0];
+  unsigned feature_count = ctx_entry->group.feature_count;
+  dump_context_entry(ctx_entry, features, feature_count);
+
+  if (pthread_mutex_unlock(&mutex_context) != 0) {
+    perror("pthread_mutex_unlock");
+    abort();
+  }
+
+  return false;
+}
+
+
+// Kernel disoatch callback
+hsa_status_t dispatch_callback(const rocprofiler_callback_data_t* callback_data, void* arg,
+                               rocprofiler_group_t* group) {
+
+  printf("Rocprofiler dispatch_callback\n\n");
+  // Passed tool data
+  hsa_agent_t agent = callback_data->agent;
+  // HSA status
+  hsa_status_t status = HSA_STATUS_ERROR;
+
+  // Open profiling context
+  // context properties
+  context_entry_t* entry = (context_entry_t*) malloc(sizeof(context_entry_t));
+  rocprofiler_t* context = NULL;
+  rocprofiler_properties_t properties = {};
+  properties.handler = context_handler1;
+  properties.handler_arg = (void*)entry;
+
+  rocprofiler_feature_t *features;
+  unsigned feature_count = metrics_input(&features);
+
+  HPCRUN_ROCPROFILER_CALL(rocprofiler_open, (agent, features, feature_count,
+                            &context, 0 /*ROCPROFILER_MODE_SINGLEGROUP*/, &properties));
+  
+
+  // Get group[0]
+  HPCRUN_ROCPROFILER_CALL(rocprofiler_get_group, (context, 0, group));
+  
+
+  // Fill profiling context entry
+  entry->agent = agent;
+  entry->group = *group;
+  entry->data = *callback_data;
+  entry->data.kernel_name = strdup(callback_data->kernel_name);
+  entry->valid = true;
+
+  return HSA_STATUS_SUCCESS;
+}
+
+
+static void initialize() {
+  // Getting profiling features
+  rocprofiler_feature_t* features = NULL;
+  unsigned feature_count = metrics_input(&features);
+
+  // Handler arg
+  handler_arg_t* handler_arg = (handler_arg_t*) malloc(sizeof(handler_arg_t));
+  handler_arg->features = features;
+  handler_arg->feature_count = feature_count;
+
+  rocprofiler_queue_callbacks_t callbacks_ptrs = {};
+  callbacks_ptrs.dispatch = dispatch_callback;
+  rocprofiler_set_queue_callbacks(callbacks_ptrs, NULL);
+
+  // Initialize recursive mutex_context
+  pthread_mutexattr_t Attr;
+  pthread_mutexattr_init(&Attr);
+  pthread_mutexattr_settype(&Attr, PTHREAD_MUTEX_RECURSIVE);
+  pthread_mutex_init(&mutex_context, &Attr);
+
+}
+
+// This is necessary for rocprofiler callback to work
+extern PUBLIC_API void OnLoadToolProp(rocprofiler_settings_t* settings){
+  printf("Rocprofiler OnLoadToolProp______________________\n");  
+  initialize();
+}
+
+
+static void cleanup() {
+  // Unregister dispatch callback
+  rocprofiler_remove_queue_callbacks();
+}
+
+
+
+//******************************************************************************
+// interface operations
+//******************************************************************************
+
+
+void rocprofiler_start_kernel(){
+  HPCRUN_ROCPROFILER_CALL(rocprofiler_start_queue_callbacks, ());
+}
+
+
+void rocprofiler_stop_kernel(){
+  HPCRUN_ROCPROFILER_CALL(rocprofiler_stop_queue_callbacks, ());
+}
+
+
+void
+rocprofiler_init
+(
+ void
+)
+{
+  printf("Rocprofiler INIT\n");  
+  // initialize();
+  return; 
+}
+
+
+void
+rocprofiler_fini
+(
+ void *args,
+ int how
+)
+{
+  printf("Rocprofiler FINI\n");  
+  cleanup();
+  return;
+}
+
+
+
+int
+rocprofiler_bind
+(
+ void
+)
+{
+//   // This is a workaround for roctracer to not hang when taking timer interrupts
+//   // More details: https://github.com/ROCm-Developer-Tools/roctracer/issues/22
+//   setenv("HSA_ENABLE_INTERRUPT", "0", 1);
+
+  // if (rocm_debug_api_bind() != DYNAMIC_BINDING_STATUS_OK) {
+  //   return DYNAMIC_BINDING_STATUS_ERROR;
+  // }
+
+#ifndef HPCRUN_STATIC_LINK
+  // dynamic libraries only availabile in non-static case
+  hpcrun_force_dlopen(true);
+  CHK_DLOPEN(rocprofiler, rocprofiler_path(), RTLD_NOW | RTLD_GLOBAL);
+  // Somehow roctracter needs libkfdwrapper64.so, but does not really load it.
+  // So, we load it before using any function in roctracter.
+  CHK_DLOPEN(kfd, "libkfdwrapper64.so", RTLD_NOW | RTLD_GLOBAL);
+
+  CHK_DLOPEN(hip, "libamdhip64.so", RTLD_NOW | RTLD_GLOBAL);
+  hpcrun_force_dlopen(false);
+
+#define ROCPROFILER_BIND(fn) \
+  CHK_DLSYM(rocprofiler, fn);
+
+  FORALL_ROCPROFILER_ROUTINES(ROCPROFILER_BIND);
+
+#undef ROCPROFILER_BIND
+
+  dlerror();
+  hip_kernel_name_fn = (hip_kernel_name_fnt) dlsym(hip, "hipKernelNameRef");
+  if (hip_kernel_name_fn == 0) {
+    return DYNAMIC_BINDING_STATUS_ERROR;
+  }
+
+  dlerror();
+  hip_kernel_name_ref_fn = (hip_kernel_name_ref_fnt) dlsym(hip, "hipKernelNameRefByPtr");
+  if (hip_kernel_name_ref_fn == 0) {
+    return DYNAMIC_BINDING_STATUS_ERROR;
+  }
+
+  return DYNAMIC_BINDING_STATUS_OK;
+#else
+  return DYNAMIC_BINDING_STATUS_ERROR;
+#endif // ! HPCRUN_STATIC_LINK
+}
+
diff --git a/src/tool/hpcrun/gpu/amd/rocprofiler-api.h b/src/tool/hpcrun/gpu/amd/rocprofiler-api.h
new file mode 100644
index 0000000000..ad785b037f
--- /dev/null
+++ b/src/tool/hpcrun/gpu/amd/rocprofiler-api.h
@@ -0,0 +1,90 @@
+// -*-Mode: C++;-*- // technically C99
+
+// * BeginRiceCopyright *****************************************************
+//
+// --------------------------------------------------------------------------
+// Part of HPCToolkit (hpctoolkit.org)
+//
+// Information about sources of support for research and development of
+// HPCToolkit is at 'hpctoolkit.org' and in 'README.Acknowledgments'.
+// --------------------------------------------------------------------------
+//
+// Copyright ((c)) 2002-2021, Rice University
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+//   notice, this list of conditions and the following disclaimer.
+//
+// * Redistributions in binary form must reproduce the above copyright
+//   notice, this list of conditions and the following disclaimer in the
+//   documentation and/or other materials provided with the distribution.
+//
+// * Neither the name of Rice University (RICE) nor the names of its
+//   contributors may be used to endorse or promote products derived from
+//   this software without specific prior written permission.
+//
+// This software is provided by RICE and contributors "as is" and any
+// express or implied warranties, including, but not limited to, the
+// implied warranties of merchantability and fitness for a particular
+// purpose are disclaimed. In no event shall RICE or contributors be
+// liable for any direct, indirect, incidental, special, exemplary, or
+// consequential damages (including, but not limited to, procurement of
+// substitute goods or services; loss of use, data, or profits; or
+// business interruption) however caused and on any theory of liability,
+// whether in contract, strict liability, or tort (including negligence
+// or otherwise) arising in any way out of the use of this software, even
+// if advised of the possibility of such damage.
+//
+// ******************************************************* EndRiceCopyright *
+
+#ifndef rocprofiler_api_h
+#define rocprofiler_api_h
+
+
+
+//******************************************************************************
+// interface operations
+//******************************************************************************
+
+void 
+rocprofiler_start_kernel
+(
+  void
+);
+
+
+void 
+rocprofiler_stop_kernel
+(
+	void
+);
+
+
+void
+rocprofiler_init
+(
+ void
+);
+
+
+void
+rocprofiler_fini
+(
+ void *args,
+ int how
+);
+
+
+int
+rocprofile_bind
+(
+  void
+);
+
+
+
+#endif
diff --git a/src/tool/hpcrun/gpu/amd/roctracer-api.c b/src/tool/hpcrun/gpu/amd/roctracer-api.c
index eaef31a6a6..cd8b6357ae 100644
--- a/src/tool/hpcrun/gpu/amd/roctracer-api.c
+++ b/src/tool/hpcrun/gpu/amd/roctracer-api.c
@@ -66,6 +66,9 @@
 
 #include <hpcrun/utilities/hpcrun-nanotime.h>
 
+
+#include "rocprofiler-api.h"
+
 //******************************************************************************
 // macros
 //******************************************************************************
@@ -433,6 +436,10 @@ roctracer_subscriber_callback
     // Generate notification entry
     uint64_t cpu_submit_time = hpcrun_nanotime();
     gpu_correlation_channel_produce(correlation_id, &gpu_op_ccts, cpu_submit_time);
+
+    rocprofiler_start_kernel();
+  }else if (data->phase == ACTIVITY_API_PHASE_EXIT){
+    rocprofiler_end_kernel();
   }
 }
 
diff --git a/src/tool/hpcrun/sample-sources/amd.c b/src/tool/hpcrun/sample-sources/amd.c
index df906d73a6..963c8f4df5 100644
--- a/src/tool/hpcrun/sample-sources/amd.c
+++ b/src/tool/hpcrun/sample-sources/amd.c
@@ -43,6 +43,7 @@
 #include <hpcrun/control-knob.h>
 #include <hpcrun/device-finalizers.h>
 #include <hpcrun/gpu/amd/roctracer-api.h>
+#include <hpcrun/gpu/amd/rocprofiler-api.h>
 #include <hpcrun/gpu/gpu-activity.h>
 #include <hpcrun/gpu/gpu-metrics.h>
 #include <hpcrun/gpu/gpu-trace.h>
@@ -72,7 +73,8 @@
 
 #define AMD_ROCM "gpu=amd"
 
-static device_finalizer_fn_entry_t device_finalizer_flush;
+static device_finalizer_fn_entry_t device_finalizer_roctracer_shutdown;
+static device_finalizer_fn_entry_t device_finalizer_rocprofiler_shutdown;
 static device_finalizer_fn_entry_t device_trace_finalizer_shutdown;
 
 
@@ -169,22 +171,22 @@ METHOD_FN(finalize_event_list)
   char* evlist = METHOD_CALL(self, get_event_str);
   char* event = start_tok(evlist);
 #endif
-  roctracer_init();
-
-  // Register flush function to turn off roctracer and flush traces 
-  // NOTE: this is a registered as a flush callback because is MUST precede 
-  //       GPU trace finalization, which is registered as a shutdown callback
-  device_finalizer_flush.fn = roctracer_fini;
-  device_finalizer_register(device_finalizer_type_flush, 
-                            &device_finalizer_flush);
-
-  // initialize gpu tracing 
-  gpu_trace_init();
-
-  // Register shutdown function to finalize gpu tracing and write trace files
-  device_trace_finalizer_shutdown.fn = gpu_trace_fini;
-  device_finalizer_register(device_finalizer_type_shutdown, 
-                            &device_trace_finalizer_shutdown);
+    roctracer_init();
+    rocprofiler_init();
+
+    // Init records
+    gpu_trace_init();
+
+    device_finalizer_roctracer_shutdown.fn = roctracer_fini;
+    device_finalizer_register(device_finalizer_type_shutdown, &device_finalizer_roctracer_shutdown);
+
+    device_finalizer_rocprofiler_shutdown.fn = rocprofiler_fini;
+    device_finalizer_register(device_finalizer_type_shutdown, &device_finalizer_rocprofiler_shutdown);
+
+
+    // Register shutdown functions to write trace files
+    device_trace_finalizer_shutdown.fn = gpu_trace_fini;
+    device_finalizer_register(device_finalizer_type_shutdown, &device_trace_finalizer_shutdown);
 }
 
 

From 46517d10066a5ee6cb95ff3032a44aa06840ed98 Mon Sep 17 00:00:00 2001
From: dejangrubisic <grubisic.dejan@yahoo.com>
Date: Tue, 4 May 2021 14:54:57 -0500
Subject: [PATCH 148/177] OnLoadToolProp added to HPCRUN_NAMES

---
 src/tool/hpcrun/Makefile.am               | 2 +-
 src/tool/hpcrun/Makefile.in               | 2 +-
 src/tool/hpcrun/gpu/amd/rocprofiler-api.c | 8 ++++----
 src/tool/hpcrun/gpu/amd/roctracer-api.c   | 4 ++--
 src/tool/hpcrun/sample-sources/amd.c      | 4 ++++
 5 files changed, 12 insertions(+), 8 deletions(-)

diff --git a/src/tool/hpcrun/Makefile.am b/src/tool/hpcrun/Makefile.am
index 94c52bd066..8da0e62326 100644
--- a/src/tool/hpcrun/Makefile.am
+++ b/src/tool/hpcrun/Makefile.am
@@ -1178,7 +1178,7 @@ endif
 # Don't use LDFLAGS for static case.
 
 MONITOR_NAMES = -G 'monitor_*'
-HPCRUN_NAMES =  -G 'hpcrun_*' -G 'hpctoolkit_*'
+HPCRUN_NAMES =  -G 'hpcrun_*' -G 'hpctoolkit_*' -G 'OnLoadToolProp'
 MISC_NAMES =    -G 'debug_flag_*' -G 'messages_*' -G ompt_start_tool
 OPENCL_NAMES = -G 'clBuildProgram' -G 'clCreate*' -G 'clEnqueue*' -G 'clSetKernelArg' -G 'cl*Event*'
 
diff --git a/src/tool/hpcrun/Makefile.in b/src/tool/hpcrun/Makefile.in
index 42f42ec545..3328ad2ade 100644
--- a/src/tool/hpcrun/Makefile.in
+++ b/src/tool/hpcrun/Makefile.in
@@ -2336,7 +2336,7 @@ MY_AGENT_TBB_CFLAGS = \
 # and hidden into libhpcrun.o.  Other dependencies go into hpclink.
 # Don't use LDFLAGS for static case.
 MONITOR_NAMES = -G 'monitor_*'
-HPCRUN_NAMES = -G 'hpcrun_*' -G 'hpctoolkit_*'
+HPCRUN_NAMES = -G 'hpcrun_*' -G 'hpctoolkit_*' -G 'OnLoadToolProp'
 MISC_NAMES = -G 'debug_flag_*' -G 'messages_*' -G ompt_start_tool
 OPENCL_NAMES = -G 'clBuildProgram' -G 'clCreate*' -G 'clEnqueue*' -G 'clSetKernelArg' -G 'cl*Event*'
 DYN_SYSCALL_LIST = poll ppoll pselect select __sysv_signal
diff --git a/src/tool/hpcrun/gpu/amd/rocprofiler-api.c b/src/tool/hpcrun/gpu/amd/rocprofiler-api.c
index c03307e7c2..eedc061fb7 100644
--- a/src/tool/hpcrun/gpu/amd/rocprofiler-api.c
+++ b/src/tool/hpcrun/gpu/amd/rocprofiler-api.c
@@ -435,7 +435,7 @@ hsa_status_t dispatch_callback(const rocprofiler_callback_data_t* callback_data,
 }
 
 
-static void initialize() {
+static void rocp_inicialize() {
   // Getting profiling features
   rocprofiler_feature_t* features = NULL;
   unsigned feature_count = metrics_input(&features);
@@ -449,7 +449,7 @@ static void initialize() {
   callbacks_ptrs.dispatch = dispatch_callback;
   rocprofiler_set_queue_callbacks(callbacks_ptrs, NULL);
 
-  // Initialize recursive mutex_context
+  // rocp_inicialize recursive mutex_context
   pthread_mutexattr_t Attr;
   pthread_mutexattr_init(&Attr);
   pthread_mutexattr_settype(&Attr, PTHREAD_MUTEX_RECURSIVE);
@@ -460,7 +460,7 @@ static void initialize() {
 // This is necessary for rocprofiler callback to work
 extern PUBLIC_API void OnLoadToolProp(rocprofiler_settings_t* settings){
   printf("Rocprofiler OnLoadToolProp______________________\n");  
-  initialize();
+  rocp_inicialize();
 }
 
 
@@ -493,7 +493,7 @@ rocprofiler_init
 )
 {
   printf("Rocprofiler INIT\n");  
-  // initialize();
+  // rocp_inicialize();
   return; 
 }
 
diff --git a/src/tool/hpcrun/gpu/amd/roctracer-api.c b/src/tool/hpcrun/gpu/amd/roctracer-api.c
index cd8b6357ae..2b2de014c0 100644
--- a/src/tool/hpcrun/gpu/amd/roctracer-api.c
+++ b/src/tool/hpcrun/gpu/amd/roctracer-api.c
@@ -82,7 +82,7 @@
   macro(roctracer_enable_domain_activity_expl) \
   macro(roctracer_disable_domain_callback) \
   macro(roctracer_disable_domain_activity) \
-  macro(roctracer_set_properties)
+  macro(roctracer_set_properties) 
 
 
 #define ROCTRACER_FN_NAME(f) DYN_FN_NAME(f)
@@ -439,7 +439,7 @@ roctracer_subscriber_callback
 
     rocprofiler_start_kernel();
   }else if (data->phase == ACTIVITY_API_PHASE_EXIT){
-    rocprofiler_end_kernel();
+    rocprofiler_stop_kernel();
   }
 }
 
diff --git a/src/tool/hpcrun/sample-sources/amd.c b/src/tool/hpcrun/sample-sources/amd.c
index 963c8f4df5..3c9d0af6e9 100644
--- a/src/tool/hpcrun/sample-sources/amd.c
+++ b/src/tool/hpcrun/sample-sources/amd.c
@@ -163,6 +163,10 @@ METHOD_FN(finalize_event_list)
     EEMSG("hpcrun: unable to bind to AMD roctracer library %s\n", dlerror());
     monitor_real_exit(-1);
   }
+  if (rocprofiler_bind() != DYNAMIC_BINDING_STATUS_OK) {
+    EEMSG("hpcrun: unable to bind to AMD rocprofiler library %s\n", dlerror());
+    monitor_real_exit(-1);
+  }
 #endif
 
 #if 0

From 32713b09b7b1b0e9fff4f3d0cc17326ce4c3abd7 Mon Sep 17 00:00:00 2001
From: dejangrubisic <grubisic.dejan@yahoo.com>
Date: Thu, 27 May 2021 13:32:52 -0500
Subject: [PATCH 149/177] Added -G 'OnLoad' -G 'OnUnloadTool'

---
 src/tool/hpcrun/Makefile.am               | 2 +-
 src/tool/hpcrun/Makefile.in               | 2 +-
 src/tool/hpcrun/gpu/amd/rocprofiler-api.c | 8 ++++++++
 3 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/src/tool/hpcrun/Makefile.am b/src/tool/hpcrun/Makefile.am
index 8da0e62326..21fc762469 100644
--- a/src/tool/hpcrun/Makefile.am
+++ b/src/tool/hpcrun/Makefile.am
@@ -1178,7 +1178,7 @@ endif
 # Don't use LDFLAGS for static case.
 
 MONITOR_NAMES = -G 'monitor_*'
-HPCRUN_NAMES =  -G 'hpcrun_*' -G 'hpctoolkit_*' -G 'OnLoadToolProp'
+HPCRUN_NAMES =  -G 'hpcrun_*' -G 'hpctoolkit_*' -G 'OnLoadToolProp' -G 'OnLoad' -G 'OnUnloadTool'
 MISC_NAMES =    -G 'debug_flag_*' -G 'messages_*' -G ompt_start_tool
 OPENCL_NAMES = -G 'clBuildProgram' -G 'clCreate*' -G 'clEnqueue*' -G 'clSetKernelArg' -G 'cl*Event*'
 
diff --git a/src/tool/hpcrun/Makefile.in b/src/tool/hpcrun/Makefile.in
index 3328ad2ade..5ec7507f33 100644
--- a/src/tool/hpcrun/Makefile.in
+++ b/src/tool/hpcrun/Makefile.in
@@ -2336,7 +2336,7 @@ MY_AGENT_TBB_CFLAGS = \
 # and hidden into libhpcrun.o.  Other dependencies go into hpclink.
 # Don't use LDFLAGS for static case.
 MONITOR_NAMES = -G 'monitor_*'
-HPCRUN_NAMES = -G 'hpcrun_*' -G 'hpctoolkit_*' -G 'OnLoadToolProp'
+HPCRUN_NAMES = -G 'hpcrun_*' -G 'hpctoolkit_*' -G 'OnLoadToolProp' -G 'OnLoad' -G 'OnUnloadTool'
 MISC_NAMES = -G 'debug_flag_*' -G 'messages_*' -G ompt_start_tool
 OPENCL_NAMES = -G 'clBuildProgram' -G 'clCreate*' -G 'clEnqueue*' -G 'clSetKernelArg' -G 'cl*Event*'
 DYN_SYSCALL_LIST = poll ppoll pselect select __sysv_signal
diff --git a/src/tool/hpcrun/gpu/amd/rocprofiler-api.c b/src/tool/hpcrun/gpu/amd/rocprofiler-api.c
index eedc061fb7..221e87e1d2 100644
--- a/src/tool/hpcrun/gpu/amd/rocprofiler-api.c
+++ b/src/tool/hpcrun/gpu/amd/rocprofiler-api.c
@@ -463,6 +463,14 @@ extern PUBLIC_API void OnLoadToolProp(rocprofiler_settings_t* settings){
   rocp_inicialize();
 }
 
+extern PUBLIC_API void OnUnloadTool(){
+  printf("Rocprofiler OnUnloadTool______________________\n");  
+  // rocp_inicialize();
+}
+
+extern PUBLIC_API void OnLoad(){
+  printf("Rocprofiler OnLoad______________________\n");  
+}
 
 static void cleanup() {
   // Unregister dispatch callback

From e587f2941d23e4f39093b65b035cadafa7087920 Mon Sep 17 00:00:00 2001
From: Xiaozhu Meng <mxz297@gmail.com>
Date: Sat, 29 May 2021 09:43:35 -0500
Subject: [PATCH 150/177] Prototype support for rocprofiler

---
 src/tool/hpcrun/gpu/amd/rocprofiler-api.c  | 227 +++++++++------------
 src/tool/hpcrun/gpu/amd/rocprofiler-api.h  |  11 +-
 src/tool/hpcrun/gpu/amd/roctracer-api.c    |  30 ++-
 src/tool/hpcrun/gpu/gpu-activity-process.c |  37 ++++
 src/tool/hpcrun/gpu/gpu-activity.h         |  10 +-
 src/tool/hpcrun/gpu/gpu-metrics.c          |  43 +++-
 src/tool/hpcrun/gpu/gpu-metrics.h          |   8 +
 src/tool/hpcrun/sample-sources/amd.c       |   1 +
 8 files changed, 224 insertions(+), 143 deletions(-)

diff --git a/src/tool/hpcrun/gpu/amd/rocprofiler-api.c b/src/tool/hpcrun/gpu/amd/rocprofiler-api.c
index 221e87e1d2..8846f479ff 100644
--- a/src/tool/hpcrun/gpu/amd/rocprofiler-api.c
+++ b/src/tool/hpcrun/gpu/amd/rocprofiler-api.c
@@ -85,7 +85,7 @@ Returned API status:
 
 Loading and Configuring, loadable plugin on-load/unload methods:
 - rocprofiler_settings_t – global properties
-- OnLoadTool 
+- OnLoadTool
 - OnLoadToolProp
 - OnUnloadTool
 
@@ -94,7 +94,7 @@ Info API:
 - rocprofiler_info_query_t - profiling info query
 - rocprofiler_info_data_t - profiling info data
 - rocprofiler_get_info - return the info for a given info kind
-- rocprofiler_iterote_inf_ - iterate over the info for a given info kind 
+- rocprofiler_iterote_inf_ - iterate over the info for a given info kind
 - rocprofiler_query_info - iterate over the info for a given info query
 
 Context API:
@@ -155,8 +155,8 @@ Context pool API:
   macro(rocprofiler_set_queue_callbacks) \
   macro(rocprofiler_start_queue_callbacks) \
   macro(rocprofiler_stop_queue_callbacks) \
-  macro(rocprofiler_remove_queue_callbacks) 
-  
+  macro(rocprofiler_remove_queue_callbacks)
+
 
 
 #define ROCPROFILER_FN_NAME(f) DYN_FN_NAME(f)
@@ -167,11 +167,11 @@ Context pool API:
 #define HPCRUN_ROCPROFILER_CALL(fn, args) \
 {      \
   hsa_status_t status = ROCPROFILER_FN_NAME(fn) args;	\
-  if (status != HSA_STATUS_SUCCESS) {		\    
+  if (status != HSA_STATUS_SUCCESS) {		\
     const char* error_string = NULL; \
     rocprofiler_error_string(&error_string); \
     fprintf(stderr, "ERROR: %s\n", error_string); \
-    abort(); \    
+    abort(); \
   }						\
 }
 
@@ -179,35 +179,25 @@ Context pool API:
 typedef const char* (*hip_kernel_name_fnt)(const hipFunction_t f);
 typedef const char* (*hip_kernel_name_ref_fnt)(const void* hostFunction, hipStream_t stream);
 
-
-// Context stored entry type
 typedef struct {
   bool valid;
   hsa_agent_t agent;
   rocprofiler_group_t group;
   rocprofiler_callback_data_t data;
-}context_entry_t;
-
-// Context callback arg
-typedef struct {
-  rocprofiler_pool_t** pools;
-}callbacks_arg_t;
-
-// Handler callback arg
-typedef struct {
-  rocprofiler_feature_t* features;
-  unsigned feature_count;
-}handler_arg_t;
-
-
+} hpcrun_amd_counter_data_t;
 
 //******************************************************************************
 // local variables
 //******************************************************************************
 
+// Currently we serialize kernel execution when collecting counters
+static hpcrun_amd_counter_data_t counter_data;
+static uint64_t rocprofiler_correlation_id;
+static volatile int context_callback_finish;
+
+
 static hip_kernel_name_fnt hip_kernel_name_fn;
 static hip_kernel_name_ref_fnt hip_kernel_name_ref_fn;
-pthread_mutex_t mutex_context;
 
 //----------------------------------------------------------
 // rocprofiler function pointers for late binding
@@ -230,7 +220,7 @@ ROCPROFILER_FN
 (
   rocprofiler_close,
   (
-	  rocprofiler_t* context		// [in] profiling context  
+	  rocprofiler_t* context		// [in] profiling context
   )
 );
 
@@ -294,7 +284,7 @@ rocprofiler_path
 
 unsigned metrics_input(rocprofiler_feature_t** ret) {
   // Profiling feature objects
-  const unsigned feature_count = 6;
+  const unsigned feature_count = 4;
   rocprofiler_feature_t* features = (rocprofiler_feature_t*) malloc(sizeof(rocprofiler_feature_t) * feature_count);
   memset(features, 0, feature_count * sizeof(rocprofiler_feature_t));
 
@@ -304,172 +294,126 @@ unsigned metrics_input(rocprofiler_feature_t** ret) {
   features[1].kind = ROCPROFILER_FEATURE_KIND_METRIC;
   features[1].name = "GRBM_GUI_ACTIVE";
   features[2].kind = ROCPROFILER_FEATURE_KIND_METRIC;
-  features[2].name = "GPUBusy";
+  features[2].name = "TCC_HIT_sum";
   features[3].kind = ROCPROFILER_FEATURE_KIND_METRIC;
-  features[3].name = "SQ_WAVES";
-  features[4].kind = ROCPROFILER_FEATURE_KIND_METRIC;
-  features[4].name = "SQ_INSTS_VALU";
-  features[5].kind = ROCPROFILER_FEATURE_KIND_METRIC;
-  features[5].name = "VALUInsts";
-//  features[6].kind = ROCPROFILER_FEATURE_KIND_METRIC;
-//  features[6].name = "TCC_HIT_sum";
-//  features[7].kind = ROCPROFILER_FEATURE_KIND_METRIC;
-//  features[7].name = "TCC_MISS_sum";
-//  features[8].kind = ROCPROFILER_FEATURE_KIND_METRIC;
-//  features[8].name = "WRITE_SIZE";
+  features[3].name = "TCC_MISS_sum";
 
   *ret = features;
   return feature_count;
 }
 
+// Profiling completion handler
+// Dump and delete the context entry
+// Return true if the context was dumped successfully
+static bool context_handler1(rocprofiler_group_t group, void* arg) {
+
+  volatile bool valid = counter_data.valid;
+  while (!valid) {
+    sched_yield();
+    valid = counter_data.valid;
+  }
+
+  rocprofiler_feature_t** features = counter_data.group.features;
+  unsigned feature_count = counter_data.group.feature_count;
+
 
-// Dump stored context entry
-static void dump_context_entry(context_entry_t* entry, rocprofiler_feature_t* features, unsigned feature_count) {
-  volatile bool valid = entry->valid;
-  while (valid == false) sched_yield();
-
-  const char *kernel_name = entry->data.kernel_name;
-  const rocprofiler_dispatch_record_t* record = entry->data.record;
-
-  fflush(stdout);
-  fprintf(stdout, "kernel symbol(0x%lx) name(\"%s\") tid(%u) queue-id(%u)) ", // gpu-id(%u) ",
-    entry->data.kernel_object,
-    kernel_name,
-    entry->data.thread_id,
-    entry->data.queue_id);
-    // HsaRsrcFactory::Instance().GetAgentInfo(entry->agent)->dev_index);
-  if (record) fprintf(stdout, "time(%lu,%lu,%lu,%lu)",
-    record->dispatch,
-    record->begin,
-    record->end,
-    record->complete);
-  fprintf(stdout, "\n");
-  fflush(stdout);
-
-  rocprofiler_group_t *group = &entry->group;
-  if (group->context == NULL) {
-    EMSG("error: AMD group->context = NULL");    
+  if (counter_data.group.context == NULL) {
+    EMSG("error: AMD group->context = NULL");
   }
   if (feature_count > 0) {
-    HPCRUN_ROCPROFILER_CALL(rocprofiler_group_get_data, (group));  
-    HPCRUN_ROCPROFILER_CALL(rocprofiler_get_metrics, (group->context));    
+    //HPCRUN_ROCPROFILER_CALL(rocprofiler_group_get_data, (group));
+    rocprofiler_group_get_data(&counter_data.group);
+    HPCRUN_ROCPROFILER_CALL(rocprofiler_get_metrics, (counter_data.group.context));
   }
 
-  for (unsigned i = 0; i < feature_count; ++i) {
-    const rocprofiler_feature_t* p = &features[i];
-    fprintf(stdout, ">  %s ", p->name);
-    switch (p->data.kind) {
-      // Output metrics results
-      case ROCPROFILER_DATA_KIND_INT64:
-        fprintf(stdout, "= (%lu)\n", p->data.result_int64);
-        break;
-      default:
-        fprintf(stderr, "Undefined data kind(%u)\n", p->data.kind);
-        abort();
-    }
-  }
-}
+  gpu_monitoring_thread_activities_ready();
 
+  gpu_activity_t ga;
+  memset(&ga, 0, sizeof(gpu_activity_t));
+  cstack_ptr_set(&(ga.next), 0);
 
-// Profiling completion handler
-// Dump and delete the context entry
-// Return true if the context was dumped successfully
-static bool context_handler1(rocprofiler_group_t group, void* arg) {
-  context_entry_t* ctx_entry = (context_entry_t*)arg;
+  ga.kind = GPU_ACTIVITY_COUNTER;
+  ga.details.counters.correlation_id = rocprofiler_correlation_id;
 
-  if (pthread_mutex_lock(&mutex_context) != 0) {
-    perror("pthread_mutex_lock");
-    abort();
+  for (unsigned i = 0; i < feature_count; ++i) {
+    const rocprofiler_feature_t* p = features[i];
+    if (strcmp(p->name, "GRBM_COUNT") == 0) {
+      ga.details.counters.cycles = p->data.result_int64;
+    } else if (strstr(p->name, "TCC_HIT") != NULL) {
+      ga.details.counters.l2_cache_hit += p->data.result_int64;
+    } else if (strstr(p->name, "TCC_MISS") != NULL) {
+      ga.details.counters.l2_cache_miss += p->data.result_int64;
+    }
   }
 
-  rocprofiler_feature_t* features = ctx_entry->group.features[0];
-  unsigned feature_count = ctx_entry->group.feature_count;
-  dump_context_entry(ctx_entry, features, feature_count);
-
-  if (pthread_mutex_unlock(&mutex_context) != 0) {
-    perror("pthread_mutex_unlock");
-    abort();
+  if (gpu_correlation_id_map_lookup(rocprofiler_correlation_id) == NULL) {
+    gpu_correlation_id_map_insert(rocprofiler_correlation_id, rocprofiler_correlation_id);
   }
+  gpu_activity_process(&ga);
 
+  context_callback_finish = 1;
   return false;
 }
 
-
-// Kernel disoatch callback
-hsa_status_t dispatch_callback(const rocprofiler_callback_data_t* callback_data, void* arg,
-                               rocprofiler_group_t* group) {
-
+static hsa_status_t
+dispatch_callback
+(
+  const rocprofiler_callback_data_t* callback_data,
+  void* arg,
+  rocprofiler_group_t* group
+) {
   printf("Rocprofiler dispatch_callback\n\n");
   // Passed tool data
   hsa_agent_t agent = callback_data->agent;
   // HSA status
   hsa_status_t status = HSA_STATUS_ERROR;
 
-  // Open profiling context
-  // context properties
-  context_entry_t* entry = (context_entry_t*) malloc(sizeof(context_entry_t));
   rocprofiler_t* context = NULL;
   rocprofiler_properties_t properties = {};
   properties.handler = context_handler1;
-  properties.handler_arg = (void*)entry;
+  properties.handler_arg = NULL;
 
   rocprofiler_feature_t *features;
   unsigned feature_count = metrics_input(&features);
 
+  counter_data.valid = false;
   HPCRUN_ROCPROFILER_CALL(rocprofiler_open, (agent, features, feature_count,
                             &context, 0 /*ROCPROFILER_MODE_SINGLEGROUP*/, &properties));
-  
+
 
   // Get group[0]
-  HPCRUN_ROCPROFILER_CALL(rocprofiler_get_group, (context, 0, group));
-  
+  //HPCRUN_ROCPROFILER_CALL(rocprofiler_get_group, (context, 0, group));
+  rocprofiler_get_group(context, 0, group);
 
   // Fill profiling context entry
-  entry->agent = agent;
-  entry->group = *group;
-  entry->data = *callback_data;
-  entry->data.kernel_name = strdup(callback_data->kernel_name);
-  entry->valid = true;
+  counter_data.agent = agent;
+  counter_data.group = *group;
+  counter_data.data = *callback_data;
+  counter_data.valid = true;
 
   return HSA_STATUS_SUCCESS;
 }
 
 
 static void rocp_inicialize() {
-  // Getting profiling features
-  rocprofiler_feature_t* features = NULL;
-  unsigned feature_count = metrics_input(&features);
-
-  // Handler arg
-  handler_arg_t* handler_arg = (handler_arg_t*) malloc(sizeof(handler_arg_t));
-  handler_arg->features = features;
-  handler_arg->feature_count = feature_count;
-
   rocprofiler_queue_callbacks_t callbacks_ptrs = {};
   callbacks_ptrs.dispatch = dispatch_callback;
   rocprofiler_set_queue_callbacks(callbacks_ptrs, NULL);
-
-  // rocp_inicialize recursive mutex_context
-  pthread_mutexattr_t Attr;
-  pthread_mutexattr_init(&Attr);
-  pthread_mutexattr_settype(&Attr, PTHREAD_MUTEX_RECURSIVE);
-  pthread_mutex_init(&mutex_context, &Attr);
-
 }
 
 // This is necessary for rocprofiler callback to work
 extern PUBLIC_API void OnLoadToolProp(rocprofiler_settings_t* settings){
-  printf("Rocprofiler OnLoadToolProp______________________\n");  
+  printf("Rocprofiler OnLoadToolProp______________________\n");
   rocp_inicialize();
 }
 
 extern PUBLIC_API void OnUnloadTool(){
-  printf("Rocprofiler OnUnloadTool______________________\n");  
+  printf("Rocprofiler OnUnloadTool______________________\n");
   // rocp_inicialize();
 }
 
 extern PUBLIC_API void OnLoad(){
-  printf("Rocprofiler OnLoad______________________\n");  
+  printf("Rocprofiler OnLoad______________________\n");
 }
 
 static void cleanup() {
@@ -484,7 +428,14 @@ static void cleanup() {
 //******************************************************************************
 
 
-void rocprofiler_start_kernel(){
+void
+rocprofiler_start_kernel
+(
+  uint64_t cor
+)
+{
+  rocprofiler_correlation_id = cor;
+  context_callback_finish = 0;
   HPCRUN_ROCPROFILER_CALL(rocprofiler_start_queue_callbacks, ());
 }
 
@@ -500,9 +451,9 @@ rocprofiler_init
  void
 )
 {
-  printf("Rocprofiler INIT\n");  
+  printf("Rocprofiler INIT\n");
   // rocp_inicialize();
-  return; 
+  return;
 }
 
 
@@ -513,7 +464,7 @@ rocprofiler_fini
  int how
 )
 {
-  printf("Rocprofiler FINI\n");  
+  printf("Rocprofiler FINI\n");
   cleanup();
   return;
 }
@@ -570,3 +521,11 @@ rocprofiler_bind
 #endif // ! HPCRUN_STATIC_LINK
 }
 
+void
+rocprofiler_wait_context_callback
+(
+  void
+)
+{
+  while (context_callback_finish == 0);
+}
diff --git a/src/tool/hpcrun/gpu/amd/rocprofiler-api.h b/src/tool/hpcrun/gpu/amd/rocprofiler-api.h
index ad785b037f..426f1fb28a 100644
--- a/src/tool/hpcrun/gpu/amd/rocprofiler-api.h
+++ b/src/tool/hpcrun/gpu/amd/rocprofiler-api.h
@@ -50,14 +50,14 @@
 // interface operations
 //******************************************************************************
 
-void 
+void
 rocprofiler_start_kernel
 (
-  void
+  uint64_t
 );
 
 
-void 
+void
 rocprofiler_stop_kernel
 (
 	void
@@ -85,6 +85,11 @@ rocprofile_bind
   void
 );
 
+void
+rocprofiler_wait_context_callback
+(
+  void
+);
 
 
 #endif
diff --git a/src/tool/hpcrun/gpu/amd/roctracer-api.c b/src/tool/hpcrun/gpu/amd/roctracer-api.c
index 2b2de014c0..a09561dd52 100644
--- a/src/tool/hpcrun/gpu/amd/roctracer-api.c
+++ b/src/tool/hpcrun/gpu/amd/roctracer-api.c
@@ -112,6 +112,11 @@ typedef const char* (*hip_kernel_name_ref_fnt)(const void* hostFunction, hipStre
 static hip_kernel_name_fnt hip_kernel_name_fn;
 static hip_kernel_name_ref_fnt hip_kernel_name_ref_fn;
 
+// If we collect counters for GPU kernels,
+// we will serilize kernel executions.
+// Hopefully, AMD tool support will improve this the future
+static bool collect_counter = true;
+
 //----------------------------------------------------------
 // roctracer function pointers for late binding
 //----------------------------------------------------------
@@ -305,7 +310,8 @@ roctracer_subscriber_callback
   bool is_valid_op = false;
   bool is_kernel_op = false;
   const hip_api_data_t* data = (const hip_api_data_t*)(callback_data);
-  const char* kernel_name = NULL;  
+  const char* kernel_name = NULL;
+  hipStream_t kernel_stream = 0;
 
   switch (callback_id) {
   case HIP_API_ID_hipMemcpy:
@@ -380,6 +386,9 @@ roctracer_subscriber_callback
     is_valid_op = true;
     is_kernel_op = true;
     kernel_name = hip_kernel_name_fn(data->args.hipModuleLaunchKernel.f);
+    if (collect_counter) {
+      kernel_stream = data->args.hipModuleLaunchKernel.stream;
+    }
     break;
   }
   case HIP_API_ID_hipLaunchKernel: {
@@ -391,6 +400,9 @@ roctracer_subscriber_callback
     is_kernel_op = true;
     kernel_name = hip_kernel_name_ref_fn(data->args.hipLaunchKernel.function_address, 
       data->args.hipLaunchKernel.stream);
+    if (collect_counter) {
+      kernel_stream = data->args.hipLaunchKernel.stream;
+    }
     break;
   }
   case HIP_API_ID_hipCtxSynchronize:
@@ -426,6 +438,10 @@ roctracer_subscriber_callback
 
       cct_node_t *trace_ph = gpu_op_ccts_get(&gpu_op_ccts, gpu_placeholder_type_trace);
       ensure_kernel_ip_present(trace_ph, kernel_ip);
+
+      if (collect_counter) {
+        rocprofiler_start_kernel(correlation_id);
+      }
     }
 
     hpcrun_safe_exit();
@@ -436,10 +452,13 @@ roctracer_subscriber_callback
     // Generate notification entry
     uint64_t cpu_submit_time = hpcrun_nanotime();
     gpu_correlation_channel_produce(correlation_id, &gpu_op_ccts, cpu_submit_time);
-
-    rocprofiler_start_kernel();
+    
   }else if (data->phase == ACTIVITY_API_PHASE_EXIT){
-    rocprofiler_stop_kernel();
+    if (is_kernel_op && collect_counter) {
+      hipStreamSynchronize(kernel_stream);
+      rocprofiler_wait_context_callback();
+      rocprofiler_stop_kernel();
+    }
   }
 }
 
@@ -479,6 +498,9 @@ roctracer_buffer_completion_callback
 )
 {
   hpcrun_thread_init_mem_pool_once(0, NULL, false, true);
+  if (collect_counter) {
+    return;
+  }
   roctracer_buffer_completion_notify();
   roctracer_record_t* record = (roctracer_record_t*)(begin);
   roctracer_record_t* end_record = (roctracer_record_t*)(end);
diff --git a/src/tool/hpcrun/gpu/gpu-activity-process.c b/src/tool/hpcrun/gpu/gpu-activity-process.c
index 47a6dc4288..d749ea4562 100644
--- a/src/tool/hpcrun/gpu/gpu-activity-process.c
+++ b/src/tool/hpcrun/gpu/gpu-activity-process.c
@@ -636,6 +636,39 @@ gpu_instruction_process
   PRINT("Instruction correlation_id %u\n", correlation_id);
 }
 
+static void
+gpu_counter_process
+(
+ gpu_activity_t *activity
+)
+{
+  uint32_t correlation_id = activity->details.counters.correlation_id;
+  gpu_correlation_id_map_entry_t *cid_map_entry =
+    gpu_correlation_id_map_lookup(correlation_id);
+  if (cid_map_entry != NULL) {
+    uint64_t external_id =
+      gpu_correlation_id_map_entry_external_id_get(cid_map_entry);
+    gpu_host_correlation_map_entry_t *host_op_entry =
+      gpu_host_correlation_map_lookup(external_id);
+    if (host_op_entry != NULL) {
+      gpu_placeholder_type_t ph = gpu_placeholder_type_kernel;
+      cct_node_t *host_op_node =
+        gpu_host_correlation_map_entry_op_cct_get(host_op_entry, ph);
+      assert(host_op_node != NULL);
+      // Memory allocation does not always happen on the device
+      // Do not send it to trace channels
+      attribute_activity(host_op_entry, activity, host_op_node);
+    }
+    gpu_correlation_id_map_delete(correlation_id);
+  } else {
+    PRINT("Counter correlation_id %u cannot be found\n", correlation_id);
+  }
+  PRINT("Counter CorrelationId %u\n", correlation_id);
+  PRINT("Counter cycles %lu\n", activity->details.counters.cycles);
+  PRINT("Counter l2 cache hit %lu\n", activity->details.counters.l2_cache_hit);
+  PRINT("Counter l2 cache miss %lu\n", activity->details.counters.l2._cache_miss);
+}
+
 
 //******************************************************************************
 // interface operations
@@ -713,6 +746,10 @@ gpu_activity_process
     gpu_event_process(ga);
     break;
 
+  case GPU_ACTIVITY_COUNTER:
+    gpu_counter_process(ga);
+    break;
+
   case GPU_ACTIVITY_MEMCPY2:
   default:
     gpu_unknown_process(ga);
diff --git a/src/tool/hpcrun/gpu/gpu-activity.h b/src/tool/hpcrun/gpu/gpu-activity.h
index 786c4da451..7534325aff 100644
--- a/src/tool/hpcrun/gpu/gpu-activity.h
+++ b/src/tool/hpcrun/gpu/gpu-activity.h
@@ -95,7 +95,8 @@ typedef enum {
   GPU_ACTIVITY_EXTERNAL_CORRELATION    = 14,
   GPU_ACTIVITY_EVENT                   = 15,
   GPU_ACTIVITY_FUNCTION                = 16,
-  GPU_ACTIVITY_FLUSH                   = 17
+  GPU_ACTIVITY_FLUSH                   = 17,
+  GPU_ACTIVITY_COUNTER                 = 18
 } gpu_activity_kind_t;
 
 
@@ -346,6 +347,12 @@ typedef struct gpu_host_correlation_t {
   uint64_t host_correlation_id;
 } gpu_host_correlation_t;
 
+typedef struct gpu_counter_t {
+  uint32_t correlation_id;
+  uint64_t cycles;
+  uint64_t l2_cache_hit;
+  uint64_t l2_cache_miss;
+} gpu_counter_t;
 
 // a type that can be used to access start and end times
 // for a subset of activity kinds including kernel execution,
@@ -383,6 +390,7 @@ typedef struct gpu_activity_details_t {
     gpu_synchronization_t synchronization;
     gpu_host_correlation_t correlation;
     gpu_flush_t flush;
+    gpu_counter_t counters;
 
     /* Access short cut for activitiy fields shared by multiple kinds */
 
diff --git a/src/tool/hpcrun/gpu/gpu-metrics.c b/src/tool/hpcrun/gpu/gpu-metrics.c
index 84e81657fa..a2e98c996a 100644
--- a/src/tool/hpcrun/gpu/gpu-metrics.c
+++ b/src/tool/hpcrun/gpu/gpu-metrics.c
@@ -84,7 +84,8 @@
   macro(GPU_INST, 9)  \
   macro(GTIMES, 10)  \
   macro(KINFO, 12)  \
-  macro(GSAMP, 13)
+  macro(GSAMP, 13) \
+  macro(CTR, 3)
 
 
 #define FORALL_METRIC_KINDS(macro)  \
@@ -592,6 +593,27 @@ gpu_metrics_attribute_branch
            b->executed);
 }
 
+static void
+gpu_metrics_attribute_counter
+(
+  gpu_activity_t *activity
+)
+{
+  gpu_counter_t * c = &(activity->details.counters);
+  cct_node_t *cct_node = activity->cct_node;
+
+  metric_data_list_t *metrics =
+    hpcrun_reify_metric_set(cct_node, METRIC_ID(GPU_CTR_CYCLES));
+
+  gpu_metrics_attribute_metric_int(metrics, METRIC_ID(GPU_CTR_CYCLES),
+           c->cycles);
+
+  gpu_metrics_attribute_metric_int(metrics, METRIC_ID(GPU_CTR_L2_CACHE_HIT),
+           c->l2_cache_hit);
+
+  gpu_metrics_attribute_metric_int(metrics, METRIC_ID(GPU_CTR_L2_CACHE_MISS),
+           c->l2_cache_miss);
+}
 
 //******************************************************************************
 // interface operations
@@ -652,6 +674,9 @@ gpu_metrics_attribute
     gpu_metrics_attribute_branch(activity);
     break;
 
+  case GPU_ACTIVITY_COUNTER:
+    gpu_metrics_attribute_counter(activity);
+    break;
   default:
     break;
   }
@@ -898,3 +923,19 @@ gpu_metrics_GPU_INST_STALL_enable
 
   FINALIZE_METRIC_KIND();
 }
+
+void
+gpu_metrics_GPU_CTR_enable
+(
+ void
+)
+{
+#undef CURRENT_METRIC
+#define CURRENT_METRIC CTR
+
+  INITIALIZE_METRIC_KIND();
+
+  FORALL_CTR(INITIALIZE_SCALAR_METRIC_INT);
+
+  FINALIZE_METRIC_KIND();
+}
\ No newline at end of file
diff --git a/src/tool/hpcrun/gpu/gpu-metrics.h b/src/tool/hpcrun/gpu/gpu-metrics.h
index 6e05548fb1..a26650641f 100644
--- a/src/tool/hpcrun/gpu/gpu-metrics.h
+++ b/src/tool/hpcrun/gpu/gpu-metrics.h
@@ -330,6 +330,14 @@ typedef enum {
   macro("GKER:OCC_THR",               GPU_KINFO_OCCUPANCY_THR,		\
 	"GPU kernel: theoretical occupancy (FGP_ACT / FGP_MAX)")          \
   
+// gpu kernel hardware counter metrics
+#define FORALL_CTR(macro) \
+  macro("GCTR:CYCLES",         GPU_CTR_CYCLES, \
+	"GPU counter : cycles")	\
+  macro("GCTR:L2_CACHE_HIT",         GPU_CTR_L2_CACHE_HIT, \
+	"GPU counter : L2 cache hit")	\
+  macro("GCTR:L2_CACHE_MISS",         GPU_CTR_L2_CACHE_MISS, \
+	"GPU counter : L2 cache miss")	\
 
 // gpu implicit copy
 #define FORALL_GICOPY(macro)					\
diff --git a/src/tool/hpcrun/sample-sources/amd.c b/src/tool/hpcrun/sample-sources/amd.c
index 3c9d0af6e9..bbf3725750 100644
--- a/src/tool/hpcrun/sample-sources/amd.c
+++ b/src/tool/hpcrun/sample-sources/amd.c
@@ -152,6 +152,7 @@ METHOD_FN(process_event_list, int lush_metrics)
     int nevents = (self->evl).nevents;
     gpu_metrics_default_enable();
     hpcrun_set_trace_metric(HPCRUN_GPU_TRACE_FLAG);
+    gpu_metrics_GPU_CTR_enable();
     TMSG(CUDA,"nevents = %d", nevents);
 }
 

From 65d0e8ed4bb2c4963146df037cc2a18e3c734664 Mon Sep 17 00:00:00 2001
From: Xiaozhu Meng <mxz297@gmail.com>
Date: Mon, 6 Dec 2021 21:38:24 -0600
Subject: [PATCH 151/177] Enable both roctracer and rocprofiler.

Need to initialize hpcrun memory pool for both roctracer and rocprofiler callbacks
---
 src/tool/hpcrun/fnbounds/fnbounds_dynamic.c | 1 +
 src/tool/hpcrun/gpu/amd/rocprofiler-api.c   | 2 +-
 src/tool/hpcrun/gpu/amd/roctracer-api.c     | 3 ---
 src/tool/hpcrun/loadmap.c                   | 2 +-
 4 files changed, 3 insertions(+), 5 deletions(-)

diff --git a/src/tool/hpcrun/fnbounds/fnbounds_dynamic.c b/src/tool/hpcrun/fnbounds/fnbounds_dynamic.c
index f863fdaa52..7be609f3f7 100644
--- a/src/tool/hpcrun/fnbounds/fnbounds_dynamic.c
+++ b/src/tool/hpcrun/fnbounds/fnbounds_dynamic.c
@@ -220,6 +220,7 @@ fnbounds_enclosing_addr(void* ip, void** start, void** end, load_module_t** lm)
 load_module_t*
 fnbounds_map_dso(const char *module_name, void *start, void *end, struct dl_phdr_info* info)
 {
+  hpcrun_thread_init_mem_pool_once(0, NULL, false, true);
   dso_info_t *dso = fnbounds_compute(module_name, start, end);
   if (dso) {
     load_module_t* lm = hpcrun_loadmap_map(dso);
diff --git a/src/tool/hpcrun/gpu/amd/rocprofiler-api.c b/src/tool/hpcrun/gpu/amd/rocprofiler-api.c
index 8846f479ff..b72a2d5243 100644
--- a/src/tool/hpcrun/gpu/amd/rocprofiler-api.c
+++ b/src/tool/hpcrun/gpu/amd/rocprofiler-api.c
@@ -306,7 +306,7 @@ unsigned metrics_input(rocprofiler_feature_t** ret) {
 // Dump and delete the context entry
 // Return true if the context was dumped successfully
 static bool context_handler1(rocprofiler_group_t group, void* arg) {
-
+  hpcrun_thread_init_mem_pool_once(0, NULL, false, true);
   volatile bool valid = counter_data.valid;
   while (!valid) {
     sched_yield();
diff --git a/src/tool/hpcrun/gpu/amd/roctracer-api.c b/src/tool/hpcrun/gpu/amd/roctracer-api.c
index a09561dd52..6a17a57a3a 100644
--- a/src/tool/hpcrun/gpu/amd/roctracer-api.c
+++ b/src/tool/hpcrun/gpu/amd/roctracer-api.c
@@ -498,9 +498,6 @@ roctracer_buffer_completion_callback
 )
 {
   hpcrun_thread_init_mem_pool_once(0, NULL, false, true);
-  if (collect_counter) {
-    return;
-  }
   roctracer_buffer_completion_notify();
   roctracer_record_t* record = (roctracer_record_t*)(begin);
   roctracer_record_t* end_record = (roctracer_record_t*)(end);
diff --git a/src/tool/hpcrun/loadmap.c b/src/tool/hpcrun/loadmap.c
index 6491d70029..a3b2d463f6 100644
--- a/src/tool/hpcrun/loadmap.c
+++ b/src/tool/hpcrun/loadmap.c
@@ -145,7 +145,7 @@ hpcrun_dso_make(const char* name, void** table,
   TMSG(DSO," hpcrun_dso_make for module %s", name);
 
   int namelen = strlen(name) + 1;
-  x->name = (char*) hpcrun_malloc(namelen);
+  x->name = (char*) malloc(namelen);
   strcpy(x->name, name);
 
   x->table = table;

From d344958e80b8c477d136c38bda541fa3f70f026e Mon Sep 17 00:00:00 2001
From: Xiaozhu Meng <mxz297@gmail.com>
Date: Fri, 17 Dec 2021 21:30:36 -0600
Subject: [PATCH 152/177] Start to integrate rocprofiler 1. add amd-rocprofiler
 sample source to match and display rocprofiler events 2. improve
 initialization sequences, but still more work to be done. Two main   
 initialization places are (1) command line argument parsing and (2)
 OnLoadToolProp    called from librocprofiler64.so 3. Right now, we need to
 provide "-e gpu=amd" for gpu counters as we call    rocprofiler functions
 inside roctracer subscriber callbacks

---
 src/tool/hpcrun/Makefile.am                   |   1 +
 src/tool/hpcrun/Makefile.in                   |  22 +-
 src/tool/hpcrun/gpu/amd/rocprofiler-api.c     | 271 +++++++++++++-----
 src/tool/hpcrun/gpu/amd/rocprofiler-api.h     |  30 ++
 src/tool/hpcrun/gpu/amd/roctracer-api.c       |  10 +-
 src/tool/hpcrun/gpu/amd/roctracer-api.h       |   5 +
 src/tool/hpcrun/gpu/gpu-metrics.h             |  12 +
 .../hpcrun/sample-sources/amd-rocprofiler.c   | 211 ++++++++++++++
 src/tool/hpcrun/sample-sources/amd.c          |  10 -
 src/tool/hpcrun/sample-sources/ss-list.h      |   4 +
 10 files changed, 491 insertions(+), 85 deletions(-)
 create mode 100644 src/tool/hpcrun/sample-sources/amd-rocprofiler.c

diff --git a/src/tool/hpcrun/Makefile.am b/src/tool/hpcrun/Makefile.am
index 21fc762469..1c55094e1f 100644
--- a/src/tool/hpcrun/Makefile.am
+++ b/src/tool/hpcrun/Makefile.am
@@ -542,6 +542,7 @@ endif
 if OPT_ENABLE_ROCM
 MY_ROCM_FILES =\
 	sample-sources/amd.c \
+	sample-sources/amd-rocprofiler.c \
 	gpu/amd/roctracer-activity-translate.c \
 	gpu/amd/roctracer-api.c \
 	gpu/amd/rocprofiler-api.c \
diff --git a/src/tool/hpcrun/Makefile.in b/src/tool/hpcrun/Makefile.in
index 5ec7507f33..cb0e97591b 100644
--- a/src/tool/hpcrun/Makefile.in
+++ b/src/tool/hpcrun/Makefile.in
@@ -538,10 +538,11 @@ am__libhpcrun_la_SOURCES_DIST = utilities/first_func.c main.h main.c \
 	gpu/nvidia/cupti-activity-translate.c \
 	gpu/nvidia/cupti-analysis.c gpu/nvidia/cupti-api.c \
 	gpu/nvidia/cupti-gpu-api.c sample-sources/upc.c \
-	sample-sources/amd.c gpu/amd/roctracer-activity-translate.c \
-	gpu/amd/roctracer-api.c gpu/amd/rocprofiler-api.c \
-	gpu/amd/rocm-debug-api.c gpu/amd/rocm-binary-processing.c \
-	sample-sources/level0.c gpu/level0/level0-api.c \
+	sample-sources/amd.c sample-sources/amd-rocprofiler.c \
+	gpu/amd/roctracer-activity-translate.c gpu/amd/roctracer-api.c \
+	gpu/amd/rocprofiler-api.c gpu/amd/rocm-debug-api.c \
+	gpu/amd/rocm-binary-processing.c sample-sources/level0.c \
+	gpu/level0/level0-api.c \
 	gpu/level0/level0-command-list-context-map.c \
 	gpu/level0/level0-command-list-map.c \
 	gpu/level0/level0-command-process.c \
@@ -746,6 +747,7 @@ am__objects_33 = sample-sources/libhpcrun_la-upc.lo
 am__objects_35 =
 @OPT_ENABLE_ROCM_TRUE@am__objects_36 =  \
 @OPT_ENABLE_ROCM_TRUE@	sample-sources/libhpcrun_la-amd.lo \
+@OPT_ENABLE_ROCM_TRUE@	sample-sources/libhpcrun_la-amd-rocprofiler.lo \
 @OPT_ENABLE_ROCM_TRUE@	gpu/amd/libhpcrun_la-roctracer-activity-translate.lo \
 @OPT_ENABLE_ROCM_TRUE@	gpu/amd/libhpcrun_la-roctracer-api.lo \
 @OPT_ENABLE_ROCM_TRUE@	gpu/amd/libhpcrun_la-rocprofiler-api.lo \
@@ -2000,6 +2002,7 @@ MY_AARCH64_FILES = \
 
 @OPT_ENABLE_ROCM_TRUE@MY_ROCM_FILES = \
 @OPT_ENABLE_ROCM_TRUE@	sample-sources/amd.c \
+@OPT_ENABLE_ROCM_TRUE@	sample-sources/amd-rocprofiler.c \
 @OPT_ENABLE_ROCM_TRUE@	gpu/amd/roctracer-activity-translate.c \
 @OPT_ENABLE_ROCM_TRUE@	gpu/amd/roctracer-api.c \
 @OPT_ENABLE_ROCM_TRUE@	gpu/amd/rocprofiler-api.c \
@@ -2999,6 +3002,9 @@ sample-sources/libhpcrun_la-upc.lo: sample-sources/$(am__dirstamp) \
 	sample-sources/$(DEPDIR)/$(am__dirstamp)
 sample-sources/libhpcrun_la-amd.lo: sample-sources/$(am__dirstamp) \
 	sample-sources/$(DEPDIR)/$(am__dirstamp)
+sample-sources/libhpcrun_la-amd-rocprofiler.lo:  \
+	sample-sources/$(am__dirstamp) \
+	sample-sources/$(DEPDIR)/$(am__dirstamp)
 gpu/amd/$(am__dirstamp):
 	@$(MKDIR_P) gpu/amd
 	@: > gpu/amd/$(am__dirstamp)
@@ -4103,6 +4109,7 @@ distclean-compile:
 @AMDEP_TRUE@@am__include@ @am__quote@sample-sources/$(DEPDIR)/libhpcrun_ga_wrap_a-ga-overrides.Po@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@sample-sources/$(DEPDIR)/libhpcrun_io_la-io-over.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@sample-sources/$(DEPDIR)/libhpcrun_io_wrap_a-io-over.Po@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@sample-sources/$(DEPDIR)/libhpcrun_la-amd-rocprofiler.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@sample-sources/$(DEPDIR)/libhpcrun_la-amd.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@sample-sources/$(DEPDIR)/libhpcrun_la-common.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@sample-sources/$(DEPDIR)/libhpcrun_la-display.Plo@am__quote@
@@ -5651,6 +5658,13 @@ sample-sources/libhpcrun_la-amd.lo: sample-sources/amd.c
 @AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
 @am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -c -o sample-sources/libhpcrun_la-amd.lo `test -f 'sample-sources/amd.c' || echo '$(srcdir)/'`sample-sources/amd.c
 
+sample-sources/libhpcrun_la-amd-rocprofiler.lo: sample-sources/amd-rocprofiler.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -MT sample-sources/libhpcrun_la-amd-rocprofiler.lo -MD -MP -MF sample-sources/$(DEPDIR)/libhpcrun_la-amd-rocprofiler.Tpo -c -o sample-sources/libhpcrun_la-amd-rocprofiler.lo `test -f 'sample-sources/amd-rocprofiler.c' || echo '$(srcdir)/'`sample-sources/amd-rocprofiler.c
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) sample-sources/$(DEPDIR)/libhpcrun_la-amd-rocprofiler.Tpo sample-sources/$(DEPDIR)/libhpcrun_la-amd-rocprofiler.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='sample-sources/amd-rocprofiler.c' object='sample-sources/libhpcrun_la-amd-rocprofiler.lo' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -c -o sample-sources/libhpcrun_la-amd-rocprofiler.lo `test -f 'sample-sources/amd-rocprofiler.c' || echo '$(srcdir)/'`sample-sources/amd-rocprofiler.c
+
 gpu/amd/libhpcrun_la-roctracer-activity-translate.lo: gpu/amd/roctracer-activity-translate.c
 @am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -MT gpu/amd/libhpcrun_la-roctracer-activity-translate.lo -MD -MP -MF gpu/amd/$(DEPDIR)/libhpcrun_la-roctracer-activity-translate.Tpo -c -o gpu/amd/libhpcrun_la-roctracer-activity-translate.lo `test -f 'gpu/amd/roctracer-activity-translate.c' || echo '$(srcdir)/'`gpu/amd/roctracer-activity-translate.c
 @am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) gpu/amd/$(DEPDIR)/libhpcrun_la-roctracer-activity-translate.Tpo gpu/amd/$(DEPDIR)/libhpcrun_la-roctracer-activity-translate.Plo
diff --git a/src/tool/hpcrun/gpu/amd/rocprofiler-api.c b/src/tool/hpcrun/gpu/amd/rocprofiler-api.c
index b72a2d5243..6e54893da5 100644
--- a/src/tool/hpcrun/gpu/amd/rocprofiler-api.c
+++ b/src/tool/hpcrun/gpu/amd/rocprofiler-api.c
@@ -155,7 +155,8 @@ Context pool API:
   macro(rocprofiler_set_queue_callbacks) \
   macro(rocprofiler_start_queue_callbacks) \
   macro(rocprofiler_stop_queue_callbacks) \
-  macro(rocprofiler_remove_queue_callbacks)
+  macro(rocprofiler_remove_queue_callbacks) \
+  macro(rocprofiler_iterate_info)
 
 
 
@@ -175,10 +176,6 @@ Context pool API:
   }						\
 }
 
-
-typedef const char* (*hip_kernel_name_fnt)(const hipFunction_t f);
-typedef const char* (*hip_kernel_name_ref_fnt)(const void* hostFunction, hipStream_t stream);
-
 typedef struct {
   bool valid;
   hsa_agent_t agent;
@@ -195,9 +192,19 @@ static hpcrun_amd_counter_data_t counter_data;
 static uint64_t rocprofiler_correlation_id;
 static volatile int context_callback_finish;
 
+static bool rocprofiler_initialized = false;
+
+// total number of counters supported by rocprofiler,
+// an array of their string names, and an array of their description
+static int total_counters = 0;
+static const char** counter_name = NULL;
+static const char** counter_description = NULL;
+
+//
+static int *is_specified_by_user = NULL;
+static int total_requested = 0;
+static rocprofiler_feature_t* rocprofiler_input = NULL;
 
-static hip_kernel_name_fnt hip_kernel_name_fn;
-static hip_kernel_name_ref_fnt hip_kernel_name_ref_fn;
 
 //----------------------------------------------------------
 // rocprofiler function pointers for late binding
@@ -265,6 +272,17 @@ ROCPROFILER_FN
   )
 );
 
+ROCPROFILER_FN
+(
+  rocprofiler_iterate_info,
+  (
+    const hsa_agent_t* agent,			// [in] GPU handle, NULL for all
+                                  // GPU agents
+	  rocprofiler_info_kind_t kind,			// kind of iterated info
+	  hsa_status_t (*callback)(const rocprofiler_info_data_t info, void *data), // callback
+	  void *data
+  )
+);
 
 //******************************************************************************
 // private operations
@@ -281,31 +299,17 @@ rocprofiler_path
   return path;
 }
 
-
-unsigned metrics_input(rocprofiler_feature_t** ret) {
-  // Profiling feature objects
-  const unsigned feature_count = 4;
-  rocprofiler_feature_t* features = (rocprofiler_feature_t*) malloc(sizeof(rocprofiler_feature_t) * feature_count);
-  memset(features, 0, feature_count * sizeof(rocprofiler_feature_t));
-
-  // PMC events
-  features[0].kind = ROCPROFILER_FEATURE_KIND_METRIC;
-  features[0].name = "GRBM_COUNT";
-  features[1].kind = ROCPROFILER_FEATURE_KIND_METRIC;
-  features[1].name = "GRBM_GUI_ACTIVE";
-  features[2].kind = ROCPROFILER_FEATURE_KIND_METRIC;
-  features[2].name = "TCC_HIT_sum";
-  features[3].kind = ROCPROFILER_FEATURE_KIND_METRIC;
-  features[3].name = "TCC_MISS_sum";
-
-  *ret = features;
-  return feature_count;
-}
-
 // Profiling completion handler
 // Dump and delete the context entry
 // Return true if the context was dumped successfully
-static bool context_handler1(rocprofiler_group_t group, void* arg) {
+static bool
+rocprofiler_context_handler
+(
+  rocprofiler_group_t group,
+  void* arg
+)
+{
+  printf("Enter rocprofiler_context_handler\n");
   hpcrun_thread_init_mem_pool_once(0, NULL, false, true);
   volatile bool valid = counter_data.valid;
   while (!valid) {
@@ -356,13 +360,14 @@ static bool context_handler1(rocprofiler_group_t group, void* arg) {
 }
 
 static hsa_status_t
-dispatch_callback
+rocprofiler_dispatch_callback
 (
   const rocprofiler_callback_data_t* callback_data,
   void* arg,
   rocprofiler_group_t* group
 ) {
-  printf("Rocprofiler dispatch_callback\n\n");
+  if (total_requested == 0) return HSA_STATUS_SUCCESS;
+  printf("Enter rocprofiler_dispatch_callback\n");
   // Passed tool data
   hsa_agent_t agent = callback_data->agent;
   // HSA status
@@ -370,14 +375,11 @@ dispatch_callback
 
   rocprofiler_t* context = NULL;
   rocprofiler_properties_t properties = {};
-  properties.handler = context_handler1;
+  properties.handler = rocprofiler_context_handler;
   properties.handler_arg = NULL;
 
-  rocprofiler_feature_t *features;
-  unsigned feature_count = metrics_input(&features);
-
   counter_data.valid = false;
-  HPCRUN_ROCPROFILER_CALL(rocprofiler_open, (agent, features, feature_count,
+  HPCRUN_ROCPROFILER_CALL(rocprofiler_open, (agent, rocprofiler_input, total_requested,
                             &context, 0 /*ROCPROFILER_MODE_SINGLEGROUP*/, &properties));
 
 
@@ -395,14 +397,88 @@ dispatch_callback
 }
 
 
-static void rocp_inicialize() {
+static void
+rocp_inicialize
+(
+
+)
+{
   rocprofiler_queue_callbacks_t callbacks_ptrs = {};
-  callbacks_ptrs.dispatch = dispatch_callback;
+  callbacks_ptrs.dispatch = rocprofiler_dispatch_callback;
   rocprofiler_set_queue_callbacks(callbacks_ptrs, NULL);
 }
 
+static void cleanup() {
+  // Unregister dispatch callback
+  rocprofiler_remove_queue_callbacks();
+}
+
+static hsa_status_t
+total_counter_accumulator
+(
+  const rocprofiler_info_data_t info,
+  void *data
+)
+{
+  total_counters += 1;
+  return HSA_STATUS_SUCCESS;
+}
+
+static hsa_status_t
+counter_info_accumulator
+(
+  const rocprofiler_info_data_t info,
+  void *data
+)
+{
+  /*
+  fprintf(stderr, "Enter counter_info_accumulator\n");
+  fprintf(stderr, "\tname %s\n", info.metric.name);
+  fprintf(stderr, "\tinstances %d\n", info.metric.instances);
+  fprintf(stderr, "\texpr %s\n", info.metric.expr);
+  fprintf(stderr, "\tblock name %s\n", info.metric.block_name);
+  fprintf(stderr, "\tblock_counters %d\n", info.metric.block_counters);
+  */
+  counter_name[total_counters] = strdup(info.metric.name);
+  counter_description[total_counters] = strdup(info.metric.description);
+  total_counters += 1;
+  return HSA_STATUS_SUCCESS;
+}
+
+static void
+initialize_counter_information
+(
+
+)
+{
+  // First we iterate over all counters to get the total
+  HPCRUN_ROCPROFILER_CALL(rocprofiler_iterate_info,
+    (NULL, ROCPROFILER_INFO_KIND_METRIC, total_counter_accumulator, NULL));
+
+  // Allocate infomation array
+  counter_name = (const char**) malloc(total_counters * sizeof(const char*));
+  counter_description = (const char**) malloc(total_counters * sizeof(const char*));
+
+  // Fill in name and description string for each counter
+  total_counters = 0;
+  HPCRUN_ROCPROFILER_CALL(rocprofiler_iterate_info,
+    (NULL, ROCPROFILER_INFO_KIND_METRIC, counter_info_accumulator, NULL));
+
+  // Allocate an array to record whether a counter is asked by the user
+  is_specified_by_user = (int*) malloc(total_counters * sizeof(int));
+  memset(is_specified_by_user, 0, total_counters * sizeof(int));
+}
+
+//******************************************************************************
+// AMD hidden interface operations
+//******************************************************************************
+
 // This is necessary for rocprofiler callback to work
 extern PUBLIC_API void OnLoadToolProp(rocprofiler_settings_t* settings){
+  // Somehow needs to disable code object tracking
+  // to avoid a deadlock in rocprofiler
+  settings->code_obj_tracking = 0;
+  rocprofiler_init();
   printf("Rocprofiler OnLoadToolProp______________________\n");
   rocp_inicialize();
 }
@@ -416,13 +492,6 @@ extern PUBLIC_API void OnLoad(){
   printf("Rocprofiler OnLoad______________________\n");
 }
 
-static void cleanup() {
-  // Unregister dispatch callback
-  rocprofiler_remove_queue_callbacks();
-}
-
-
-
 //******************************************************************************
 // interface operations
 //******************************************************************************
@@ -451,8 +520,21 @@ rocprofiler_init
  void
 )
 {
+  if (rocprofiler_initialized) {
+    return;
+  }
+  rocprofiler_initialized = true;
   printf("Rocprofiler INIT\n");
-  // rocp_inicialize();
+
+#ifndef HPCRUN_STATIC_LINK
+  // We usually bind GPU vendor library in finalize_event_list.
+  // But here we must do early binding to query supported list of counters
+  if (rocprofiler_bind() != DYNAMIC_BINDING_STATUS_OK) {
+    EEMSG("hpcrun: unable to bind to AMD rocprofiler library %s\n", dlerror());
+    monitor_real_exit(-1);
+  }
+#endif
+  initialize_counter_information();
   return;
 }
 
@@ -477,23 +559,16 @@ rocprofiler_bind
  void
 )
 {
-//   // This is a workaround for roctracer to not hang when taking timer interrupts
-//   // More details: https://github.com/ROCm-Developer-Tools/roctracer/issues/22
-//   setenv("HSA_ENABLE_INTERRUPT", "0", 1);
-
-  // if (rocm_debug_api_bind() != DYNAMIC_BINDING_STATUS_OK) {
-  //   return DYNAMIC_BINDING_STATUS_ERROR;
-  // }
-
 #ifndef HPCRUN_STATIC_LINK
   // dynamic libraries only availabile in non-static case
   hpcrun_force_dlopen(true);
   CHK_DLOPEN(rocprofiler, rocprofiler_path(), RTLD_NOW | RTLD_GLOBAL);
-  // Somehow roctracter needs libkfdwrapper64.so, but does not really load it.
-  // So, we load it before using any function in roctracter.
-  CHK_DLOPEN(kfd, "libkfdwrapper64.so", RTLD_NOW | RTLD_GLOBAL);
 
-  CHK_DLOPEN(hip, "libamdhip64.so", RTLD_NOW | RTLD_GLOBAL);
+  if (getenv("HPCRUN_LIST_EVENT")) {
+    CHK_DLOPEN(hsa, "libhsa-runtime64.so", RTLD_NOW | RTLD_GLOBAL);
+    hsa_init();
+  }
+
   hpcrun_force_dlopen(false);
 
 #define ROCPROFILER_BIND(fn) \
@@ -503,18 +578,6 @@ rocprofiler_bind
 
 #undef ROCPROFILER_BIND
 
-  dlerror();
-  hip_kernel_name_fn = (hip_kernel_name_fnt) dlsym(hip, "hipKernelNameRef");
-  if (hip_kernel_name_fn == 0) {
-    return DYNAMIC_BINDING_STATUS_ERROR;
-  }
-
-  dlerror();
-  hip_kernel_name_ref_fn = (hip_kernel_name_ref_fnt) dlsym(hip, "hipKernelNameRefByPtr");
-  if (hip_kernel_name_ref_fn == 0) {
-    return DYNAMIC_BINDING_STATUS_ERROR;
-  }
-
   return DYNAMIC_BINDING_STATUS_OK;
 #else
   return DYNAMIC_BINDING_STATUS_ERROR;
@@ -529,3 +592,71 @@ rocprofiler_wait_context_callback
 {
   while (context_callback_finish == 0);
 }
+
+int
+rocprofiler_total_counters
+(
+  void
+)
+{
+  return total_counters;
+}
+
+const char*
+rocprofiler_counter_name
+(
+  int idx
+)
+{
+  if (idx < 0 || idx >= total_counters || counter_name == NULL) return NULL;
+  return counter_name[idx];
+}
+
+const char*
+rocprofiler_counter_description
+(
+  int idx
+)
+{
+  if (idx < 0 || idx >= total_counters || counter_description == NULL) return NULL;
+  return counter_description[idx];
+}
+
+int
+rocprofiler_match_event
+(
+  const char* ev_str
+)
+{
+  for (int i = 0; i < total_counters; i++) {
+    if (strcmp(ev_str, counter_name[i]) == 0) {
+      is_specified_by_user[i] = 1;
+      return 1;
+    }
+  }
+  return 0;
+}
+
+void
+rocprofiler_finalize_event_list
+(
+)
+{
+  for (int i = 0; i < total_counters; i++) {
+    if (is_specified_by_user[i] == 1) {
+      total_requested += 1;
+    }
+  }
+
+  rocprofiler_input = (rocprofiler_feature_t*) malloc(sizeof(rocprofiler_feature_t) * total_requested);
+  memset(rocprofiler_input, 0, total_requested * sizeof(rocprofiler_feature_t));
+
+  total_requested = 0;
+  for (int i = 0; i < total_counters; i++) {
+    if (is_specified_by_user[i] == 1) {
+      rocprofiler_input[total_requested].kind = ROCPROFILER_FEATURE_KIND_METRIC;
+      rocprofiler_input[total_requested].name = counter_name[i];
+      total_requested += 1;
+    }
+  }
+}
diff --git a/src/tool/hpcrun/gpu/amd/rocprofiler-api.h b/src/tool/hpcrun/gpu/amd/rocprofiler-api.h
index 426f1fb28a..42b50ee2b0 100644
--- a/src/tool/hpcrun/gpu/amd/rocprofiler-api.h
+++ b/src/tool/hpcrun/gpu/amd/rocprofiler-api.h
@@ -91,5 +91,35 @@ rocprofiler_wait_context_callback
   void
 );
 
+int
+rocprofiler_total_counters
+(
+  void
+);
+
+const char*
+rocprofiler_counter_name
+(
+  int
+);
+
+const char*
+rocprofiler_counter_description
+(
+  int
+);
+
+int
+rocprofiler_match_event
+(
+  const char*
+);
+
+void
+rocprofiler_finalize_event_list
+(
+
+);
+
 
 #endif
diff --git a/src/tool/hpcrun/gpu/amd/roctracer-api.c b/src/tool/hpcrun/gpu/amd/roctracer-api.c
index 6a17a57a3a..7368f21ee1 100644
--- a/src/tool/hpcrun/gpu/amd/roctracer-api.c
+++ b/src/tool/hpcrun/gpu/amd/roctracer-api.c
@@ -115,7 +115,7 @@ static hip_kernel_name_ref_fnt hip_kernel_name_ref_fn;
 // If we collect counters for GPU kernels,
 // we will serilize kernel executions.
 // Hopefully, AMD tool support will improve this the future
-static bool collect_counter = true;
+static bool collect_counter = false;
 
 //----------------------------------------------------------
 // roctracer function pointers for late binding
@@ -621,3 +621,11 @@ roctracer_fini
   gpu_application_thread_process_activities();
 }
 
+void
+roctracer_enable_counter_collection
+(
+  void
+)
+{
+  collect_counter = true;
+}
diff --git a/src/tool/hpcrun/gpu/amd/roctracer-api.h b/src/tool/hpcrun/gpu/amd/roctracer-api.h
index b30d21a8b6..68d1aa618b 100644
--- a/src/tool/hpcrun/gpu/amd/roctracer-api.h
+++ b/src/tool/hpcrun/gpu/amd/roctracer-api.h
@@ -71,6 +71,11 @@ roctracer_bind
   void
 );
 
+void
+roctracer_enable_counter_collection
+(
+  void
+);
 
 
 #endif
diff --git a/src/tool/hpcrun/gpu/gpu-metrics.h b/src/tool/hpcrun/gpu/gpu-metrics.h
index a26650641f..01952f01a6 100644
--- a/src/tool/hpcrun/gpu/gpu-metrics.h
+++ b/src/tool/hpcrun/gpu/gpu-metrics.h
@@ -494,6 +494,18 @@ gpu_metrics_GBR_enable
 );
 
 
+//--------------------------------------------------
+// record GPU hardware counters
+//--------------------------------------------------
+
+void
+gpu_metrics_GPU_CTR_enable
+(
+  void
+);
+
+
+
 //--------------------------------------------------
 // attribute GPU measurements to an application 
 // thread's calling context tree
diff --git a/src/tool/hpcrun/sample-sources/amd-rocprofiler.c b/src/tool/hpcrun/sample-sources/amd-rocprofiler.c
new file mode 100644
index 0000000000..6158ddd2fe
--- /dev/null
+++ b/src/tool/hpcrun/sample-sources/amd-rocprofiler.c
@@ -0,0 +1,211 @@
+//******************************************************************************
+// system includes
+//******************************************************************************
+
+#include <alloca.h>
+#include <assert.h>
+#include <ctype.h>
+#include <setjmp.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <ucontext.h>
+#include <stdbool.h>
+
+#include <pthread.h>
+
+#ifndef HPCRUN_STATIC_LINK
+#include <dlfcn.h>
+#endif
+
+
+
+//******************************************************************************
+// libmonitor
+//******************************************************************************
+
+#include <monitor.h>
+
+
+
+//******************************************************************************
+// local includes
+//******************************************************************************
+
+#include "amd.h"
+
+#include "libdl.h"
+
+#include "simple_oo.h"
+#include "sample_source_obj.h"
+#include "common.h"
+
+#include <hpcrun/control-knob.h>
+#include <hpcrun/device-finalizers.h>
+#include <hpcrun/gpu/amd/roctracer-api.h>
+#include <hpcrun/gpu/amd/rocprofiler-api.h>
+#include <hpcrun/gpu/gpu-activity.h>
+#include <hpcrun/gpu/gpu-metrics.h>
+#include <hpcrun/gpu/gpu-trace.h>
+#include <hpcrun/hpcrun_options.h>
+#include <hpcrun/hpcrun_stats.h>
+#include <hpcrun/metrics.h>
+#include <hpcrun/module-ignore-map.h>
+#include <hpcrun/ompt/ompt-interface.h>
+#include <hpcrun/safe-sampling.h>
+#include <hpcrun/sample_sources_registered.h>
+#include <hpcrun/sample_event.h>
+#include <hpcrun/thread_data.h>
+#include <hpcrun/trace.h>
+
+#include <utilities/tokenize.h>
+#include <messages/messages.h>
+#include <lush/lush-backtrace.h>
+#include <lib/prof-lean/hpcrun-fmt.h>
+
+#include <roctracer_hip.h>
+
+
+
+//******************************************************************************
+// macros
+//******************************************************************************
+
+#define AMD_ROCPROFILER_PREFIX "rocprof"
+
+static device_finalizer_fn_entry_t device_finalizer_rocprofiler_shutdown;
+
+//******************************************************************************
+// interface operations
+//******************************************************************************
+
+static void
+METHOD_FN(init)
+{
+    self->state = INIT;
+}
+
+
+static void
+METHOD_FN(thread_init)
+{
+    TMSG(CUDA, "thread_init");
+}
+
+
+static void
+METHOD_FN(thread_init_action)
+{
+    TMSG(CUDA, "thread_init_action");
+}
+
+
+static void
+METHOD_FN(start)
+{
+    TMSG(CUDA, "start");
+    TD_GET(ss_state)[self->sel_idx] = START;
+}
+
+
+static void
+METHOD_FN(thread_fini_action)
+{
+    TMSG(CUDA, "thread_fini_action");
+}
+
+
+static void
+METHOD_FN(stop)
+{
+  hpcrun_get_thread_data();
+  TD_GET(ss_state)[self->sel_idx] = STOP;
+}
+
+
+static void
+METHOD_FN(shutdown)
+{
+  self->state = UNINIT;
+}
+
+
+static bool
+METHOD_FN(supports_event, const char *ev_str)
+{
+  rocprofiler_init();
+#ifndef HPCRUN_STATIC_LINK
+  if (hpcrun_ev_is(ev_str, AMD_ROCPROFILER_PREFIX)) {
+    const char* roc_str = ev_str + sizeof(AMD_ROCPROFILER_PREFIX);
+    while (*roc_str == ':') roc_str++;
+    if (*roc_str == 0) return false;
+    return rocprofiler_match_event(roc_str) != 0;
+  }
+  return false;
+#else
+    return false;
+#endif
+
+
+}
+
+static void
+METHOD_FN(process_event_list, int lush_metrics)
+{
+  int nevents = (self->evl).nevents;
+  gpu_metrics_GPU_CTR_enable();
+  TMSG(CUDA,"nevents = %d", nevents);
+}
+
+static void
+METHOD_FN(finalize_event_list)
+{
+  // After going through all command line arguments,
+  // we call this function to generate a list of counters
+  // in rocprofiler's format
+  rocprofiler_finalize_event_list();
+
+  device_finalizer_rocprofiler_shutdown.fn = rocprofiler_fini;
+  device_finalizer_register(device_finalizer_type_shutdown, &device_finalizer_rocprofiler_shutdown);
+
+  // Inform roctracer component that we will collect hardware counters,
+  // which will serialize kernel launches
+  roctracer_enable_counter_collection();
+}
+
+
+static void
+METHOD_FN(gen_event_set,int lush_metrics)
+{
+
+}
+
+
+static void
+METHOD_FN(display_events)
+{
+  // We need to query rocprofiler to get a list of supported rocprofiler counters
+  rocprofiler_init();
+
+  int total_counters = rocprofiler_total_counters();
+  printf("===========================================================================\n");
+  printf("Available AMD GPU hardware counter events\n");
+  printf("===========================================================================\n");
+  printf("Name\t\tDescription\n");
+  printf("---------------------------------------------------------------------------\n");
+  for (int i = 0; i < total_counters; ++i) {
+    printf("%s%s\t\t%s\n", AMD_ROCPROFILER_PREFIX, rocprofiler_counter_name(i), rocprofiler_counter_description(i));
+  }
+  printf("\n");
+}
+
+
+
+//**************************************************************************
+// object
+//**************************************************************************
+
+#define ss_name amd_rocprof
+#define ss_cls SS_HARDWARE
+
+#include "ss_obj.h"
diff --git a/src/tool/hpcrun/sample-sources/amd.c b/src/tool/hpcrun/sample-sources/amd.c
index bbf3725750..a6273d3fa3 100644
--- a/src/tool/hpcrun/sample-sources/amd.c
+++ b/src/tool/hpcrun/sample-sources/amd.c
@@ -152,7 +152,6 @@ METHOD_FN(process_event_list, int lush_metrics)
     int nevents = (self->evl).nevents;
     gpu_metrics_default_enable();
     hpcrun_set_trace_metric(HPCRUN_GPU_TRACE_FLAG);
-    gpu_metrics_GPU_CTR_enable();
     TMSG(CUDA,"nevents = %d", nevents);
 }
 
@@ -164,10 +163,6 @@ METHOD_FN(finalize_event_list)
     EEMSG("hpcrun: unable to bind to AMD roctracer library %s\n", dlerror());
     monitor_real_exit(-1);
   }
-  if (rocprofiler_bind() != DYNAMIC_BINDING_STATUS_OK) {
-    EEMSG("hpcrun: unable to bind to AMD rocprofiler library %s\n", dlerror());
-    monitor_real_exit(-1);
-  }
 #endif
 
 #if 0
@@ -177,7 +172,6 @@ METHOD_FN(finalize_event_list)
   char* event = start_tok(evlist);
 #endif
     roctracer_init();
-    rocprofiler_init();
 
     // Init records
     gpu_trace_init();
@@ -185,10 +179,6 @@ METHOD_FN(finalize_event_list)
     device_finalizer_roctracer_shutdown.fn = roctracer_fini;
     device_finalizer_register(device_finalizer_type_shutdown, &device_finalizer_roctracer_shutdown);
 
-    device_finalizer_rocprofiler_shutdown.fn = rocprofiler_fini;
-    device_finalizer_register(device_finalizer_type_shutdown, &device_finalizer_rocprofiler_shutdown);
-
-
     // Register shutdown functions to write trace files
     device_trace_finalizer_shutdown.fn = gpu_trace_fini;
     device_finalizer_register(device_finalizer_type_shutdown, &device_trace_finalizer_shutdown);
diff --git a/src/tool/hpcrun/sample-sources/ss-list.h b/src/tool/hpcrun/sample-sources/ss-list.h
index 50f6724000..3a329c1719 100644
--- a/src/tool/hpcrun/sample-sources/ss-list.h
+++ b/src/tool/hpcrun/sample-sources/ss-list.h
@@ -96,6 +96,10 @@ SAMPLE_SOURCE_DECL_MACRO(nvidia_gpu)
 SAMPLE_SOURCE_DECL_MACRO(amd_gpu)
 #endif
 
+#ifdef HPCRUN_SS_AMD
+SAMPLE_SOURCE_DECL_MACRO(amd_rocprof)
+#endif
+
 #ifdef HPCRUN_SS_LEVEL0
 SAMPLE_SOURCE_DECL_MACRO(level0)
 #endif

From e6857db1b76d29bf134cc1b8fa10845c6459ef6f Mon Sep 17 00:00:00 2001
From: Xiaozhu Meng <mxz297@gmail.com>
Date: Fri, 17 Dec 2021 21:53:49 -0600
Subject: [PATCH 153/177] Set environment variables needed for rocprofiler in
 hpcrun script

---
 Makefile.in                                      | 1 +
 configure                                        | 2 ++
 configure.ac                                     | 1 +
 doc/Makefile.in                                  | 1 +
 doc/man/Makefile.in                              | 1 +
 doc/manual/Makefile.in                           | 1 +
 doc/www/Makefile.in                              | 1 +
 lib/Makefile.in                                  | 1 +
 src/Makefile.in                                  | 1 +
 src/extern/Makefile.in                           | 1 +
 src/extern/libunwind/Makefile.in                 | 1 +
 src/extern/lzma/Makefile.in                      | 1 +
 src/lib/Makefile.in                              | 1 +
 src/lib/analysis/Makefile.in                     | 1 +
 src/lib/banal/Makefile.in                        | 1 +
 src/lib/binutils/Makefile.in                     | 1 +
 src/lib/isa/Makefile.in                          | 1 +
 src/lib/prof-lean/Makefile.in                    | 1 +
 src/lib/prof/Makefile.in                         | 1 +
 src/lib/profile/Makefile.in                      | 1 +
 src/lib/profxml/Makefile.in                      | 1 +
 src/lib/stubs-gcc_s/Makefile.in                  | 1 +
 src/lib/support-lean/Makefile.in                 | 1 +
 src/lib/support/Makefile.in                      | 1 +
 src/lib/xml/Makefile.in                          | 1 +
 src/tool/Makefile.in                             | 1 +
 src/tool/hpcfnbounds/Makefile.in                 | 1 +
 src/tool/hpcfnbounds2/Makefile.in                | 1 +
 src/tool/hpclump/Makefile.in                     | 1 +
 src/tool/hpcprof-flat/Makefile.in                | 1 +
 src/tool/hpcprof-mpi/Makefile.in                 | 1 +
 src/tool/hpcprof/Makefile.in                     | 1 +
 src/tool/hpcprof2-mpi/Makefile.in                | 1 +
 src/tool/hpcprof2/Makefile.in                    | 1 +
 src/tool/hpcproftt/Makefile.in                   | 1 +
 src/tool/hpcrun-flat/Makefile.in                 | 1 +
 src/tool/hpcrun/Makefile.in                      | 1 +
 src/tool/hpcrun/sample-sources/amd-rocprofiler.c | 2 +-
 src/tool/hpcrun/scripts/hpcrun.in                | 6 ++++++
 src/tool/hpcrun/utilities/bgq-cnk/Makefile.in    | 1 +
 src/tool/hpcserver/Makefile.in                   | 1 +
 src/tool/hpcserver/mpi/Makefile.in               | 1 +
 src/tool/hpcstruct/Makefile.in                   | 1 +
 src/tool/hpctracedump/Makefile.in                | 1 +
 src/tool/misc/Makefile.in                        | 1 +
 src/tool/xprof/Makefile.in                       | 1 +
 46 files changed, 52 insertions(+), 1 deletion(-)

diff --git a/Makefile.in b/Makefile.in
index 9166db4530..da05a502d1 100644
--- a/Makefile.in
+++ b/Makefile.in
@@ -379,6 +379,7 @@ PERFMON_LDFLAGS_STAT = @PERFMON_LDFLAGS_STAT@
 PERFMON_LIB = @PERFMON_LIB@
 PERF_EVENT_PARANOID = @PERF_EVENT_PARANOID@
 RANLIB = @RANLIB@
+ROCM_PROFILER_LD_DIR = @ROCM_PROFILER_LD_DIR@
 SED = @SED@
 SET_MAKE = @SET_MAKE@
 SHELL = @SHELL@
diff --git a/configure b/configure
index d95189361b..c27fb61548 100755
--- a/configure
+++ b/configure
@@ -651,6 +651,7 @@ OPT_LEVEL0_IFLAGS
 OPT_LEVEL0
 OPT_ENABLE_LEVEL0_FALSE
 OPT_ENABLE_LEVEL0_TRUE
+ROCM_PROFILER_LD_DIR
 OPT_ROCM_LD_LIB_PATH
 OPT_ROCM_IFLAGS
 OPT_ENABLE_ROCM_FALSE
@@ -24800,6 +24801,7 @@ fi
 
 
 
+
 #-------------------------------------------------
 # Option: --with-level0=PATH
 #-------------------------------------------------
diff --git a/configure.ac b/configure.ac
index 40edba48b9..36ad2cf0b4 100644
--- a/configure.ac
+++ b/configure.ac
@@ -5508,6 +5508,7 @@ AM_CONDITIONAL([OPT_ENABLE_ROCM], [test "$OPT_HAVE_ROCM" = yes])
 
 AC_SUBST([OPT_ROCM_IFLAGS])
 AC_SUBST([OPT_ROCM_LD_LIB_PATH])
+AC_SUBST([ROCM_PROFILER_LD_DIR])
 
 
 #-------------------------------------------------
diff --git a/doc/Makefile.in b/doc/Makefile.in
index c8196892ef..25c469b6a1 100644
--- a/doc/Makefile.in
+++ b/doc/Makefile.in
@@ -391,6 +391,7 @@ PERFMON_LDFLAGS_STAT = @PERFMON_LDFLAGS_STAT@
 PERFMON_LIB = @PERFMON_LIB@
 PERF_EVENT_PARANOID = @PERF_EVENT_PARANOID@
 RANLIB = @RANLIB@
+ROCM_PROFILER_LD_DIR = @ROCM_PROFILER_LD_DIR@
 SED = @SED@
 SET_MAKE = @SET_MAKE@
 SHELL = @SHELL@
diff --git a/doc/man/Makefile.in b/doc/man/Makefile.in
index 67cb362dc1..0f8c905292 100644
--- a/doc/man/Makefile.in
+++ b/doc/man/Makefile.in
@@ -334,6 +334,7 @@ PERFMON_LDFLAGS_STAT = @PERFMON_LDFLAGS_STAT@
 PERFMON_LIB = @PERFMON_LIB@
 PERF_EVENT_PARANOID = @PERF_EVENT_PARANOID@
 RANLIB = @RANLIB@
+ROCM_PROFILER_LD_DIR = @ROCM_PROFILER_LD_DIR@
 SED = @SED@
 SET_MAKE = @SET_MAKE@
 SHELL = @SHELL@
diff --git a/doc/manual/Makefile.in b/doc/manual/Makefile.in
index a804f017b1..45cc21b02b 100644
--- a/doc/manual/Makefile.in
+++ b/doc/manual/Makefile.in
@@ -331,6 +331,7 @@ PERFMON_LDFLAGS_STAT = @PERFMON_LDFLAGS_STAT@
 PERFMON_LIB = @PERFMON_LIB@
 PERF_EVENT_PARANOID = @PERF_EVENT_PARANOID@
 RANLIB = @RANLIB@
+ROCM_PROFILER_LD_DIR = @ROCM_PROFILER_LD_DIR@
 SED = @SED@
 SET_MAKE = @SET_MAKE@
 SHELL = @SHELL@
diff --git a/doc/www/Makefile.in b/doc/www/Makefile.in
index 6a3e8c3784..8bd3d67822 100644
--- a/doc/www/Makefile.in
+++ b/doc/www/Makefile.in
@@ -331,6 +331,7 @@ PERFMON_LDFLAGS_STAT = @PERFMON_LDFLAGS_STAT@
 PERFMON_LIB = @PERFMON_LIB@
 PERF_EVENT_PARANOID = @PERF_EVENT_PARANOID@
 RANLIB = @RANLIB@
+ROCM_PROFILER_LD_DIR = @ROCM_PROFILER_LD_DIR@
 SED = @SED@
 SET_MAKE = @SET_MAKE@
 SHELL = @SHELL@
diff --git a/lib/Makefile.in b/lib/Makefile.in
index 92e797fb7c..889321db78 100644
--- a/lib/Makefile.in
+++ b/lib/Makefile.in
@@ -330,6 +330,7 @@ PERFMON_LDFLAGS_STAT = @PERFMON_LDFLAGS_STAT@
 PERFMON_LIB = @PERFMON_LIB@
 PERF_EVENT_PARANOID = @PERF_EVENT_PARANOID@
 RANLIB = @RANLIB@
+ROCM_PROFILER_LD_DIR = @ROCM_PROFILER_LD_DIR@
 SED = @SED@
 SET_MAKE = @SET_MAKE@
 SHELL = @SHELL@
diff --git a/src/Makefile.in b/src/Makefile.in
index 168fc19cc0..da16ec4ea6 100644
--- a/src/Makefile.in
+++ b/src/Makefile.in
@@ -360,6 +360,7 @@ PERFMON_LDFLAGS_STAT = @PERFMON_LDFLAGS_STAT@
 PERFMON_LIB = @PERFMON_LIB@
 PERF_EVENT_PARANOID = @PERF_EVENT_PARANOID@
 RANLIB = @RANLIB@
+ROCM_PROFILER_LD_DIR = @ROCM_PROFILER_LD_DIR@
 SED = @SED@
 SET_MAKE = @SET_MAKE@
 SHELL = @SHELL@
diff --git a/src/extern/Makefile.in b/src/extern/Makefile.in
index 6487c6533c..09226a39c3 100644
--- a/src/extern/Makefile.in
+++ b/src/extern/Makefile.in
@@ -369,6 +369,7 @@ PERFMON_LDFLAGS_STAT = @PERFMON_LDFLAGS_STAT@
 PERFMON_LIB = @PERFMON_LIB@
 PERF_EVENT_PARANOID = @PERF_EVENT_PARANOID@
 RANLIB = @RANLIB@
+ROCM_PROFILER_LD_DIR = @ROCM_PROFILER_LD_DIR@
 SED = @SED@
 SET_MAKE = @SET_MAKE@
 SHELL = @SHELL@
diff --git a/src/extern/libunwind/Makefile.in b/src/extern/libunwind/Makefile.in
index 07e5213148..38c2943f21 100644
--- a/src/extern/libunwind/Makefile.in
+++ b/src/extern/libunwind/Makefile.in
@@ -304,6 +304,7 @@ PERFMON_LDFLAGS_STAT = @PERFMON_LDFLAGS_STAT@
 PERFMON_LIB = @PERFMON_LIB@
 PERF_EVENT_PARANOID = @PERF_EVENT_PARANOID@
 RANLIB = @RANLIB@
+ROCM_PROFILER_LD_DIR = @ROCM_PROFILER_LD_DIR@
 SED = @SED@
 SET_MAKE = @SET_MAKE@
 SHELL = @SHELL@
diff --git a/src/extern/lzma/Makefile.in b/src/extern/lzma/Makefile.in
index c8649688a4..608d832360 100644
--- a/src/extern/lzma/Makefile.in
+++ b/src/extern/lzma/Makefile.in
@@ -304,6 +304,7 @@ PERFMON_LDFLAGS_STAT = @PERFMON_LDFLAGS_STAT@
 PERFMON_LIB = @PERFMON_LIB@
 PERF_EVENT_PARANOID = @PERF_EVENT_PARANOID@
 RANLIB = @RANLIB@
+ROCM_PROFILER_LD_DIR = @ROCM_PROFILER_LD_DIR@
 SED = @SED@
 SET_MAKE = @SET_MAKE@
 SHELL = @SHELL@
diff --git a/src/lib/Makefile.in b/src/lib/Makefile.in
index e3627010c1..27e6214845 100644
--- a/src/lib/Makefile.in
+++ b/src/lib/Makefile.in
@@ -373,6 +373,7 @@ PERFMON_LDFLAGS_STAT = @PERFMON_LDFLAGS_STAT@
 PERFMON_LIB = @PERFMON_LIB@
 PERF_EVENT_PARANOID = @PERF_EVENT_PARANOID@
 RANLIB = @RANLIB@
+ROCM_PROFILER_LD_DIR = @ROCM_PROFILER_LD_DIR@
 SED = @SED@
 SET_MAKE = @SET_MAKE@
 SHELL = @SHELL@
diff --git a/src/lib/analysis/Makefile.in b/src/lib/analysis/Makefile.in
index 56c6d6f61f..1e1bfa1e32 100644
--- a/src/lib/analysis/Makefile.in
+++ b/src/lib/analysis/Makefile.in
@@ -408,6 +408,7 @@ PERFMON_LDFLAGS_STAT = @PERFMON_LDFLAGS_STAT@
 PERFMON_LIB = @PERFMON_LIB@
 PERF_EVENT_PARANOID = @PERF_EVENT_PARANOID@
 RANLIB = @RANLIB@
+ROCM_PROFILER_LD_DIR = @ROCM_PROFILER_LD_DIR@
 SED = @SED@
 SET_MAKE = @SET_MAKE@
 SHELL = @SHELL@
diff --git a/src/lib/banal/Makefile.in b/src/lib/banal/Makefile.in
index 6398fc3e67..69481d5394 100644
--- a/src/lib/banal/Makefile.in
+++ b/src/lib/banal/Makefile.in
@@ -404,6 +404,7 @@ PERFMON_LDFLAGS_STAT = @PERFMON_LDFLAGS_STAT@
 PERFMON_LIB = @PERFMON_LIB@
 PERF_EVENT_PARANOID = @PERF_EVENT_PARANOID@
 RANLIB = @RANLIB@
+ROCM_PROFILER_LD_DIR = @ROCM_PROFILER_LD_DIR@
 SED = @SED@
 SET_MAKE = @SET_MAKE@
 SHELL = @SHELL@
diff --git a/src/lib/binutils/Makefile.in b/src/lib/binutils/Makefile.in
index ea27efd04c..09ede4c05a 100644
--- a/src/lib/binutils/Makefile.in
+++ b/src/lib/binutils/Makefile.in
@@ -424,6 +424,7 @@ PERFMON_LDFLAGS_STAT = @PERFMON_LDFLAGS_STAT@
 PERFMON_LIB = @PERFMON_LIB@
 PERF_EVENT_PARANOID = @PERF_EVENT_PARANOID@
 RANLIB = @RANLIB@
+ROCM_PROFILER_LD_DIR = @ROCM_PROFILER_LD_DIR@
 SED = @SED@
 SET_MAKE = @SET_MAKE@
 SHELL = @SHELL@
diff --git a/src/lib/isa/Makefile.in b/src/lib/isa/Makefile.in
index 3faa223f42..3ea046f353 100644
--- a/src/lib/isa/Makefile.in
+++ b/src/lib/isa/Makefile.in
@@ -401,6 +401,7 @@ PERFMON_LDFLAGS_STAT = @PERFMON_LDFLAGS_STAT@
 PERFMON_LIB = @PERFMON_LIB@
 PERF_EVENT_PARANOID = @PERF_EVENT_PARANOID@
 RANLIB = @RANLIB@
+ROCM_PROFILER_LD_DIR = @ROCM_PROFILER_LD_DIR@
 SED = @SED@
 SET_MAKE = @SET_MAKE@
 SHELL = @SHELL@
diff --git a/src/lib/prof-lean/Makefile.in b/src/lib/prof-lean/Makefile.in
index 72deb89952..087e84cf49 100644
--- a/src/lib/prof-lean/Makefile.in
+++ b/src/lib/prof-lean/Makefile.in
@@ -406,6 +406,7 @@ PERFMON_LDFLAGS_STAT = @PERFMON_LDFLAGS_STAT@
 PERFMON_LIB = @PERFMON_LIB@
 PERF_EVENT_PARANOID = @PERF_EVENT_PARANOID@
 RANLIB = @RANLIB@
+ROCM_PROFILER_LD_DIR = @ROCM_PROFILER_LD_DIR@
 SED = @SED@
 SET_MAKE = @SET_MAKE@
 SHELL = @SHELL@
diff --git a/src/lib/prof/Makefile.in b/src/lib/prof/Makefile.in
index 16af4067ad..d7d4edf428 100644
--- a/src/lib/prof/Makefile.in
+++ b/src/lib/prof/Makefile.in
@@ -412,6 +412,7 @@ PERFMON_LDFLAGS_STAT = @PERFMON_LDFLAGS_STAT@
 PERFMON_LIB = @PERFMON_LIB@
 PERF_EVENT_PARANOID = @PERF_EVENT_PARANOID@
 RANLIB = @RANLIB@
+ROCM_PROFILER_LD_DIR = @ROCM_PROFILER_LD_DIR@
 SED = @SED@
 SET_MAKE = @SET_MAKE@
 SHELL = @SHELL@
diff --git a/src/lib/profile/Makefile.in b/src/lib/profile/Makefile.in
index a1f68e6195..5d6e878e70 100644
--- a/src/lib/profile/Makefile.in
+++ b/src/lib/profile/Makefile.in
@@ -443,6 +443,7 @@ PERFMON_LDFLAGS_STAT = @PERFMON_LDFLAGS_STAT@
 PERFMON_LIB = @PERFMON_LIB@
 PERF_EVENT_PARANOID = @PERF_EVENT_PARANOID@
 RANLIB = @RANLIB@
+ROCM_PROFILER_LD_DIR = @ROCM_PROFILER_LD_DIR@
 SED = @SED@
 SET_MAKE = @SET_MAKE@
 SHELL = @SHELL@
diff --git a/src/lib/profxml/Makefile.in b/src/lib/profxml/Makefile.in
index d7285ecb6c..90915b63e8 100644
--- a/src/lib/profxml/Makefile.in
+++ b/src/lib/profxml/Makefile.in
@@ -406,6 +406,7 @@ PERFMON_LDFLAGS_STAT = @PERFMON_LDFLAGS_STAT@
 PERFMON_LIB = @PERFMON_LIB@
 PERF_EVENT_PARANOID = @PERF_EVENT_PARANOID@
 RANLIB = @RANLIB@
+ROCM_PROFILER_LD_DIR = @ROCM_PROFILER_LD_DIR@
 SED = @SED@
 SET_MAKE = @SET_MAKE@
 SHELL = @SHELL@
diff --git a/src/lib/stubs-gcc_s/Makefile.in b/src/lib/stubs-gcc_s/Makefile.in
index 9c780b3dca..99d6a5e683 100644
--- a/src/lib/stubs-gcc_s/Makefile.in
+++ b/src/lib/stubs-gcc_s/Makefile.in
@@ -384,6 +384,7 @@ PERFMON_LDFLAGS_STAT = @PERFMON_LDFLAGS_STAT@
 PERFMON_LIB = @PERFMON_LIB@
 PERF_EVENT_PARANOID = @PERF_EVENT_PARANOID@
 RANLIB = @RANLIB@
+ROCM_PROFILER_LD_DIR = @ROCM_PROFILER_LD_DIR@
 SED = @SED@
 SET_MAKE = @SET_MAKE@
 SHELL = @SHELL@
diff --git a/src/lib/support-lean/Makefile.in b/src/lib/support-lean/Makefile.in
index 35ffa8c0a6..dae900c413 100644
--- a/src/lib/support-lean/Makefile.in
+++ b/src/lib/support-lean/Makefile.in
@@ -390,6 +390,7 @@ PERFMON_LDFLAGS_STAT = @PERFMON_LDFLAGS_STAT@
 PERFMON_LIB = @PERFMON_LIB@
 PERF_EVENT_PARANOID = @PERF_EVENT_PARANOID@
 RANLIB = @RANLIB@
+ROCM_PROFILER_LD_DIR = @ROCM_PROFILER_LD_DIR@
 SED = @SED@
 SET_MAKE = @SET_MAKE@
 SHELL = @SHELL@
diff --git a/src/lib/support/Makefile.in b/src/lib/support/Makefile.in
index d4f1edf599..706a9b6352 100644
--- a/src/lib/support/Makefile.in
+++ b/src/lib/support/Makefile.in
@@ -418,6 +418,7 @@ PERFMON_LDFLAGS_STAT = @PERFMON_LDFLAGS_STAT@
 PERFMON_LIB = @PERFMON_LIB@
 PERF_EVENT_PARANOID = @PERF_EVENT_PARANOID@
 RANLIB = @RANLIB@
+ROCM_PROFILER_LD_DIR = @ROCM_PROFILER_LD_DIR@
 SED = @SED@
 SET_MAKE = @SET_MAKE@
 SHELL = @SHELL@
diff --git a/src/lib/xml/Makefile.in b/src/lib/xml/Makefile.in
index 50c80235a4..db7cca1826 100644
--- a/src/lib/xml/Makefile.in
+++ b/src/lib/xml/Makefile.in
@@ -403,6 +403,7 @@ PERFMON_LDFLAGS_STAT = @PERFMON_LDFLAGS_STAT@
 PERFMON_LIB = @PERFMON_LIB@
 PERF_EVENT_PARANOID = @PERF_EVENT_PARANOID@
 RANLIB = @RANLIB@
+ROCM_PROFILER_LD_DIR = @ROCM_PROFILER_LD_DIR@
 SED = @SED@
 SET_MAKE = @SET_MAKE@
 SHELL = @SHELL@
diff --git a/src/tool/Makefile.in b/src/tool/Makefile.in
index ae0baf8570..457c790acd 100644
--- a/src/tool/Makefile.in
+++ b/src/tool/Makefile.in
@@ -378,6 +378,7 @@ PERFMON_LDFLAGS_STAT = @PERFMON_LDFLAGS_STAT@
 PERFMON_LIB = @PERFMON_LIB@
 PERF_EVENT_PARANOID = @PERF_EVENT_PARANOID@
 RANLIB = @RANLIB@
+ROCM_PROFILER_LD_DIR = @ROCM_PROFILER_LD_DIR@
 SED = @SED@
 SET_MAKE = @SET_MAKE@
 SHELL = @SHELL@
diff --git a/src/tool/hpcfnbounds/Makefile.in b/src/tool/hpcfnbounds/Makefile.in
index 04543be278..74e1247c53 100644
--- a/src/tool/hpcfnbounds/Makefile.in
+++ b/src/tool/hpcfnbounds/Makefile.in
@@ -487,6 +487,7 @@ PERFMON_LDFLAGS_STAT = @PERFMON_LDFLAGS_STAT@
 PERFMON_LIB = @PERFMON_LIB@
 PERF_EVENT_PARANOID = @PERF_EVENT_PARANOID@
 RANLIB = @RANLIB@
+ROCM_PROFILER_LD_DIR = @ROCM_PROFILER_LD_DIR@
 SED = @SED@
 SET_MAKE = @SET_MAKE@
 SHELL = @SHELL@
diff --git a/src/tool/hpcfnbounds2/Makefile.in b/src/tool/hpcfnbounds2/Makefile.in
index 3b727fb413..06a67e3a01 100644
--- a/src/tool/hpcfnbounds2/Makefile.in
+++ b/src/tool/hpcfnbounds2/Makefile.in
@@ -385,6 +385,7 @@ PERFMON_LDFLAGS_STAT = @PERFMON_LDFLAGS_STAT@
 PERFMON_LIB = @PERFMON_LIB@
 PERF_EVENT_PARANOID = @PERF_EVENT_PARANOID@
 RANLIB = @RANLIB@
+ROCM_PROFILER_LD_DIR = @ROCM_PROFILER_LD_DIR@
 SED = @SED@
 SET_MAKE = @SET_MAKE@
 SHELL = @SHELL@
diff --git a/src/tool/hpclump/Makefile.in b/src/tool/hpclump/Makefile.in
index 8c2f7b688b..3987ae7cc2 100644
--- a/src/tool/hpclump/Makefile.in
+++ b/src/tool/hpclump/Makefile.in
@@ -419,6 +419,7 @@ PERFMON_LDFLAGS_STAT = @PERFMON_LDFLAGS_STAT@
 PERFMON_LIB = @PERFMON_LIB@
 PERF_EVENT_PARANOID = @PERF_EVENT_PARANOID@
 RANLIB = @RANLIB@
+ROCM_PROFILER_LD_DIR = @ROCM_PROFILER_LD_DIR@
 SED = @SED@
 SET_MAKE = @SET_MAKE@
 SHELL = @SHELL@
diff --git a/src/tool/hpcprof-flat/Makefile.in b/src/tool/hpcprof-flat/Makefile.in
index fbc6ce1017..1ad084ea82 100644
--- a/src/tool/hpcprof-flat/Makefile.in
+++ b/src/tool/hpcprof-flat/Makefile.in
@@ -453,6 +453,7 @@ PERFMON_LDFLAGS_STAT = @PERFMON_LDFLAGS_STAT@
 PERFMON_LIB = @PERFMON_LIB@
 PERF_EVENT_PARANOID = @PERF_EVENT_PARANOID@
 RANLIB = @RANLIB@
+ROCM_PROFILER_LD_DIR = @ROCM_PROFILER_LD_DIR@
 SED = @SED@
 SET_MAKE = @SET_MAKE@
 SHELL = @SHELL@
diff --git a/src/tool/hpcprof-mpi/Makefile.in b/src/tool/hpcprof-mpi/Makefile.in
index a9d6ecaa97..b301f19ce2 100644
--- a/src/tool/hpcprof-mpi/Makefile.in
+++ b/src/tool/hpcprof-mpi/Makefile.in
@@ -453,6 +453,7 @@ PERFMON_LDFLAGS_STAT = @PERFMON_LDFLAGS_STAT@
 PERFMON_LIB = @PERFMON_LIB@
 PERF_EVENT_PARANOID = @PERF_EVENT_PARANOID@
 RANLIB = @RANLIB@
+ROCM_PROFILER_LD_DIR = @ROCM_PROFILER_LD_DIR@
 SED = @SED@
 SET_MAKE = @SET_MAKE@
 SHELL = @SHELL@
diff --git a/src/tool/hpcprof/Makefile.in b/src/tool/hpcprof/Makefile.in
index f675a9e44f..2722593378 100644
--- a/src/tool/hpcprof/Makefile.in
+++ b/src/tool/hpcprof/Makefile.in
@@ -451,6 +451,7 @@ PERFMON_LDFLAGS_STAT = @PERFMON_LDFLAGS_STAT@
 PERFMON_LIB = @PERFMON_LIB@
 PERF_EVENT_PARANOID = @PERF_EVENT_PARANOID@
 RANLIB = @RANLIB@
+ROCM_PROFILER_LD_DIR = @ROCM_PROFILER_LD_DIR@
 SED = @SED@
 SET_MAKE = @SET_MAKE@
 SHELL = @SHELL@
diff --git a/src/tool/hpcprof2-mpi/Makefile.in b/src/tool/hpcprof2-mpi/Makefile.in
index 96836cd011..d984122c9d 100644
--- a/src/tool/hpcprof2-mpi/Makefile.in
+++ b/src/tool/hpcprof2-mpi/Makefile.in
@@ -396,6 +396,7 @@ PERFMON_LDFLAGS_STAT = @PERFMON_LDFLAGS_STAT@
 PERFMON_LIB = @PERFMON_LIB@
 PERF_EVENT_PARANOID = @PERF_EVENT_PARANOID@
 RANLIB = @RANLIB@
+ROCM_PROFILER_LD_DIR = @ROCM_PROFILER_LD_DIR@
 SED = @SED@
 SET_MAKE = @SET_MAKE@
 SHELL = @SHELL@
diff --git a/src/tool/hpcprof2/Makefile.in b/src/tool/hpcprof2/Makefile.in
index b40dfeef89..43265c0839 100644
--- a/src/tool/hpcprof2/Makefile.in
+++ b/src/tool/hpcprof2/Makefile.in
@@ -388,6 +388,7 @@ PERFMON_LDFLAGS_STAT = @PERFMON_LDFLAGS_STAT@
 PERFMON_LIB = @PERFMON_LIB@
 PERF_EVENT_PARANOID = @PERF_EVENT_PARANOID@
 RANLIB = @RANLIB@
+ROCM_PROFILER_LD_DIR = @ROCM_PROFILER_LD_DIR@
 SED = @SED@
 SET_MAKE = @SET_MAKE@
 SHELL = @SHELL@
diff --git a/src/tool/hpcproftt/Makefile.in b/src/tool/hpcproftt/Makefile.in
index 4c305011e7..7ba4e9d71d 100644
--- a/src/tool/hpcproftt/Makefile.in
+++ b/src/tool/hpcproftt/Makefile.in
@@ -454,6 +454,7 @@ PERFMON_LDFLAGS_STAT = @PERFMON_LDFLAGS_STAT@
 PERFMON_LIB = @PERFMON_LIB@
 PERF_EVENT_PARANOID = @PERF_EVENT_PARANOID@
 RANLIB = @RANLIB@
+ROCM_PROFILER_LD_DIR = @ROCM_PROFILER_LD_DIR@
 SED = @SED@
 SET_MAKE = @SET_MAKE@
 SHELL = @SHELL@
diff --git a/src/tool/hpcrun-flat/Makefile.in b/src/tool/hpcrun-flat/Makefile.in
index 10f0c788b8..456921baa3 100644
--- a/src/tool/hpcrun-flat/Makefile.in
+++ b/src/tool/hpcrun-flat/Makefile.in
@@ -448,6 +448,7 @@ PERFMON_LDFLAGS_STAT = @PERFMON_LDFLAGS_STAT@
 PERFMON_LIB = @PERFMON_LIB@
 PERF_EVENT_PARANOID = @PERF_EVENT_PARANOID@
 RANLIB = @RANLIB@
+ROCM_PROFILER_LD_DIR = @ROCM_PROFILER_LD_DIR@
 SED = @SED@
 SET_MAKE = @SET_MAKE@
 SHELL = @SHELL@
diff --git a/src/tool/hpcrun/Makefile.in b/src/tool/hpcrun/Makefile.in
index cb0e97591b..25274b6fe2 100644
--- a/src/tool/hpcrun/Makefile.in
+++ b/src/tool/hpcrun/Makefile.in
@@ -1620,6 +1620,7 @@ PERFMON_LDFLAGS_STAT = @PERFMON_LDFLAGS_STAT@
 PERFMON_LIB = @PERFMON_LIB@
 PERF_EVENT_PARANOID = @PERF_EVENT_PARANOID@
 RANLIB = @RANLIB@
+ROCM_PROFILER_LD_DIR = @ROCM_PROFILER_LD_DIR@
 SED = @SED@
 SET_MAKE = @SET_MAKE@
 SHELL = @SHELL@
diff --git a/src/tool/hpcrun/sample-sources/amd-rocprofiler.c b/src/tool/hpcrun/sample-sources/amd-rocprofiler.c
index 6158ddd2fe..ffde2fe06a 100644
--- a/src/tool/hpcrun/sample-sources/amd-rocprofiler.c
+++ b/src/tool/hpcrun/sample-sources/amd-rocprofiler.c
@@ -194,7 +194,7 @@ METHOD_FN(display_events)
   printf("Name\t\tDescription\n");
   printf("---------------------------------------------------------------------------\n");
   for (int i = 0; i < total_counters; ++i) {
-    printf("%s%s\t\t%s\n", AMD_ROCPROFILER_PREFIX, rocprofiler_counter_name(i), rocprofiler_counter_description(i));
+    printf("%s::%s\t\t%s\n", AMD_ROCPROFILER_PREFIX, rocprofiler_counter_name(i), rocprofiler_counter_description(i));
   }
   printf("\n");
 }
diff --git a/src/tool/hpcrun/scripts/hpcrun.in b/src/tool/hpcrun/scripts/hpcrun.in
index 491e2b20ab..b09ba5d497 100644
--- a/src/tool/hpcrun/scripts/hpcrun.in
+++ b/src/tool/hpcrun/scripts/hpcrun.in
@@ -375,6 +375,9 @@ do
 		CPU_GPU_IDLE* ) preload_list="${preload_list:+${preload_list}:}${hpcrun_dir}/libhpcrun_gpu.so" ;;
 		MPI* )     preload_list="${preload_list:+${preload_list}:}${hpcrun_dir}/libhpcrun_mpi.so" ;;
 		gpu=amd) roctracer_libdir="${roctracer_lib_path}"
+             export HSA_TOOLS_LIB=librocprofiler64.so.1
+             export ROCP_TOOL_LIB=libhpcrun.so
+             export ROCP_METRICS=@ROCM_PROFILER_LD_DIR@/metrics.xml
 			 export HIP_ENABLE_DEFERRED_LOADING=0;;
 		gpu=opencl)	 preload_list="${preload_list:+${preload_list}:}${hpcrun_dir}/libhpcrun_opencl.so" ;;
 
@@ -395,6 +398,9 @@ do
 	    ;;
 
 	-L | -l | --list-events )
+        export HSA_TOOLS_LIB=librocprofiler64.so.1
+        export ROCP_TOOL_LIB=libhpcrun.so
+        export ROCP_METRICS=@ROCM_PROFILER_LD_DIR@/metrics.xml
 	    export HPCRUN_EVENT_LIST=LIST
 		export HPCRUN_LIST_EVENT=1
 	    ;;
diff --git a/src/tool/hpcrun/utilities/bgq-cnk/Makefile.in b/src/tool/hpcrun/utilities/bgq-cnk/Makefile.in
index 7b714659b0..7278f63517 100644
--- a/src/tool/hpcrun/utilities/bgq-cnk/Makefile.in
+++ b/src/tool/hpcrun/utilities/bgq-cnk/Makefile.in
@@ -349,6 +349,7 @@ PERFMON_LDFLAGS_STAT = @PERFMON_LDFLAGS_STAT@
 PERFMON_LIB = @PERFMON_LIB@
 PERF_EVENT_PARANOID = @PERF_EVENT_PARANOID@
 RANLIB = @RANLIB@
+ROCM_PROFILER_LD_DIR = @ROCM_PROFILER_LD_DIR@
 SED = @SED@
 SET_MAKE = @SET_MAKE@
 SHELL = @SHELL@
diff --git a/src/tool/hpcserver/Makefile.in b/src/tool/hpcserver/Makefile.in
index ea164abf7f..96ae329a18 100644
--- a/src/tool/hpcserver/Makefile.in
+++ b/src/tool/hpcserver/Makefile.in
@@ -403,6 +403,7 @@ PERFMON_LDFLAGS_STAT = @PERFMON_LDFLAGS_STAT@
 PERFMON_LIB = @PERFMON_LIB@
 PERF_EVENT_PARANOID = @PERF_EVENT_PARANOID@
 RANLIB = @RANLIB@
+ROCM_PROFILER_LD_DIR = @ROCM_PROFILER_LD_DIR@
 SED = @SED@
 SET_MAKE = @SET_MAKE@
 SHELL = @SHELL@
diff --git a/src/tool/hpcserver/mpi/Makefile.in b/src/tool/hpcserver/mpi/Makefile.in
index d4fc024c72..a7671b4d95 100644
--- a/src/tool/hpcserver/mpi/Makefile.in
+++ b/src/tool/hpcserver/mpi/Makefile.in
@@ -411,6 +411,7 @@ PERFMON_LDFLAGS_STAT = @PERFMON_LDFLAGS_STAT@
 PERFMON_LIB = @PERFMON_LIB@
 PERF_EVENT_PARANOID = @PERF_EVENT_PARANOID@
 RANLIB = @RANLIB@
+ROCM_PROFILER_LD_DIR = @ROCM_PROFILER_LD_DIR@
 SED = @SED@
 SET_MAKE = @SET_MAKE@
 SHELL = @SHELL@
diff --git a/src/tool/hpcstruct/Makefile.in b/src/tool/hpcstruct/Makefile.in
index 49ad90c9ad..742adf6423 100644
--- a/src/tool/hpcstruct/Makefile.in
+++ b/src/tool/hpcstruct/Makefile.in
@@ -439,6 +439,7 @@ PERFMON_LDFLAGS_STAT = @PERFMON_LDFLAGS_STAT@
 PERFMON_LIB = @PERFMON_LIB@
 PERF_EVENT_PARANOID = @PERF_EVENT_PARANOID@
 RANLIB = @RANLIB@
+ROCM_PROFILER_LD_DIR = @ROCM_PROFILER_LD_DIR@
 SED = @SED@
 SET_MAKE = @SET_MAKE@
 SHELL = @SHELL@
diff --git a/src/tool/hpctracedump/Makefile.in b/src/tool/hpctracedump/Makefile.in
index 46cf02cc36..39cce4cf73 100644
--- a/src/tool/hpctracedump/Makefile.in
+++ b/src/tool/hpctracedump/Makefile.in
@@ -389,6 +389,7 @@ PERFMON_LDFLAGS_STAT = @PERFMON_LDFLAGS_STAT@
 PERFMON_LIB = @PERFMON_LIB@
 PERF_EVENT_PARANOID = @PERF_EVENT_PARANOID@
 RANLIB = @RANLIB@
+ROCM_PROFILER_LD_DIR = @ROCM_PROFILER_LD_DIR@
 SED = @SED@
 SET_MAKE = @SET_MAKE@
 SHELL = @SHELL@
diff --git a/src/tool/misc/Makefile.in b/src/tool/misc/Makefile.in
index acb75a41da..643dd4f8d2 100644
--- a/src/tool/misc/Makefile.in
+++ b/src/tool/misc/Makefile.in
@@ -344,6 +344,7 @@ PERFMON_LDFLAGS_STAT = @PERFMON_LDFLAGS_STAT@
 PERFMON_LIB = @PERFMON_LIB@
 PERF_EVENT_PARANOID = @PERF_EVENT_PARANOID@
 RANLIB = @RANLIB@
+ROCM_PROFILER_LD_DIR = @ROCM_PROFILER_LD_DIR@
 SED = @SED@
 SET_MAKE = @SET_MAKE@
 SHELL = @SHELL@
diff --git a/src/tool/xprof/Makefile.in b/src/tool/xprof/Makefile.in
index 6e11068ad1..9f481f4b15 100644
--- a/src/tool/xprof/Makefile.in
+++ b/src/tool/xprof/Makefile.in
@@ -410,6 +410,7 @@ PERFMON_LDFLAGS_STAT = @PERFMON_LDFLAGS_STAT@
 PERFMON_LIB = @PERFMON_LIB@
 PERF_EVENT_PARANOID = @PERF_EVENT_PARANOID@
 RANLIB = @RANLIB@
+ROCM_PROFILER_LD_DIR = @ROCM_PROFILER_LD_DIR@
 SED = @SED@
 SET_MAKE = @SET_MAKE@
 SHELL = @SHELL@

From e7024b7e4b477599933fe9cf74cb6586edbeaea1 Mon Sep 17 00:00:00 2001
From: Xiaozhu Meng <mxz297@gmail.com>
Date: Sat, 18 Dec 2021 10:07:47 -0600
Subject: [PATCH 154/177] Some code cleanup and add environment variable
 HPCRUN_PRINT_ROCPROFILER_COUNTER_DETAILS to print rocprofiler counter details

---
 src/tool/hpcrun/gpu/amd/rocprofiler-api.c | 147 ++++++++++++----------
 1 file changed, 80 insertions(+), 67 deletions(-)

diff --git a/src/tool/hpcrun/gpu/amd/rocprofiler-api.c b/src/tool/hpcrun/gpu/amd/rocprofiler-api.c
index 6e54893da5..cd45613922 100644
--- a/src/tool/hpcrun/gpu/amd/rocprofiler-api.c
+++ b/src/tool/hpcrun/gpu/amd/rocprofiler-api.c
@@ -156,7 +156,9 @@ Context pool API:
   macro(rocprofiler_start_queue_callbacks) \
   macro(rocprofiler_stop_queue_callbacks) \
   macro(rocprofiler_remove_queue_callbacks) \
-  macro(rocprofiler_iterate_info)
+  macro(rocprofiler_iterate_info) \
+  macro(rocprofiler_group_get_data) \
+  macro(rocprofiler_get_group)
 
 
 
@@ -187,7 +189,9 @@ typedef struct {
 // local variables
 //******************************************************************************
 
-// Currently we serialize kernel execution when collecting counters
+// Currently we serialize kernel execution when collecting counters.
+// So we have one global correlation id, counter data storage,
+// and one variable indicating whether the processing is finished or not
 static hpcrun_amd_counter_data_t counter_data;
 static uint64_t rocprofiler_correlation_id;
 static volatile int context_callback_finish;
@@ -227,14 +231,14 @@ ROCPROFILER_FN
 (
   rocprofiler_close,
   (
-	  rocprofiler_t* context		// [in] profiling context
+    rocprofiler_t* context		// [in] profiling context
   )
 );
 
 ROCPROFILER_FN
 (
   rocprofiler_get_metrics,
-	(
+  (
     rocprofiler_t* context		// [in/out] profiling context
   )
 );
@@ -278,9 +282,29 @@ ROCPROFILER_FN
   (
     const hsa_agent_t* agent,			// [in] GPU handle, NULL for all
                                   // GPU agents
-	  rocprofiler_info_kind_t kind,			// kind of iterated info
-	  hsa_status_t (*callback)(const rocprofiler_info_data_t info, void *data), // callback
-	  void *data
+    rocprofiler_info_kind_t kind,			// kind of iterated info
+    hsa_status_t (*callback)(const rocprofiler_info_data_t info, void *data), // callback
+    void *data
+  )
+);
+
+ROCPROFILER_FN
+(
+  rocprofiler_group_get_data,
+  (
+    rocprofiler_group_t* group // [in/out] profiling group
+  )
+);
+
+ROCPROFILER_FN
+(
+  rocprofiler_get_group,
+  (
+    rocprofiler_t* context,			  // [in/out] profiling context,
+                                  //  will be returned as
+                                  //  a part of the group structure
+    uint32_t index,				        // [in] group index
+    rocprofiler_group_t* group		// [out] profiling group
   )
 );
 
@@ -299,6 +323,32 @@ rocprofiler_path
   return path;
 }
 
+static void
+translate_rocprofiler_output
+(
+  gpu_activity_t* ga
+)
+{
+  // Translate counter results stored in rocprofiler_feature_t
+  // to hpcrun's gpu_activity_t data structure
+  rocprofiler_feature_t** features = counter_data.group.features;
+  unsigned feature_count = counter_data.group.feature_count;
+
+  ga->kind = GPU_ACTIVITY_COUNTER;
+  ga->details.counters.correlation_id = rocprofiler_correlation_id;
+
+  for (unsigned i = 0; i < feature_count; ++i) {
+    const rocprofiler_feature_t* p = features[i];
+    if (strcmp(p->name, "GRBM_COUNT") == 0) {
+      ga->details.counters.cycles = p->data.result_int64;
+    } else if (strstr(p->name, "TCC_HIT") != NULL) {
+      ga->details.counters.l2_cache_hit += p->data.result_int64;
+    } else if (strstr(p->name, "TCC_MISS") != NULL) {
+      ga->details.counters.l2_cache_miss += p->data.result_int64;
+    }
+  }
+}
+
 // Profiling completion handler
 // Dump and delete the context entry
 // Return true if the context was dumped successfully
@@ -309,7 +359,6 @@ rocprofiler_context_handler
   void* arg
 )
 {
-  printf("Enter rocprofiler_context_handler\n");
   hpcrun_thread_init_mem_pool_once(0, NULL, false, true);
   volatile bool valid = counter_data.valid;
   while (!valid) {
@@ -317,16 +366,11 @@ rocprofiler_context_handler
     valid = counter_data.valid;
   }
 
-  rocprofiler_feature_t** features = counter_data.group.features;
-  unsigned feature_count = counter_data.group.feature_count;
-
-
   if (counter_data.group.context == NULL) {
     EMSG("error: AMD group->context = NULL");
   }
-  if (feature_count > 0) {
-    //HPCRUN_ROCPROFILER_CALL(rocprofiler_group_get_data, (group));
-    rocprofiler_group_get_data(&counter_data.group);
+  if (counter_data.group.feature_count > 0) {
+    HPCRUN_ROCPROFILER_CALL(rocprofiler_group_get_data, (&counter_data.group));
     HPCRUN_ROCPROFILER_CALL(rocprofiler_get_metrics, (counter_data.group.context));
   }
 
@@ -336,19 +380,7 @@ rocprofiler_context_handler
   memset(&ga, 0, sizeof(gpu_activity_t));
   cstack_ptr_set(&(ga.next), 0);
 
-  ga.kind = GPU_ACTIVITY_COUNTER;
-  ga.details.counters.correlation_id = rocprofiler_correlation_id;
-
-  for (unsigned i = 0; i < feature_count; ++i) {
-    const rocprofiler_feature_t* p = features[i];
-    if (strcmp(p->name, "GRBM_COUNT") == 0) {
-      ga.details.counters.cycles = p->data.result_int64;
-    } else if (strstr(p->name, "TCC_HIT") != NULL) {
-      ga.details.counters.l2_cache_hit += p->data.result_int64;
-    } else if (strstr(p->name, "TCC_MISS") != NULL) {
-      ga.details.counters.l2_cache_miss += p->data.result_int64;
-    }
-  }
+  translate_rocprofiler_output(&ga);
 
   if (gpu_correlation_id_map_lookup(rocprofiler_correlation_id) == NULL) {
     gpu_correlation_id_map_insert(rocprofiler_correlation_id, rocprofiler_correlation_id);
@@ -367,7 +399,7 @@ rocprofiler_dispatch_callback
   rocprofiler_group_t* group
 ) {
   if (total_requested == 0) return HSA_STATUS_SUCCESS;
-  printf("Enter rocprofiler_dispatch_callback\n");
+
   // Passed tool data
   hsa_agent_t agent = callback_data->agent;
   // HSA status
@@ -384,8 +416,7 @@ rocprofiler_dispatch_callback
 
 
   // Get group[0]
-  //HPCRUN_ROCPROFILER_CALL(rocprofiler_get_group, (context, 0, group));
-  rocprofiler_get_group(context, 0, group);
+  HPCRUN_ROCPROFILER_CALL(rocprofiler_get_group, (context, 0, group));
 
   // Fill profiling context entry
   counter_data.agent = agent;
@@ -396,23 +427,6 @@ rocprofiler_dispatch_callback
   return HSA_STATUS_SUCCESS;
 }
 
-
-static void
-rocp_inicialize
-(
-
-)
-{
-  rocprofiler_queue_callbacks_t callbacks_ptrs = {};
-  callbacks_ptrs.dispatch = rocprofiler_dispatch_callback;
-  rocprofiler_set_queue_callbacks(callbacks_ptrs, NULL);
-}
-
-static void cleanup() {
-  // Unregister dispatch callback
-  rocprofiler_remove_queue_callbacks();
-}
-
 static hsa_status_t
 total_counter_accumulator
 (
@@ -431,14 +445,14 @@ counter_info_accumulator
   void *data
 )
 {
-  /*
-  fprintf(stderr, "Enter counter_info_accumulator\n");
-  fprintf(stderr, "\tname %s\n", info.metric.name);
-  fprintf(stderr, "\tinstances %d\n", info.metric.instances);
-  fprintf(stderr, "\texpr %s\n", info.metric.expr);
-  fprintf(stderr, "\tblock name %s\n", info.metric.block_name);
-  fprintf(stderr, "\tblock_counters %d\n", info.metric.block_counters);
-  */
+  if (getenv("HPCRUN_PRINT_ROCPROFILER_COUNTER_DETAILS")) {
+    printf("Enter counter_info_accumulator\n");
+    printf("\tname %s\n", info.metric.name);
+    printf("\tinstances %d\n", info.metric.instances);
+    printf("\texpr %s\n", info.metric.expr);
+    printf("\tblock name %s\n", info.metric.block_name);
+    printf("\tblock_counters %d\n", info.metric.block_counters);
+  }
   counter_name[total_counters] = strdup(info.metric.name);
   counter_description[total_counters] = strdup(info.metric.description);
   total_counters += 1;
@@ -478,18 +492,17 @@ extern PUBLIC_API void OnLoadToolProp(rocprofiler_settings_t* settings){
   // Somehow needs to disable code object tracking
   // to avoid a deadlock in rocprofiler
   settings->code_obj_tracking = 0;
+
   rocprofiler_init();
-  printf("Rocprofiler OnLoadToolProp______________________\n");
-  rocp_inicialize();
-}
 
-extern PUBLIC_API void OnUnloadTool(){
-  printf("Rocprofiler OnUnloadTool______________________\n");
-  // rocp_inicialize();
+  rocprofiler_queue_callbacks_t callbacks_ptrs = {};
+  callbacks_ptrs.dispatch = rocprofiler_dispatch_callback;
+  rocprofiler_set_queue_callbacks(callbacks_ptrs, NULL);
 }
 
-extern PUBLIC_API void OnLoad(){
-  printf("Rocprofiler OnLoad______________________\n");
+extern PUBLIC_API void OnUnloadTool() {
+  // Must be provided. Otherwise rocprofiler
+  // will refuse to work
 }
 
 //******************************************************************************
@@ -523,8 +536,9 @@ rocprofiler_init
   if (rocprofiler_initialized) {
     return;
   }
+  // Ensure librocprofiler64.so is loaded
+  // and initialize all rocprofiler API function pointers
   rocprofiler_initialized = true;
-  printf("Rocprofiler INIT\n");
 
 #ifndef HPCRUN_STATIC_LINK
   // We usually bind GPU vendor library in finalize_event_list.
@@ -546,8 +560,7 @@ rocprofiler_fini
  int how
 )
 {
-  printf("Rocprofiler FINI\n");
-  cleanup();
+  HPCRUN_ROCPROFILER_CALL(rocprofiler_remove_queue_callbacks, ());
   return;
 }
 

From 30031ce59dc7775255d567a5376db56b2282a745 Mon Sep 17 00:00:00 2001
From: Xiaozhu Meng <mxz297@gmail.com>
Date: Wed, 22 Dec 2021 10:40:05 -0600
Subject: [PATCH 155/177] Refactoring code for roctracer and rocprofiler
 regarding thread-safety:

1. Every application thread will have two correlation and activity channels.
   One pair for roctracer and the other for rocprofiler
2. Host correlation id map is changed from global to thread local as
   roctracer thread and rocprofiler thread should handle correlation independently.
---
 src/tool/hpcrun/gpu/amd/rocprofiler-api.c     | 20 ++++----
 src/tool/hpcrun/gpu/amd/rocprofiler-api.h     |  5 ++
 src/tool/hpcrun/gpu/amd/roctracer-api.c       | 24 ++++++----
 src/tool/hpcrun/gpu/gpu-activity-channel.c    | 30 ++++++++++--
 src/tool/hpcrun/gpu/gpu-activity-channel.h    | 15 ++++++
 src/tool/hpcrun/gpu/gpu-channel-common.h      | 24 ++++++++++
 .../hpcrun/gpu/gpu-correlation-channel-set.c  | 19 ++++----
 .../hpcrun/gpu/gpu-correlation-channel-set.h  |  7 +--
 src/tool/hpcrun/gpu/gpu-correlation-channel.c | 48 +++++++++++--------
 src/tool/hpcrun/gpu/gpu-correlation-channel.h | 15 ++++--
 src/tool/hpcrun/gpu/gpu-correlation-id-map.c  |  4 +-
 .../hpcrun/gpu/gpu-host-correlation-map.c     |  6 +--
 .../hpcrun/gpu/gpu-monitoring-thread-api.c    | 11 ++++-
 .../hpcrun/gpu/gpu-monitoring-thread-api.h    |  7 +++
 14 files changed, 173 insertions(+), 62 deletions(-)
 create mode 100644 src/tool/hpcrun/gpu/gpu-channel-common.h

diff --git a/src/tool/hpcrun/gpu/amd/rocprofiler-api.c b/src/tool/hpcrun/gpu/amd/rocprofiler-api.c
index cd45613922..5c7990c714 100644
--- a/src/tool/hpcrun/gpu/amd/rocprofiler-api.c
+++ b/src/tool/hpcrun/gpu/amd/rocprofiler-api.c
@@ -366,6 +366,9 @@ rocprofiler_context_handler
     valid = counter_data.valid;
   }
 
+  // Consume the correlation channel for rocprofiler
+  gpu_monitoring_thread_activities_ready_with_idx(ROCPROFILER_CHANNEL_IDX);
+
   if (counter_data.group.context == NULL) {
     EMSG("error: AMD group->context = NULL");
   }
@@ -374,8 +377,6 @@ rocprofiler_context_handler
     HPCRUN_ROCPROFILER_CALL(rocprofiler_get_metrics, (counter_data.group.context));
   }
 
-  gpu_monitoring_thread_activities_ready();
-
   gpu_activity_t ga;
   memset(&ga, 0, sizeof(gpu_activity_t));
   cstack_ptr_set(&(ga.next), 0);
@@ -497,7 +498,7 @@ extern PUBLIC_API void OnLoadToolProp(rocprofiler_settings_t* settings){
 
   rocprofiler_queue_callbacks_t callbacks_ptrs = {};
   callbacks_ptrs.dispatch = rocprofiler_dispatch_callback;
-  rocprofiler_set_queue_callbacks(callbacks_ptrs, NULL);
+  HPCRUN_ROCPROFILER_CALL(rocprofiler_set_queue_callbacks, (callbacks_ptrs, NULL));
 }
 
 extern PUBLIC_API void OnUnloadTool() {
@@ -576,12 +577,6 @@ rocprofiler_bind
   // dynamic libraries only availabile in non-static case
   hpcrun_force_dlopen(true);
   CHK_DLOPEN(rocprofiler, rocprofiler_path(), RTLD_NOW | RTLD_GLOBAL);
-
-  if (getenv("HPCRUN_LIST_EVENT")) {
-    CHK_DLOPEN(hsa, "libhsa-runtime64.so", RTLD_NOW | RTLD_GLOBAL);
-    hsa_init();
-  }
-
   hpcrun_force_dlopen(false);
 
 #define ROCPROFILER_BIND(fn) \
@@ -591,6 +586,13 @@ rocprofiler_bind
 
 #undef ROCPROFILER_BIND
 
+  hpcrun_force_dlopen(true);
+  if (getenv("HPCRUN_LIST_EVENT")) {
+    CHK_DLOPEN(hsa, "libhsa-runtime64.so", RTLD_NOW | RTLD_GLOBAL);
+    hsa_init();
+  }
+  hpcrun_force_dlopen(false);
+
   return DYNAMIC_BINDING_STATUS_OK;
 #else
   return DYNAMIC_BINDING_STATUS_ERROR;
diff --git a/src/tool/hpcrun/gpu/amd/rocprofiler-api.h b/src/tool/hpcrun/gpu/amd/rocprofiler-api.h
index 42b50ee2b0..c1a612e2ea 100644
--- a/src/tool/hpcrun/gpu/amd/rocprofiler-api.h
+++ b/src/tool/hpcrun/gpu/amd/rocprofiler-api.h
@@ -44,7 +44,12 @@
 #ifndef rocprofiler_api_h
 #define rocprofiler_api_h
 
+//******************************************************************************
+// macro definitions
+//******************************************************************************
 
+#define ROCTRACER_CHANNEL_IDX 0
+#define ROCPROFILER_CHANNEL_IDX 1
 
 //******************************************************************************
 // interface operations
diff --git a/src/tool/hpcrun/gpu/amd/roctracer-api.c b/src/tool/hpcrun/gpu/amd/roctracer-api.c
index 7368f21ee1..9b74a6fa67 100644
--- a/src/tool/hpcrun/gpu/amd/roctracer-api.c
+++ b/src/tool/hpcrun/gpu/amd/roctracer-api.c
@@ -82,8 +82,7 @@
   macro(roctracer_enable_domain_activity_expl) \
   macro(roctracer_disable_domain_callback) \
   macro(roctracer_disable_domain_activity) \
-  macro(roctracer_set_properties) 
-
+  macro(roctracer_set_properties)
 
 #define ROCTRACER_FN_NAME(f) DYN_FN_NAME(f)
 
@@ -398,7 +397,7 @@ roctracer_subscriber_callback
 				 gpu_placeholder_type_trace);
     is_valid_op = true;
     is_kernel_op = true;
-    kernel_name = hip_kernel_name_ref_fn(data->args.hipLaunchKernel.function_address, 
+    kernel_name = hip_kernel_name_ref_fn(data->args.hipLaunchKernel.function_address,
       data->args.hipLaunchKernel.stream);
     if (collect_counter) {
       kernel_stream = data->args.hipLaunchKernel.stream;
@@ -423,6 +422,7 @@ roctracer_subscriber_callback
 
   if (data->phase == ACTIVITY_API_PHASE_ENTER) {
     uint64_t correlation_id = data->correlation_id;
+    uint64_t rocprofiler_correlation_id = 0;
     cct_node_t *api_node =
       gpu_application_thread_correlation_callback(correlation_id);
 
@@ -440,19 +440,25 @@ roctracer_subscriber_callback
       ensure_kernel_ip_present(trace_ph, kernel_ip);
 
       if (collect_counter) {
-        rocprofiler_start_kernel(correlation_id);
+        rocprofiler_correlation_id = correlation_id; // | 0x800000000000LL;
+        rocprofiler_start_kernel(rocprofiler_correlation_id);
       }
     }
 
     hpcrun_safe_exit();
 
-
-    gpu_activity_channel_consume(gpu_metrics_attribute);
+    gpu_activity_channel_consume_with_idx(ROCTRACER_CHANNEL_IDX, gpu_metrics_attribute);
+    if (collect_counter) {
+      gpu_activity_channel_consume_with_idx(ROCPROFILER_CHANNEL_IDX, gpu_metrics_attribute);
+    }
 
     // Generate notification entry
     uint64_t cpu_submit_time = hpcrun_nanotime();
-    gpu_correlation_channel_produce(correlation_id, &gpu_op_ccts, cpu_submit_time);
-    
+    gpu_correlation_channel_produce_with_idx(ROCTRACER_CHANNEL_IDX, correlation_id, &gpu_op_ccts, cpu_submit_time);
+    if (collect_counter && is_kernel_op && kernel_name != NULL) {
+      gpu_correlation_channel_produce_with_idx(ROCPROFILER_CHANNEL_IDX, rocprofiler_correlation_id, &gpu_op_ccts, cpu_submit_time);
+    }
+
   }else if (data->phase == ACTIVITY_API_PHASE_EXIT){
     if (is_kernel_op && collect_counter) {
       hipStreamSynchronize(kernel_stream);
@@ -469,7 +475,7 @@ roctracer_buffer_completion_notify
   void
 )
 {
-  gpu_monitoring_thread_activities_ready();
+  gpu_monitoring_thread_activities_ready_with_idx(ROCTRACER_CHANNEL_IDX);
 }
 
 
diff --git a/src/tool/hpcrun/gpu/gpu-activity-channel.c b/src/tool/hpcrun/gpu/gpu-activity-channel.c
index 12d1e0a755..b1386b6b01 100644
--- a/src/tool/hpcrun/gpu/gpu-activity-channel.c
+++ b/src/tool/hpcrun/gpu/gpu-activity-channel.c
@@ -50,6 +50,7 @@
 #include "gpu-activity.h"
 #include "gpu-activity-channel.h"
 #include "gpu-channel-item-allocator.h"
+#include "gpu-channel-common.h"
 
 
 //******************************************************************************
@@ -96,7 +97,7 @@ typedef struct gpu_activity_channel_t {
 // local data
 //******************************************************************************
 
-static __thread gpu_activity_channel_t *gpu_activity_channel = NULL;
+static __thread gpu_activity_channel_t *gpu_activity_channels[GPU_CHANNEL_TOTAL];
 
 
 
@@ -134,11 +135,20 @@ gpu_activity_channel_get
  void
 )
 {
-  if (gpu_activity_channel == NULL) {
-    gpu_activity_channel = gpu_activity_channel_alloc();
+  return gpu_activity_channel_get_with_idx(0);
+}
+
+gpu_activity_channel_t *
+gpu_activity_channel_get_with_idx
+(
+ int idx
+)
+{
+  if (gpu_activity_channels[idx] == NULL) {
+    gpu_activity_channels[idx] = gpu_activity_channel_alloc();
   }
 
-  return gpu_activity_channel;
+  return gpu_activity_channels[idx];
 }
 
 
@@ -164,7 +174,17 @@ gpu_activity_channel_consume
  gpu_activity_attribute_fn_t aa_fn
 )
 {
-  gpu_activity_channel_t *channel = gpu_activity_channel_get();
+  return gpu_activity_channel_consume_with_idx(0, aa_fn);
+}
+
+void
+gpu_activity_channel_consume_with_idx
+(
+ int idx,
+ gpu_activity_attribute_fn_t aa_fn
+)
+{
+  gpu_activity_channel_t *channel = gpu_activity_channel_get_with_idx(idx);
 
   // steal elements previously enqueued by the producer
   channel_steal(channel, bichannel_direction_forward);
diff --git a/src/tool/hpcrun/gpu/gpu-activity-channel.h b/src/tool/hpcrun/gpu/gpu-activity-channel.h
index 4565b797a6..e9a994c0e6 100644
--- a/src/tool/hpcrun/gpu/gpu-activity-channel.h
+++ b/src/tool/hpcrun/gpu/gpu-activity-channel.h
@@ -51,6 +51,7 @@
 #include <lib/prof-lean/bichannel.h>
 
 #include "gpu-activity.h"
+#include "gpu-channel-common.h"
 
 
 //******************************************************************************
@@ -74,6 +75,13 @@ gpu_activity_channel_get
 );
 
 
+gpu_activity_channel_t *
+gpu_activity_channel_get_with_idx
+(
+ int
+);
+
+
 void
 gpu_activity_channel_produce
 (
@@ -89,5 +97,12 @@ gpu_activity_channel_consume
 );
 
 
+void
+gpu_activity_channel_consume_with_idx
+(
+ int idx,
+ gpu_activity_attribute_fn_t aa_fn
+);
+
 
 #endif
diff --git a/src/tool/hpcrun/gpu/gpu-channel-common.h b/src/tool/hpcrun/gpu/gpu-channel-common.h
new file mode 100644
index 0000000000..24869acc92
--- /dev/null
+++ b/src/tool/hpcrun/gpu/gpu-channel-common.h
@@ -0,0 +1,24 @@
+#ifndef GPU_CHANNEL_COMMON_H
+#define GPU_CHANNEL_COMMON_H
+
+// GPU_CHANNEL_TOTAL specifies the total number
+// of correlation and activity channels an application
+// thread will create.
+// This is created for supporting AMD GPUs,
+// where roctracer and rocprofiler will each create
+// one monitoring thread.
+// As the implementation of the channel is one-proceduer-one-consumer,
+// we need an array of correlation and 
+// activity channel for each application thread.
+// For platforms where there is just one monitoring
+// thread, such as NVIDIA, the implementation maintains
+// backward compatibility, where we will just use
+// the first channel pair.
+// Implementation wise, channel operations without _with_idx suffix
+// represent old operations and will use channel 0
+// Channel operations with _with_idx suffix requires a channel
+// index to specify which channel to operate with
+
+#define GPU_CHANNEL_TOTAL 2
+
+#endif
\ No newline at end of file
diff --git a/src/tool/hpcrun/gpu/gpu-correlation-channel-set.c b/src/tool/hpcrun/gpu/gpu-correlation-channel-set.c
index f7f2d95a93..5557818cac 100644
--- a/src/tool/hpcrun/gpu/gpu-correlation-channel-set.c
+++ b/src/tool/hpcrun/gpu/gpu-correlation-channel-set.c
@@ -51,6 +51,7 @@
 
 #include "gpu-correlation-channel.h"
 #include "gpu-correlation-channel-set.h"
+#include "gpu-channel-common.h"
 
 
 
@@ -99,7 +100,7 @@ typed_stack_declare_type(gpu_correlation_channel_ptr_t);
 
 static 
 typed_stack_elem_ptr(gpu_correlation_channel_ptr_t) 
-gpu_correlation_channel_stack;
+gpu_correlation_channel_stacks[GPU_CHANNEL_TOTAL];
 
 
 
@@ -128,12 +129,13 @@ channel_forone
 
 
 static void
-gpu_correlation_channel_set_forall
+gpu_correlation_channel_set_forall_with_idx
 (
+ int idx,
  gpu_correlation_channel_fn_t channel_fn
 )
 {
-  channel_stack_forall(&gpu_correlation_channel_stack, channel_forone, 
+  channel_stack_forall(&gpu_correlation_channel_stacks[idx], channel_forone,
 		       channel_fn);
 }
 
@@ -143,8 +145,9 @@ gpu_correlation_channel_set_forall
 //******************************************************************************
 
 void
-gpu_correlation_channel_set_insert
+gpu_correlation_channel_set_insert_with_idx
 (
+ int idx,
  gpu_correlation_channel_t *channel
 )
 {
@@ -157,15 +160,15 @@ gpu_correlation_channel_set_insert
   channel_stack_elem_ptr_set(e, 0); // clear the entry's next ptr
 
   // add the entry to the channel stack
-  channel_stack_push(&gpu_correlation_channel_stack, e);
+  channel_stack_push(&gpu_correlation_channel_stacks[idx], e);
 }
 
 
 void
-gpu_correlation_channel_set_consume
+gpu_correlation_channel_set_consume_with_idx
 (
- void
+ int idx
 )
 {
-  gpu_correlation_channel_set_forall(gpu_correlation_channel_consume);
+  gpu_correlation_channel_set_forall_with_idx(idx, gpu_correlation_channel_consume);
 }
diff --git a/src/tool/hpcrun/gpu/gpu-correlation-channel-set.h b/src/tool/hpcrun/gpu/gpu-correlation-channel-set.h
index 5eac5a7d5a..091ba7394c 100644
--- a/src/tool/hpcrun/gpu/gpu-correlation-channel-set.h
+++ b/src/tool/hpcrun/gpu/gpu-correlation-channel-set.h
@@ -70,16 +70,17 @@ typedef void (*gpu_correlation_channel_fn_t)
 //******************************************************************************
 
 void
-gpu_correlation_channel_set_insert
+gpu_correlation_channel_set_insert_with_idx
 (
+ int idx,
  gpu_correlation_channel_t *channel
 );
 
 
 void
-gpu_correlation_channel_set_consume
+gpu_correlation_channel_set_consume_with_idx
 (
- void
+ int idx
 );
 
 
diff --git a/src/tool/hpcrun/gpu/gpu-correlation-channel.c b/src/tool/hpcrun/gpu/gpu-correlation-channel.c
index 47a8345554..cf855b1c54 100644
--- a/src/tool/hpcrun/gpu/gpu-correlation-channel.c
+++ b/src/tool/hpcrun/gpu/gpu-correlation-channel.c
@@ -71,7 +71,7 @@
 #define typed_bichannel(x) gpu_correlation_channel_t
 #define typed_stack_elem(x) gpu_correlation_t
 
-// define macros that simplify use of correlation channel API 
+// define macros that simplify use of correlation channel API
 #define channel_init  \
   typed_bichannel_init(gpu_correlation_t)
 
@@ -100,7 +100,7 @@ typedef struct gpu_correlation_channel_t {
 // local data
 //******************************************************************************
 
-static __thread gpu_correlation_channel_t *gpu_correlation_channel = NULL;
+static __thread gpu_correlation_channel_t *gpu_correlation_channels[GPU_CHANNEL_TOTAL];
 
 
 
@@ -113,37 +113,35 @@ typed_bichannel_impl(gpu_correlation_t)
 
 
 static gpu_correlation_channel_t *
-gpu_correlation_channel_alloc
+gpu_correlation_channel_alloc_with_idx
 (
- void
+ int idx
 )
 {
-  gpu_correlation_channel_t *c = 
+  gpu_correlation_channel_t *c =
     hpcrun_malloc_safe(sizeof(gpu_correlation_channel_t));
 
   channel_init(c);
 
-  gpu_correlation_channel_set_insert(c);
+  gpu_correlation_channel_set_insert_with_idx(idx, c);
 
   return c;
 }
 
 
 static gpu_correlation_channel_t *
-gpu_correlation_channel_get
+gpu_correlation_channel_get_with_idx
 (
- void
+ int idx
 )
 {
-  if (gpu_correlation_channel == NULL) {
-    gpu_correlation_channel = gpu_correlation_channel_alloc();
+  if (gpu_correlation_channels[idx] == NULL) {
+    gpu_correlation_channels[idx] = gpu_correlation_channel_alloc_with_idx(idx);
   }
 
-  return gpu_correlation_channel;
+  return gpu_correlation_channels[idx];
 }
 
-
-
 //******************************************************************************
 // interface functions
 //******************************************************************************
@@ -156,8 +154,21 @@ gpu_correlation_channel_produce
  uint64_t cpu_submit_time
 )
 {
-  gpu_correlation_channel_t *corr_channel = gpu_correlation_channel_get();
-  gpu_activity_channel_t *activity_channel = gpu_activity_channel_get();
+  // Relaying parameters with index 0
+  gpu_correlation_channel_produce_with_idx(0, host_correlation_id, gpu_op_ccts, cpu_submit_time);
+}
+
+void
+gpu_correlation_channel_produce_with_idx
+(
+ int idx,
+ uint64_t host_correlation_id,
+ gpu_op_ccts_t *gpu_op_ccts,
+ uint64_t cpu_submit_time
+)
+{
+  gpu_correlation_channel_t *corr_channel = gpu_correlation_channel_get_with_idx(idx);
+  gpu_activity_channel_t *activity_channel = gpu_activity_channel_get_with_idx(idx);
 
   gpu_correlation_t *c = gpu_correlation_alloc(corr_channel);
 
@@ -167,7 +178,6 @@ gpu_correlation_channel_produce
   channel_push(corr_channel, bichannel_direction_forward, c);
 }
 
-
 void
 gpu_correlation_channel_consume
 (
@@ -204,7 +214,7 @@ gpu_correlation_channel_consume
 void *hpcrun_malloc_safe
 (
  size_t s
-) 
+)
 {
   return malloc(s);
 }
@@ -214,7 +224,7 @@ gpu_activity_channel_t *
 gpu_activity_channel_get
 (
  void
-) 
+)
 {
   return (gpu_activity_channel_t *) 0x5000;
 }
@@ -223,7 +233,7 @@ gpu_activity_channel_get
 int
 main
 (
- int argc, 
+ int argc,
  char **argv
 )
 {
diff --git a/src/tool/hpcrun/gpu/gpu-correlation-channel.h b/src/tool/hpcrun/gpu/gpu-correlation-channel.h
index 33fcc0185e..5e321d6730 100644
--- a/src/tool/hpcrun/gpu/gpu-correlation-channel.h
+++ b/src/tool/hpcrun/gpu/gpu-correlation-channel.h
@@ -50,8 +50,7 @@
 //******************************************************************************
 
 #include "gpu-correlation.h"
-
-
+#include "gpu-channel-common.h"
 
 //******************************************************************************
 // type declarations
@@ -67,7 +66,7 @@ typedef struct gpu_op_ccts_t gpu_op_ccts_t;
 // interface operations 
 //******************************************************************************
 
-// produce into a channel that my thread created
+// produce into the first channel that my thread created
 void
 gpu_correlation_channel_produce
 (
@@ -76,6 +75,16 @@ gpu_correlation_channel_produce
  uint64_t cpu_submit_time
 );
 
+// produce into a specified channel (with idx) that my thread created
+// when idx == 0, this function is equivalent to gpu_correlation_channel_produce
+void
+gpu_correlation_channel_produce_with_idx
+(
+ int idx,
+ uint64_t host_correlation_id,
+ gpu_op_ccts_t *gpu_ccts,
+ uint64_t cpu_submit_time
+);
 
 // consume from a channel that another thread created
 void
diff --git a/src/tool/hpcrun/gpu/gpu-correlation-id-map.c b/src/tool/hpcrun/gpu/gpu-correlation-id-map.c
index 90ba4a0470..e840fcab14 100644
--- a/src/tool/hpcrun/gpu/gpu-correlation-id-map.c
+++ b/src/tool/hpcrun/gpu/gpu-correlation-id-map.c
@@ -119,9 +119,9 @@ typedef struct typed_splay_node(correlation_id) {
 // local data
 //******************************************************************************
 
-static gpu_correlation_id_map_entry_t *map_root = NULL;
+static __thread gpu_correlation_id_map_entry_t *map_root = NULL;
 
-static gpu_correlation_id_map_entry_t *free_list = NULL;
+static __thread gpu_correlation_id_map_entry_t *free_list = NULL;
 
 
 
diff --git a/src/tool/hpcrun/gpu/gpu-host-correlation-map.c b/src/tool/hpcrun/gpu/gpu-host-correlation-map.c
index fdd8edb583..bb5d5518d1 100644
--- a/src/tool/hpcrun/gpu/gpu-host-correlation-map.c
+++ b/src/tool/hpcrun/gpu/gpu-host-correlation-map.c
@@ -125,11 +125,11 @@ typedef struct typed_splay_node(host_correlation) {
 // local data
 //******************************************************************************
 
-static gpu_host_correlation_map_entry_t *map_root = NULL;
+static __thread gpu_host_correlation_map_entry_t *map_root = NULL;
 
-static gpu_host_correlation_map_entry_t *free_list = NULL;
+static __thread gpu_host_correlation_map_entry_t *free_list = NULL;
 
-static bool allow_replace = false;
+static __thread bool allow_replace = false;
 
 //******************************************************************************
 // private operations
diff --git a/src/tool/hpcrun/gpu/gpu-monitoring-thread-api.c b/src/tool/hpcrun/gpu/gpu-monitoring-thread-api.c
index 1c4a937374..361262069b 100644
--- a/src/tool/hpcrun/gpu/gpu-monitoring-thread-api.c
+++ b/src/tool/hpcrun/gpu/gpu-monitoring-thread-api.c
@@ -60,6 +60,15 @@ gpu_monitoring_thread_activities_ready
  void
 )
 {
-  gpu_correlation_channel_set_consume();
+  gpu_correlation_channel_set_consume_with_idx(0);
+}
+
+void
+gpu_monitoring_thread_activities_ready_with_idx
+(
+ int idx
+)
+{
+  gpu_correlation_channel_set_consume_with_idx(idx);
 }
 
diff --git a/src/tool/hpcrun/gpu/gpu-monitoring-thread-api.h b/src/tool/hpcrun/gpu/gpu-monitoring-thread-api.h
index 881667601e..c3d02d4c82 100644
--- a/src/tool/hpcrun/gpu/gpu-monitoring-thread-api.h
+++ b/src/tool/hpcrun/gpu/gpu-monitoring-thread-api.h
@@ -57,5 +57,12 @@ gpu_monitoring_thread_activities_ready
 );
 
 
+void
+gpu_monitoring_thread_activities_ready_with_idx
+(
+ int idx
+);
+
+
 
 #endif

From 00725498ac44170a25180f7d501d90aeff5476f4 Mon Sep 17 00:00:00 2001
From: Xiaozhu Meng <mxz297@gmail.com>
Date: Wed, 22 Dec 2021 11:15:36 -0600
Subject: [PATCH 156/177] Ensure rocprofiler support to serialize GPU kernel
 launches

---
 src/tool/hpcrun/gpu/amd/rocprofiler-api.c | 89 +++++------------------
 1 file changed, 18 insertions(+), 71 deletions(-)

diff --git a/src/tool/hpcrun/gpu/amd/rocprofiler-api.c b/src/tool/hpcrun/gpu/amd/rocprofiler-api.c
index 5c7990c714..5035853e0b 100644
--- a/src/tool/hpcrun/gpu/amd/rocprofiler-api.c
+++ b/src/tool/hpcrun/gpu/amd/rocprofiler-api.c
@@ -66,7 +66,7 @@
 
 #include <hpcrun/utilities/hpcrun-nanotime.h>
 
-// #include <lib/prof-lean/stdatomic.h>
+#include <lib/prof-lean/spinlock.h>
 #include <pthread.h>
 
 #define DEBUG 0
@@ -79,75 +79,6 @@
 
 #define PUBLIC_API __attribute__((visibility("default")))
 
-#if 0
-Returned API status:
-- hsa_status_t - HSA status codes are used from hsa.h header
-
-Loading and Configuring, loadable plugin on-load/unload methods:
-- rocprofiler_settings_t – global properties
-- OnLoadTool
-- OnLoadToolProp
-- OnUnloadTool
-
-Info API:
-- rocprofiler_info_kind_t - profiling info kind
-- rocprofiler_info_query_t - profiling info query
-- rocprofiler_info_data_t - profiling info data
-- rocprofiler_get_info - return the info for a given info kind
-- rocprofiler_iterote_inf_ - iterate over the info for a given info kind
-- rocprofiler_query_info - iterate over the info for a given info query
-
-Context API:
-- rocprofiler_t - profiling context handle
-- rocprofiler_feature_kind_t - profiling feature kind
-- rocprofiler_feature_parameter_t - profiling feature parameter
-- rocprofiler_data_kind_t - profiling data kind
-- rocprofiler_data_t - profiling data
-- rocprofiler_feature_t - profiling feature
-- rocprofiler_mode_t - profiling modes
-- rocprofiler_properties_t - profiler properties
-- rocprofiler_open - open new profiling context
-- rocprofiler_close - close profiling context and release all allocated resources
-- rocprofiler_group_count - return profiling groups count
-- rocprofiler_get_group - return profiling group for a given index
-- rocprofiler_get_metrics - method for calculating the metrics data
-- rocprofiler_iterate_trace_data - method for iterating output trace data instances
-- rocprofiler_time_id_t - supported time value ID enumeration
-- rocprofiler_get_time – return time for a given time ID and profiling timestamp value
-
-Sampling API:
-- rocprofiler_start - start profiling
-- rocprofiler_stop - stop profiling
-- rocprofiler_read - read profiling data to the profiling features objects
-- rocprofiler_get_data - wait for profiling data
-  Group versions of start/stop/read/get_data methods:
-  o rocprofiler_group_start
-  o rocprofiler_group_stop
-  o rocprofiler_group_read
-  o rocprofiler_group_get_data
-
-Intercepting API:
-- rocprofiler_callback_t - profiling callback type
-- rocprofiler_callback_data_t - profiling callback data type
-- rocprofiler_dispatch_record_t – dispatch record
-- rocprofiler_queue_callbacks_t – queue callbacks, dispatch/destroy
-- rocprofiler_set_queue_callbacks - set queue kernel dispatch and queue destroy callbacks
-- rocprofiler_remove_queue_callbacks - remove queue callbacks
-
-Context pool API:
-- rocprofiler_pool_t – context pool handle
-- rocprofiler_pool_entry_t – context pool entry
-- rocprofiler_pool_properties_t – context pool properties
-- rocprofiler_pool_handler_t – context pool completion handler
-- rocprofiler_pool_open - context pool open
-- rocprofiler_pool_close - context pool close
-- rocprofiler_pool_fetch – fetch and empty context entry to pool
-- rocprofiler_pool_release – release a context entry
-- rocprofiler_pool_iterate – iterated fetched context entries
-- rocprofiler_pool_flush – flush completed context entries
-#endif
-
-
 #define FORALL_ROCPROFILER_ROUTINES(macro)			\
   macro(rocprofiler_open)   \
   macro(rocprofiler_close)   \
@@ -204,11 +135,13 @@ static int total_counters = 0;
 static const char** counter_name = NULL;
 static const char** counter_description = NULL;
 
-//
+// the list of counters specified at the command line
 static int *is_specified_by_user = NULL;
 static int total_requested = 0;
 static rocprofiler_feature_t* rocprofiler_input = NULL;
 
+// A spin lock to serialize GPU kernels
+static spinlock_t kernel_lock;
 
 //----------------------------------------------------------
 // rocprofiler function pointers for late binding
@@ -360,6 +293,10 @@ rocprofiler_context_handler
 )
 {
   hpcrun_thread_init_mem_pool_once(0, NULL, false, true);
+
+  // This wait-loop is taken from rocprofiler example.
+  // It is strange that the rocprofiler thread will have to
+  // wait for subscriber callback to finish.
   volatile bool valid = counter_data.valid;
   while (!valid) {
     sched_yield();
@@ -517,7 +454,10 @@ rocprofiler_start_kernel
   uint64_t cor
 )
 {
+  spinlock_lock(&kernel_lock);
   rocprofiler_correlation_id = cor;
+  // We will only allow the critical section
+  // to finish after we get rocprofiler results
   context_callback_finish = 0;
   HPCRUN_ROCPROFILER_CALL(rocprofiler_start_queue_callbacks, ());
 }
@@ -525,6 +465,7 @@ rocprofiler_start_kernel
 
 void rocprofiler_stop_kernel(){
   HPCRUN_ROCPROFILER_CALL(rocprofiler_stop_queue_callbacks, ());
+  spinlock_unlock(&kernel_lock);
 }
 
 
@@ -550,6 +491,9 @@ rocprofiler_init
   }
 #endif
   initialize_counter_information();
+
+  // Initialize the spin lock used to serialize GPU kernel launches
+  spinlock_init(&kernel_lock);
   return;
 }
 
@@ -605,6 +549,9 @@ rocprofiler_wait_context_callback
   void
 )
 {
+  // The rocprofiler monitoring thread will set
+  // context_callback_finish to 1 after it finishes processing
+  // rocprofiler data
   while (context_callback_finish == 0);
 }
 

From d5b08b62ef9d93caadab0e2780e782b8697a81be Mon Sep 17 00:00:00 2001
From: Xiaozhu Meng <mxz297@gmail.com>
Date: Wed, 5 Jan 2022 09:47:05 -0600
Subject: [PATCH 157/177] Refactor gpu_acitvity_t creation and processing to
 dynamically allocate metric entries for AMD GPU hardware counters, which may
 have hundreds of different counters

---
 src/tool/hpcrun/gpu/amd/rocprofiler-api.c     | 39 ++++++++++-------
 src/tool/hpcrun/gpu/amd/roctracer-api.c       |  2 +-
 src/tool/hpcrun/gpu/gpu-activity-process.c    | 13 +++++-
 src/tool/hpcrun/gpu/gpu-activity.h            |  9 ++--
 src/tool/hpcrun/gpu/gpu-metrics.c             | 42 +++++++++++--------
 src/tool/hpcrun/gpu/gpu-metrics.h             | 17 ++++----
 .../hpcrun/sample-sources/amd-rocprofiler.c   |  4 +-
 7 files changed, 76 insertions(+), 50 deletions(-)

diff --git a/src/tool/hpcrun/gpu/amd/rocprofiler-api.c b/src/tool/hpcrun/gpu/amd/rocprofiler-api.c
index 5035853e0b..74adce8197 100644
--- a/src/tool/hpcrun/gpu/amd/rocprofiler-api.c
+++ b/src/tool/hpcrun/gpu/amd/rocprofiler-api.c
@@ -139,6 +139,8 @@ static const char** counter_description = NULL;
 static int *is_specified_by_user = NULL;
 static int total_requested = 0;
 static rocprofiler_feature_t* rocprofiler_input = NULL;
+static const char** requested_counter_name = NULL;
+static const char** requested_counter_description = NULL;
 
 // A spin lock to serialize GPU kernels
 static spinlock_t kernel_lock;
@@ -269,16 +271,19 @@ translate_rocprofiler_output
 
   ga->kind = GPU_ACTIVITY_COUNTER;
   ga->details.counters.correlation_id = rocprofiler_correlation_id;
+  ga->details.counters.total_counters = feature_count;
 
+  // This function should be called by rocprofiler thread,
+  // which is not monitored. So, this function will not be called
+  // inside a signal handler and we can call malloc.
+  // The memory is freed when we attribute this gpu_activity_t.
+  ga->details.counters.values = (uint64_t*) malloc(sizeof(uint64_t) * feature_count);
+
+  // rocprofiler should pass metric results in the same order
+  // that we pass metrics as input to rocprofiler
   for (unsigned i = 0; i < feature_count; ++i) {
     const rocprofiler_feature_t* p = features[i];
-    if (strcmp(p->name, "GRBM_COUNT") == 0) {
-      ga->details.counters.cycles = p->data.result_int64;
-    } else if (strstr(p->name, "TCC_HIT") != NULL) {
-      ga->details.counters.l2_cache_hit += p->data.result_int64;
-    } else if (strstr(p->name, "TCC_MISS") != NULL) {
-      ga->details.counters.l2_cache_miss += p->data.result_int64;
-    }
+    ga->details.counters.values[i] = p->data.result_int64;
   }
 }
 
@@ -303,9 +308,6 @@ rocprofiler_context_handler
     valid = counter_data.valid;
   }
 
-  // Consume the correlation channel for rocprofiler
-  gpu_monitoring_thread_activities_ready_with_idx(ROCPROFILER_CHANNEL_IDX);
-
   if (counter_data.group.context == NULL) {
     EMSG("error: AMD group->context = NULL");
   }
@@ -320,6 +322,8 @@ rocprofiler_context_handler
 
   translate_rocprofiler_output(&ga);
 
+  // Consume the correlation channel for rocprofiler
+  gpu_monitoring_thread_activities_ready_with_idx(ROCPROFILER_CHANNEL_IDX);
   if (gpu_correlation_id_map_lookup(rocprofiler_correlation_id) == NULL) {
     gpu_correlation_id_map_insert(rocprofiler_correlation_id, rocprofiler_correlation_id);
   }
@@ -613,12 +617,19 @@ rocprofiler_finalize_event_list
   rocprofiler_input = (rocprofiler_feature_t*) malloc(sizeof(rocprofiler_feature_t) * total_requested);
   memset(rocprofiler_input, 0, total_requested * sizeof(rocprofiler_feature_t));
 
-  total_requested = 0;
+  requested_counter_name = (const char**) malloc(sizeof(const char*) * total_requested);
+  requested_counter_description = (const char**) malloc(sizeof(const char*) * total_requested);
+
+  int cur_id = 0;
   for (int i = 0; i < total_counters; i++) {
     if (is_specified_by_user[i] == 1) {
-      rocprofiler_input[total_requested].kind = ROCPROFILER_FEATURE_KIND_METRIC;
-      rocprofiler_input[total_requested].name = counter_name[i];
-      total_requested += 1;
+      rocprofiler_input[cur_id].kind = ROCPROFILER_FEATURE_KIND_METRIC;
+      rocprofiler_input[cur_id].name = counter_name[i];
+      requested_counter_name[cur_id] = counter_name[i];
+      requested_counter_description[cur_id] = counter_description[i];
+      cur_id += 1;
     }
   }
+
+  gpu_metrics_GPU_CTR_enable(total_requested, requested_counter_name, requested_counter_description);
 }
diff --git a/src/tool/hpcrun/gpu/amd/roctracer-api.c b/src/tool/hpcrun/gpu/amd/roctracer-api.c
index 9b74a6fa67..c98074489e 100644
--- a/src/tool/hpcrun/gpu/amd/roctracer-api.c
+++ b/src/tool/hpcrun/gpu/amd/roctracer-api.c
@@ -440,7 +440,7 @@ roctracer_subscriber_callback
       ensure_kernel_ip_present(trace_ph, kernel_ip);
 
       if (collect_counter) {
-        rocprofiler_correlation_id = correlation_id; // | 0x800000000000LL;
+        rocprofiler_correlation_id = correlation_id;
         rocprofiler_start_kernel(rocprofiler_correlation_id);
       }
     }
diff --git a/src/tool/hpcrun/gpu/gpu-activity-process.c b/src/tool/hpcrun/gpu/gpu-activity-process.c
index d749ea4562..3d9c2819fb 100644
--- a/src/tool/hpcrun/gpu/gpu-activity-process.c
+++ b/src/tool/hpcrun/gpu/gpu-activity-process.c
@@ -655,9 +655,18 @@ gpu_counter_process
       cct_node_t *host_op_node =
         gpu_host_correlation_map_entry_op_cct_get(host_op_entry, ph);
       assert(host_op_node != NULL);
+
+      cct_node_t *func_node = hpcrun_cct_children(host_op_node); // only child
+      cct_node_t *kernel_node;
+      if (func_node == NULL) {
+        kernel_node = host_op_node;
+      } else {
+        cct_addr_t *addr = hpcrun_cct_addr(func_node);
+        kernel_node = hpcrun_cct_insert_ip_norm(host_op_node, addr->ip_norm);
+      }
       // Memory allocation does not always happen on the device
       // Do not send it to trace channels
-      attribute_activity(host_op_entry, activity, host_op_node);
+      attribute_activity(host_op_entry, activity, kernel_node);
     }
     gpu_correlation_id_map_delete(correlation_id);
   } else {
@@ -666,7 +675,7 @@ gpu_counter_process
   PRINT("Counter CorrelationId %u\n", correlation_id);
   PRINT("Counter cycles %lu\n", activity->details.counters.cycles);
   PRINT("Counter l2 cache hit %lu\n", activity->details.counters.l2_cache_hit);
-  PRINT("Counter l2 cache miss %lu\n", activity->details.counters.l2._cache_miss);
+  PRINT("Counter l2 cache miss %lu\n", activity->details.counters.l2_cache_miss);
 }
 
 
diff --git a/src/tool/hpcrun/gpu/gpu-activity.h b/src/tool/hpcrun/gpu/gpu-activity.h
index 7534325aff..1eea4c9fde 100644
--- a/src/tool/hpcrun/gpu/gpu-activity.h
+++ b/src/tool/hpcrun/gpu/gpu-activity.h
@@ -349,9 +349,12 @@ typedef struct gpu_host_correlation_t {
 
 typedef struct gpu_counter_t {
   uint32_t correlation_id;
-  uint64_t cycles;
-  uint64_t l2_cache_hit;
-  uint64_t l2_cache_miss;
+  int total_counters;
+  // The function that creates the structure should
+  // be responsible for allocating memory.
+  // The function that attributes the structure should
+  // be responsible for deallocating the memory.
+  uint64_t* values;
 } gpu_counter_t;
 
 // a type that can be used to access start and end times
diff --git a/src/tool/hpcrun/gpu/gpu-metrics.c b/src/tool/hpcrun/gpu/gpu-metrics.c
index a2e98c996a..7b7d924da1 100644
--- a/src/tool/hpcrun/gpu/gpu-metrics.c
+++ b/src/tool/hpcrun/gpu/gpu-metrics.c
@@ -204,11 +204,14 @@ name ## _metric_kind
 // local variables 
 //*****************************************************************************
 
-FORALL_METRIC_KINDS(INITIALIZE_METRIC_KINDS)
+FORALL_METRIC_KINDS(INITIALIZE_METRIC_KINDS);
 
-FORALL_INDEXED_METRIC_KINDS(INITIALIZE_INDEXED_METRIC)
+FORALL_INDEXED_METRIC_KINDS(INITIALIZE_INDEXED_METRIC);
 
-FORALL_SCALAR_METRIC_KINDS(INITIALIZE_SCALAR_METRIC_KIND)
+FORALL_SCALAR_METRIC_KINDS(INITIALIZE_SCALAR_METRIC_KIND);
+
+static kind_info_t* GPU_COUNTER_METRIC_KIND_INFO = NULL;
+static int* gpu_counter_hpcrun_metric_id_array = NULL;
 
 static const unsigned int MAX_CHAR_FORMULA = 32;
 
@@ -603,16 +606,13 @@ gpu_metrics_attribute_counter
   cct_node_t *cct_node = activity->cct_node;
 
   metric_data_list_t *metrics =
-    hpcrun_reify_metric_set(cct_node, METRIC_ID(GPU_CTR_CYCLES));
-
-  gpu_metrics_attribute_metric_int(metrics, METRIC_ID(GPU_CTR_CYCLES),
-           c->cycles);
+    hpcrun_reify_metric_set(cct_node,gpu_counter_hpcrun_metric_id_array[0]);
 
-  gpu_metrics_attribute_metric_int(metrics, METRIC_ID(GPU_CTR_L2_CACHE_HIT),
-           c->l2_cache_hit);
+  for (int i = 0; i < c->total_counters; ++i) {
+    gpu_metrics_attribute_metric_int(metrics, gpu_counter_hpcrun_metric_id_array[i], c->values[i]);
+  }
 
-  gpu_metrics_attribute_metric_int(metrics, METRIC_ID(GPU_CTR_L2_CACHE_MISS),
-           c->l2_cache_miss);
+  free(c->values);
 }
 
 //******************************************************************************
@@ -927,15 +927,21 @@ gpu_metrics_GPU_INST_STALL_enable
 void
 gpu_metrics_GPU_CTR_enable
 (
- void
+  int total,
+  const char** counter_name,
+  const char** counter_desc
 )
 {
-#undef CURRENT_METRIC
-#define CURRENT_METRIC CTR
+  gpu_counter_hpcrun_metric_id_array = (int*) malloc(sizeof(int) * total);
 
-  INITIALIZE_METRIC_KIND();
+  GPU_COUNTER_METRIC_KIND_INFO = hpcrun_metrics_new_kind();
 
-  FORALL_CTR(INITIALIZE_SCALAR_METRIC_INT);
+  for (int i = 0; i < total; ++i) {
+    gpu_counter_hpcrun_metric_id_array[i] = hpcrun_set_new_metric_desc_and_period(
+      GPU_COUNTER_METRIC_KIND_INFO, counter_name[i], counter_desc[i],
+      MetricFlags_ValFmt_Int, 1, metric_property_none
+    );
+  }
 
-  FINALIZE_METRIC_KIND();
-}
\ No newline at end of file
+  hpcrun_close_kind(GPU_COUNTER_METRIC_KIND_INFO);
+}
diff --git a/src/tool/hpcrun/gpu/gpu-metrics.h b/src/tool/hpcrun/gpu/gpu-metrics.h
index 01952f01a6..af04665f42 100644
--- a/src/tool/hpcrun/gpu/gpu-metrics.h
+++ b/src/tool/hpcrun/gpu/gpu-metrics.h
@@ -329,15 +329,6 @@ typedef enum {
 	"GPU kernel: launch count")					\
   macro("GKER:OCC_THR",               GPU_KINFO_OCCUPANCY_THR,		\
 	"GPU kernel: theoretical occupancy (FGP_ACT / FGP_MAX)")          \
-  
-// gpu kernel hardware counter metrics
-#define FORALL_CTR(macro) \
-  macro("GCTR:CYCLES",         GPU_CTR_CYCLES, \
-	"GPU counter : cycles")	\
-  macro("GCTR:L2_CACHE_HIT",         GPU_CTR_L2_CACHE_HIT, \
-	"GPU counter : L2 cache hit")	\
-  macro("GCTR:L2_CACHE_MISS",         GPU_CTR_L2_CACHE_MISS, \
-	"GPU counter : L2 cache miss")	\
 
 // gpu implicit copy
 #define FORALL_GICOPY(macro)					\
@@ -498,10 +489,16 @@ gpu_metrics_GBR_enable
 // record GPU hardware counters
 //--------------------------------------------------
 
+// Unlike other GPU metric types that may have up to a dozen of metrics,
+// GPU hardware counters may have a few hundred metrics.
+// So, we should only create counter metrics for the ones that are
+// requested at the command line.
 void
 gpu_metrics_GPU_CTR_enable
 (
-  void
+  int,
+  const char**,
+  const char**
 );
 
 
diff --git a/src/tool/hpcrun/sample-sources/amd-rocprofiler.c b/src/tool/hpcrun/sample-sources/amd-rocprofiler.c
index ffde2fe06a..41cdaf6ca9 100644
--- a/src/tool/hpcrun/sample-sources/amd-rocprofiler.c
+++ b/src/tool/hpcrun/sample-sources/amd-rocprofiler.c
@@ -153,7 +153,6 @@ static void
 METHOD_FN(process_event_list, int lush_metrics)
 {
   int nevents = (self->evl).nevents;
-  gpu_metrics_GPU_CTR_enable();
   TMSG(CUDA,"nevents = %d", nevents);
 }
 
@@ -162,7 +161,8 @@ METHOD_FN(finalize_event_list)
 {
   // After going through all command line arguments,
   // we call this function to generate a list of counters
-  // in rocprofiler's format
+  // in rocprofiler's format and initialize corresponding
+  // hpcrun metrics
   rocprofiler_finalize_event_list();
 
   device_finalizer_rocprofiler_shutdown.fn = rocprofiler_fini;

From b88618e5f538a0d92c63450460357ba92d23ad3a Mon Sep 17 00:00:00 2001
From: John Mellor-Crummey <johnmc@rice.edu>
Date: Fri, 14 Jan 2022 17:19:09 -0600
Subject: [PATCH 158/177] update to rocm 4.5; no longer works with earlier
 rocm. (#506)

---
 src/tool/hpcrun/gpu/amd/rocm-debug-api.c | 45 ------------------------
 src/tool/hpcrun/gpu/amd/roctracer-api.c  |  3 --
 2 files changed, 48 deletions(-)

diff --git a/src/tool/hpcrun/gpu/amd/rocm-debug-api.c b/src/tool/hpcrun/gpu/amd/rocm-debug-api.c
index 2f7262b2bf..b20b664f9b 100644
--- a/src/tool/hpcrun/gpu/amd/rocm-debug-api.c
+++ b/src/tool/hpcrun/gpu/amd/rocm-debug-api.c
@@ -165,52 +165,10 @@ hpcrun_self_process
   return AMD_DBGAPI_STATUS_SUCCESS;
 }
 
-static amd_dbgapi_status_t
-hpcrun_enable_notify_shared_library
-(
-  amd_dbgapi_client_process_id_t client_process_id,
-  const char *shared_library_name,
-  amd_dbgapi_shared_library_id_t shared_library_id,
-  amd_dbgapi_shared_library_state_t *shared_library_state
-)
-{
-  if (strcmp(shared_library_name, "libhsa-runtime64.so.1") == 0)
-    *shared_library_state = AMD_DBGAPI_SHARED_LIBRARY_STATE_LOADED;
-  else
-    *shared_library_state = AMD_DBGAPI_SHARED_LIBRARY_STATE_UNLOADED;
-  return AMD_DBGAPI_STATUS_SUCCESS;
-}
-
-static amd_dbgapi_status_t
-hpcrun_disable_notify_shared_library
-(
-  amd_dbgapi_client_process_id_t client_process_id,
-  amd_dbgapi_shared_library_id_t shared_library_id
-)
-{
-  return AMD_DBGAPI_STATUS_SUCCESS;
-}
-
-static amd_dbgapi_status_t
-hpcrun_get_symbol_address
-(
-  amd_dbgapi_client_process_id_t client_process_id,
-  amd_dbgapi_shared_library_id_t shared_library_id,
-  const char *symbol_name,
-  amd_dbgapi_global_address_t *address
-)
-{
-  // It is necessary to allow rocm debug library to call dlsym through this function.
-  // We need to ensure that this code will not be called in a signal handler
-  *address = (amd_dbgapi_global_address_t) dlsym(RTLD_DEFAULT, symbol_name);
-  return AMD_DBGAPI_STATUS_SUCCESS;
-}
-
 static amd_dbgapi_status_t
 hpcrun_insert_breakpoint
 (
   amd_dbgapi_client_process_id_t client_process_id,
-  amd_dbgapi_shared_library_id_t shared_library_id,
   amd_dbgapi_global_address_t address,
   amd_dbgapi_breakpoint_id_t breakpoint_id
 )
@@ -310,9 +268,6 @@ rocm_debug_api_init
   callbacks.allocate_memory = malloc;
   callbacks.deallocate_memory = free;
   callbacks.get_os_pid = hpcrun_self_process;
-  callbacks.enable_notify_shared_library = hpcrun_enable_notify_shared_library;
-  callbacks.disable_notify_shared_library = hpcrun_disable_notify_shared_library;
-  callbacks.get_symbol_address = hpcrun_get_symbol_address;
   callbacks.insert_breakpoint = hpcrun_insert_breakpoint;
   callbacks.remove_breakpoint = hpcrun_remove_breakpoint;
   callbacks.log_message = hpcrun_log_message;
diff --git a/src/tool/hpcrun/gpu/amd/roctracer-api.c b/src/tool/hpcrun/gpu/amd/roctracer-api.c
index c98074489e..dc5e657575 100644
--- a/src/tool/hpcrun/gpu/amd/roctracer-api.c
+++ b/src/tool/hpcrun/gpu/amd/roctracer-api.c
@@ -549,9 +549,6 @@ roctracer_bind
   // dynamic libraries only availabile in non-static case
   hpcrun_force_dlopen(true);
   CHK_DLOPEN(roctracer, roctracer_path(), RTLD_NOW | RTLD_GLOBAL);
-  // Somehow roctracter needs libkfdwrapper64.so, but does not really load it.
-  // So, we load it before using any function in roctracter.
-  CHK_DLOPEN(kfd, "libkfdwrapper64.so", RTLD_NOW | RTLD_GLOBAL);
 
   CHK_DLOPEN(hip, "libamdhip64.so", RTLD_NOW | RTLD_GLOBAL);
   hpcrun_force_dlopen(false);

From 5788aa25bc0bdf9f7efc9f0ff943b1366d071d42 Mon Sep 17 00:00:00 2001
From: Xiaozhu Meng <mxz297@gmail.com>
Date: Thu, 20 Jan 2022 10:16:40 -0600
Subject: [PATCH 159/177] 1. no longer need to disable code object tracking as
 this option is ignored by rocprofiler in rocm-4.5.2 2. code cleanup

---
 src/tool/hpcrun/gpu/amd/rocprofiler-api.c | 4 ----
 src/tool/hpcrun/gpu/amd/rocprofiler-api.h | 4 ++--
 src/tool/hpcrun/sample-sources/ss-list.h  | 2 ++
 3 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/src/tool/hpcrun/gpu/amd/rocprofiler-api.c b/src/tool/hpcrun/gpu/amd/rocprofiler-api.c
index 74adce8197..9f2b5fe0c9 100644
--- a/src/tool/hpcrun/gpu/amd/rocprofiler-api.c
+++ b/src/tool/hpcrun/gpu/amd/rocprofiler-api.c
@@ -431,10 +431,6 @@ initialize_counter_information
 
 // This is necessary for rocprofiler callback to work
 extern PUBLIC_API void OnLoadToolProp(rocprofiler_settings_t* settings){
-  // Somehow needs to disable code object tracking
-  // to avoid a deadlock in rocprofiler
-  settings->code_obj_tracking = 0;
-
   rocprofiler_init();
 
   rocprofiler_queue_callbacks_t callbacks_ptrs = {};
diff --git a/src/tool/hpcrun/gpu/amd/rocprofiler-api.h b/src/tool/hpcrun/gpu/amd/rocprofiler-api.h
index c1a612e2ea..ba624910b0 100644
--- a/src/tool/hpcrun/gpu/amd/rocprofiler-api.h
+++ b/src/tool/hpcrun/gpu/amd/rocprofiler-api.h
@@ -9,7 +9,7 @@
 // HPCToolkit is at 'hpctoolkit.org' and in 'README.Acknowledgments'.
 // --------------------------------------------------------------------------
 //
-// Copyright ((c)) 2002-2021, Rice University
+// Copyright ((c)) 2002-2022, Rice University
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
@@ -85,7 +85,7 @@ rocprofiler_fini
 
 
 int
-rocprofile_bind
+rocprofiler_bind
 (
   void
 );
diff --git a/src/tool/hpcrun/sample-sources/ss-list.h b/src/tool/hpcrun/sample-sources/ss-list.h
index 3a329c1719..14887d1bc3 100644
--- a/src/tool/hpcrun/sample-sources/ss-list.h
+++ b/src/tool/hpcrun/sample-sources/ss-list.h
@@ -97,8 +97,10 @@ SAMPLE_SOURCE_DECL_MACRO(amd_gpu)
 #endif
 
 #ifdef HPCRUN_SS_AMD
+#ifndef HPCRUN_STATIC_LINK
 SAMPLE_SOURCE_DECL_MACRO(amd_rocprof)
 #endif
+#endif
 
 #ifdef HPCRUN_SS_LEVEL0
 SAMPLE_SOURCE_DECL_MACRO(level0)

From 6008ea38897c5909868bf476cedbedb38c962599 Mon Sep 17 00:00:00 2001
From: Xiaozhu Meng <mxz297@gmail.com>
Date: Fri, 21 Jan 2022 14:54:24 -0600
Subject: [PATCH 160/177] Use roctracer/rocprofiler to get URIs for AMD GPU
 binaries

---
 .../hpcrun/gpu/amd/rocm-binary-processing.c   | 107 +++++++++---------
 .../hpcrun/gpu/amd/rocm-binary-processing.h   |  12 ++
 src/tool/hpcrun/gpu/amd/rocprofiler-api.c     |  35 +++++-
 src/tool/hpcrun/gpu/amd/roctracer-api.c       |   3 -
 src/tool/hpcrun/gpu/amd/roctracer-api.h       |   1 -
 5 files changed, 97 insertions(+), 61 deletions(-)

diff --git a/src/tool/hpcrun/gpu/amd/rocm-binary-processing.c b/src/tool/hpcrun/gpu/amd/rocm-binary-processing.c
index 1044368644..75dcc1df77 100644
--- a/src/tool/hpcrun/gpu/amd/rocm-binary-processing.c
+++ b/src/tool/hpcrun/gpu/amd/rocm-binary-processing.c
@@ -58,8 +58,7 @@
 //******************************************************************************
 
 #include <include/gpu-binary.h>
-
-#include "rocm-debug-api.h"
+#include <lib/prof-lean/spinlock.h>
 #include "rocm-binary-processing.h"
 #include <hpcrun/files.h>
 #include <hpcrun/memory/hpcrun-malloc.h>
@@ -96,6 +95,12 @@ typedef struct amd_gpu_binary {
 
 amd_gpu_binary_t* binary_list = NULL;
 
+// A spin lock to serialize two AMD GPU binary opertionas:
+// 1. parse and add a code object to the binary list
+// 2. look up a function name from the the binary list
+static spinlock_t rocm_binary_list_lock;
+
+
 //******************************************************************************
 // private operations
 //******************************************************************************
@@ -313,55 +318,33 @@ file_uri_exists
   return 0;
 }
 
-static int
+static void
 parse_amd_gpu_binary
 (
-  void
+  const char* uri
 )
 {
-  // rocm debug api library creates a new thread through std::thread.
-  // This breaks automatic thread ignoring code because we only check
-  // the caller of pthread_create. So, we manually ignore the new thread.
-  monitor_disable_new_threads();
-
-  rocm_debug_api_init();
-  size_t code_object_count;
-  rocm_debug_api_query_code_object(&code_object_count);
-
-  for (size_t i = 0; i < code_object_count; ++i) {
-    char* uri = rocm_debug_api_query_uri(i);
-    PRINT("uri %d, %s\n", i, uri);
-
-    // Handle file URIs
-    if (strncmp(uri, "file://", strlen("file://")) == 0) {
-      if (file_uri_exists(uri)) continue;
-
-      // Handle a new AMD GPU binary
-      amd_gpu_binary_t* bin = (amd_gpu_binary_t*) malloc(sizeof(amd_gpu_binary_t));
-      bin->uri = strdup(uri);
-      bin->next = binary_list;
-      binary_list = bin;
-
-      // Parse URI to extract the binary
-      parse_amd_gpu_binary_uri(uri, bin);
-
-      // Parse the ELF symbol table
-      elf_version(EV_CURRENT);
-      Elf *elf = elf_memory(bin->buf, bin->size);
-      if (elf != 0) {
-        construct_amd_gpu_symbols(elf, &(bin->function_table));
-        elf_end(elf);
-      }
+  // Handle file URIs
+  if (strncmp(uri, "file://", strlen("file://")) == 0) {
+    if (file_uri_exists(uri)) return;
+
+    // Handle a new AMD GPU binary
+    amd_gpu_binary_t* bin = (amd_gpu_binary_t*) malloc(sizeof(amd_gpu_binary_t));
+    bin->uri = strdup(uri);
+    bin->next = binary_list;
+    binary_list = bin;
+
+    // Parse URI to extract the binary
+    parse_amd_gpu_binary_uri(uri, bin);
+
+    // Parse the ELF symbol table
+    elf_version(EV_CURRENT);
+    Elf *elf = elf_memory(bin->buf, bin->size);
+    if (elf != 0) {
+      construct_amd_gpu_symbols(elf, &(bin->function_table));
+      elf_end(elf);
     }
   }
-
-  rocm_debug_api_fini();
-
-  // Now we are done with the rocm debug api.
-  // we enable tracing threads
-  monitor_enable_new_threads();
-
-  return 0;
 }
 
 // TODO:
@@ -408,19 +391,31 @@ rocm_binary_function_lookup
 )
 {
   // TODO:
-  // 1. Handle multi-threaded case. Currently, this function is called when the first
-  //    HIP kernel launch is done. So multiple threads can enter this concurrently.
-  // 2. Currently we support multiple GPU binaries, but assume that kernel is unique
+  // 1. Currently we support multiple GPU binaries, but assume that kernel is unique
   //    across GPU binaries.
-  if (binary_list == NULL) {
-    if (parse_amd_gpu_binary() < 0) {
-      // Allocate a placeholder binary
-      binary_list = (amd_gpu_binary_t*)malloc(sizeof(amd_gpu_binary_t));
-      binary_list->next = NULL;
-      binary_list->function_table.size = 0;
-    }
-  }
+  spinlock_lock(&rocm_binary_list_lock);
   ip_normalized_t nip = lookup_amd_function(kernel_name);
   PRINT("HIP launch kernel %s, lm_ip %lx\n", kernel_name, nip.lm_ip);
+  spinlock_unlock(&rocm_binary_list_lock);
   return nip;
 }
+
+void
+rocm_binary_uri_add
+(
+  const char* uri
+)
+{
+  spinlock_lock(&rocm_binary_list_lock);
+  parse_amd_gpu_binary(uri);
+  spinlock_unlock(&rocm_binary_list_lock);
+}
+
+void
+rocm_binary_uri_list_init
+(
+  void
+)
+{
+  spinlock_init(&rocm_binary_list_lock);
+}
\ No newline at end of file
diff --git a/src/tool/hpcrun/gpu/amd/rocm-binary-processing.h b/src/tool/hpcrun/gpu/amd/rocm-binary-processing.h
index 9300ffa710..0fa592e823 100644
--- a/src/tool/hpcrun/gpu/amd/rocm-binary-processing.h
+++ b/src/tool/hpcrun/gpu/amd/rocm-binary-processing.h
@@ -60,4 +60,16 @@ rocm_binary_function_lookup
   const char* kernel_name
 );
 
+void
+rocm_binary_uri_add
+(
+  const char* uri
+);
+
+void
+rocm_binary_uri_list_init
+(
+  void
+);
+
 #endif
diff --git a/src/tool/hpcrun/gpu/amd/rocprofiler-api.c b/src/tool/hpcrun/gpu/amd/rocprofiler-api.c
index 9f2b5fe0c9..f20d88f6c4 100644
--- a/src/tool/hpcrun/gpu/amd/rocprofiler-api.c
+++ b/src/tool/hpcrun/gpu/amd/rocprofiler-api.c
@@ -46,11 +46,11 @@
 //******************************************************************************
 
 #include "rocprofiler-api.h"
-// #include "rocm-debug-api.h"
 #include "rocm-binary-processing.h"
 
 #include <roctracer_hip.h>
 #include <rocprofiler.h>
+#include <activity.h>
 
 #include <hpcrun/gpu/gpu-activity-channel.h>
 #include <hpcrun/gpu/gpu-activity-process.h>
@@ -425,12 +425,45 @@ initialize_counter_information
   memset(is_specified_by_user, 0, total_counters * sizeof(int));
 }
 
+// This function should be implemented in roctracer-api.c,
+// but due to c++ism in AMD software, I can only include rocprofiler header
+// filers in one .o
+static void
+roctracer_codeobj_callback
+(
+  uint32_t domain,
+  uint32_t cid,
+  const void* data,
+  void* arg
+)
+{
+  const hsa_evt_data_t* evt_data = (const hsa_evt_data_t*)(data);
+  const char* uri = evt_data->codeobj.uri;
+  rocm_binary_uri_add(uri);
+  PRINT("codeobj_callback domain(%u) cid(%u): load_base(0x%lx) load_size(0x%lx) load_delta(0x%lx) uri(\"%s\")\n",
+    domain,
+    cid,
+    evt_data->codeobj.load_base,
+    evt_data->codeobj.load_size,
+    evt_data->codeobj.load_delta,
+    uri);
+  free((void*)uri);
+}
+
 //******************************************************************************
 // AMD hidden interface operations
 //******************************************************************************
 
 // This is necessary for rocprofiler callback to work
 extern PUBLIC_API void OnLoadToolProp(rocprofiler_settings_t* settings){
+  // Enable hsa interception for getting code object URIs
+  settings->hsa_intercepting = 1;
+
+  // Ask roctracer to set up code object URI callbacks
+  rocm_binary_uri_list_init();
+  roctracer_enable_op_callback(
+    ACTIVITY_DOMAIN_HSA_EVT, HSA_EVT_ID_CODEOBJ, roctracer_codeobj_callback, NULL
+  );
   rocprofiler_init();
 
   rocprofiler_queue_callbacks_t callbacks_ptrs = {};
diff --git a/src/tool/hpcrun/gpu/amd/roctracer-api.c b/src/tool/hpcrun/gpu/amd/roctracer-api.c
index dc5e657575..d2cb8d0f05 100644
--- a/src/tool/hpcrun/gpu/amd/roctracer-api.c
+++ b/src/tool/hpcrun/gpu/amd/roctracer-api.c
@@ -47,7 +47,6 @@
 
 #include "roctracer-api.h"
 #include "roctracer-activity-translate.h"
-#include "rocm-debug-api.h"
 #include "rocm-binary-processing.h"
 
 #include <roctracer_hip.h>
@@ -525,8 +524,6 @@ roctracer_path
   return path;
 }
 
-
-
 //******************************************************************************
 // interface operations
 //******************************************************************************
diff --git a/src/tool/hpcrun/gpu/amd/roctracer-api.h b/src/tool/hpcrun/gpu/amd/roctracer-api.h
index 68d1aa618b..672338754d 100644
--- a/src/tool/hpcrun/gpu/amd/roctracer-api.h
+++ b/src/tool/hpcrun/gpu/amd/roctracer-api.h
@@ -77,5 +77,4 @@ roctracer_enable_counter_collection
   void
 );
 
-
 #endif

From 0807053bb7168775275cf6626f6ed1211ce72302 Mon Sep 17 00:00:00 2001
From: Xiaozhu Meng <mxz297@gmail.com>
Date: Fri, 21 Jan 2022 15:27:21 -0600
Subject: [PATCH 161/177] Remove rocm debug api source files and add a missing
 environment variable for rocprofiler

---
 src/tool/hpcrun/Makefile.am              |   1 -
 src/tool/hpcrun/Makefile.in              |  17 +-
 src/tool/hpcrun/gpu/amd/rocm-debug-api.c | 312 -----------------------
 src/tool/hpcrun/gpu/amd/rocm-debug-api.h |  81 ------
 src/tool/hpcrun/gpu/amd/roctracer-api.c  |   4 -
 src/tool/hpcrun/scripts/hpcrun.in        |   3 +-
 6 files changed, 4 insertions(+), 414 deletions(-)
 delete mode 100644 src/tool/hpcrun/gpu/amd/rocm-debug-api.c
 delete mode 100644 src/tool/hpcrun/gpu/amd/rocm-debug-api.h

diff --git a/src/tool/hpcrun/Makefile.am b/src/tool/hpcrun/Makefile.am
index 1c55094e1f..e71a2615e2 100644
--- a/src/tool/hpcrun/Makefile.am
+++ b/src/tool/hpcrun/Makefile.am
@@ -546,7 +546,6 @@ MY_ROCM_FILES =\
 	gpu/amd/roctracer-activity-translate.c \
 	gpu/amd/roctracer-api.c \
 	gpu/amd/rocprofiler-api.c \
-	gpu/amd/rocm-debug-api.c \
 	gpu/amd/rocm-binary-processing.c
 endif
 
diff --git a/src/tool/hpcrun/Makefile.in b/src/tool/hpcrun/Makefile.in
index 25274b6fe2..42fdc3b5f0 100644
--- a/src/tool/hpcrun/Makefile.in
+++ b/src/tool/hpcrun/Makefile.in
@@ -540,9 +540,8 @@ am__libhpcrun_la_SOURCES_DIST = utilities/first_func.c main.h main.c \
 	gpu/nvidia/cupti-gpu-api.c sample-sources/upc.c \
 	sample-sources/amd.c sample-sources/amd-rocprofiler.c \
 	gpu/amd/roctracer-activity-translate.c gpu/amd/roctracer-api.c \
-	gpu/amd/rocprofiler-api.c gpu/amd/rocm-debug-api.c \
-	gpu/amd/rocm-binary-processing.c sample-sources/level0.c \
-	gpu/level0/level0-api.c \
+	gpu/amd/rocprofiler-api.c gpu/amd/rocm-binary-processing.c \
+	sample-sources/level0.c gpu/level0/level0-api.c \
 	gpu/level0/level0-command-list-context-map.c \
 	gpu/level0/level0-command-list-map.c \
 	gpu/level0/level0-command-process.c \
@@ -751,7 +750,6 @@ am__objects_35 =
 @OPT_ENABLE_ROCM_TRUE@	gpu/amd/libhpcrun_la-roctracer-activity-translate.lo \
 @OPT_ENABLE_ROCM_TRUE@	gpu/amd/libhpcrun_la-roctracer-api.lo \
 @OPT_ENABLE_ROCM_TRUE@	gpu/amd/libhpcrun_la-rocprofiler-api.lo \
-@OPT_ENABLE_ROCM_TRUE@	gpu/amd/libhpcrun_la-rocm-debug-api.lo \
 @OPT_ENABLE_ROCM_TRUE@	gpu/amd/libhpcrun_la-rocm-binary-processing.lo
 @OPT_ENABLE_ROCM_TRUE@am__objects_37 = $(am__objects_36)
 @OPT_ENABLE_LEVEL0_TRUE@am__objects_38 =  \
@@ -2007,7 +2005,6 @@ MY_AARCH64_FILES = \
 @OPT_ENABLE_ROCM_TRUE@	gpu/amd/roctracer-activity-translate.c \
 @OPT_ENABLE_ROCM_TRUE@	gpu/amd/roctracer-api.c \
 @OPT_ENABLE_ROCM_TRUE@	gpu/amd/rocprofiler-api.c \
-@OPT_ENABLE_ROCM_TRUE@	gpu/amd/rocm-debug-api.c \
 @OPT_ENABLE_ROCM_TRUE@	gpu/amd/rocm-binary-processing.c
 
 @OPT_ENABLE_LEVEL0_TRUE@MY_LEVEL0_FILES = \
@@ -3018,8 +3015,6 @@ gpu/amd/libhpcrun_la-roctracer-api.lo: gpu/amd/$(am__dirstamp) \
 	gpu/amd/$(DEPDIR)/$(am__dirstamp)
 gpu/amd/libhpcrun_la-rocprofiler-api.lo: gpu/amd/$(am__dirstamp) \
 	gpu/amd/$(DEPDIR)/$(am__dirstamp)
-gpu/amd/libhpcrun_la-rocm-debug-api.lo: gpu/amd/$(am__dirstamp) \
-	gpu/amd/$(DEPDIR)/$(am__dirstamp)
 gpu/amd/libhpcrun_la-rocm-binary-processing.lo:  \
 	gpu/amd/$(am__dirstamp) gpu/amd/$(DEPDIR)/$(am__dirstamp)
 sample-sources/libhpcrun_la-level0.lo: sample-sources/$(am__dirstamp) \
@@ -4013,7 +4008,6 @@ distclean-compile:
 @AMDEP_TRUE@@am__include@ @am__quote@gpu/$(DEPDIR)/libhpcrun_o-gpu-trace-item.Po@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@gpu/$(DEPDIR)/libhpcrun_o-gpu-trace.Po@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@gpu/amd/$(DEPDIR)/libhpcrun_la-rocm-binary-processing.Plo@am__quote@
-@AMDEP_TRUE@@am__include@ @am__quote@gpu/amd/$(DEPDIR)/libhpcrun_la-rocm-debug-api.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@gpu/amd/$(DEPDIR)/libhpcrun_la-rocprofiler-api.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@gpu/amd/$(DEPDIR)/libhpcrun_la-roctracer-activity-translate.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@gpu/amd/$(DEPDIR)/libhpcrun_la-roctracer-api.Plo@am__quote@
@@ -5687,13 +5681,6 @@ gpu/amd/libhpcrun_la-rocprofiler-api.lo: gpu/amd/rocprofiler-api.c
 @AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
 @am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -c -o gpu/amd/libhpcrun_la-rocprofiler-api.lo `test -f 'gpu/amd/rocprofiler-api.c' || echo '$(srcdir)/'`gpu/amd/rocprofiler-api.c
 
-gpu/amd/libhpcrun_la-rocm-debug-api.lo: gpu/amd/rocm-debug-api.c
-@am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -MT gpu/amd/libhpcrun_la-rocm-debug-api.lo -MD -MP -MF gpu/amd/$(DEPDIR)/libhpcrun_la-rocm-debug-api.Tpo -c -o gpu/amd/libhpcrun_la-rocm-debug-api.lo `test -f 'gpu/amd/rocm-debug-api.c' || echo '$(srcdir)/'`gpu/amd/rocm-debug-api.c
-@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) gpu/amd/$(DEPDIR)/libhpcrun_la-rocm-debug-api.Tpo gpu/amd/$(DEPDIR)/libhpcrun_la-rocm-debug-api.Plo
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='gpu/amd/rocm-debug-api.c' object='gpu/amd/libhpcrun_la-rocm-debug-api.lo' libtool=yes @AMDEPBACKSLASH@
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
-@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -c -o gpu/amd/libhpcrun_la-rocm-debug-api.lo `test -f 'gpu/amd/rocm-debug-api.c' || echo '$(srcdir)/'`gpu/amd/rocm-debug-api.c
-
 gpu/amd/libhpcrun_la-rocm-binary-processing.lo: gpu/amd/rocm-binary-processing.c
 @am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -MT gpu/amd/libhpcrun_la-rocm-binary-processing.lo -MD -MP -MF gpu/amd/$(DEPDIR)/libhpcrun_la-rocm-binary-processing.Tpo -c -o gpu/amd/libhpcrun_la-rocm-binary-processing.lo `test -f 'gpu/amd/rocm-binary-processing.c' || echo '$(srcdir)/'`gpu/amd/rocm-binary-processing.c
 @am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) gpu/amd/$(DEPDIR)/libhpcrun_la-rocm-binary-processing.Tpo gpu/amd/$(DEPDIR)/libhpcrun_la-rocm-binary-processing.Plo
diff --git a/src/tool/hpcrun/gpu/amd/rocm-debug-api.c b/src/tool/hpcrun/gpu/amd/rocm-debug-api.c
deleted file mode 100644
index b20b664f9b..0000000000
--- a/src/tool/hpcrun/gpu/amd/rocm-debug-api.c
+++ /dev/null
@@ -1,312 +0,0 @@
-// -*-Mode: C++;-*- // technically C99
-
-// * BeginRiceCopyright *****************************************************
-//
-// --------------------------------------------------------------------------
-// Part of HPCToolkit (hpctoolkit.org)
-//
-// Information about sources of support for research and development of
-// HPCToolkit is at 'hpctoolkit.org' and in 'README.Acknowledgments'.
-// --------------------------------------------------------------------------
-//
-// Copyright ((c)) 2002-2022, Rice University
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// * Redistributions of source code must retain the above copyright
-//   notice, this list of conditions and the following disclaimer.
-//
-// * Redistributions in binary form must reproduce the above copyright
-//   notice, this list of conditions and the following disclaimer in the
-//   documentation and/or other materials provided with the distribution.
-//
-// * Neither the name of Rice University (RICE) nor the names of its
-//   contributors may be used to endorse or promote products derived from
-//   this software without specific prior written permission.
-//
-// This software is provided by RICE and contributors "as is" and any
-// express or implied warranties, including, but not limited to, the
-// implied warranties of merchantability and fitness for a particular
-// purpose are disclaimed. In no event shall RICE or contributors be
-// liable for any direct, indirect, incidental, special, exemplary, or
-// consequential damages (including, but not limited to, procurement of
-// substitute goods or services; loss of use, data, or profits; or
-// business interruption) however caused and on any theory of liability,
-// whether in contract, strict liability, or tort (including negligence
-// or otherwise) arising in any way out of the use of this software, even
-// if advised of the possibility of such damage.
-//
-// ******************************************************* EndRiceCopyright *
-
-//******************************************************************************
-// system includes
-//******************************************************************************
-
-#include "amd-dbgapi.h"
-
-#include <sys/types.h>
-#include <unistd.h>
-
-//******************************************************************************
-// local includes
-//******************************************************************************
-
-#include "rocm-debug-api.h"
-
-#include <hpcrun/safe-sampling.h>
-#include <hpcrun/sample-sources/libdl.h>
-#include <hpcrun/memory/hpcrun-malloc.h>
-
-//******************************************************************************
-// macros
-//******************************************************************************
-
-#define FORALL_ROCM_DEBUG_ROUTINES(macro)			\
-  macro(amd_dbgapi_initialize)   \
-  macro(amd_dbgapi_process_attach)   \
-  macro(amd_dbgapi_process_detach) \
-  macro(amd_dbgapi_process_code_object_list) \
-  macro(amd_dbgapi_code_object_get_info)
-
-
-#define ROCM_DEBUG_FN_NAME(f) DYN_FN_NAME(f)
-
-#define ROCM_DEBUG_FN(fn, args) \
-  static amd_dbgapi_status_t (*ROCM_DEBUG_FN_NAME(fn)) args
-
-#define HPCRUN_ROCM_DEBUG_CALL(fn, args) \
-{      \
-  amd_dbgapi_status_t ret = ROCM_DEBUG_FN_NAME(fn) args;	\
-  check_rocm_debug_status(ret, __LINE__); \
-}
-
-//******************************************************************************
-// debug print
-//******************************************************************************
-
-#define DEBUG 0
-
-#include "hpcrun/gpu/gpu-print.h"
-
-//******************************************************************************
-// local variables
-//******************************************************************************
-
-static amd_dbgapi_callbacks_t callbacks;
-static amd_dbgapi_process_id_t self;
-static amd_dbgapi_code_object_id_t *code_objects_id;
-
-//----------------------------------------------------------
-// rocm debug api function pointers for late binding
-//----------------------------------------------------------
-
-ROCM_DEBUG_FN
-(
-  amd_dbgapi_initialize,
-  (
-    amd_dbgapi_callbacks_t*
-  )
-);
-
-ROCM_DEBUG_FN
-(
-  amd_dbgapi_process_attach,
-  (
-    amd_dbgapi_client_process_id_t,
-    amd_dbgapi_process_id_t *
-  )
-);
-
-ROCM_DEBUG_FN
-(
-  amd_dbgapi_process_detach,
-  (
-    amd_dbgapi_process_id_t
-  )
-);
-
-ROCM_DEBUG_FN
-(
-  amd_dbgapi_process_code_object_list,
-  (
-    amd_dbgapi_process_id_t,
-    size_t *,
-    amd_dbgapi_code_object_id_t **,
-    amd_dbgapi_changed_t *
-  )
-);
-
-ROCM_DEBUG_FN
-(
-  amd_dbgapi_code_object_get_info,
-  (
-    amd_dbgapi_code_object_id_t,    
-    amd_dbgapi_code_object_info_t,
-    size_t,
-    void*
-  )
-);
-
-//******************************************************************************
-// private operations
-//******************************************************************************
-
-static amd_dbgapi_status_t
-hpcrun_self_process
-(
-  amd_dbgapi_client_process_id_t cp,
-  amd_dbgapi_os_process_id_t *os_pid
-)
-{
-  *os_pid = getpid();
-  return AMD_DBGAPI_STATUS_SUCCESS;
-}
-
-static amd_dbgapi_status_t
-hpcrun_insert_breakpoint
-(
-  amd_dbgapi_client_process_id_t client_process_id,
-  amd_dbgapi_global_address_t address,
-  amd_dbgapi_breakpoint_id_t breakpoint_id
-)
-{
-  return AMD_DBGAPI_STATUS_SUCCESS;
-}
-
-static amd_dbgapi_status_t
-hpcrun_remove_breakpoint
-(
-  amd_dbgapi_client_process_id_t client_process_id,
-  amd_dbgapi_breakpoint_id_t breakpoint_id
-)
-{
-  return AMD_DBGAPI_STATUS_SUCCESS;
-}
-
-static void
-hpcrun_log_message
-(
-  amd_dbgapi_log_level_t level,
-  const char *message
-)
-{
-  PRINT("%s\n", message);
-}
-
-static void
-check_rocm_debug_status
-(
-  amd_dbgapi_status_t ret,
-  int lineNo
-)
-{
-  if (ret == AMD_DBGAPI_STATUS_SUCCESS) {
-    return;
-  }
-
-#define CHECK_RET(x) case x: { PRINT("%s", #x); break; }
-  switch(ret) {
-    CHECK_RET(AMD_DBGAPI_STATUS_FATAL)
-    CHECK_RET(AMD_DBGAPI_STATUS_ERROR_NOT_INITIALIZED)
-    CHECK_RET(AMD_DBGAPI_STATUS_ERROR_INVALID_PROCESS_ID)
-    CHECK_RET(AMD_DBGAPI_STATUS_ERROR_INVALID_ARGUMENT)
-    CHECK_RET(AMD_DBGAPI_STATUS_ERROR_CLIENT_CALLBACK)
-    CHECK_RET(AMD_DBGAPI_STATUS_ERROR_INVALID_CODE_OBJECT_ID)
-    CHECK_RET(AMD_DBGAPI_STATUS_ERROR_INVALID_ARGUMENT_COMPATIBILITY)
-    default:
-      PRINT("unknown rocm debug return value");
-      break;
-  } 
-
-#undef CHECK_RET
-
-  PRINT(" at line %d\n", lineNo);
-}
-
-//******************************************************************************
-// interface operations
-//******************************************************************************
-
-int
-rocm_debug_api_bind
-(
-  void
-)
-{
-  // This disable HIP's deferred code object loading.
-  // We can remove this when we start to use HSA API tracing
-  setenv("HIP_ENABLE_DEFERRED_LOADING", "0", 1);
-
-#ifndef HPCRUN_STATIC_LINK
-  // dynamic libraries only availabile in non-static case
-  hpcrun_force_dlopen(true);
-  CHK_DLOPEN(rocm_debug, "librocm-dbgapi.so", RTLD_NOW | RTLD_GLOBAL);
-  hpcrun_force_dlopen(false);
-
-#define ROCM_DEBUG_BIND(fn) \
-  CHK_DLSYM(rocm_debug, fn);
-
-  FORALL_ROCM_DEBUG_ROUTINES(ROCM_DEBUG_BIND);
-
-#undef ROCM_DEBUG_BIND
-  return DYNAMIC_BINDING_STATUS_OK;
-#else
-  return DYNAMIC_BINDING_STATUS_ERROR;
-#endif // ! HPCRUN_STATIC_LINK
-}
-
-void
-rocm_debug_api_init
-(
-  void
-)
-{
-  // Fill in call back functions for rocm debug api
-  callbacks.allocate_memory = malloc;
-  callbacks.deallocate_memory = free;
-  callbacks.get_os_pid = hpcrun_self_process;
-  callbacks.insert_breakpoint = hpcrun_insert_breakpoint;
-  callbacks.remove_breakpoint = hpcrun_remove_breakpoint;
-  callbacks.log_message = hpcrun_log_message;
-
-  HPCRUN_ROCM_DEBUG_CALL(amd_dbgapi_initialize, (&callbacks));
-  HPCRUN_ROCM_DEBUG_CALL(amd_dbgapi_process_attach,
-    ((amd_dbgapi_client_process_id_t)(&self), &self));
-}
-
-void
-rocm_debug_api_fini
-(
-  void
-)
-{
-  HPCRUN_ROCM_DEBUG_CALL(amd_dbgapi_process_detach, (self));
-}
-
-void
-rocm_debug_api_query_code_object
-(
-  size_t* code_object_count_ptr
-)
-{
-  HPCRUN_ROCM_DEBUG_CALL(amd_dbgapi_process_code_object_list,
-    (self, code_object_count_ptr, &code_objects_id, NULL));
-  PRINT("code object count %u\n", *code_object_count_ptr);
-}
-
-char*
-rocm_debug_api_query_uri
-(
-  size_t code_object_index
-)
-{
-  char* uri;
-  HPCRUN_ROCM_DEBUG_CALL(amd_dbgapi_code_object_get_info,
-    (code_objects_id[code_object_index],
-      AMD_DBGAPI_CODE_OBJECT_INFO_URI_NAME,
-      sizeof(char*), (void*)(&uri)));
-  return uri;
-}
diff --git a/src/tool/hpcrun/gpu/amd/rocm-debug-api.h b/src/tool/hpcrun/gpu/amd/rocm-debug-api.h
deleted file mode 100644
index 9ffacea2a8..0000000000
--- a/src/tool/hpcrun/gpu/amd/rocm-debug-api.h
+++ /dev/null
@@ -1,81 +0,0 @@
-// -*-Mode: C++;-*- // technically C99
-
-// * BeginRiceCopyright *****************************************************
-//
-// --------------------------------------------------------------------------
-// Part of HPCToolkit (hpctoolkit.org)
-//
-// Information about sources of support for research and development of
-// HPCToolkit is at 'hpctoolkit.org' and in 'README.Acknowledgments'.
-// --------------------------------------------------------------------------
-//
-// Copyright ((c)) 2002-2022, Rice University
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// * Redistributions of source code must retain the above copyright
-//   notice, this list of conditions and the following disclaimer.
-//
-// * Redistributions in binary form must reproduce the above copyright
-//   notice, this list of conditions and the following disclaimer in the
-//   documentation and/or other materials provided with the distribution.
-//
-// * Neither the name of Rice University (RICE) nor the names of its
-//   contributors may be used to endorse or promote products derived from
-//   this software without specific prior written permission.
-//
-// This software is provided by RICE and contributors "as is" and any
-// express or implied warranties, including, but not limited to, the
-// implied warranties of merchantability and fitness for a particular
-// purpose are disclaimed. In no event shall RICE or contributors be
-// liable for any direct, indirect, incidental, special, exemplary, or
-// consequential damages (including, but not limited to, procurement of
-// substitute goods or services; loss of use, data, or profits; or
-// business interruption) however caused and on any theory of liability,
-// whether in contract, strict liability, or tort (including negligence
-// or otherwise) arising in any way out of the use of this software, even
-// if advised of the possibility of such damage.
-//
-// ******************************************************* EndRiceCopyright *
-
-#ifndef rocm_debug_api_h
-#define rocm_debug_api_h
-
-//******************************************************************************
-// interface operations
-//******************************************************************************
-
-int
-rocm_debug_api_bind
-(
-  void
-);
-
-void
-rocm_debug_api_init
-(
-  void
-);
-
-void
-rocm_debug_api_fini
-(
-  void
-);
-
-void
-rocm_debug_api_query_code_object
-(
-  size_t* code_obejct_count_ptr
-);
-
-char*
-rocm_debug_api_query_uri
-(
-  size_t code_object_index
-);
-
-#endif
diff --git a/src/tool/hpcrun/gpu/amd/roctracer-api.c b/src/tool/hpcrun/gpu/amd/roctracer-api.c
index d2cb8d0f05..adbee24dd9 100644
--- a/src/tool/hpcrun/gpu/amd/roctracer-api.c
+++ b/src/tool/hpcrun/gpu/amd/roctracer-api.c
@@ -538,10 +538,6 @@ roctracer_bind
   // More details: https://github.com/ROCm-Developer-Tools/roctracer/issues/22
   setenv("HSA_ENABLE_INTERRUPT", "0", 1);
 
-  if (rocm_debug_api_bind() != DYNAMIC_BINDING_STATUS_OK) {
-    return DYNAMIC_BINDING_STATUS_ERROR;
-  }
-
 #ifndef HPCRUN_STATIC_LINK
   // dynamic libraries only availabile in non-static case
   hpcrun_force_dlopen(true);
diff --git a/src/tool/hpcrun/scripts/hpcrun.in b/src/tool/hpcrun/scripts/hpcrun.in
index b09ba5d497..ad8a55e837 100644
--- a/src/tool/hpcrun/scripts/hpcrun.in
+++ b/src/tool/hpcrun/scripts/hpcrun.in
@@ -378,7 +378,8 @@ do
              export HSA_TOOLS_LIB=librocprofiler64.so.1
              export ROCP_TOOL_LIB=libhpcrun.so
              export ROCP_METRICS=@ROCM_PROFILER_LD_DIR@/metrics.xml
-			 export HIP_ENABLE_DEFERRED_LOADING=0;;
+             export HIP_ENABLE_DEFERRED_LOADING=0
+			 export ROCP_HSA_INTERCEPT=1;;
 		gpu=opencl)	 preload_list="${preload_list:+${preload_list}:}${hpcrun_dir}/libhpcrun_opencl.so" ;;
 
 		gpu=opencl,inst) gtpin_libdir="${gtpin_lib_path}"

From 5b9dd6c92f6718beed775d3cca4b64960345ad03 Mon Sep 17 00:00:00 2001
From: Xiaozhu Meng <mxz297@gmail.com>
Date: Fri, 21 Jan 2022 16:25:00 -0600
Subject: [PATCH 162/177] Fix compilation

---
 src/tool/hpcrun/gpu/amd/roctracer-api.c    | 2 +-
 src/tool/hpcrun/gpu/gpu-activity-process.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/tool/hpcrun/gpu/amd/roctracer-api.c b/src/tool/hpcrun/gpu/amd/roctracer-api.c
index adbee24dd9..0bdf107b76 100644
--- a/src/tool/hpcrun/gpu/amd/roctracer-api.c
+++ b/src/tool/hpcrun/gpu/amd/roctracer-api.c
@@ -290,7 +290,7 @@ ensure_kernel_ip_present
   // is already present
   if (hpcrun_cct_children(kernel_ph) == NULL) {
     cct_node_t *kernel =
-      hpcrun_cct_insert_ip_norm(kernel_ph, kernel_ip);
+      hpcrun_cct_insert_ip_norm(kernel_ph, kernel_ip, true);
     hpcrun_cct_retain(kernel);
   }
 }
diff --git a/src/tool/hpcrun/gpu/gpu-activity-process.c b/src/tool/hpcrun/gpu/gpu-activity-process.c
index 3d9c2819fb..a65eb3d57b 100644
--- a/src/tool/hpcrun/gpu/gpu-activity-process.c
+++ b/src/tool/hpcrun/gpu/gpu-activity-process.c
@@ -662,7 +662,7 @@ gpu_counter_process
         kernel_node = host_op_node;
       } else {
         cct_addr_t *addr = hpcrun_cct_addr(func_node);
-        kernel_node = hpcrun_cct_insert_ip_norm(host_op_node, addr->ip_norm);
+        kernel_node = hpcrun_cct_insert_ip_norm(host_op_node, addr->ip_norm, true);
       }
       // Memory allocation does not always happen on the device
       // Do not send it to trace channels

From d999b53b510b3cd01441a17f5d106de48deef667 Mon Sep 17 00:00:00 2001
From: John M Mellor-Crummey <johnmc@rice.edu>
Date: Sun, 23 Jan 2022 16:57:31 -0600
Subject: [PATCH 163/177] revert whitespace changes to produce simpler merge

---
 src/tool/hpcrun/sample-sources/ss-list.h | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/src/tool/hpcrun/sample-sources/ss-list.h b/src/tool/hpcrun/sample-sources/ss-list.h
index 05f74b92aa..c9d52bd2a0 100644
--- a/src/tool/hpcrun/sample-sources/ss-list.h
+++ b/src/tool/hpcrun/sample-sources/ss-list.h
@@ -48,7 +48,7 @@
 //******************************************************************************
 // File: ss-list.h
 //
-// Purpose:
+// Purpose: 
 //   This file contains a list of sample sources wrapped by a call to an
 //   unspecified macro. The intended use of this file is to define the
 //   macro, include the file elsewhere one or more times to register the
@@ -61,21 +61,21 @@
 #include <include/hpctoolkit-config.h>
 
 SAMPLE_SOURCE_DECL_MACRO(ga)
-SAMPLE_SOURCE_DECL_MACRO(io)
+SAMPLE_SOURCE_DECL_MACRO(io)  
 #ifdef ENABLE_CLOCK_REALTIME
-SAMPLE_SOURCE_DECL_MACRO(itimer)
+SAMPLE_SOURCE_DECL_MACRO(itimer)  
 #endif
 
 #ifdef HPCRUN_SS_LINUX_PERF
-SAMPLE_SOURCE_DECL_MACRO(linux_perf)
+SAMPLE_SOURCE_DECL_MACRO(linux_perf)  
 #endif
 
-SAMPLE_SOURCE_DECL_MACRO(memleak)
+SAMPLE_SOURCE_DECL_MACRO(memleak)  
 
-SAMPLE_SOURCE_DECL_MACRO(none)
+SAMPLE_SOURCE_DECL_MACRO(none)  
 
 #ifdef HPCRUN_SS_PAPI
-SAMPLE_SOURCE_DECL_MACRO(papi)
+SAMPLE_SOURCE_DECL_MACRO(papi)  
 #endif
 
 SAMPLE_SOURCE_DECL_MACRO(directed_blame)

From 7be834a1a22f4202430b9a0cf54e7bb275d0b588 Mon Sep 17 00:00:00 2001
From: John M Mellor-Crummey <johnmc@rice.edu>
Date: Sun, 23 Jan 2022 21:13:43 -0600
Subject: [PATCH 164/177] remove dead code in ompt-activity-translate.c

---
 .../hpcrun/gpu/ompt/ompt-activity-translate.c | 64 +------------------
 1 file changed, 2 insertions(+), 62 deletions(-)

diff --git a/src/tool/hpcrun/gpu/ompt/ompt-activity-translate.c b/src/tool/hpcrun/gpu/ompt/ompt-activity-translate.c
index adbc21c008..4f89e2c37c 100644
--- a/src/tool/hpcrun/gpu/ompt/ompt-activity-translate.c
+++ b/src/tool/hpcrun/gpu/ompt/ompt-activity-translate.c
@@ -113,16 +113,10 @@ convert_target
  uint64_t *cid_ptr
 )
 {
-  ompt_record_target_t *t = &r->record.target;
+  ompt_record_target_t *t __attribute__((unused)) = &r->record.target;
 
   ga->kind = GPU_ACTIVITY_UNKNOWN;
   *cid_ptr = 0;
-
-#if 0
-  printf("\tTarget task: kind=%d endpoint=%d device=%d task_id=%lu target_id=%lu codeptr=%p\n",
-	 target_rec.kind, target_rec.endpoint, target_rec.device_num,
-	 target_rec.task_id, target_rec.target_id,
-#endif
 }
 
 
@@ -202,16 +196,6 @@ convert_memcpy
 {
   ompt_record_target_data_op_t *d = &r->record.target_data_op;
 
-# if 0
-  TMSG(OMPT_ACTIVITY, "Memcpy copy CorrelationId %u", r->correlationId);
-  TMSG(OMPT_ACTIVITY, "Memcpy copy kind %u", d->optype);
-  TMSG(OMPT_ACTIVITY, "Memcpy copy bytes %lu", d->bytes);
-
-
-  ga->details.memcpy.context_id = r->contextId;
-  ga->details.memcpy.stream_id = r->streamId;
-#endif
-
   ga->kind = GPU_ACTIVITY_MEMCPY;
 
   ga->details.memcpy.correlation_id = d->host_op_id;
@@ -257,19 +241,7 @@ convert_target_data_op
   default:
     convert_unknown(ga, r, cid_ptr);
     break;
- }
-
-#if 0
-  ( r->thread_id, r->target_id);
-
-
-  printf("\tTarget data op: host_op_id=%lu optype=%d src_addr=%p "
-	 "src_device=%d dest_addr=%p dest_device=%d bytes=%lu "
-	 "end_time=%lu duration=%luus codeptr=%p\n",
-	 d->host_op_id, d->optype,
-	 d->src_addr, d->src_device_num,
-	 d->dest_addr, d->dest_device_num,
-#endif
+  }
 
   gpu_interval_set(&ga->details.interval, r->time, d->end_time);
 }
@@ -289,38 +261,6 @@ convert_target_submit
   ga->details.kernel.correlation_id = k->host_op_id;
   *cid_ptr = k->host_op_id;
 
-#if 0
-  ( r->thread_id, r->target_id);
-  printf("\tTarget kernel: host_op_id=%lu requested_num_teams=%u "
-	 "granted_num_teams=%u end_time=%lu duration=%luus\n",
-	 target_kernel_rec.host_op_id,
-	 target_kernel_rec.requested_num_teams,
-	 target_kernel_rec.granted_num_teams,
-
-  ga->details.kernel.dynamicSharedMemory = r->dynamicSharedMemory;
-  ga->details.kernel.staticSharedMemory = r->staticSharedMemory;
-  ga->details.kernel.localMemoryTotal = r->localMemoryTotal;
-  ga->details.kernel.device_id = r->deviceId;
-  ga->details.kernel.context_id = r->contextId;
-  ga->details.kernel.stream_id = r->streamId;
-  ga->details.kernel.blocks = r->blockX * r->blockY * r->blockZ;
-
-
-  uint32_t activeWarpsPerSM = 0;
-  uint32_t maxActiveWarpsPerSM = 0;
-  uint32_t threadRegisters = 0;
-  uint32_t blockThreads = 0;
-  uint32_t blockSharedMemory = 0;
-  cupti_occupancy_analyze(r, &activeWarpsPerSM, &maxActiveWarpsPerSM,
-			  &threadRegisters, &blockThreads, &blockSharedMemory);
-
-  ga->details.kernel.activeWarpsPerSM = activeWarpsPerSM;
-  ga->details.kernel.maxActiveWarpsPerSM = maxActiveWarpsPerSM;
-  ga->details.kernel.threadRegisters = threadRegisters;
-  ga->details.kernel.blockThreads = blockThreads;
-  ga->details.kernel.blockSharedMemory = blockSharedMemory;
-#endif
-
   gpu_interval_set(&ga->details.interval, r->time, k->end_time);
 }
 

From 3d9994a809c2e6d831ac042667fb6da2d7691cf9 Mon Sep 17 00:00:00 2001
From: John M Mellor-Crummey <johnmc@rice.edu>
Date: Sun, 23 Jan 2022 21:14:33 -0600
Subject: [PATCH 165/177] selectively issue ompt_trace_flush - only flush trace
 during finalization in threads that have   launched kernels

---
 src/tool/hpcrun/ompt/ompt-device.c | 27 +++++++++++++++++++++++++--
 1 file changed, 25 insertions(+), 2 deletions(-)

diff --git a/src/tool/hpcrun/ompt/ompt-device.c b/src/tool/hpcrun/ompt/ompt-device.c
index edfae74106..731b463af9 100644
--- a/src/tool/hpcrun/ompt/ompt-device.c
+++ b/src/tool/hpcrun/ompt/ompt-device.c
@@ -87,6 +87,9 @@
 // macros
 //*****************************************************************************
 
+// with OMPT support turned on, callpath pruning should not be necessary
+#define PRUNE_CALLPATH 0
+
 #define OMPT_ACTIVITY_DEBUG 1
 
 #if OMPT_ACTIVITY_DEBUG
@@ -130,6 +133,14 @@ typedef struct ompt_device_entry_t {
 
 
 
+//*****************************************************************************
+// forward declarations
+//*****************************************************************************
+
+static void ompt_dump(ompt_record_ompt_t *r) __attribute__((unused));
+
+
+
 //*****************************************************************************
 // static variables
 //*****************************************************************************
@@ -142,6 +153,14 @@ static int ompt_shutdown_complete = 0;
 
 static ompt_device_entry_t *device_list = 0;
 
+static __thread bool ompt_need_flush = false;
+
+
+
+//*****************************************************************************
+// private operations
+//*****************************************************************************
+
 static void
 device_list_insert
 (
@@ -359,7 +378,7 @@ ompt_finalize_flush
   while (e) {
     PRINT("ompt_finalize_flush flush id=%d device=%p\n",
 	  e->device_id, e->device);
-    ompt_flush_trace(e->device);
+    if (ompt_need_flush) ompt_flush_trace(e->device);
     e = e->next;
   }
   PRINT("ompt_finalize_flush exit\n");
@@ -509,6 +528,7 @@ ompt_device_unload(int device_num,
 }
 
 
+#if PRUNE_CALLPATH
 static int
 get_load_module
 (
@@ -519,6 +539,7 @@ get_load_module
   ip_normalized_t ip = addr->ip_norm;
   return ip.lm_id;
 }
+#endif
 
 
 void
@@ -565,10 +586,10 @@ ompt_target_callback_emi
     hpcrun_sample_callpath(&uc, zero_metric_id, zero_metric_incr,
                            skip_this_frame, 1, NULL).sample_node;
 
+#if PRUNE_CALLPATH
   // the load module for the runtime library that supports offloading
   int lm = get_load_module(target_node);
 
-#if 0
   // drop nodes on the call chain until we find one that is not in the load
   // module for runtime library that supports offloading
   for (;;) {
@@ -579,6 +600,8 @@ ompt_target_callback_emi
 
   hpcrun_safe_exit();
   td->overhead--;
+
+  ompt_need_flush = true;
 }
 
 #define FOREACH_OMPT_DATA_OP(macro)				     \

From 3f1817b20d4d3dcb5e4102c9c1ece81befb52edc Mon Sep 17 00:00:00 2001
From: John M Mellor-Crummey <johnmc@rice.edu>
Date: Sun, 23 Jan 2022 21:48:23 -0600
Subject: [PATCH 166/177] not that ompt needs flushing for more than just
 kernels

---
 src/tool/hpcrun/ompt/ompt-device.c | 23 ++++++++++++++++-------
 1 file changed, 16 insertions(+), 7 deletions(-)

diff --git a/src/tool/hpcrun/ompt/ompt-device.c b/src/tool/hpcrun/ompt/ompt-device.c
index 731b463af9..781487d889 100644
--- a/src/tool/hpcrun/ompt/ompt-device.c
+++ b/src/tool/hpcrun/ompt/ompt-device.c
@@ -87,6 +87,12 @@
 // macros
 //*****************************************************************************
 
+#define FOREACH_OMPT_DATA_OP(macro)				     \
+  macro(op, ompt_target_data_alloc, ompt_tgt_alloc)		     \
+  macro(op, ompt_target_data_delete, ompt_tgt_delete)		     \
+  macro(op, ompt_target_data_transfer_to_device, ompt_tgt_copyin)    \
+  macro(op, ompt_target_data_transfer_from_device, ompt_tgt_copyout)
+
 // with OMPT support turned on, callpath pruning should not be necessary
 #define PRUNE_CALLPATH 0
 
@@ -559,6 +565,8 @@ ompt_target_callback_emi
     return;
   }
 
+  ompt_need_flush = true;
+
   uint64_t target_id = target_data->value = gpu_correlation_id();
   PRINT("ompt_target_callback->target_id 0x%lx\n", target_id);
 
@@ -600,15 +608,8 @@ ompt_target_callback_emi
 
   hpcrun_safe_exit();
   td->overhead--;
-
-  ompt_need_flush = true;
 }
 
-#define FOREACH_OMPT_DATA_OP(macro)				     \
-  macro(op, ompt_target_data_alloc, ompt_tgt_alloc)		     \
-  macro(op, ompt_target_data_delete, ompt_tgt_delete)		     \
-  macro(op, ompt_target_data_transfer_to_device, ompt_tgt_copyin)    \
-  macro(op, ompt_target_data_transfer_from_device, ompt_tgt_copyout)
 
 void
 ompt_data_op_callback_emi
@@ -628,6 +629,8 @@ ompt_data_op_callback_emi
 {
   if (endpoint == ompt_scope_end) return;
 
+  ompt_need_flush = true;
+
   uint64_t target_id = target_data->value;
   uint64_t op_id = *host_op_id = gpu_correlation_id();
 
@@ -661,12 +664,17 @@ ompt_submit_callback_emi
 )
 {
   uint64_t target_id = target_data->value;
+
   PRINT("ompt_submit_callback enter->target_id 0x%lx\n", target_id);
+
   if (endpoint == ompt_scope_begin) {
     *host_op_id = gpu_correlation_id();
     hpcrun_ompt_op_id_notify(endpoint, *host_op_id,
       ompt_placeholders.ompt_tgt_kernel.pc_norm);
+
+    ompt_need_flush = true;
   }
+
   PRINT("ompt_submit_callback exit->target_id 0x%lx\n", target_id);
 }
 
@@ -679,6 +687,7 @@ ompt_map_callback(ompt_id_t target_id,
                   size_t *bytes,
                   unsigned int *mapping_flags)
 {
+  ompt_need_flush = true;
 }
 
 

From 867820e4b22b39b5158c2b99437a73a009fd29d3 Mon Sep 17 00:00:00 2001
From: John M Mellor-Crummey <johnmc@rice.edu>
Date: Mon, 24 Jan 2022 00:08:29 -0600
Subject: [PATCH 167/177] enhance debugging for monitoring GPU intervals

---
 src/tool/hpcrun/gpu/gpu-activity.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/src/tool/hpcrun/gpu/gpu-activity.c b/src/tool/hpcrun/gpu/gpu-activity.c
index 0a640c8636..ba595e187f 100644
--- a/src/tool/hpcrun/gpu/gpu-activity.c
+++ b/src/tool/hpcrun/gpu/gpu-activity.c
@@ -56,6 +56,9 @@
 
 #include "gpu-activity.h"
 #include "gpu-channel-item-allocator.h"
+
+#define DEBUG 0
+
 #include "gpu-print.h"
 
 
@@ -66,9 +69,6 @@
 
 #define UNIT_TEST 0
 
-#define DEBUG 0
-
-
 #define FORALL_OPENCL_KINDS(macro)					\
   macro(GPU_ACTIVITY_UNKNOWN)							\
   macro(GPU_ACTIVITY_KERNEL)           \
@@ -160,6 +160,8 @@ gpu_interval_set
 {
   interval->start = start;
   interval->end = end;
+  PRINT("gpu interval: [%lu, %lu) delta = %ld\n", interval->start, 
+        interval->end, interval->end - interval->start); 
 }
 
 

From ec481c0f271f96faeb528514bc01710d0ca05838 Mon Sep 17 00:00:00 2001
From: John M Mellor-Crummey <johnmc@rice.edu>
Date: Mon, 24 Jan 2022 00:09:44 -0600
Subject: [PATCH 168/177] adjust tracing of intervals to avoid interval overlap

trace "[start, end) op xxx" as
start xxx
end no_activity

don't use end + 1 for no_activity to avoid overlap with an
adjacent interval
---
 src/tool/hpcrun/gpu/gpu-trace.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/src/tool/hpcrun/gpu/gpu-trace.c b/src/tool/hpcrun/gpu/gpu-trace.c
index edcc8593aa..e8008542e7 100644
--- a/src/tool/hpcrun/gpu/gpu-trace.c
+++ b/src/tool/hpcrun/gpu/gpu-trace.c
@@ -441,8 +441,11 @@ consume_one_trace_item
 
   if (append) {
     gpu_trace_stream_append(td, leaf, start);
-    gpu_trace_stream_append(td, no_activity, end + 1);
 
-    PRINT("%p Append trace activity [%lu, %lu]\n", td, start, end);
+    // note: adding 1 to end makes sense. however, with AMD OMPT, this
+    // causes adjacent events to share a timestamp. so, don't add 1.
+    gpu_trace_stream_append(td, no_activity, end);
+
+    PRINT("%p Append trace activity [%lu, %lu)\n", td, start, end);
   }
 }

From 9f5e56f39c856f24d251d74afedf2f792201c8aa Mon Sep 17 00:00:00 2001
From: Xiaozhu Meng <mxz297@gmail.com>
Date: Wed, 26 Jan 2022 11:12:26 -0600
Subject: [PATCH 169/177] Prototype support for using customized rocprofiler

---
 src/tool/hpcrun/audit/auditor.c | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/src/tool/hpcrun/audit/auditor.c b/src/tool/hpcrun/audit/auditor.c
index 24ebe24d97..7bd66d235b 100644
--- a/src/tool/hpcrun/audit/auditor.c
+++ b/src/tool/hpcrun/audit/auditor.c
@@ -599,3 +599,14 @@ unsigned int la_objclose(uintptr_t* cookie) {
   *cookie = 0;
   return 0;
 }
+
+char *la_objsearch(const char *name, uintptr_t *cookie, unsigned int flag) {
+  if (strstr(name, "librocprofiler64.so") == NULL) {
+    return name;
+  }
+  char* path = getenv("HPCRUN_USE_CUSTOM_ROCPROFILER");
+  if (path != NULL) {
+    return path;
+  }
+  return name;
+}

From e82455e0c501904d83f721900e93d761e867c32a Mon Sep 17 00:00:00 2001
From: Xiaozhu Meng <mxz297@gmail.com>
Date: Mon, 31 Jan 2022 10:55:44 -0600
Subject: [PATCH 170/177] Do no initialize rocprofiler until we find rocprof::
 style metrics

---
 src/tool/hpcrun/sample-sources/amd-rocprofiler.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/tool/hpcrun/sample-sources/amd-rocprofiler.c b/src/tool/hpcrun/sample-sources/amd-rocprofiler.c
index 41cdaf6ca9..32cdeaf27e 100644
--- a/src/tool/hpcrun/sample-sources/amd-rocprofiler.c
+++ b/src/tool/hpcrun/sample-sources/amd-rocprofiler.c
@@ -133,9 +133,9 @@ METHOD_FN(shutdown)
 static bool
 METHOD_FN(supports_event, const char *ev_str)
 {
-  rocprofiler_init();
 #ifndef HPCRUN_STATIC_LINK
   if (hpcrun_ev_is(ev_str, AMD_ROCPROFILER_PREFIX)) {
+    rocprofiler_init();
     const char* roc_str = ev_str + sizeof(AMD_ROCPROFILER_PREFIX);
     while (*roc_str == ':') roc_str++;
     if (*roc_str == 0) return false;

From dc422aee0675f1437d3aedc2834d702b7fa976fe Mon Sep 17 00:00:00 2001
From: John M Mellor-Crummey <johnmc@rice.edu>
Date: Sun, 6 Feb 2022 22:39:10 -0600
Subject: [PATCH 171/177] fix issues with OMPT device support - poll for
 activities after flush - handle buffer completion event with empty buffer,  
 which happens to be owned

---
 src/tool/hpcrun/ompt/ompt-device.c | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/src/tool/hpcrun/ompt/ompt-device.c b/src/tool/hpcrun/ompt/ompt-device.c
index 781487d889..1de3107ff3 100644
--- a/src/tool/hpcrun/ompt/ompt-device.c
+++ b/src/tool/hpcrun/ompt/ompt-device.c
@@ -125,7 +125,7 @@
   macro(ompt_get_record_abstract) \
   macro(ompt_advance_buffer_cursor)
 
-#define BUFFER_EMPTY(record, buffer, bytes) (((char *) record) >= (((char *)buffer) + bytes))
+
 
 //*****************************************************************************
 // type declarations
@@ -380,6 +380,7 @@ ompt_finalize_flush
 )
 {
   PRINT("ompt_finalize_flush enter\n");
+
   ompt_device_entry_t *e = device_list;
   while (e) {
     PRINT("ompt_finalize_flush flush id=%d device=%p\n",
@@ -387,6 +388,9 @@ ompt_finalize_flush
     if (ompt_need_flush) ompt_flush_trace(e->device);
     e = e->next;
   }
+
+  gpu_application_thread_process_activities();
+
   PRINT("ompt_finalize_flush exit\n");
 }
 
@@ -447,10 +451,13 @@ ompt_buffer_complete
     // signal advance to return pointer to first record
     ompt_buffer_cursor_t current = begin;
     int status = 1;
-    while(status) {
+    while (status) {
       // extract the next record from the buffer
       ompt_record_ompt_t *record = ompt_get_record_ompt(buffer, current);
 
+      // a buffer may be empty, so the first record may be NULL
+      if (record == NULL) break;
+
       // process the record
       ompt_activity_process(record);
 
@@ -458,10 +465,6 @@ ompt_buffer_complete
       // status will be 0 if there is no next record
       status = ompt_advance_buffer_cursor(device, buffer, bytes, current,
 					  &current);
-#if 0
-      // obsolete with changes from AMD
-      if (BUFFER_EMPTY(record, buffer, bytes)) break;
-#endif
     }
   }
 

From 12d188ef9dd4f2a5a5a1354269f51460563658c3 Mon Sep 17 00:00:00 2001
From: John M Mellor-Crummey <johnmc@rice.edu>
Date: Sun, 6 Feb 2022 23:20:07 -0600
Subject: [PATCH 172/177] turn off OMPT logging

---
 src/tool/hpcrun/ompt/ompt-device.c    | 19 ++++++++-----------
 src/tool/hpcrun/ompt/ompt-interface.c |  2 +-
 2 files changed, 9 insertions(+), 12 deletions(-)

diff --git a/src/tool/hpcrun/ompt/ompt-device.c b/src/tool/hpcrun/ompt/ompt-device.c
index 1de3107ff3..6e6fea2fce 100644
--- a/src/tool/hpcrun/ompt/ompt-device.c
+++ b/src/tool/hpcrun/ompt/ompt-device.c
@@ -96,7 +96,7 @@
 // with OMPT support turned on, callpath pruning should not be necessary
 #define PRUNE_CALLPATH 0
 
-#define OMPT_ACTIVITY_DEBUG 1
+#define OMPT_ACTIVITY_DEBUG 0
 
 #if OMPT_ACTIVITY_DEBUG
 #define PRINT(...) fprintf(stderr, __VA_ARGS__)
@@ -267,7 +267,7 @@ ompt_bind_names(ompt_function_lookup_t lookup)
 {
 #define ompt_bind_name(fn) \
   fn = (fn ## _t ) lookup(#fn); \
-  printf("look up function %s, got %p\n", #fn, fn);
+  PRINT("look up function %s, got %p\n", #fn, fn);
 
   FOREACH_OMPT_TARGET_FN(ompt_bind_name)
 
@@ -570,8 +570,8 @@ ompt_target_callback_emi
 
   ompt_need_flush = true;
 
-  uint64_t target_id = target_data->value = gpu_correlation_id();
-  PRINT("ompt_target_callback->target_id 0x%lx\n", target_id);
+  target_data->value = gpu_correlation_id();
+  PRINT("ompt_target_callback->target_id 0x%lx\n", target_data->value);
 
   // XXX(Keren): Do not use openmp callbacks to consume and produce records
   // HPCToolkit always subscribes its own cupti callback
@@ -634,10 +634,9 @@ ompt_data_op_callback_emi
 
   ompt_need_flush = true;
 
-  uint64_t target_id = target_data->value;
   uint64_t op_id = *host_op_id = gpu_correlation_id();
 
-  PRINT("ompt_data_op enter->target_id 0x%lx\n", target_id);
+  PRINT("ompt_data_op enter->target_id 0x%lx\n", target_data->value);
   ompt_placeholder_t op = ompt_placeholders.ompt_tgt_none;
   switch (optype) {
 #define ompt_op_macro(op, ompt_op_type, ompt_op_class) \
@@ -653,7 +652,7 @@ ompt_data_op_callback_emi
   }
 
   hpcrun_ompt_op_id_notify(endpoint, op_id, op.pc_norm);
-  PRINT("ompt_data_op exit->target_id 0x%lx\n", target_id);
+  PRINT("ompt_data_op exit->target_id 0x%lx\n", target_data->value);
 }
 
 
@@ -666,9 +665,7 @@ ompt_submit_callback_emi
  unsigned int requested_num_teams
 )
 {
-  uint64_t target_id = target_data->value;
-
-  PRINT("ompt_submit_callback enter->target_id 0x%lx\n", target_id);
+  PRINT("ompt_submit_callback enter->target_id 0x%lx\n", target_data->value);
 
   if (endpoint == ompt_scope_begin) {
     *host_op_id = gpu_correlation_id();
@@ -678,7 +675,7 @@ ompt_submit_callback_emi
     ompt_need_flush = true;
   }
 
-  PRINT("ompt_submit_callback exit->target_id 0x%lx\n", target_id);
+  PRINT("ompt_submit_callback exit->target_id 0x%lx\n", target_data->value);
 }
 
 
diff --git a/src/tool/hpcrun/ompt/ompt-interface.c b/src/tool/hpcrun/ompt/ompt-interface.c
index 1d4399681b..4b69f56fef 100644
--- a/src/tool/hpcrun/ompt/ompt-interface.c
+++ b/src/tool/hpcrun/ompt/ompt-interface.c
@@ -91,7 +91,7 @@
 #define ompt_event_may_occur(r) \
   ((r ==  ompt_set_sometimes) | (r ==  ompt_set_always))
 
-#define OMPT_DEBUG_STARTUP 1
+#define OMPT_DEBUG_STARTUP 0
 #define OMPT_DEBUG_TASK 0
 
 

From 8136f07a46172353ef8fd5d54f139f55d68f7bc6 Mon Sep 17 00:00:00 2001
From: Xiaozhu Meng <mxz297@gmail.com>
Date: Fri, 11 Feb 2022 09:57:10 -0600
Subject: [PATCH 173/177] Refactor thread local memory pool

---
 src/tool/hpcrun/fnbounds/fnbounds_dynamic.c |  1 -
 src/tool/hpcrun/memory/mem.c                | 37 ++++++++++++---------
 src/tool/hpcrun/memory/newmem.h             |  3 +-
 src/tool/hpcrun/sample_event.c              |  4 +--
 src/tool/hpcrun/thread_data.c               |  5 ---
 src/tool/hpcrun/thread_data.h               |  6 ----
 6 files changed, 26 insertions(+), 30 deletions(-)

diff --git a/src/tool/hpcrun/fnbounds/fnbounds_dynamic.c b/src/tool/hpcrun/fnbounds/fnbounds_dynamic.c
index 7be609f3f7..f863fdaa52 100644
--- a/src/tool/hpcrun/fnbounds/fnbounds_dynamic.c
+++ b/src/tool/hpcrun/fnbounds/fnbounds_dynamic.c
@@ -220,7 +220,6 @@ fnbounds_enclosing_addr(void* ip, void** start, void** end, load_module_t** lm)
 load_module_t*
 fnbounds_map_dso(const char *module_name, void *start, void *end, struct dl_phdr_info* info)
 {
-  hpcrun_thread_init_mem_pool_once(0, NULL, false, true);
   dso_info_t *dso = fnbounds_compute(module_name, start, end);
   if (dso) {
     load_module_t* lm = hpcrun_loadmap_map(dso);
diff --git a/src/tool/hpcrun/memory/mem.c b/src/tool/hpcrun/memory/mem.c
index e996453277..7af6897a3c 100644
--- a/src/tool/hpcrun/memory/mem.c
+++ b/src/tool/hpcrun/memory/mem.c
@@ -94,6 +94,15 @@ static long total_non_freeable = 0;
 
 static int out_of_mem_mesg = 0;
 
+
+// ---------------------------------------------------
+// hpcrun_malloc() memory thread local data structures
+// ---------------------------------------------------
+__thread hpcrun_meminfo_t memstore;
+__thread int              mem_low;
+
+
+
 //------------------------------------------------------------------
 // Internal functions
 //------------------------------------------------------------------
@@ -222,22 +231,12 @@ hpcrun_memory_reinit(void)
 // Allocate space and init a thread's memstore.
 // If failure, shutdown sampling and leave old memstore in place.
 void
-hpcrun_make_memstore(hpcrun_meminfo_t *mi, int is_child)
+hpcrun_make_memstore(hpcrun_meminfo_t *mi)
 {
   void *addr;
 
   hpcrun_mem_init();
 
-  // If in the child after fork(), then continue to use the parent's
-  // memstore if it looks ok, else mmap a new one.  Note: we can't
-  // reset the memstore to empty unless we delete everything that was
-  // created via hpcrun_malloc() (cct, uw_recipe_map, ...).
-  if (is_child && mi->mi_start != NULL
-      && mi->mi_start <= mi->mi_low && mi->mi_low <= mi->mi_high
-      && mi->mi_high <= mi->mi_start + mi->mi_size) {
-    return;
-  }
-
   addr = hpcrun_mmap_anon(memsize);
   if (addr == NULL) {
     if (! out_of_mem_mesg) {
@@ -260,10 +259,10 @@ hpcrun_make_memstore(hpcrun_meminfo_t *mi, int is_child)
 void
 hpcrun_reclaim_freeable_mem(void)
 {
-  hpcrun_meminfo_t *mi = &TD_GET(memstore);
+  hpcrun_meminfo_t *mi = &memstore;
 
   mi->mi_low = mi->mi_start;
-  TD_GET(mem_low) = 0;
+  mem_low = 0;
   num_reclaims++;
   TMSG(MALLOC, "%s: %d", __func__, num_reclaims);
 }
@@ -283,7 +282,7 @@ hpcrun_malloc(size_t size)
     return NULL;
   }
 
-  mi = &TD_GET(memstore);
+  mi = &memstore;
   size = round_up(size);
 
   // For a large request that doesn't fit within the existing
@@ -310,7 +309,7 @@ hpcrun_malloc(size_t size)
       || mi->mi_high - mi->mi_low < low_memsize
       || mi->mi_high - mi->mi_low < size) {
     if (allow_extra_mmap) {
-      hpcrun_make_memstore(mi, 0);
+      hpcrun_make_memstore(mi);
     } else {
       if (! out_of_mem_mesg) {
 	EMSG("%s: out of memory, shutting down sampling", __func__);
@@ -412,3 +411,11 @@ hpcrun_memory_summary(void)
        "malloc failures: %ld",
        total_freeable/meg, total_non_freeable/meg, num_failures);
 }
+
+int
+get_mem_low(
+  void
+)
+{
+  return mem_low;
+}
diff --git a/src/tool/hpcrun/memory/newmem.h b/src/tool/hpcrun/memory/newmem.h
index 7fb1ed1211..90695c7322 100644
--- a/src/tool/hpcrun/memory/newmem.h
+++ b/src/tool/hpcrun/memory/newmem.h
@@ -63,6 +63,7 @@ struct hpcrun_meminfo {
 
 typedef struct hpcrun_meminfo hpcrun_meminfo_t;
 
-void hpcrun_make_memstore(hpcrun_meminfo_t *mi, int is_child);
+void hpcrun_make_memstore(hpcrun_meminfo_t *mi);
+int get_mem_low(void);
 
 #endif
diff --git a/src/tool/hpcrun/sample_event.c b/src/tool/hpcrun/sample_event.c
index f5c7b46f30..abb4512512 100644
--- a/src/tool/hpcrun/sample_event.c
+++ b/src/tool/hpcrun/sample_event.c
@@ -305,7 +305,7 @@ hpcrun_sample_callpath(void* context, int metricId,
   }
 
   hpcrun_clear_handling_sample(td);
-  if (TD_GET(mem_low) || ENABLED(FLUSH_EVERY_SAMPLE)) {
+  if (get_mem_low() || ENABLED(FLUSH_EVERY_SAMPLE)) {
     hpcrun_flush_epochs(&(TD_GET(core_profile_trace_data)));
     hpcrun_reclaim_freeable_mem();
   }
@@ -384,7 +384,7 @@ hpcrun_gen_thread_ctxt(void* context)
   }
 #endif
   hpcrun_clear_handling_sample(td);
-  if (TD_GET(mem_low) || ENABLED(FLUSH_EVERY_SAMPLE)) {
+  if (get_mem_low() || ENABLED(FLUSH_EVERY_SAMPLE)) {
     hpcrun_flush_epochs(&(TD_GET(core_profile_trace_data)));
     hpcrun_reclaim_freeable_mem();
   }
diff --git a/src/tool/hpcrun/thread_data.c b/src/tool/hpcrun/thread_data.c
index bdea4fa984..a1a8706e67 100644
--- a/src/tool/hpcrun/thread_data.c
+++ b/src/tool/hpcrun/thread_data.c
@@ -385,7 +385,6 @@ hpcrun_thread_data_init
   size_t n_sources
 )
 {
-  hpcrun_meminfo_t memstore;
   thread_data_t* td = hpcrun_get_thread_data();
 
   // ----------------------------------------
@@ -396,12 +395,8 @@ hpcrun_thread_data_init
   // memstore so we can reuse it in the child after fork.  This must
   // come first.
   td->inside_hpcrun = 1;
-  memstore = td->memstore;
   memset(td, 0xfe, sizeof(thread_data_t));
   td->inside_hpcrun = 1;
-  td->memstore = memstore;
-  hpcrun_make_memstore(&td->memstore, is_child);
-  td->mem_low = 0;
 
   // ----------------------------------------
   // normalized thread id (monitor-generated)
diff --git a/src/tool/hpcrun/thread_data.h b/src/tool/hpcrun/thread_data.h
index 2874b39400..baa37f6c95 100644
--- a/src/tool/hpcrun/thread_data.h
+++ b/src/tool/hpcrun/thread_data.h
@@ -172,12 +172,6 @@ typedef struct thread_data_t {
   int omp_thread;
   uint64_t last_bar_time_us;
 
-  // ----------------------------------------
-  // hpcrun_malloc() memory data structures
-  // ----------------------------------------
-  hpcrun_meminfo_t memstore;
-  int              mem_low;
-
   // ----------------------------------------
   // sample sources
   // ----------------------------------------

From 5c816adb41518f466b92173a4a862c747494db54 Mon Sep 17 00:00:00 2001
From: Xiaozhu Meng <mxz297@gmail.com>
Date: Mon, 14 Feb 2022 10:17:21 -0600
Subject: [PATCH 174/177] Refactor AMD and hpcrun initialization interaction

---
 src/tool/hpcrun/gpu/amd/rocprofiler-api.c | 36 +++++++++++++++--------
 src/tool/hpcrun/gpu/amd/rocprofiler-api.h |  6 ++++
 src/tool/hpcrun/gpu/amd/roctracer-api.c   |  3 ++
 src/tool/hpcrun/scripts/hpcrun.in         |  1 -
 tests/Makefile.in                         |  1 +
 5 files changed, 33 insertions(+), 14 deletions(-)

diff --git a/src/tool/hpcrun/gpu/amd/rocprofiler-api.c b/src/tool/hpcrun/gpu/amd/rocprofiler-api.c
index f20d88f6c4..9e3daac034 100644
--- a/src/tool/hpcrun/gpu/amd/rocprofiler-api.c
+++ b/src/tool/hpcrun/gpu/amd/rocprofiler-api.c
@@ -458,17 +458,6 @@ roctracer_codeobj_callback
 extern PUBLIC_API void OnLoadToolProp(rocprofiler_settings_t* settings){
   // Enable hsa interception for getting code object URIs
   settings->hsa_intercepting = 1;
-
-  // Ask roctracer to set up code object URI callbacks
-  rocm_binary_uri_list_init();
-  roctracer_enable_op_callback(
-    ACTIVITY_DOMAIN_HSA_EVT, HSA_EVT_ID_CODEOBJ, roctracer_codeobj_callback, NULL
-  );
-  rocprofiler_init();
-
-  rocprofiler_queue_callbacks_t callbacks_ptrs = {};
-  callbacks_ptrs.dispatch = rocprofiler_dispatch_callback;
-  HPCRUN_ROCPROFILER_CALL(rocprofiler_set_queue_callbacks, (callbacks_ptrs, NULL));
 }
 
 extern PUBLIC_API void OnUnloadTool() {
@@ -523,6 +512,11 @@ rocprofiler_init
     monitor_real_exit(-1);
   }
 #endif
+
+  rocprofiler_queue_callbacks_t callbacks_ptrs = {};
+  callbacks_ptrs.dispatch = rocprofiler_dispatch_callback;
+  HPCRUN_ROCPROFILER_CALL(rocprofiler_set_queue_callbacks, (callbacks_ptrs, NULL));
+
   initialize_counter_information();
 
   // Initialize the spin lock used to serialize GPU kernel launches
@@ -564,10 +558,10 @@ rocprofiler_bind
 #undef ROCPROFILER_BIND
 
   hpcrun_force_dlopen(true);
-  if (getenv("HPCRUN_LIST_EVENT")) {
+  //if (getenv("HPCRUN_LIST_EVENT")) {
     CHK_DLOPEN(hsa, "libhsa-runtime64.so", RTLD_NOW | RTLD_GLOBAL);
     hsa_init();
-  }
+  //}
   hpcrun_force_dlopen(false);
 
   return DYNAMIC_BINDING_STATUS_OK;
@@ -662,3 +656,19 @@ rocprofiler_finalize_event_list
 
   gpu_metrics_GPU_CTR_enable(total_requested, requested_counter_name, requested_counter_description);
 }
+
+void
+rocprofiler_uri_setup
+(
+  void
+)
+{
+  // Ask roctracer to set up code object URI callbacks
+  // TODO: this really should be implemented in roctracer-api.c,
+  // however, due to an AMD header file that is not fully C compatible,
+  // I can only include rocprofiler header file in one source file.
+  rocm_binary_uri_list_init();
+  roctracer_enable_op_callback(
+    ACTIVITY_DOMAIN_HSA_EVT, HSA_EVT_ID_CODEOBJ, roctracer_codeobj_callback, NULL
+  );
+}
diff --git a/src/tool/hpcrun/gpu/amd/rocprofiler-api.h b/src/tool/hpcrun/gpu/amd/rocprofiler-api.h
index ba624910b0..267db702c0 100644
--- a/src/tool/hpcrun/gpu/amd/rocprofiler-api.h
+++ b/src/tool/hpcrun/gpu/amd/rocprofiler-api.h
@@ -123,7 +123,13 @@ rocprofiler_match_event
 void
 rocprofiler_finalize_event_list
 (
+  void
+);
 
+void
+rocprofiler_uri_setup
+(
+  void
 );
 
 
diff --git a/src/tool/hpcrun/gpu/amd/roctracer-api.c b/src/tool/hpcrun/gpu/amd/roctracer-api.c
index 0bdf107b76..afae6dd300 100644
--- a/src/tool/hpcrun/gpu/amd/roctracer-api.c
+++ b/src/tool/hpcrun/gpu/amd/roctracer-api.c
@@ -597,6 +597,9 @@ roctracer_init
   HPCRUN_ROCTRACER_CALL(roctracer_enable_domain_callback, (ACTIVITY_DOMAIN_KFD_API, roctracer_subscriber_callback, NULL));
   // Enable rocTX
   HPCRUN_ROCTRACER_CALL(roctracer_enable_domain_callback, (ACTIVITY_DOMAIN_ROCTX, roctracer_subscriber_callback, NULL));
+
+  // Prepare getting URI
+  rocprofiler_uri_setup();
 }
 
 void
diff --git a/src/tool/hpcrun/scripts/hpcrun.in b/src/tool/hpcrun/scripts/hpcrun.in
index ad8a55e837..9b97b4aff3 100644
--- a/src/tool/hpcrun/scripts/hpcrun.in
+++ b/src/tool/hpcrun/scripts/hpcrun.in
@@ -378,7 +378,6 @@ do
              export HSA_TOOLS_LIB=librocprofiler64.so.1
              export ROCP_TOOL_LIB=libhpcrun.so
              export ROCP_METRICS=@ROCM_PROFILER_LD_DIR@/metrics.xml
-             export HIP_ENABLE_DEFERRED_LOADING=0
 			 export ROCP_HSA_INTERCEPT=1;;
 		gpu=opencl)	 preload_list="${preload_list:+${preload_list}:}${hpcrun_dir}/libhpcrun_opencl.so" ;;
 
diff --git a/tests/Makefile.in b/tests/Makefile.in
index dc7d338c90..19a0158a22 100644
--- a/tests/Makefile.in
+++ b/tests/Makefile.in
@@ -576,6 +576,7 @@ PERFMON_LDFLAGS_STAT = @PERFMON_LDFLAGS_STAT@
 PERFMON_LIB = @PERFMON_LIB@
 PERF_EVENT_PARANOID = @PERF_EVENT_PARANOID@
 RANLIB = @RANLIB@
+ROCM_PROFILER_LD_DIR = @ROCM_PROFILER_LD_DIR@
 SED = @SED@
 SET_MAKE = @SET_MAKE@
 SHELL = @SHELL@

From bdaccde04421c5b4f8385873fc0c3a3f3845dcf8 Mon Sep 17 00:00:00 2001
From: Xiaozhu Meng <mxz297@gmail.com>
Date: Mon, 14 Feb 2022 12:03:34 -0600
Subject: [PATCH 175/177] Fix compilation regarding OMPT placeholders

---
 src/tool/hpcrun/gpu/gpu-activity-process.c |  3 ++-
 src/tool/hpcrun/ompt/ompt-device.c         | 13 +++----------
 2 files changed, 5 insertions(+), 11 deletions(-)

diff --git a/src/tool/hpcrun/gpu/gpu-activity-process.c b/src/tool/hpcrun/gpu/gpu-activity-process.c
index 1dae7c6da7..10effafd1b 100644
--- a/src/tool/hpcrun/gpu/gpu-activity-process.c
+++ b/src/tool/hpcrun/gpu/gpu-activity-process.c
@@ -628,7 +628,8 @@ gpu_memory_process
       trace_item_set(&entry_trace, activity, host_op_entry, host_op_node);
 
       gpu_context_stream_trace
-	(activity->details.memory.context_id,
+	(activity->details.memory.device_id,
+   activity->details.memory.context_id,
 	 activity->details.memory.stream_id,
 	 &entry_trace);
 
diff --git a/src/tool/hpcrun/ompt/ompt-device.c b/src/tool/hpcrun/ompt/ompt-device.c
index 0e67ec5847..b27b968f71 100644
--- a/src/tool/hpcrun/ompt/ompt-device.c
+++ b/src/tool/hpcrun/ompt/ompt-device.c
@@ -68,7 +68,6 @@
 
 #include "ompt-interface.h"
 #include "ompt-device-map.h"
-#include "ompt-placeholders.h"
 #include "ompt-device.h"
 
 #include "gpu/gpu-op-placeholders.h"
@@ -615,12 +614,6 @@ ompt_target_callback_emi
   td->overhead--;
 }
 
-#define FOREACH_OMPT_DATA_OP(macro)				     \
-  macro(ph, ompt_target_data_alloc, ompt_tgt_alloc)		     \
-  macro(ph, ompt_target_data_delete, ompt_tgt_delete)		     \
-  macro(ph, ompt_target_data_transfer_to_device, ompt_tgt_copyin)    \
-  macro(ph, ompt_target_data_transfer_from_device, ompt_tgt_copyout)
-
 void
 ompt_data_op_callback_emi
 (
@@ -644,7 +637,7 @@ ompt_data_op_callback_emi
   uint64_t op_id = *host_op_id = gpu_correlation_id();
 
   PRINT("ompt_data_op enter->target_id 0x%lx\n", target_data->value);
-  ompt_placeholder_t op = ompt_placeholders.ompt_tgt_none;
+  enum hpcrun_placeholder op = hpcrun_placeholder_ompt_tgt_none;
   switch (optype) {
 #define ompt_op_macro(op, ompt_op_type, ompt_op_class) \
     case ompt_op_type:                                 \
@@ -658,7 +651,7 @@ ompt_data_op_callback_emi
       break;
   }
 
-  hpcrun_ompt_op_id_notify(endpoint, op_id, op.pc_norm);
+  hpcrun_ompt_op_id_notify(endpoint, op_id, get_placeholder_norm(op));
   PRINT("ompt_data_op exit->target_id 0x%lx\n", target_data->value);
 }
 
@@ -677,7 +670,7 @@ ompt_submit_callback_emi
   if (endpoint == ompt_scope_begin) {
     *host_op_id = gpu_correlation_id();
     hpcrun_ompt_op_id_notify(endpoint, *host_op_id,
-      ompt_placeholders.ompt_tgt_kernel.pc_norm);
+      get_placeholder_norm(hpcrun_placeholder_ompt_tgt_kernel));
 
     ompt_need_flush = true;
   }

From 5f722776f960f590772ba268b79af1ccb03d1b26 Mon Sep 17 00:00:00 2001
From: Xiaozhu Meng <mxz297@gmail.com>
Date: Mon, 21 Feb 2022 16:36:59 -0600
Subject: [PATCH 176/177] No longer need to use customized rocprofiler

---
 src/tool/hpcrun/audit/auditor.c                  | 11 -----------
 src/tool/hpcrun/gpu-monitors.c                   |  2 +-
 src/tool/hpcrun/gpu/amd/hip-api.c                |  2 +-
 src/tool/hpcrun/gpu/amd/rocm-binary-processing.c |  2 +-
 src/tool/hpcrun/gpu/gpu-channel-common.h         |  2 +-
 5 files changed, 4 insertions(+), 15 deletions(-)

diff --git a/src/tool/hpcrun/audit/auditor.c b/src/tool/hpcrun/audit/auditor.c
index 7bd66d235b..24ebe24d97 100644
--- a/src/tool/hpcrun/audit/auditor.c
+++ b/src/tool/hpcrun/audit/auditor.c
@@ -599,14 +599,3 @@ unsigned int la_objclose(uintptr_t* cookie) {
   *cookie = 0;
   return 0;
 }
-
-char *la_objsearch(const char *name, uintptr_t *cookie, unsigned int flag) {
-  if (strstr(name, "librocprofiler64.so") == NULL) {
-    return name;
-  }
-  char* path = getenv("HPCRUN_USE_CUSTOM_ROCPROFILER");
-  if (path != NULL) {
-    return path;
-  }
-  return name;
-}
diff --git a/src/tool/hpcrun/gpu-monitors.c b/src/tool/hpcrun/gpu-monitors.c
index 9d05ce6443..5b03c70503 100644
--- a/src/tool/hpcrun/gpu-monitors.c
+++ b/src/tool/hpcrun/gpu-monitors.c
@@ -37,4 +37,4 @@ gpu_monitors_apply(cct_node_t *cct_node, gpu_monitor_type_t type)
       node = node->next;
     }
   }
-}
\ No newline at end of file
+}
diff --git a/src/tool/hpcrun/gpu/amd/hip-api.c b/src/tool/hpcrun/gpu/amd/hip-api.c
index 9dd8188c0f..be0d916b83 100644
--- a/src/tool/hpcrun/gpu/amd/hip-api.c
+++ b/src/tool/hpcrun/gpu/amd/hip-api.c
@@ -248,4 +248,4 @@ hip_dev_sync
 #else
   return -1;
 #endif
-}
\ No newline at end of file
+}
diff --git a/src/tool/hpcrun/gpu/amd/rocm-binary-processing.c b/src/tool/hpcrun/gpu/amd/rocm-binary-processing.c
index 75dcc1df77..e66672bff3 100644
--- a/src/tool/hpcrun/gpu/amd/rocm-binary-processing.c
+++ b/src/tool/hpcrun/gpu/amd/rocm-binary-processing.c
@@ -418,4 +418,4 @@ rocm_binary_uri_list_init
 )
 {
   spinlock_init(&rocm_binary_list_lock);
-}
\ No newline at end of file
+}
diff --git a/src/tool/hpcrun/gpu/gpu-channel-common.h b/src/tool/hpcrun/gpu/gpu-channel-common.h
index 24869acc92..b2396dd1a7 100644
--- a/src/tool/hpcrun/gpu/gpu-channel-common.h
+++ b/src/tool/hpcrun/gpu/gpu-channel-common.h
@@ -21,4 +21,4 @@
 
 #define GPU_CHANNEL_TOTAL 2
 
-#endif
\ No newline at end of file
+#endif

From 511afa55aa4ed65eab8953f787a0d8ca5f261cc6 Mon Sep 17 00:00:00 2001
From: Xiaozhu Meng <mxz297@gmail.com>
Date: Tue, 22 Feb 2022 20:10:12 -0600
Subject: [PATCH 177/177] 1. Fix error in Makefile generated by hpcstruct.
 Double \ is no longer needed to escape    a character (because Jonathon wrote
 a script to handle \ properly) 2. Fix merge error causing hpcstruct to fail
 with cubin

---
 src/lib/binutils/ElfHelper.cpp | 5 -----
 src/tool/hpcstruct/pmake.txt   | 8 ++++----
 2 files changed, 4 insertions(+), 9 deletions(-)

diff --git a/src/lib/binutils/ElfHelper.cpp b/src/lib/binutils/ElfHelper.cpp
index 882d99948d..0715035de4 100644
--- a/src/lib/binutils/ElfHelper.cpp
+++ b/src/lib/binutils/ElfHelper.cpp
@@ -133,15 +133,10 @@ ElfFile::open
       fwrite(getMemory(), getLength(), 1, f);
       fclose(f);
     }
-    // Prevent memory leak
-    free(memPtr);
 #else
     result = false;
     memPtr = 0;
 #endif
-    // If we cannot open the binary, release memPtr's memory
-    // If we opened the binary, we've copied memPtr to origPtr, also release memory
-    memPtr = 0;
   }
 #endif
 
diff --git a/src/tool/hpcstruct/pmake.txt b/src/tool/hpcstruct/pmake.txt
index f70ac39224..bae9eb6f62 100644
--- a/src/tool/hpcstruct/pmake.txt
+++ b/src/tool/hpcstruct/pmake.txt
@@ -114,9 +114,9 @@ $(STRUCTS_DIR)/%.hpcstruct: $(CPUBIN_DIR)/%
 	nbytes=`du -b -L $< | tail -1 | awk '{ print $$1 }'`
 	if test $$nbytes -gt $(CPAR_SIZE) ; then
 		if test $(THREADS) -gt 1 ; then
-			echo msg: begin parallel analysis of $$cpubin_name \\(size = $$nbytes, using $(THREADS) threads\\)
+			echo msg: begin parallel analysis of $$cpubin_name \(size = $$nbytes, using $(THREADS) threads\)
 		else
-			echo msg: begin concurrent analysis of $$cpubin_name \\(size = $$nbytes, using 1 of $(JOBS) threads\\)
+			echo msg: begin concurrent analysis of $$cpubin_name \(size = $$nbytes, using 1 of $(JOBS) threads\)
 		fi
 		$(STRUCT) -j $(THREADS) -o $$struct_name $< > $$warn_name 2>&1
 		if test -s $$warn_name ; then
@@ -148,9 +148,9 @@ $(STRUCTS_DIR)/%.hpcstruct: $(GPUBIN_DIR)/%
 	nbytes=`du -b -L $< | tail -1 | awk '{ print $$1 }'`
 	if test $$nbytes -gt $(GPAR_SIZE) ; then
 		if test $(THREADS) -gt 1 ; then
-			echo msg: begin parallel analysis of $$gpubin_name \\(size = $$nbytes, using $(THREADS) threads\\)
+			echo msg: begin parallel analysis of $$gpubin_name \(size = $$nbytes, using $(THREADS) threads\)
 		else
-			echo msg: begin concurrent analysis of $$gpubin_name \\(size = $$nbytes, using 1 of $(JOBS) threads\\)
+			echo msg: begin concurrent analysis of $$gpubin_name \(size = $$nbytes, using 1 of $(JOBS) threads\)
 		fi
 		$(STRUCT) -j $(THREADS) --gpucfg $(GPUBIN_CFG) -o $$struct_name $< > $$warn_name 2>&1
 		if test -s $$warn_name ; then