diff --git a/Makefile.in b/Makefile.in
index 9166db4530..fb24849491 100644
--- a/Makefile.in
+++ b/Makefile.in
@@ -342,6 +342,7 @@ OPT_CUPTI_IFLAGS = @OPT_CUPTI_IFLAGS@
OPT_CUPTI_LDFLAGS = @OPT_CUPTI_LDFLAGS@
OPT_GTPIN = @OPT_GTPIN@
OPT_GTPIN_IFLAGS = @OPT_GTPIN_IFLAGS@
+OPT_GTPIN_LDFLAGS = @OPT_GTPIN_LDFLAGS@
OPT_GTPIN_LIBDIR = @OPT_GTPIN_LIBDIR@
OPT_IGC = @OPT_IGC@
OPT_IGC_IFLAGS = @OPT_IGC_IFLAGS@
@@ -379,6 +380,7 @@ PERFMON_LDFLAGS_STAT = @PERFMON_LDFLAGS_STAT@
PERFMON_LIB = @PERFMON_LIB@
PERF_EVENT_PARANOID = @PERF_EVENT_PARANOID@
RANLIB = @RANLIB@
+ROCM_PROFILER_LD_DIR = @ROCM_PROFILER_LD_DIR@
SED = @SED@
SET_MAKE = @SET_MAKE@
SHELL = @SHELL@
diff --git a/configure b/configure
index 4fa77fc0d7..76f0a04114 100755
--- a/configure
+++ b/configure
@@ -651,10 +651,12 @@ OPT_LEVEL0_IFLAGS
OPT_LEVEL0
OPT_ENABLE_LEVEL0_FALSE
OPT_ENABLE_LEVEL0_TRUE
+ROCM_PROFILER_LD_DIR
OPT_ROCM_LD_LIB_PATH
OPT_ROCM_IFLAGS
OPT_ENABLE_ROCM_FALSE
OPT_ENABLE_ROCM_TRUE
+OPT_GTPIN_LDFLAGS
OPT_GTPIN_LIBDIR
OPT_GTPIN_IFLAGS
OPT_GTPIN
@@ -747,6 +749,8 @@ TBB_LIB_DIR
TBB_PROXY_LIB
TBB_LFLAGS
TBB_IFLAGS
+OPT_PAPI_ROCM_FALSE
+OPT_PAPI_ROCM_TRUE
OPT_PAPI_CUPTI_FALSE
OPT_PAPI_CUPTI_TRUE
OPT_PAPI_COMPONENT_FALSE
@@ -1053,6 +1057,7 @@ with_papi
enable_force_papi
enable_papi_c
enable_papi_c_cupti
+enable_papi_c_rocm
with_perfmon
enable_perf_events
enable_kernel_blocking
@@ -1081,6 +1086,8 @@ with_rocm
with_rocm_hip
with_rocm_dbgapi
with_rocm_tracer
+with_rocm_profiler
+with_rocm_hsa
with_level0
enable_data_centric_tracing
enable_devtools
@@ -1760,6 +1767,8 @@ Optional Features:
--enable-papi-c use component papi, if available (default yes)
--enable-papi-c-cupti use papi CUPTI support, if available (default no),
requires papi cuda component
+ --enable-papi-c-rocm use papi ROCM support, if available (default no),
+ requires papi rocm component
--enable-perf-events force enable or disable perf events in hpcrun
(normally 2.6.32 or later), only needed if fails to
auto-detect correctly
@@ -1839,6 +1848,9 @@ Optional Packages:
--with-rocm-hip=PATH path to hip install directory
--with-rocm-dbgapi=PATH path to rocm-dbgapi install directory
--with-rocm-tracer=PATH path to roctracer-dev install directory
+ --with-rocm-profiler=PATH
+ path to rocprofiler-dev install directory
+ --with-rocm-hsa=PATH path to hsa-dev install directory
--with-level0=PATH use given Level Zero installation (absolute path)
with hpcrun (default is NO)
--with-valgrind=PATH path to Valgrind install directory
@@ -21990,10 +22002,8 @@ ac_compiler_gnu=$ac_cv_c_compiler_gnu
#include "papi.h"
-extern void CUDA_init_component(void);
int main()
{
- CUDA_init_component();
}
_ACEOF
@@ -22019,6 +22029,67 @@ $as_echo "$use_papi_c_cupti" >&6; }
fi
+#-------------------------------------------------
+# Option: --enable-papi-c-rocm
+#-------------------------------------------------
+
+use_papi_c_rocm=no
+
+# Check whether --enable-papi-c-rocm was given.
+if test "${enable_papi_c_rocm+set}" = set; then :
+ enableval=$enable_papi_c_rocm; use_papi_c_rocm="$enableval"
+fi
+
+
+if test "$use_papi_c" = no || test "$use_papi_c_rocm" != yes ; then
+ use_papi_c_rocm=no
+else
+ { $as_echo "$as_me:${as_lineno-$LINENO}: checking for papi rocm component" >&5
+$as_echo_n "checking for papi rocm component... " >&6; }
+
+ ORIG_CFLAGS="$CFLAGS"
+ ORIG_LIBS="$LIBS"
+ CFLAGS="$CFLAGS $OPT_PAPI_IFLAGS"
+ LIBS="$OPT_PAPI_LDFLAGS $papi_extra_libs"
+ ac_ext=c
+ac_cpp='$CPP $CPPFLAGS'
+ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5'
+ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
+ac_compiler_gnu=$ac_cv_c_compiler_gnu
+
+
+ cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h. */
+
+
+#include "papi.h"
+int main()
+{
+}
+
+_ACEOF
+if ac_fn_c_try_link "$LINENO"; then :
+ use_papi_c_rocm=yes
+else
+ use_papi_c_rocm=no
+fi
+rm -f core conftest.err conftest.$ac_objext \
+ conftest$ac_exeext conftest.$ac_ext
+
+ ac_ext=cpp
+ac_cpp='$CXXCPP $CPPFLAGS'
+ac_compile='$CXX -c $CXXFLAGS $CPPFLAGS conftest.$ac_ext >&5'
+ac_link='$CXX -o conftest$ac_exeext $CXXFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
+ac_compiler_gnu=$ac_cv_cxx_compiler_gnu
+
+ CFLAGS="$ORIG_CFLAGS"
+ LIBS="$ORIG_LIBS"
+
+ { $as_echo "$as_me:${as_lineno-$LINENO}: result: $use_papi_c_rocm" >&5
+$as_echo "$use_papi_c_rocm" >&6; }
+fi
+
+
#-------------------------------------------------
# Option: --with-perfmon=PATH
#-------------------------------------------------
@@ -22348,6 +22419,7 @@ $as_echo "$as_me: WARNING: disable papi due to possible conflict with perfmon" >
OPT_PAPI_LIBPATH=
use_papi_c=no
use_papi_c_cupti=no
+ use_papi_c_rocm=no
fi
fi
@@ -22385,6 +22457,14 @@ else
OPT_PAPI_CUPTI_FALSE=
fi
+ if test "$use_papi_c_rocm" = yes; then
+ OPT_PAPI_ROCM_TRUE=
+ OPT_PAPI_ROCM_FALSE='#'
+else
+ OPT_PAPI_ROCM_TRUE='#'
+ OPT_PAPI_ROCM_FALSE=
+fi
+
#-------------------------------------------------
@@ -24368,6 +24448,201 @@ $as_echo "$GTPIN" >&6; }
+#-------------------------------------------------
+# Option: --with-igc=PATH
+#-------------------------------------------------
+
+IGC=no
+OPT_HAVE_IGC=no
+OPT_IGC_IFLAGS=
+OPT_IGC_LDFLAGS=
+
+
+# Check whether --with-igc was given.
+if test "${with_igc+set}" = set; then :
+ withval=$with_igc; IGC="$withval"
+fi
+
+
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for igc" >&5
+$as_echo_n "checking for igc... " >&6; }
+
+case "$IGC" in
+ /* )
+ if test ! -f "${IGC}/include/igc/igc.opencl.h" ; then
+ as_fn_error $? "unable to find igc.opencl.h in: $IGC" "$LINENO" 5
+ fi
+ OPT_IGC_IFLAGS="-I${IGC}/include"
+
+ IGC_LDFLAGS =
+ IGA_LDFLAGS =
+
+ for lib in $multilib_path ; do
+ if test -f "${IGC}/${lib}/libigc.so" ; then
+ IGC_LDFLAGS="-L${IGC}/$lib -Wl,-rpath=${IGC}/$lib -ligc"
+ break
+ fi
+ done
+ if test "x$IGC_LDFLAGS" = x ; then
+ as_fn_error $? "unable to find libigc.so in: $IGC" "$LINENO" 5
+ fi
+
+ for lib in $multilib_path ; do
+ if test -f "${IGC}/${lib}/libiga64.so" ; then
+ IGA_LDFLAGS="-L${IGC}/$lib -Wl,-rpath=${IGC}/$lib -liga64"
+ break
+ fi
+ done
+ if test "x$IGA_LDFLAGS" = x ; then
+ as_fn_error $? "unable to find libiga.so in: $IGC" "$LINENO" 5
+ fi
+
+ OPT_IGC_LDFLAGS="${IGC_LDFLAGS} ${IGA_LDFLAGS}"
+
+ OPT_HAVE_IGC=yes
+ ;;
+ no )
+ ;;
+ * )
+ as_fn_error $? "igc directory must be absolute path: $IGC" "$LINENO" 5
+ ;;
+esac
+
+ if test "$OPT_HAVE_IGC" = yes; then
+ OPT_ENABLE_IGC_TRUE=
+ OPT_ENABLE_IGC_FALSE='#'
+else
+ OPT_ENABLE_IGC_TRUE='#'
+ OPT_ENABLE_IGC_FALSE=
+fi
+
+
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $IGC" >&5
+$as_echo "$IGC" >&6; }
+
+
+
+
+
+#-------------------------------------------------
+# Option: --with-metrics-discovery=PATH
+#-------------------------------------------------
+
+METRICS_DISCOVERY=no
+OPT_HAVE_METRICS_DISCOVERY=no
+OPT_METRICS_DISCOVERY_IFLAGS=
+OPT_METRICS_DISCOVERY_LDFLAGS=
+
+
+# Check whether --with-metrics-discovery was given.
+if test "${with_metrics_discovery+set}" = set; then :
+ withval=$with_metrics_discovery; METRICS_DISCOVERY="$withval"
+fi
+
+
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for metrics-discovery" >&5
+$as_echo_n "checking for metrics-discovery... " >&6; }
+
+case "$METRICS_DISCOVERY" in
+ /* )
+ if test ! -f "${METRICS_DISCOVERY}/include/metrics_discovery_api.h" ; then
+ as_fn_error $? "unable to find metrics_discovery_api.h in: $METRICS_DISCOVERY" "$LINENO" 5
+ fi
+ OPT_METRICS_DISCOVERY_IFLAGS="-I${METRICS_DISCOVERY}/include"
+
+ for lib in $multilib_path ; do
+ if test -f "${METRICS_DISCOVERY}/${lib}/libmd.so" ; then
+ OPT_METRICS_DISCOVERY_LDFLAGS="-L${METRICS_DISCOVERY}/$lib -Wl,-rpath=${METRICS_DISCOVERY}/$lib -lmd"
+ break
+ fi
+ done
+ if test "x$OPT_METRICS_DISCOVERY_LDFLAGS" = x ; then
+ as_fn_error $? "unable to find libmd.so in: $METRICS_DISCOVERY" "$LINENO" 5
+ fi
+ OPT_HAVE_METRICS_DISCOVERY=yes
+ ;;
+ no )
+ ;;
+ * )
+ as_fn_error $? "metrics-discovery directory must be absolute path: $METRICS_DISCOVERY" "$LINENO" 5
+ ;;
+esac
+
+ if test "$OPT_HAVE_METRICS_DISCOVERY" = yes; then
+ OPT_ENABLE_METRICS_DISCOVERY_TRUE=
+ OPT_ENABLE_METRICS_DISCOVERY_FALSE='#'
+else
+ OPT_ENABLE_METRICS_DISCOVERY_TRUE='#'
+ OPT_ENABLE_METRICS_DISCOVERY_FALSE=
+fi
+
+
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $METRICS_DISCOVERY" >&5
+$as_echo "$METRICS_DISCOVERY" >&6; }
+
+
+
+
+
+#-------------------------------------------------
+# Option: --with-gtpin=PATH
+#-------------------------------------------------
+
+GTPIN=no
+OPT_HAVE_GTPIN=no
+OPT_GTPIN_IFLAGS=
+OPT_GTPIN_LDFLAGS=
+
+
+# Check whether --with-gtpin was given.
+if test "${with_gtpin+set}" = set; then :
+ withval=$with_gtpin; GTPIN="$withval"
+fi
+
+
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for gtpin" >&5
+$as_echo_n "checking for gtpin... " >&6; }
+
+case "$GTPIN" in
+ /* )
+ if test ! -f "${GTPIN}/Profilers/Include/gtpin.h" ; then
+ as_fn_error $? "unable to find gtpin.h in: $GTPIN" "$LINENO" 5
+ fi
+ OPT_GTPIN_IFLAGS="-I${GTPIN}/Profilers/Include -I${GTPIN}/Profilers/Include/ged/intel64/"
+
+ if test -f "${GTPIN}/Profilers/Lib/intel64/libgtpin.so" ; then
+ OPT_GTPIN_LDFLAGS="-L${GTPIN}/Profilers/Lib/intel64/ -Wl,-rpath=${GTPIN}/Profilers/Lib/intel64/ -lgtpin"
+ fi
+
+ if test "x$OPT_GTPIN_LDFLAGS" = x ; then
+ as_fn_error $? "unable to find libgtpin.so in: $GTPIN" "$LINENO" 5
+ fi
+ OPT_HAVE_GTPIN=yes
+ ;;
+ no )
+ ;;
+ * )
+ as_fn_error $? "gtpin directory must be absolute path: $GTPIN" "$LINENO" 5
+ ;;
+esac
+
+ if test "$OPT_HAVE_GTPIN" = yes; then
+ OPT_ENABLE_GTPIN_TRUE=
+ OPT_ENABLE_GTPIN_FALSE='#'
+else
+ OPT_ENABLE_GTPIN_TRUE='#'
+ OPT_ENABLE_GTPIN_FALSE=
+fi
+
+
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $GTPIN" >&5
+$as_echo "$GTPIN" >&6; }
+
+
+
+
+
+
#-------------------------------------------------
# Option: --with-rocm=PATH
#-------------------------------------------------
@@ -24383,6 +24658,8 @@ ROCM=
ROCM_HIP=
ROCM_DBGAPI=
ROCM_TRACER=
+ROCM_PROFILER=
+ROCM_HSA=
# Check whether --with-rocm was given.
@@ -24412,17 +24689,39 @@ if test "${with_rocm_tracer+set}" = set; then :
fi
+
+# Check whether --with-rocm-profiler was given.
+if test "${with_rocm_profiler+set}" = set; then :
+ withval=$with_rocm_profiler; ROCM_PROFILER="$withval"
+fi
+
+
+
+# Check whether --with-rocm-hsa was given.
+if test "${with_rocm_hsa+set}" = set; then :
+ withval=$with_rocm_hsa; ROCM_HSA="$withval"
+fi
+
+
+
+
ROCM_HIP_IFLAGS=
ROCM_DBGAPI_IFLAGS=
ROCM_TRACER_IFLAGS=
+ROCM_PROFILER_IFLAGS=
+ROCM_HSA_IFLAGS=
ROCM_HIP_LD_DIR=
ROCM_DBGAPI_LD_DIR=
ROCM_TRACER_LD_DIR=
+ROCM_PROFILER_LD_DIR=
+ROCM_HSA_LD_DIR=
ROCM_HIP_MESG=
ROCM_DBGAPI_MESG=
ROCM_TRACER_MESG=
+ROCM_PROFILER_MESG=
+ROCM_HSA_MESG=
require_rocm=no
@@ -24485,6 +24784,38 @@ $as_echo "$as_me: found $ROCM/roctracer/lib/libroctracer64.so" >&6;}
found=yes
fi
+ # ROCPROFILER
+ if test -f "$ROCM/rocprofiler/include/rocprofiler.h" ; then
+ { $as_echo "$as_me:${as_lineno-$LINENO}: found $ROCM/rocprofiler/include/rocprofiler.h" >&5
+$as_echo "$as_me: found $ROCM/rocprofiler/include/rocprofiler.h" >&6;}
+ ROCM_PROFILER_IFLAGS="-I$ROCM/rocprofiler/include"
+ ROCM_PROFILER_MESG="$ROCM/rocprofiler"
+ found=yes
+ fi
+ if test -f "$ROCM/rocprofiler/lib/librocprofiler64.so" ; then
+ { $as_echo "$as_me:${as_lineno-$LINENO}: found $ROCM/rocprofiler/lib/librocprofiler64.so" >&5
+$as_echo "$as_me: found $ROCM/rocprofiler/lib/librocprofiler64.so" >&6;}
+ ROCM_PROFILER_LD_DIR="$ROCM/rocprofiler/lib"
+ ROCM_PROFILER_MESG="$ROCM/rocprofiler"
+ found=yes
+ fi
+
+ # HSA
+ if test -f "$ROCM/hsa/include/hsa/hsa.h" ; then
+ { $as_echo "$as_me:${as_lineno-$LINENO}: found $ROCM/hsa/include/hsa/hsa.h" >&5
+$as_echo "$as_me: found $ROCM/hsa/include/hsa/hsa.h" >&6;}
+ ROCM_HSA_IFLAGS="-I$ROCM/hsa/include/hsa"
+ ROCM_HSA_MESG="$ROCM/hsa"
+ found=yes
+ fi
+ if test -f "$ROCM/hsa/lib/libhsa-runtime64.so" ; then
+ { $as_echo "$as_me:${as_lineno-$LINENO}: found $ROCM/hsa/lib/libhsa-runtime64.so" >&5
+$as_echo "$as_me: found $ROCM/hsa/lib/libhsa-runtime64.so" >&6;}
+ ROCM_HSA_LD_DIR="$ROCM/hsa/lib"
+ ROCM_HSA_MESG="$ROCM/hsa"
+ found=yes
+ fi
+
# warn if given dir has nothing useful
if test "$found" = no ; then
{ $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: found nothing useful in $ROCM" >&5
@@ -24586,6 +24917,64 @@ $as_echo "$as_me: WARNING: found nothing useful in $ROCM_TRACER" >&2;}
;;
esac
+case "$ROCM_PROFILER" in
+ /* )
+ require_rocm=yes
+ found=no
+
+ if test -f "$ROCM_PROFILER/rocprofiler/include/rocprofiler.h" ; then
+ { $as_echo "$as_me:${as_lineno-$LINENO}: found $ROCM_PROFILER/rocprofiler/include/rocprofiler.h" >&5
+$as_echo "$as_me: found $ROCM_PROFILER/rocprofiler/include/rocprofiler.h" >&6;}
+ ROCM_PROFILER_IFLAGS="-I$ROCM_PROFILER/rocprofiler/include"
+ ROCM_PROFILER_MESG="$ROCM_PROFILER/rocprofiler"
+ found=yes
+ fi
+ if test -f "$ROCM_PROFILER/rocprofiler/lib/librocprofiler64.so" ; then
+ { $as_echo "$as_me:${as_lineno-$LINENO}: found $ROCM_PROFILER/rocprofiler/lib/librocprofiler64.so" >&5
+$as_echo "$as_me: found $ROCM_PROFILER/rocprofiler/lib/librocprofiler64.so" >&6;}
+ ROCM_PROFILER_LD_DIR="$ROCM_PROFILER/rocprofiler/lib"
+ ROCM_PROFILER_MESG="$ROCM_PROFILER/rocprofiler"
+ found=yes
+ fi
+ if test "$found" = no ; then
+ { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: found nothing useful in $ROCM_PROFILER" >&5
+$as_echo "$as_me: WARNING: found nothing useful in $ROCM_PROFILER" >&2;}
+ fi
+ ;;
+ * )
+ ROCM_PROFILER=no
+ ;;
+esac
+
+case "$ROCM_HSA" in
+ /* )
+ require_rocm=yes
+ found=no
+
+ if test -f "$ROCM_HSA/include/hsa/hsa.h" ; then
+ { $as_echo "$as_me:${as_lineno-$LINENO}: found $ROCM_HSA/include/hsa/hsa.h" >&5
+$as_echo "$as_me: found $ROCM_HSA/include/hsa/hsa.h" >&6;}
+ ROCM_HSA_IFLAGS="-I$ROCM_HSA/include/hsa"
+ ROCM_HSA_MESG="$ROCM_HSA"
+ found=yes
+ fi
+ if test -f "$ROCM_HSA/lib/libhsa-runtime64.so" ; then
+ { $as_echo "$as_me:${as_lineno-$LINENO}: found $ROCM_HSA/lib/libhsa-runtime64.so" >&5
+$as_echo "$as_me: found $ROCM_HSA/lib/libhsa-runtime64.so" >&6;}
+ ROCM_HSA_LD_DIR="$ROCM_HSA/lib"
+ ROCM_HSA_MESG="$ROCM_HSA"
+ found=yes
+ fi
+ if test "$found" = no ; then
+ { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: found nothing useful in $ROCM_HSA" >&5
+$as_echo "$as_me: WARNING: found nothing useful in $ROCM_HSA" >&2;}
+ fi
+ ;;
+ * )
+ ROCM_HSA=no
+ ;;
+esac
+
#
# Check that we found all the pieces.
#
@@ -24602,6 +24991,12 @@ then
if test "x$ROCM_TRACER_IFLAGS" = x ; then
as_fn_error $? "unable to find roctracer_hip.h" "$LINENO" 5
fi
+ if test "x$ROCM_PROFILER_IFLAGS" = x ; then
+ as_fn_error $? "unable to find rocprofiler.h" "$LINENO" 5
+ fi
+ if test "x$ROCM_HSA_IFLAGS" = x ; then
+ as_fn_error $? "unable to find hsa.h" "$LINENO" 5
+ fi
if test "x$ROCM_HIP_LD_DIR" = x ; then
as_fn_error $? "unable to find libamdhip64.so" "$LINENO" 5
@@ -24612,10 +25007,16 @@ then
if test "x$ROCM_TRACER_LD_DIR" = x ; then
as_fn_error $? "unable to find libroctracer64.so" "$LINENO" 5
fi
+ if test "x$ROCM_PROFILER_LD_DIR" = x ; then
+ as_fn_error $? "unable to find librocprofiler64.so" "$LINENO" 5
+ fi
+ if test "x$ROCM_HSA_LD_DIR" = x ; then
+ as_fn_error $? "unable to find libhsa-runtime64.so" "$LINENO" 5
+ fi
OPT_HAVE_ROCM=yes
- OPT_ROCM_IFLAGS="$ROCM_HIP_IFLAGS $ROCM_DBGAPI_IFLAGS $ROCM_TRACER_IFLAGS"
- OPT_ROCM_LD_LIB_PATH="${ROCM_HIP_LD_DIR}:${ROCM_DBGAPI_LD_DIR}:${ROCM_TRACER_LD_DIR}"
+ OPT_ROCM_IFLAGS="$ROCM_HIP_IFLAGS $ROCM_DBGAPI_IFLAGS $ROCM_TRACER_IFLAGS $ROCM_PROFILER_IFLAGS $ROCM_HSA_IFLAGS"
+ OPT_ROCM_LD_LIB_PATH="${ROCM_HIP_LD_DIR}:${ROCM_DBGAPI_LD_DIR}:${ROCM_TRACER_LD_DIR}:${ROCM_PROFILER_LD_DIR}:${ROCM_HSA_LD_DIR}"
fi
#
@@ -24669,6 +25070,7 @@ fi
+
#-------------------------------------------------
# Option: --with-level0=PATH
#-------------------------------------------------
@@ -25225,6 +25627,10 @@ if test -z "${OPT_PAPI_CUPTI_TRUE}" && test -z "${OPT_PAPI_CUPTI_FALSE}"; then
as_fn_error $? "conditional \"OPT_PAPI_CUPTI\" was never defined.
Usually this means the macro was only invoked conditionally." "$LINENO" 5
fi
+if test -z "${OPT_PAPI_ROCM_TRUE}" && test -z "${OPT_PAPI_ROCM_FALSE}"; then
+ as_fn_error $? "conditional \"OPT_PAPI_ROCM\" was never defined.
+Usually this means the macro was only invoked conditionally." "$LINENO" 5
+fi
if test -z "${OPT_USE_ZLIB_TRUE}" && test -z "${OPT_USE_ZLIB_FALSE}"; then
as_fn_error $? "conditional \"OPT_USE_ZLIB\" was never defined.
Usually this means the macro was only invoked conditionally." "$LINENO" 5
@@ -25317,6 +25723,18 @@ if test -z "${OPT_ENABLE_GTPIN_TRUE}" && test -z "${OPT_ENABLE_GTPIN_FALSE}"; th
as_fn_error $? "conditional \"OPT_ENABLE_GTPIN\" was never defined.
Usually this means the macro was only invoked conditionally." "$LINENO" 5
fi
+if test -z "${OPT_ENABLE_IGC_TRUE}" && test -z "${OPT_ENABLE_IGC_FALSE}"; then
+ as_fn_error $? "conditional \"OPT_ENABLE_IGC\" was never defined.
+Usually this means the macro was only invoked conditionally." "$LINENO" 5
+fi
+if test -z "${OPT_ENABLE_METRICS_DISCOVERY_TRUE}" && test -z "${OPT_ENABLE_METRICS_DISCOVERY_FALSE}"; then
+ as_fn_error $? "conditional \"OPT_ENABLE_METRICS_DISCOVERY\" was never defined.
+Usually this means the macro was only invoked conditionally." "$LINENO" 5
+fi
+if test -z "${OPT_ENABLE_GTPIN_TRUE}" && test -z "${OPT_ENABLE_GTPIN_FALSE}"; then
+ as_fn_error $? "conditional \"OPT_ENABLE_GTPIN\" was never defined.
+Usually this means the macro was only invoked conditionally." "$LINENO" 5
+fi
if test -z "${OPT_ENABLE_ROCM_TRUE}" && test -z "${OPT_ENABLE_ROCM_FALSE}"; then
as_fn_error $? "conditional \"OPT_ENABLE_ROCM\" was never defined.
Usually this means the macro was only invoked conditionally." "$LINENO" 5
@@ -27999,6 +28417,8 @@ $as_echo "$as_me: gtpin: ${GTPIN}" >&6;}
$as_echo "$as_me: metrics-discovery: ${METRICS_DISCOVERY}" >&6;}
{ $as_echo "$as_me:${as_lineno-$LINENO}: papi-c-cupti: ${use_papi_c_cupti}" >&5
$as_echo "$as_me: papi-c-cupti: ${use_papi_c_cupti}" >&6;}
+{ $as_echo "$as_me:${as_lineno-$LINENO}: papi-c-rocm: ${use_papi_c_rocm}" >&5
+$as_echo "$as_me: papi-c-rocm: ${use_papi_c_rocm}" >&6;}
{ $as_echo "$as_me:${as_lineno-$LINENO}: rocm: ${rocm_mesg}" >&5
$as_echo "$as_me: rocm: ${rocm_mesg}" >&6;}
if test "$OPT_HAVE_ROCM" = yes ; then
@@ -28008,6 +28428,10 @@ $as_echo "$as_me: rocm hip: $ROCM_HIP_MESG" >&6;}
$as_echo "$as_me: rocm dbgapi: $ROCM_DBGAPI_MESG" >&6;}
{ $as_echo "$as_me:${as_lineno-$LINENO}: rocm tracer: $ROCM_TRACER_MESG" >&5
$as_echo "$as_me: rocm tracer: $ROCM_TRACER_MESG" >&6;}
+ { $as_echo "$as_me:${as_lineno-$LINENO}: rocm profiler:$ROCM_PROFILER_MESG" >&5
+$as_echo "$as_me: rocm profiler:$ROCM_PROFILER_MESG" >&6;}
+ { $as_echo "$as_me:${as_lineno-$LINENO}: rocm hsa: $ROCM_HSA_MESG" >&5
+$as_echo "$as_me: rocm hsa: $ROCM_HSA_MESG" >&6;}
fi
{ $as_echo "$as_me:${as_lineno-$LINENO}: valgrind: ${VALGRIND}" >&5
$as_echo "$as_me: valgrind: ${VALGRIND}" >&6;}
diff --git a/configure.ac b/configure.ac
index 94496fec79..e8f61ffe14 100644
--- a/configure.ac
+++ b/configure.ac
@@ -3314,10 +3314,8 @@ else
AC_LINK_IFELSE([
AC_LANG_SOURCE([[
#include "papi.h"
-extern void CUDA_init_component(void);
int main()
{
- CUDA_init_component();
}
]])], [use_papi_c_cupti=yes], [use_papi_c_cupti=no])
@@ -3329,6 +3327,46 @@ int main()
fi
+#-------------------------------------------------
+# Option: --enable-papi-c-rocm
+#-------------------------------------------------
+
+use_papi_c_rocm=no
+
+AC_ARG_ENABLE([papi-c-rocm],
+ [AS_HELP_STRING([--enable-papi-c-rocm],
+ [use papi ROCM support, if available (default no), requires
+ papi rocm component])],
+ [use_papi_c_rocm="$enableval"],
+ [])
+
+if test "$use_papi_c" = no || test "$use_papi_c_rocm" != yes ; then
+ use_papi_c_rocm=no
+else
+ AC_MSG_CHECKING([for papi rocm component])
+
+ ORIG_CFLAGS="$CFLAGS"
+ ORIG_LIBS="$LIBS"
+ CFLAGS="$CFLAGS $OPT_PAPI_IFLAGS"
+ LIBS="$OPT_PAPI_LDFLAGS $papi_extra_libs"
+ AC_LANG_PUSH([C])
+
+ AC_LINK_IFELSE([
+ AC_LANG_SOURCE([[
+#include "papi.h"
+int main()
+{
+}
+]])], [use_papi_c_rocm=yes], [use_papi_c_rocm=no])
+
+ AC_LANG_POP
+ CFLAGS="$ORIG_CFLAGS"
+ LIBS="$ORIG_LIBS"
+
+ AC_MSG_RESULT([$use_papi_c_rocm])
+fi
+
+
#-------------------------------------------------
# Option: --with-perfmon=PATH
#-------------------------------------------------
@@ -3602,6 +3640,7 @@ then
OPT_PAPI_LIBPATH=
use_papi_c=no
use_papi_c_cupti=no
+ use_papi_c_rocm=no
fi
fi
@@ -3611,6 +3650,7 @@ AM_CONDITIONAL(OPT_PAPI_DYNAMIC, [test "$OPT_PAPI_DYNAMIC" = yes])
AM_CONDITIONAL(OPT_PAPI_STATIC, [test "$OPT_PAPI_STATIC" = yes])
AM_CONDITIONAL(OPT_PAPI_COMPONENT, [test "$use_papi_c" = yes])
AM_CONDITIONAL(OPT_PAPI_CUPTI, [test "$use_papi_c_cupti" = yes])
+AM_CONDITIONAL(OPT_PAPI_ROCM, [test "$use_papi_c_rocm" = yes])
#-------------------------------------------------
@@ -5133,6 +5173,171 @@ AC_SUBST([OPT_GTPIN_IFLAGS])
AC_SUBST([OPT_GTPIN_LIBDIR])
+#-------------------------------------------------
+# Option: --with-igc=PATH
+#-------------------------------------------------
+
+IGC=no
+OPT_HAVE_IGC=no
+OPT_IGC_IFLAGS=
+OPT_IGC_LDFLAGS=
+
+AC_ARG_WITH([igc],
+ [AS_HELP_STRING([--with-igc=PATH],
+ [path to igc install directory])],
+ [IGC="$withval"],
+ [])
+
+AC_MSG_CHECKING([for igc])
+
+case "$IGC" in
+ /* )
+ if test ! -f "${IGC}/include/igc/igc.opencl.h" ; then
+ AC_MSG_ERROR([unable to find igc.opencl.h in: $IGC])
+ fi
+ OPT_IGC_IFLAGS="-I${IGC}/include"
+
+ IGC_LDFLAGS =
+ IGA_LDFLAGS =
+
+ for lib in $multilib_path ; do
+ if test -f "${IGC}/${lib}/libigc.so" ; then
+ IGC_LDFLAGS="-L${IGC}/$lib -Wl,-rpath=${IGC}/$lib -ligc"
+ break
+ fi
+ done
+ if test "x$IGC_LDFLAGS" = x ; then
+ AC_MSG_ERROR([unable to find libigc.so in: $IGC])
+ fi
+
+ for lib in $multilib_path ; do
+ if test -f "${IGC}/${lib}/libiga64.so" ; then
+ IGA_LDFLAGS="-L${IGC}/$lib -Wl,-rpath=${IGC}/$lib -liga64"
+ break
+ fi
+ done
+ if test "x$IGA_LDFLAGS" = x ; then
+ AC_MSG_ERROR([unable to find libiga.so in: $IGC])
+ fi
+
+ OPT_IGC_LDFLAGS="${IGC_LDFLAGS} ${IGA_LDFLAGS}"
+
+ OPT_HAVE_IGC=yes
+ ;;
+ no )
+ ;;
+ * )
+ AC_MSG_ERROR([igc directory must be absolute path: $IGC])
+ ;;
+esac
+
+AM_CONDITIONAL([OPT_ENABLE_IGC], [test "$OPT_HAVE_IGC" = yes])
+
+AC_MSG_RESULT([$IGC])
+
+AC_SUBST([OPT_IGC])
+AC_SUBST([OPT_IGC_IFLAGS])
+AC_SUBST([OPT_IGC_LDFLAGS])
+
+#-------------------------------------------------
+# Option: --with-metrics-discovery=PATH
+#-------------------------------------------------
+
+METRICS_DISCOVERY=no
+OPT_HAVE_METRICS_DISCOVERY=no
+OPT_METRICS_DISCOVERY_IFLAGS=
+OPT_METRICS_DISCOVERY_LDFLAGS=
+
+AC_ARG_WITH([metrics-discovery],
+ [AS_HELP_STRING([--with-metrics-discovery=PATH],
+ [path to metrics-discovery install directory])],
+ [METRICS_DISCOVERY="$withval"],
+ [])
+
+AC_MSG_CHECKING([for metrics-discovery])
+
+case "$METRICS_DISCOVERY" in
+ /* )
+ if test ! -f "${METRICS_DISCOVERY}/include/metrics_discovery_api.h" ; then
+ AC_MSG_ERROR([unable to find metrics_discovery_api.h in: $METRICS_DISCOVERY])
+ fi
+ OPT_METRICS_DISCOVERY_IFLAGS="-I${METRICS_DISCOVERY}/include"
+
+ for lib in $multilib_path ; do
+ if test -f "${METRICS_DISCOVERY}/${lib}/libmd.so" ; then
+ OPT_METRICS_DISCOVERY_LDFLAGS="-L${METRICS_DISCOVERY}/$lib -Wl,-rpath=${METRICS_DISCOVERY}/$lib -lmd"
+ break
+ fi
+ done
+ if test "x$OPT_METRICS_DISCOVERY_LDFLAGS" = x ; then
+ AC_MSG_ERROR([unable to find libmd.so in: $METRICS_DISCOVERY])
+ fi
+ OPT_HAVE_METRICS_DISCOVERY=yes
+ ;;
+ no )
+ ;;
+ * )
+ AC_MSG_ERROR([metrics-discovery directory must be absolute path: $METRICS_DISCOVERY])
+ ;;
+esac
+
+AM_CONDITIONAL([OPT_ENABLE_METRICS_DISCOVERY], [test "$OPT_HAVE_METRICS_DISCOVERY" = yes])
+
+AC_MSG_RESULT([$METRICS_DISCOVERY])
+
+AC_SUBST([OPT_METRICS_DISCOVERY])
+AC_SUBST([OPT_METRICS_DISCOVERY_IFLAGS])
+AC_SUBST([OPT_METRICS_DISCOVERY_LDFLAGS])
+
+#-------------------------------------------------
+# Option: --with-gtpin=PATH
+#-------------------------------------------------
+
+GTPIN=no
+OPT_HAVE_GTPIN=no
+OPT_GTPIN_IFLAGS=
+OPT_GTPIN_LDFLAGS=
+
+AC_ARG_WITH([gtpin],
+ [AS_HELP_STRING([--with-gtpin=PATH],
+ [path to gtpin install directory])],
+ [GTPIN="$withval"],
+ [])
+
+AC_MSG_CHECKING([for gtpin])
+
+case "$GTPIN" in
+ /* )
+ if test ! -f "${GTPIN}/Profilers/Include/gtpin.h" ; then
+ AC_MSG_ERROR([unable to find gtpin.h in: $GTPIN])
+ fi
+ OPT_GTPIN_IFLAGS="-I${GTPIN}/Profilers/Include -I${GTPIN}/Profilers/Include/ged/intel64/"
+
+ if test -f "${GTPIN}/Profilers/Lib/intel64/libgtpin.so" ; then
+ OPT_GTPIN_LDFLAGS="-L${GTPIN}/Profilers/Lib/intel64/ -Wl,-rpath=${GTPIN}/Profilers/Lib/intel64/ -lgtpin"
+ fi
+
+ if test "x$OPT_GTPIN_LDFLAGS" = x ; then
+ AC_MSG_ERROR([unable to find libgtpin.so in: $GTPIN])
+ fi
+ OPT_HAVE_GTPIN=yes
+ ;;
+ no )
+ ;;
+ * )
+ AC_MSG_ERROR([gtpin directory must be absolute path: $GTPIN])
+ ;;
+esac
+
+AM_CONDITIONAL([OPT_ENABLE_GTPIN], [test "$OPT_HAVE_GTPIN" = yes])
+
+AC_MSG_RESULT([$GTPIN])
+
+AC_SUBST([OPT_GTPIN])
+AC_SUBST([OPT_GTPIN_IFLAGS])
+AC_SUBST([OPT_GTPIN_LDFLAGS])
+
+
#-------------------------------------------------
# Option: --with-rocm=PATH
#-------------------------------------------------
@@ -5148,6 +5353,8 @@ ROCM=
ROCM_HIP=
ROCM_DBGAPI=
ROCM_TRACER=
+ROCM_PROFILER=
+ROCM_HSA=
AC_ARG_WITH([rocm],
AS_HELP_STRING([--with-rocm=PATH],
@@ -5169,17 +5376,35 @@ AC_ARG_WITH([rocm-tracer],
[path to roctracer-dev install directory]),
[ROCM_TRACER="$withval"], [])
+AC_ARG_WITH([rocm-profiler],
+ AS_HELP_STRING([--with-rocm-profiler=PATH],
+ [path to rocprofiler-dev install directory]),
+ [ROCM_PROFILER="$withval"], [])
+
+AC_ARG_WITH([rocm-hsa],
+ AS_HELP_STRING([--with-rocm-hsa=PATH],
+ [path to hsa-dev install directory]),
+ [ROCM_HSA="$withval"], [])
+
+
+
ROCM_HIP_IFLAGS=
ROCM_DBGAPI_IFLAGS=
ROCM_TRACER_IFLAGS=
+ROCM_PROFILER_IFLAGS=
+ROCM_HSA_IFLAGS=
ROCM_HIP_LD_DIR=
ROCM_DBGAPI_LD_DIR=
ROCM_TRACER_LD_DIR=
+ROCM_PROFILER_LD_DIR=
+ROCM_HSA_LD_DIR=
ROCM_HIP_MESG=
ROCM_DBGAPI_MESG=
ROCM_TRACER_MESG=
+ROCM_PROFILER_MESG=
+ROCM_HSA_MESG=
require_rocm=no
@@ -5235,6 +5460,34 @@ case "$ROCM" in
found=yes
fi
+ # ROCPROFILER
+ if test -f "$ROCM/rocprofiler/include/rocprofiler.h" ; then
+ AC_MSG_NOTICE([found $ROCM/rocprofiler/include/rocprofiler.h])
+ ROCM_PROFILER_IFLAGS="-I$ROCM/rocprofiler/include"
+ ROCM_PROFILER_MESG="$ROCM/rocprofiler"
+ found=yes
+ fi
+ if test -f "$ROCM/rocprofiler/lib/librocprofiler64.so" ; then
+ AC_MSG_NOTICE([found $ROCM/rocprofiler/lib/librocprofiler64.so])
+ ROCM_PROFILER_LD_DIR="$ROCM/rocprofiler/lib"
+ ROCM_PROFILER_MESG="$ROCM/rocprofiler"
+ found=yes
+ fi
+
+ # HSA
+ if test -f "$ROCM/hsa/include/hsa/hsa.h" ; then
+ AC_MSG_NOTICE([found $ROCM/hsa/include/hsa/hsa.h])
+ ROCM_HSA_IFLAGS="-I$ROCM/hsa/include/hsa"
+ ROCM_HSA_MESG="$ROCM/hsa"
+ found=yes
+ fi
+ if test -f "$ROCM/hsa/lib/libhsa-runtime64.so" ; then
+ AC_MSG_NOTICE([found $ROCM/hsa/lib/libhsa-runtime64.so])
+ ROCM_HSA_LD_DIR="$ROCM/hsa/lib"
+ ROCM_HSA_MESG="$ROCM/hsa"
+ found=yes
+ fi
+
# warn if given dir has nothing useful
if test "$found" = no ; then
AC_MSG_WARN([found nothing useful in $ROCM])
@@ -5326,6 +5579,58 @@ case "$ROCM_TRACER" in
;;
esac
+case "$ROCM_PROFILER" in
+ /* )
+ require_rocm=yes
+ found=no
+
+ if test -f "$ROCM_PROFILER/rocprofiler/include/rocprofiler.h" ; then
+ AC_MSG_NOTICE([found $ROCM_PROFILER/rocprofiler/include/rocprofiler.h])
+ ROCM_PROFILER_IFLAGS="-I$ROCM_PROFILER/rocprofiler/include"
+ ROCM_PROFILER_MESG="$ROCM_PROFILER/rocprofiler"
+ found=yes
+ fi
+ if test -f "$ROCM_PROFILER/rocprofiler/lib/librocprofiler64.so" ; then
+ AC_MSG_NOTICE([found $ROCM_PROFILER/rocprofiler/lib/librocprofiler64.so])
+ ROCM_PROFILER_LD_DIR="$ROCM_PROFILER/rocprofiler/lib"
+ ROCM_PROFILER_MESG="$ROCM_PROFILER/rocprofiler"
+ found=yes
+ fi
+ if test "$found" = no ; then
+ AC_MSG_WARN([found nothing useful in $ROCM_PROFILER])
+ fi
+ ;;
+ * )
+ ROCM_PROFILER=no
+ ;;
+esac
+
+case "$ROCM_HSA" in
+ /* )
+ require_rocm=yes
+ found=no
+
+ if test -f "$ROCM_HSA/include/hsa/hsa.h" ; then
+ AC_MSG_NOTICE([found $ROCM_HSA/include/hsa/hsa.h])
+ ROCM_HSA_IFLAGS="-I$ROCM_HSA/include/hsa"
+ ROCM_HSA_MESG="$ROCM_HSA"
+ found=yes
+ fi
+ if test -f "$ROCM_HSA/lib/libhsa-runtime64.so" ; then
+ AC_MSG_NOTICE([found $ROCM_HSA/lib/libhsa-runtime64.so])
+ ROCM_HSA_LD_DIR="$ROCM_HSA/lib"
+ ROCM_HSA_MESG="$ROCM_HSA"
+ found=yes
+ fi
+ if test "$found" = no ; then
+ AC_MSG_WARN([found nothing useful in $ROCM_HSA])
+ fi
+ ;;
+ * )
+ ROCM_HSA=no
+ ;;
+esac
+
#
# Check that we found all the pieces.
#
@@ -5342,6 +5647,12 @@ then
if test "x$ROCM_TRACER_IFLAGS" = x ; then
AC_MSG_ERROR([unable to find roctracer_hip.h])
fi
+ if test "x$ROCM_PROFILER_IFLAGS" = x ; then
+ AC_MSG_ERROR([unable to find rocprofiler.h])
+ fi
+ if test "x$ROCM_HSA_IFLAGS" = x ; then
+ AC_MSG_ERROR([unable to find hsa.h])
+ fi
if test "x$ROCM_HIP_LD_DIR" = x ; then
AC_MSG_ERROR([unable to find libamdhip64.so])
@@ -5352,10 +5663,16 @@ then
if test "x$ROCM_TRACER_LD_DIR" = x ; then
AC_MSG_ERROR([unable to find libroctracer64.so])
fi
+ if test "x$ROCM_PROFILER_LD_DIR" = x ; then
+ AC_MSG_ERROR([unable to find librocprofiler64.so])
+ fi
+ if test "x$ROCM_HSA_LD_DIR" = x ; then
+ AC_MSG_ERROR([unable to find libhsa-runtime64.so])
+ fi
OPT_HAVE_ROCM=yes
- OPT_ROCM_IFLAGS="$ROCM_HIP_IFLAGS $ROCM_DBGAPI_IFLAGS $ROCM_TRACER_IFLAGS"
- OPT_ROCM_LD_LIB_PATH="${ROCM_HIP_LD_DIR}:${ROCM_DBGAPI_LD_DIR}:${ROCM_TRACER_LD_DIR}"
+ OPT_ROCM_IFLAGS="$ROCM_HIP_IFLAGS $ROCM_DBGAPI_IFLAGS $ROCM_TRACER_IFLAGS $ROCM_PROFILER_IFLAGS $ROCM_HSA_IFLAGS"
+ OPT_ROCM_LD_LIB_PATH="${ROCM_HIP_LD_DIR}:${ROCM_DBGAPI_LD_DIR}:${ROCM_TRACER_LD_DIR}:${ROCM_PROFILER_LD_DIR}:${ROCM_HSA_LD_DIR}"
fi
#
@@ -5396,6 +5713,7 @@ AM_CONDITIONAL([OPT_ENABLE_ROCM], [test "$OPT_HAVE_ROCM" = yes])
AC_SUBST([OPT_ROCM_IFLAGS])
AC_SUBST([OPT_ROCM_LD_LIB_PATH])
+AC_SUBST([ROCM_PROFILER_LD_DIR])
#-------------------------------------------------
@@ -5795,11 +6113,14 @@ AC_MSG_NOTICE([ igc: ${IGC}])
AC_MSG_NOTICE([ gtpin: ${GTPIN}])
AC_MSG_NOTICE([ metrics-discovery: ${METRICS_DISCOVERY}])
AC_MSG_NOTICE([ papi-c-cupti: ${use_papi_c_cupti}])
+AC_MSG_NOTICE([ papi-c-rocm: ${use_papi_c_rocm}])
AC_MSG_NOTICE([ rocm: ${rocm_mesg}])
if test "$OPT_HAVE_ROCM" = yes ; then
AC_MSG_NOTICE([ rocm hip: $ROCM_HIP_MESG])
AC_MSG_NOTICE([ rocm dbgapi: $ROCM_DBGAPI_MESG])
AC_MSG_NOTICE([ rocm tracer: $ROCM_TRACER_MESG])
+ AC_MSG_NOTICE([ rocm profiler:$ROCM_PROFILER_MESG])
+ AC_MSG_NOTICE([ rocm hsa: $ROCM_HSA_MESG])
fi
AC_MSG_NOTICE([ valgrind: ${VALGRIND}])
AC_MSG_NOTICE([ valgrind: annotated: ${OPT_ENABLE_VG_ANNOTATIONS}])
diff --git a/doc/Makefile.in b/doc/Makefile.in
index c8196892ef..a14a5bdf49 100644
--- a/doc/Makefile.in
+++ b/doc/Makefile.in
@@ -354,6 +354,7 @@ OPT_CUPTI_IFLAGS = @OPT_CUPTI_IFLAGS@
OPT_CUPTI_LDFLAGS = @OPT_CUPTI_LDFLAGS@
OPT_GTPIN = @OPT_GTPIN@
OPT_GTPIN_IFLAGS = @OPT_GTPIN_IFLAGS@
+OPT_GTPIN_LDFLAGS = @OPT_GTPIN_LDFLAGS@
OPT_GTPIN_LIBDIR = @OPT_GTPIN_LIBDIR@
OPT_IGC = @OPT_IGC@
OPT_IGC_IFLAGS = @OPT_IGC_IFLAGS@
@@ -391,6 +392,7 @@ PERFMON_LDFLAGS_STAT = @PERFMON_LDFLAGS_STAT@
PERFMON_LIB = @PERFMON_LIB@
PERF_EVENT_PARANOID = @PERF_EVENT_PARANOID@
RANLIB = @RANLIB@
+ROCM_PROFILER_LD_DIR = @ROCM_PROFILER_LD_DIR@
SED = @SED@
SET_MAKE = @SET_MAKE@
SHELL = @SHELL@
diff --git a/doc/man/Makefile.in b/doc/man/Makefile.in
index 67cb362dc1..33c0aea877 100644
--- a/doc/man/Makefile.in
+++ b/doc/man/Makefile.in
@@ -297,6 +297,7 @@ OPT_CUPTI_IFLAGS = @OPT_CUPTI_IFLAGS@
OPT_CUPTI_LDFLAGS = @OPT_CUPTI_LDFLAGS@
OPT_GTPIN = @OPT_GTPIN@
OPT_GTPIN_IFLAGS = @OPT_GTPIN_IFLAGS@
+OPT_GTPIN_LDFLAGS = @OPT_GTPIN_LDFLAGS@
OPT_GTPIN_LIBDIR = @OPT_GTPIN_LIBDIR@
OPT_IGC = @OPT_IGC@
OPT_IGC_IFLAGS = @OPT_IGC_IFLAGS@
@@ -334,6 +335,7 @@ PERFMON_LDFLAGS_STAT = @PERFMON_LDFLAGS_STAT@
PERFMON_LIB = @PERFMON_LIB@
PERF_EVENT_PARANOID = @PERF_EVENT_PARANOID@
RANLIB = @RANLIB@
+ROCM_PROFILER_LD_DIR = @ROCM_PROFILER_LD_DIR@
SED = @SED@
SET_MAKE = @SET_MAKE@
SHELL = @SHELL@
diff --git a/doc/manual/Makefile.in b/doc/manual/Makefile.in
index a804f017b1..4ed1616ca5 100644
--- a/doc/manual/Makefile.in
+++ b/doc/manual/Makefile.in
@@ -294,6 +294,7 @@ OPT_CUPTI_IFLAGS = @OPT_CUPTI_IFLAGS@
OPT_CUPTI_LDFLAGS = @OPT_CUPTI_LDFLAGS@
OPT_GTPIN = @OPT_GTPIN@
OPT_GTPIN_IFLAGS = @OPT_GTPIN_IFLAGS@
+OPT_GTPIN_LDFLAGS = @OPT_GTPIN_LDFLAGS@
OPT_GTPIN_LIBDIR = @OPT_GTPIN_LIBDIR@
OPT_IGC = @OPT_IGC@
OPT_IGC_IFLAGS = @OPT_IGC_IFLAGS@
@@ -331,6 +332,7 @@ PERFMON_LDFLAGS_STAT = @PERFMON_LDFLAGS_STAT@
PERFMON_LIB = @PERFMON_LIB@
PERF_EVENT_PARANOID = @PERF_EVENT_PARANOID@
RANLIB = @RANLIB@
+ROCM_PROFILER_LD_DIR = @ROCM_PROFILER_LD_DIR@
SED = @SED@
SET_MAKE = @SET_MAKE@
SHELL = @SHELL@
diff --git a/doc/www/Makefile.in b/doc/www/Makefile.in
index 6a3e8c3784..d8f839f9f3 100644
--- a/doc/www/Makefile.in
+++ b/doc/www/Makefile.in
@@ -294,6 +294,7 @@ OPT_CUPTI_IFLAGS = @OPT_CUPTI_IFLAGS@
OPT_CUPTI_LDFLAGS = @OPT_CUPTI_LDFLAGS@
OPT_GTPIN = @OPT_GTPIN@
OPT_GTPIN_IFLAGS = @OPT_GTPIN_IFLAGS@
+OPT_GTPIN_LDFLAGS = @OPT_GTPIN_LDFLAGS@
OPT_GTPIN_LIBDIR = @OPT_GTPIN_LIBDIR@
OPT_IGC = @OPT_IGC@
OPT_IGC_IFLAGS = @OPT_IGC_IFLAGS@
@@ -331,6 +332,7 @@ PERFMON_LDFLAGS_STAT = @PERFMON_LDFLAGS_STAT@
PERFMON_LIB = @PERFMON_LIB@
PERF_EVENT_PARANOID = @PERF_EVENT_PARANOID@
RANLIB = @RANLIB@
+ROCM_PROFILER_LD_DIR = @ROCM_PROFILER_LD_DIR@
SED = @SED@
SET_MAKE = @SET_MAKE@
SHELL = @SHELL@
diff --git a/doc/www/download.html b/doc/www/download.html
index 3a562bcc4b..e4cd235dcd 100644
--- a/doc/www/download.html
+++ b/doc/www/download.html
@@ -69,28 +69,29 @@
Installing Java 11
Set JAVA_HOME environment variable to the installed directory
-
- Additional Mac OS directions if other Java versions are installed
-
- - Leave all JDKs at their default location, under
/Library/Java/JavaVirtualMachines. The system will pick the highest version by default.
- - To exclude a JDK from being picked by default, rename its
Contents/Info.plist to: Info.plist.disabled.
- That JDK can still be used when $JAVA_HOME points to it, or explicitly referenced in a script or configuration. It will simply be ignored by system's java command.
-
-
-
+
+Additional Mac OS directions if other Java versions are installed:
+
+
+ - Leave all JDKs at their default location, under
/Library/Java/JavaVirtualMachines. The system will pick the highest version by default.
+ - To exclude a JDK from being picked by default, rename its
Contents/Info.plist to: Info.plist.disabled.
+ That JDK can still be used when $JAVA_HOME points to it, or explicitly referenced in a script or configuration. It will simply be ignored by system's java command.
+
+
-Latest Release
+Latest release
diff --git a/lib/Makefile.in b/lib/Makefile.in
index 92e797fb7c..8e67265146 100644
--- a/lib/Makefile.in
+++ b/lib/Makefile.in
@@ -293,6 +293,7 @@ OPT_CUPTI_IFLAGS = @OPT_CUPTI_IFLAGS@
OPT_CUPTI_LDFLAGS = @OPT_CUPTI_LDFLAGS@
OPT_GTPIN = @OPT_GTPIN@
OPT_GTPIN_IFLAGS = @OPT_GTPIN_IFLAGS@
+OPT_GTPIN_LDFLAGS = @OPT_GTPIN_LDFLAGS@
OPT_GTPIN_LIBDIR = @OPT_GTPIN_LIBDIR@
OPT_IGC = @OPT_IGC@
OPT_IGC_IFLAGS = @OPT_IGC_IFLAGS@
@@ -330,6 +331,7 @@ PERFMON_LDFLAGS_STAT = @PERFMON_LDFLAGS_STAT@
PERFMON_LIB = @PERFMON_LIB@
PERF_EVENT_PARANOID = @PERF_EVENT_PARANOID@
RANLIB = @RANLIB@
+ROCM_PROFILER_LD_DIR = @ROCM_PROFILER_LD_DIR@
SED = @SED@
SET_MAKE = @SET_MAKE@
SHELL = @SHELL@
diff --git a/src/Makefile.in b/src/Makefile.in
index 168fc19cc0..d353bb7012 100644
--- a/src/Makefile.in
+++ b/src/Makefile.in
@@ -323,6 +323,7 @@ OPT_CUPTI_IFLAGS = @OPT_CUPTI_IFLAGS@
OPT_CUPTI_LDFLAGS = @OPT_CUPTI_LDFLAGS@
OPT_GTPIN = @OPT_GTPIN@
OPT_GTPIN_IFLAGS = @OPT_GTPIN_IFLAGS@
+OPT_GTPIN_LDFLAGS = @OPT_GTPIN_LDFLAGS@
OPT_GTPIN_LIBDIR = @OPT_GTPIN_LIBDIR@
OPT_IGC = @OPT_IGC@
OPT_IGC_IFLAGS = @OPT_IGC_IFLAGS@
@@ -360,6 +361,7 @@ PERFMON_LDFLAGS_STAT = @PERFMON_LDFLAGS_STAT@
PERFMON_LIB = @PERFMON_LIB@
PERF_EVENT_PARANOID = @PERF_EVENT_PARANOID@
RANLIB = @RANLIB@
+ROCM_PROFILER_LD_DIR = @ROCM_PROFILER_LD_DIR@
SED = @SED@
SET_MAKE = @SET_MAKE@
SHELL = @SHELL@
diff --git a/src/extern/Makefile.in b/src/extern/Makefile.in
index 6487c6533c..2950b9ba80 100644
--- a/src/extern/Makefile.in
+++ b/src/extern/Makefile.in
@@ -332,6 +332,7 @@ OPT_CUPTI_IFLAGS = @OPT_CUPTI_IFLAGS@
OPT_CUPTI_LDFLAGS = @OPT_CUPTI_LDFLAGS@
OPT_GTPIN = @OPT_GTPIN@
OPT_GTPIN_IFLAGS = @OPT_GTPIN_IFLAGS@
+OPT_GTPIN_LDFLAGS = @OPT_GTPIN_LDFLAGS@
OPT_GTPIN_LIBDIR = @OPT_GTPIN_LIBDIR@
OPT_IGC = @OPT_IGC@
OPT_IGC_IFLAGS = @OPT_IGC_IFLAGS@
@@ -369,6 +370,7 @@ PERFMON_LDFLAGS_STAT = @PERFMON_LDFLAGS_STAT@
PERFMON_LIB = @PERFMON_LIB@
PERF_EVENT_PARANOID = @PERF_EVENT_PARANOID@
RANLIB = @RANLIB@
+ROCM_PROFILER_LD_DIR = @ROCM_PROFILER_LD_DIR@
SED = @SED@
SET_MAKE = @SET_MAKE@
SHELL = @SHELL@
diff --git a/src/extern/libunwind/Makefile.in b/src/extern/libunwind/Makefile.in
index 07e5213148..b4b2172587 100644
--- a/src/extern/libunwind/Makefile.in
+++ b/src/extern/libunwind/Makefile.in
@@ -267,6 +267,7 @@ OPT_CUPTI_IFLAGS = @OPT_CUPTI_IFLAGS@
OPT_CUPTI_LDFLAGS = @OPT_CUPTI_LDFLAGS@
OPT_GTPIN = @OPT_GTPIN@
OPT_GTPIN_IFLAGS = @OPT_GTPIN_IFLAGS@
+OPT_GTPIN_LDFLAGS = @OPT_GTPIN_LDFLAGS@
OPT_GTPIN_LIBDIR = @OPT_GTPIN_LIBDIR@
OPT_IGC = @OPT_IGC@
OPT_IGC_IFLAGS = @OPT_IGC_IFLAGS@
@@ -304,6 +305,7 @@ PERFMON_LDFLAGS_STAT = @PERFMON_LDFLAGS_STAT@
PERFMON_LIB = @PERFMON_LIB@
PERF_EVENT_PARANOID = @PERF_EVENT_PARANOID@
RANLIB = @RANLIB@
+ROCM_PROFILER_LD_DIR = @ROCM_PROFILER_LD_DIR@
SED = @SED@
SET_MAKE = @SET_MAKE@
SHELL = @SHELL@
diff --git a/src/extern/lzma/Makefile.in b/src/extern/lzma/Makefile.in
index c8649688a4..aa24d2ab9b 100644
--- a/src/extern/lzma/Makefile.in
+++ b/src/extern/lzma/Makefile.in
@@ -267,6 +267,7 @@ OPT_CUPTI_IFLAGS = @OPT_CUPTI_IFLAGS@
OPT_CUPTI_LDFLAGS = @OPT_CUPTI_LDFLAGS@
OPT_GTPIN = @OPT_GTPIN@
OPT_GTPIN_IFLAGS = @OPT_GTPIN_IFLAGS@
+OPT_GTPIN_LDFLAGS = @OPT_GTPIN_LDFLAGS@
OPT_GTPIN_LIBDIR = @OPT_GTPIN_LIBDIR@
OPT_IGC = @OPT_IGC@
OPT_IGC_IFLAGS = @OPT_IGC_IFLAGS@
@@ -304,6 +305,7 @@ PERFMON_LDFLAGS_STAT = @PERFMON_LDFLAGS_STAT@
PERFMON_LIB = @PERFMON_LIB@
PERF_EVENT_PARANOID = @PERF_EVENT_PARANOID@
RANLIB = @RANLIB@
+ROCM_PROFILER_LD_DIR = @ROCM_PROFILER_LD_DIR@
SED = @SED@
SET_MAKE = @SET_MAKE@
SHELL = @SHELL@
diff --git a/src/lib/Makefile.in b/src/lib/Makefile.in
index e3627010c1..4889bb6d06 100644
--- a/src/lib/Makefile.in
+++ b/src/lib/Makefile.in
@@ -336,6 +336,7 @@ OPT_CUPTI_IFLAGS = @OPT_CUPTI_IFLAGS@
OPT_CUPTI_LDFLAGS = @OPT_CUPTI_LDFLAGS@
OPT_GTPIN = @OPT_GTPIN@
OPT_GTPIN_IFLAGS = @OPT_GTPIN_IFLAGS@
+OPT_GTPIN_LDFLAGS = @OPT_GTPIN_LDFLAGS@
OPT_GTPIN_LIBDIR = @OPT_GTPIN_LIBDIR@
OPT_IGC = @OPT_IGC@
OPT_IGC_IFLAGS = @OPT_IGC_IFLAGS@
@@ -373,6 +374,7 @@ PERFMON_LDFLAGS_STAT = @PERFMON_LDFLAGS_STAT@
PERFMON_LIB = @PERFMON_LIB@
PERF_EVENT_PARANOID = @PERF_EVENT_PARANOID@
RANLIB = @RANLIB@
+ROCM_PROFILER_LD_DIR = @ROCM_PROFILER_LD_DIR@
SED = @SED@
SET_MAKE = @SET_MAKE@
SHELL = @SHELL@
diff --git a/src/lib/analysis/Makefile.in b/src/lib/analysis/Makefile.in
index 56c6d6f61f..68d1dcf0df 100644
--- a/src/lib/analysis/Makefile.in
+++ b/src/lib/analysis/Makefile.in
@@ -371,6 +371,7 @@ OPT_CUPTI_IFLAGS = @OPT_CUPTI_IFLAGS@
OPT_CUPTI_LDFLAGS = @OPT_CUPTI_LDFLAGS@
OPT_GTPIN = @OPT_GTPIN@
OPT_GTPIN_IFLAGS = @OPT_GTPIN_IFLAGS@
+OPT_GTPIN_LDFLAGS = @OPT_GTPIN_LDFLAGS@
OPT_GTPIN_LIBDIR = @OPT_GTPIN_LIBDIR@
OPT_IGC = @OPT_IGC@
OPT_IGC_IFLAGS = @OPT_IGC_IFLAGS@
@@ -408,6 +409,7 @@ PERFMON_LDFLAGS_STAT = @PERFMON_LDFLAGS_STAT@
PERFMON_LIB = @PERFMON_LIB@
PERF_EVENT_PARANOID = @PERF_EVENT_PARANOID@
RANLIB = @RANLIB@
+ROCM_PROFILER_LD_DIR = @ROCM_PROFILER_LD_DIR@
SED = @SED@
SET_MAKE = @SET_MAKE@
SHELL = @SHELL@
diff --git a/src/lib/banal/Makefile.in b/src/lib/banal/Makefile.in
index 6398fc3e67..caa556f5c0 100644
--- a/src/lib/banal/Makefile.in
+++ b/src/lib/banal/Makefile.in
@@ -367,6 +367,7 @@ OPT_CUPTI_IFLAGS = @OPT_CUPTI_IFLAGS@
OPT_CUPTI_LDFLAGS = @OPT_CUPTI_LDFLAGS@
OPT_GTPIN = @OPT_GTPIN@
OPT_GTPIN_IFLAGS = @OPT_GTPIN_IFLAGS@
+OPT_GTPIN_LDFLAGS = @OPT_GTPIN_LDFLAGS@
OPT_GTPIN_LIBDIR = @OPT_GTPIN_LIBDIR@
OPT_IGC = @OPT_IGC@
OPT_IGC_IFLAGS = @OPT_IGC_IFLAGS@
@@ -404,6 +405,7 @@ PERFMON_LDFLAGS_STAT = @PERFMON_LDFLAGS_STAT@
PERFMON_LIB = @PERFMON_LIB@
PERF_EVENT_PARANOID = @PERF_EVENT_PARANOID@
RANLIB = @RANLIB@
+ROCM_PROFILER_LD_DIR = @ROCM_PROFILER_LD_DIR@
SED = @SED@
SET_MAKE = @SET_MAKE@
SHELL = @SHELL@
diff --git a/src/lib/banal/Struct.cpp b/src/lib/banal/Struct.cpp
index 9c3f32f49a..1be8bd3d28 100644
--- a/src/lib/banal/Struct.cpp
+++ b/src/lib/banal/Struct.cpp
@@ -117,6 +117,7 @@
#include "gpu/ReadCudaCFG.hpp"
+
#ifdef ENABLE_IGC
#include "gpu/ReadIntelCFG.hpp"
#endif // ENABLE_IGC
diff --git a/src/lib/banal/gpu/ReadIntelCFG.cpp b/src/lib/banal/gpu/ReadIntelCFG.cpp
index 0df0d6232b..71c5868a6e 100644
--- a/src/lib/banal/gpu/ReadIntelCFG.cpp
+++ b/src/lib/banal/gpu/ReadIntelCFG.cpp
@@ -45,6 +45,7 @@
//***************************************************************************
+
#ifdef ENABLE_IGC
//******************************************************************************
@@ -300,4 +301,5 @@ readIntelCFG
return false;
}
+
#endif // ENABLE_IGC
diff --git a/src/lib/binutils/Makefile.in b/src/lib/binutils/Makefile.in
index ea27efd04c..dbfa265f59 100644
--- a/src/lib/binutils/Makefile.in
+++ b/src/lib/binutils/Makefile.in
@@ -387,6 +387,7 @@ OPT_CUPTI_IFLAGS = @OPT_CUPTI_IFLAGS@
OPT_CUPTI_LDFLAGS = @OPT_CUPTI_LDFLAGS@
OPT_GTPIN = @OPT_GTPIN@
OPT_GTPIN_IFLAGS = @OPT_GTPIN_IFLAGS@
+OPT_GTPIN_LDFLAGS = @OPT_GTPIN_LDFLAGS@
OPT_GTPIN_LIBDIR = @OPT_GTPIN_LIBDIR@
OPT_IGC = @OPT_IGC@
OPT_IGC_IFLAGS = @OPT_IGC_IFLAGS@
@@ -424,6 +425,7 @@ PERFMON_LDFLAGS_STAT = @PERFMON_LDFLAGS_STAT@
PERFMON_LIB = @PERFMON_LIB@
PERF_EVENT_PARANOID = @PERF_EVENT_PARANOID@
RANLIB = @RANLIB@
+ROCM_PROFILER_LD_DIR = @ROCM_PROFILER_LD_DIR@
SED = @SED@
SET_MAKE = @SET_MAKE@
SHELL = @SHELL@
diff --git a/src/lib/binutils/intel/gen_binary_decoder.h b/src/lib/binutils/intel/gen_binary_decoder.h
index 1e6a85dd08..2f2b2f3fde 100644
--- a/src/lib/binutils/intel/gen_binary_decoder.h
+++ b/src/lib/binutils/intel/gen_binary_decoder.h
@@ -29,6 +29,7 @@
#include
+
#ifdef ENABLE_IGC
#include
@@ -74,6 +75,7 @@ class GenBinaryDecoder {
private:
KernelView kernel_view_;
};
+
#endif // ENABLE_IGC
#endif // PTI_SAMPLES_UTILS_GEN_BINARY_DECODER_H_
diff --git a/src/lib/isa/Makefile.in b/src/lib/isa/Makefile.in
index 3faa223f42..9f0ff691f8 100644
--- a/src/lib/isa/Makefile.in
+++ b/src/lib/isa/Makefile.in
@@ -364,6 +364,7 @@ OPT_CUPTI_IFLAGS = @OPT_CUPTI_IFLAGS@
OPT_CUPTI_LDFLAGS = @OPT_CUPTI_LDFLAGS@
OPT_GTPIN = @OPT_GTPIN@
OPT_GTPIN_IFLAGS = @OPT_GTPIN_IFLAGS@
+OPT_GTPIN_LDFLAGS = @OPT_GTPIN_LDFLAGS@
OPT_GTPIN_LIBDIR = @OPT_GTPIN_LIBDIR@
OPT_IGC = @OPT_IGC@
OPT_IGC_IFLAGS = @OPT_IGC_IFLAGS@
@@ -401,6 +402,7 @@ PERFMON_LDFLAGS_STAT = @PERFMON_LDFLAGS_STAT@
PERFMON_LIB = @PERFMON_LIB@
PERF_EVENT_PARANOID = @PERF_EVENT_PARANOID@
RANLIB = @RANLIB@
+ROCM_PROFILER_LD_DIR = @ROCM_PROFILER_LD_DIR@
SED = @SED@
SET_MAKE = @SET_MAKE@
SHELL = @SHELL@
diff --git a/src/lib/prof-lean/Makefile.in b/src/lib/prof-lean/Makefile.in
index 72deb89952..dc73f044c1 100644
--- a/src/lib/prof-lean/Makefile.in
+++ b/src/lib/prof-lean/Makefile.in
@@ -369,6 +369,7 @@ OPT_CUPTI_IFLAGS = @OPT_CUPTI_IFLAGS@
OPT_CUPTI_LDFLAGS = @OPT_CUPTI_LDFLAGS@
OPT_GTPIN = @OPT_GTPIN@
OPT_GTPIN_IFLAGS = @OPT_GTPIN_IFLAGS@
+OPT_GTPIN_LDFLAGS = @OPT_GTPIN_LDFLAGS@
OPT_GTPIN_LIBDIR = @OPT_GTPIN_LIBDIR@
OPT_IGC = @OPT_IGC@
OPT_IGC_IFLAGS = @OPT_IGC_IFLAGS@
@@ -406,6 +407,7 @@ PERFMON_LDFLAGS_STAT = @PERFMON_LDFLAGS_STAT@
PERFMON_LIB = @PERFMON_LIB@
PERF_EVENT_PARANOID = @PERF_EVENT_PARANOID@
RANLIB = @RANLIB@
+ROCM_PROFILER_LD_DIR = @ROCM_PROFILER_LD_DIR@
SED = @SED@
SET_MAKE = @SET_MAKE@
SHELL = @SHELL@
diff --git a/src/lib/prof-lean/crypto-hash.h b/src/lib/prof-lean/crypto-hash.h
index 7a18f59b82..d212583670 100644
--- a/src/lib/prof-lean/crypto-hash.h
+++ b/src/lib/prof-lean/crypto-hash.h
@@ -159,6 +159,10 @@ crypto_hash_self_test
int verbose
);
+#if defined(__cplusplus)
+}
+#endif
+
#endif
#if defined(__cplusplus)
diff --git a/src/lib/prof/Makefile.in b/src/lib/prof/Makefile.in
index 16af4067ad..33a9fdb0d9 100644
--- a/src/lib/prof/Makefile.in
+++ b/src/lib/prof/Makefile.in
@@ -375,6 +375,7 @@ OPT_CUPTI_IFLAGS = @OPT_CUPTI_IFLAGS@
OPT_CUPTI_LDFLAGS = @OPT_CUPTI_LDFLAGS@
OPT_GTPIN = @OPT_GTPIN@
OPT_GTPIN_IFLAGS = @OPT_GTPIN_IFLAGS@
+OPT_GTPIN_LDFLAGS = @OPT_GTPIN_LDFLAGS@
OPT_GTPIN_LIBDIR = @OPT_GTPIN_LIBDIR@
OPT_IGC = @OPT_IGC@
OPT_IGC_IFLAGS = @OPT_IGC_IFLAGS@
@@ -412,6 +413,7 @@ PERFMON_LDFLAGS_STAT = @PERFMON_LDFLAGS_STAT@
PERFMON_LIB = @PERFMON_LIB@
PERF_EVENT_PARANOID = @PERF_EVENT_PARANOID@
RANLIB = @RANLIB@
+ROCM_PROFILER_LD_DIR = @ROCM_PROFILER_LD_DIR@
SED = @SED@
SET_MAKE = @SET_MAKE@
SHELL = @SHELL@
diff --git a/src/lib/profile/Makefile.in b/src/lib/profile/Makefile.in
index a1f68e6195..9c8ac842a6 100644
--- a/src/lib/profile/Makefile.in
+++ b/src/lib/profile/Makefile.in
@@ -406,6 +406,7 @@ OPT_CUPTI_IFLAGS = @OPT_CUPTI_IFLAGS@
OPT_CUPTI_LDFLAGS = @OPT_CUPTI_LDFLAGS@
OPT_GTPIN = @OPT_GTPIN@
OPT_GTPIN_IFLAGS = @OPT_GTPIN_IFLAGS@
+OPT_GTPIN_LDFLAGS = @OPT_GTPIN_LDFLAGS@
OPT_GTPIN_LIBDIR = @OPT_GTPIN_LIBDIR@
OPT_IGC = @OPT_IGC@
OPT_IGC_IFLAGS = @OPT_IGC_IFLAGS@
@@ -443,6 +444,7 @@ PERFMON_LDFLAGS_STAT = @PERFMON_LDFLAGS_STAT@
PERFMON_LIB = @PERFMON_LIB@
PERF_EVENT_PARANOID = @PERF_EVENT_PARANOID@
RANLIB = @RANLIB@
+ROCM_PROFILER_LD_DIR = @ROCM_PROFILER_LD_DIR@
SED = @SED@
SET_MAKE = @SET_MAKE@
SHELL = @SHELL@
diff --git a/src/lib/profxml/Makefile.in b/src/lib/profxml/Makefile.in
index d7285ecb6c..baef3c5eeb 100644
--- a/src/lib/profxml/Makefile.in
+++ b/src/lib/profxml/Makefile.in
@@ -369,6 +369,7 @@ OPT_CUPTI_IFLAGS = @OPT_CUPTI_IFLAGS@
OPT_CUPTI_LDFLAGS = @OPT_CUPTI_LDFLAGS@
OPT_GTPIN = @OPT_GTPIN@
OPT_GTPIN_IFLAGS = @OPT_GTPIN_IFLAGS@
+OPT_GTPIN_LDFLAGS = @OPT_GTPIN_LDFLAGS@
OPT_GTPIN_LIBDIR = @OPT_GTPIN_LIBDIR@
OPT_IGC = @OPT_IGC@
OPT_IGC_IFLAGS = @OPT_IGC_IFLAGS@
@@ -406,6 +407,7 @@ PERFMON_LDFLAGS_STAT = @PERFMON_LDFLAGS_STAT@
PERFMON_LIB = @PERFMON_LIB@
PERF_EVENT_PARANOID = @PERF_EVENT_PARANOID@
RANLIB = @RANLIB@
+ROCM_PROFILER_LD_DIR = @ROCM_PROFILER_LD_DIR@
SED = @SED@
SET_MAKE = @SET_MAKE@
SHELL = @SHELL@
diff --git a/src/lib/stubs-gcc_s/Makefile.in b/src/lib/stubs-gcc_s/Makefile.in
index 9c780b3dca..98f067d869 100644
--- a/src/lib/stubs-gcc_s/Makefile.in
+++ b/src/lib/stubs-gcc_s/Makefile.in
@@ -347,6 +347,7 @@ OPT_CUPTI_IFLAGS = @OPT_CUPTI_IFLAGS@
OPT_CUPTI_LDFLAGS = @OPT_CUPTI_LDFLAGS@
OPT_GTPIN = @OPT_GTPIN@
OPT_GTPIN_IFLAGS = @OPT_GTPIN_IFLAGS@
+OPT_GTPIN_LDFLAGS = @OPT_GTPIN_LDFLAGS@
OPT_GTPIN_LIBDIR = @OPT_GTPIN_LIBDIR@
OPT_IGC = @OPT_IGC@
OPT_IGC_IFLAGS = @OPT_IGC_IFLAGS@
@@ -384,6 +385,7 @@ PERFMON_LDFLAGS_STAT = @PERFMON_LDFLAGS_STAT@
PERFMON_LIB = @PERFMON_LIB@
PERF_EVENT_PARANOID = @PERF_EVENT_PARANOID@
RANLIB = @RANLIB@
+ROCM_PROFILER_LD_DIR = @ROCM_PROFILER_LD_DIR@
SED = @SED@
SET_MAKE = @SET_MAKE@
SHELL = @SHELL@
diff --git a/src/lib/support-lean/Makefile.in b/src/lib/support-lean/Makefile.in
index 35ffa8c0a6..2636565045 100644
--- a/src/lib/support-lean/Makefile.in
+++ b/src/lib/support-lean/Makefile.in
@@ -353,6 +353,7 @@ OPT_CUPTI_IFLAGS = @OPT_CUPTI_IFLAGS@
OPT_CUPTI_LDFLAGS = @OPT_CUPTI_LDFLAGS@
OPT_GTPIN = @OPT_GTPIN@
OPT_GTPIN_IFLAGS = @OPT_GTPIN_IFLAGS@
+OPT_GTPIN_LDFLAGS = @OPT_GTPIN_LDFLAGS@
OPT_GTPIN_LIBDIR = @OPT_GTPIN_LIBDIR@
OPT_IGC = @OPT_IGC@
OPT_IGC_IFLAGS = @OPT_IGC_IFLAGS@
@@ -390,6 +391,7 @@ PERFMON_LDFLAGS_STAT = @PERFMON_LDFLAGS_STAT@
PERFMON_LIB = @PERFMON_LIB@
PERF_EVENT_PARANOID = @PERF_EVENT_PARANOID@
RANLIB = @RANLIB@
+ROCM_PROFILER_LD_DIR = @ROCM_PROFILER_LD_DIR@
SED = @SED@
SET_MAKE = @SET_MAKE@
SHELL = @SHELL@
diff --git a/src/lib/support/Makefile.in b/src/lib/support/Makefile.in
index d4f1edf599..10b2a09865 100644
--- a/src/lib/support/Makefile.in
+++ b/src/lib/support/Makefile.in
@@ -381,6 +381,7 @@ OPT_CUPTI_IFLAGS = @OPT_CUPTI_IFLAGS@
OPT_CUPTI_LDFLAGS = @OPT_CUPTI_LDFLAGS@
OPT_GTPIN = @OPT_GTPIN@
OPT_GTPIN_IFLAGS = @OPT_GTPIN_IFLAGS@
+OPT_GTPIN_LDFLAGS = @OPT_GTPIN_LDFLAGS@
OPT_GTPIN_LIBDIR = @OPT_GTPIN_LIBDIR@
OPT_IGC = @OPT_IGC@
OPT_IGC_IFLAGS = @OPT_IGC_IFLAGS@
@@ -418,6 +419,7 @@ PERFMON_LDFLAGS_STAT = @PERFMON_LDFLAGS_STAT@
PERFMON_LIB = @PERFMON_LIB@
PERF_EVENT_PARANOID = @PERF_EVENT_PARANOID@
RANLIB = @RANLIB@
+ROCM_PROFILER_LD_DIR = @ROCM_PROFILER_LD_DIR@
SED = @SED@
SET_MAKE = @SET_MAKE@
SHELL = @SHELL@
diff --git a/src/lib/xml/Makefile.in b/src/lib/xml/Makefile.in
index 50c80235a4..c5cd173841 100644
--- a/src/lib/xml/Makefile.in
+++ b/src/lib/xml/Makefile.in
@@ -366,6 +366,7 @@ OPT_CUPTI_IFLAGS = @OPT_CUPTI_IFLAGS@
OPT_CUPTI_LDFLAGS = @OPT_CUPTI_LDFLAGS@
OPT_GTPIN = @OPT_GTPIN@
OPT_GTPIN_IFLAGS = @OPT_GTPIN_IFLAGS@
+OPT_GTPIN_LDFLAGS = @OPT_GTPIN_LDFLAGS@
OPT_GTPIN_LIBDIR = @OPT_GTPIN_LIBDIR@
OPT_IGC = @OPT_IGC@
OPT_IGC_IFLAGS = @OPT_IGC_IFLAGS@
@@ -403,6 +404,7 @@ PERFMON_LDFLAGS_STAT = @PERFMON_LDFLAGS_STAT@
PERFMON_LIB = @PERFMON_LIB@
PERF_EVENT_PARANOID = @PERF_EVENT_PARANOID@
RANLIB = @RANLIB@
+ROCM_PROFILER_LD_DIR = @ROCM_PROFILER_LD_DIR@
SED = @SED@
SET_MAKE = @SET_MAKE@
SHELL = @SHELL@
diff --git a/src/tool/Makefile.in b/src/tool/Makefile.in
index ae0baf8570..465bddb363 100644
--- a/src/tool/Makefile.in
+++ b/src/tool/Makefile.in
@@ -341,6 +341,7 @@ OPT_CUPTI_IFLAGS = @OPT_CUPTI_IFLAGS@
OPT_CUPTI_LDFLAGS = @OPT_CUPTI_LDFLAGS@
OPT_GTPIN = @OPT_GTPIN@
OPT_GTPIN_IFLAGS = @OPT_GTPIN_IFLAGS@
+OPT_GTPIN_LDFLAGS = @OPT_GTPIN_LDFLAGS@
OPT_GTPIN_LIBDIR = @OPT_GTPIN_LIBDIR@
OPT_IGC = @OPT_IGC@
OPT_IGC_IFLAGS = @OPT_IGC_IFLAGS@
@@ -378,6 +379,7 @@ PERFMON_LDFLAGS_STAT = @PERFMON_LDFLAGS_STAT@
PERFMON_LIB = @PERFMON_LIB@
PERF_EVENT_PARANOID = @PERF_EVENT_PARANOID@
RANLIB = @RANLIB@
+ROCM_PROFILER_LD_DIR = @ROCM_PROFILER_LD_DIR@
SED = @SED@
SET_MAKE = @SET_MAKE@
SHELL = @SHELL@
diff --git a/src/tool/hpcfnbounds/Makefile.in b/src/tool/hpcfnbounds/Makefile.in
index 04543be278..caccfb848d 100644
--- a/src/tool/hpcfnbounds/Makefile.in
+++ b/src/tool/hpcfnbounds/Makefile.in
@@ -450,6 +450,7 @@ OPT_CUPTI_IFLAGS = @OPT_CUPTI_IFLAGS@
OPT_CUPTI_LDFLAGS = @OPT_CUPTI_LDFLAGS@
OPT_GTPIN = @OPT_GTPIN@
OPT_GTPIN_IFLAGS = @OPT_GTPIN_IFLAGS@
+OPT_GTPIN_LDFLAGS = @OPT_GTPIN_LDFLAGS@
OPT_GTPIN_LIBDIR = @OPT_GTPIN_LIBDIR@
OPT_IGC = @OPT_IGC@
OPT_IGC_IFLAGS = @OPT_IGC_IFLAGS@
@@ -487,6 +488,7 @@ PERFMON_LDFLAGS_STAT = @PERFMON_LDFLAGS_STAT@
PERFMON_LIB = @PERFMON_LIB@
PERF_EVENT_PARANOID = @PERF_EVENT_PARANOID@
RANLIB = @RANLIB@
+ROCM_PROFILER_LD_DIR = @ROCM_PROFILER_LD_DIR@
SED = @SED@
SET_MAKE = @SET_MAKE@
SHELL = @SHELL@
diff --git a/src/tool/hpcfnbounds2/Makefile.in b/src/tool/hpcfnbounds2/Makefile.in
index 3b727fb413..9828167843 100644
--- a/src/tool/hpcfnbounds2/Makefile.in
+++ b/src/tool/hpcfnbounds2/Makefile.in
@@ -348,6 +348,7 @@ OPT_CUPTI_IFLAGS = @OPT_CUPTI_IFLAGS@
OPT_CUPTI_LDFLAGS = @OPT_CUPTI_LDFLAGS@
OPT_GTPIN = @OPT_GTPIN@
OPT_GTPIN_IFLAGS = @OPT_GTPIN_IFLAGS@
+OPT_GTPIN_LDFLAGS = @OPT_GTPIN_LDFLAGS@
OPT_GTPIN_LIBDIR = @OPT_GTPIN_LIBDIR@
OPT_IGC = @OPT_IGC@
OPT_IGC_IFLAGS = @OPT_IGC_IFLAGS@
@@ -385,6 +386,7 @@ PERFMON_LDFLAGS_STAT = @PERFMON_LDFLAGS_STAT@
PERFMON_LIB = @PERFMON_LIB@
PERF_EVENT_PARANOID = @PERF_EVENT_PARANOID@
RANLIB = @RANLIB@
+ROCM_PROFILER_LD_DIR = @ROCM_PROFILER_LD_DIR@
SED = @SED@
SET_MAKE = @SET_MAKE@
SHELL = @SHELL@
diff --git a/src/tool/hpclump/Makefile.in b/src/tool/hpclump/Makefile.in
index 8c2f7b688b..fff69c91ee 100644
--- a/src/tool/hpclump/Makefile.in
+++ b/src/tool/hpclump/Makefile.in
@@ -382,6 +382,7 @@ OPT_CUPTI_IFLAGS = @OPT_CUPTI_IFLAGS@
OPT_CUPTI_LDFLAGS = @OPT_CUPTI_LDFLAGS@
OPT_GTPIN = @OPT_GTPIN@
OPT_GTPIN_IFLAGS = @OPT_GTPIN_IFLAGS@
+OPT_GTPIN_LDFLAGS = @OPT_GTPIN_LDFLAGS@
OPT_GTPIN_LIBDIR = @OPT_GTPIN_LIBDIR@
OPT_IGC = @OPT_IGC@
OPT_IGC_IFLAGS = @OPT_IGC_IFLAGS@
@@ -419,6 +420,7 @@ PERFMON_LDFLAGS_STAT = @PERFMON_LDFLAGS_STAT@
PERFMON_LIB = @PERFMON_LIB@
PERF_EVENT_PARANOID = @PERF_EVENT_PARANOID@
RANLIB = @RANLIB@
+ROCM_PROFILER_LD_DIR = @ROCM_PROFILER_LD_DIR@
SED = @SED@
SET_MAKE = @SET_MAKE@
SHELL = @SHELL@
diff --git a/src/tool/hpcprof-flat/Makefile.in b/src/tool/hpcprof-flat/Makefile.in
index fbc6ce1017..8fdc2ad6c3 100644
--- a/src/tool/hpcprof-flat/Makefile.in
+++ b/src/tool/hpcprof-flat/Makefile.in
@@ -416,6 +416,7 @@ OPT_CUPTI_IFLAGS = @OPT_CUPTI_IFLAGS@
OPT_CUPTI_LDFLAGS = @OPT_CUPTI_LDFLAGS@
OPT_GTPIN = @OPT_GTPIN@
OPT_GTPIN_IFLAGS = @OPT_GTPIN_IFLAGS@
+OPT_GTPIN_LDFLAGS = @OPT_GTPIN_LDFLAGS@
OPT_GTPIN_LIBDIR = @OPT_GTPIN_LIBDIR@
OPT_IGC = @OPT_IGC@
OPT_IGC_IFLAGS = @OPT_IGC_IFLAGS@
@@ -453,6 +454,7 @@ PERFMON_LDFLAGS_STAT = @PERFMON_LDFLAGS_STAT@
PERFMON_LIB = @PERFMON_LIB@
PERF_EVENT_PARANOID = @PERF_EVENT_PARANOID@
RANLIB = @RANLIB@
+ROCM_PROFILER_LD_DIR = @ROCM_PROFILER_LD_DIR@
SED = @SED@
SET_MAKE = @SET_MAKE@
SHELL = @SHELL@
diff --git a/src/tool/hpcprof-mpi/Makefile.in b/src/tool/hpcprof-mpi/Makefile.in
index a9d6ecaa97..5a263b3717 100644
--- a/src/tool/hpcprof-mpi/Makefile.in
+++ b/src/tool/hpcprof-mpi/Makefile.in
@@ -416,6 +416,7 @@ OPT_CUPTI_IFLAGS = @OPT_CUPTI_IFLAGS@
OPT_CUPTI_LDFLAGS = @OPT_CUPTI_LDFLAGS@
OPT_GTPIN = @OPT_GTPIN@
OPT_GTPIN_IFLAGS = @OPT_GTPIN_IFLAGS@
+OPT_GTPIN_LDFLAGS = @OPT_GTPIN_LDFLAGS@
OPT_GTPIN_LIBDIR = @OPT_GTPIN_LIBDIR@
OPT_IGC = @OPT_IGC@
OPT_IGC_IFLAGS = @OPT_IGC_IFLAGS@
@@ -453,6 +454,7 @@ PERFMON_LDFLAGS_STAT = @PERFMON_LDFLAGS_STAT@
PERFMON_LIB = @PERFMON_LIB@
PERF_EVENT_PARANOID = @PERF_EVENT_PARANOID@
RANLIB = @RANLIB@
+ROCM_PROFILER_LD_DIR = @ROCM_PROFILER_LD_DIR@
SED = @SED@
SET_MAKE = @SET_MAKE@
SHELL = @SHELL@
diff --git a/src/tool/hpcprof/Makefile.in b/src/tool/hpcprof/Makefile.in
index f675a9e44f..3a16a2826d 100644
--- a/src/tool/hpcprof/Makefile.in
+++ b/src/tool/hpcprof/Makefile.in
@@ -414,6 +414,7 @@ OPT_CUPTI_IFLAGS = @OPT_CUPTI_IFLAGS@
OPT_CUPTI_LDFLAGS = @OPT_CUPTI_LDFLAGS@
OPT_GTPIN = @OPT_GTPIN@
OPT_GTPIN_IFLAGS = @OPT_GTPIN_IFLAGS@
+OPT_GTPIN_LDFLAGS = @OPT_GTPIN_LDFLAGS@
OPT_GTPIN_LIBDIR = @OPT_GTPIN_LIBDIR@
OPT_IGC = @OPT_IGC@
OPT_IGC_IFLAGS = @OPT_IGC_IFLAGS@
@@ -451,6 +452,7 @@ PERFMON_LDFLAGS_STAT = @PERFMON_LDFLAGS_STAT@
PERFMON_LIB = @PERFMON_LIB@
PERF_EVENT_PARANOID = @PERF_EVENT_PARANOID@
RANLIB = @RANLIB@
+ROCM_PROFILER_LD_DIR = @ROCM_PROFILER_LD_DIR@
SED = @SED@
SET_MAKE = @SET_MAKE@
SHELL = @SHELL@
diff --git a/src/tool/hpcprof2-mpi/Makefile.in b/src/tool/hpcprof2-mpi/Makefile.in
index 96836cd011..5d7363e3f3 100644
--- a/src/tool/hpcprof2-mpi/Makefile.in
+++ b/src/tool/hpcprof2-mpi/Makefile.in
@@ -359,6 +359,7 @@ OPT_CUPTI_IFLAGS = @OPT_CUPTI_IFLAGS@
OPT_CUPTI_LDFLAGS = @OPT_CUPTI_LDFLAGS@
OPT_GTPIN = @OPT_GTPIN@
OPT_GTPIN_IFLAGS = @OPT_GTPIN_IFLAGS@
+OPT_GTPIN_LDFLAGS = @OPT_GTPIN_LDFLAGS@
OPT_GTPIN_LIBDIR = @OPT_GTPIN_LIBDIR@
OPT_IGC = @OPT_IGC@
OPT_IGC_IFLAGS = @OPT_IGC_IFLAGS@
@@ -396,6 +397,7 @@ PERFMON_LDFLAGS_STAT = @PERFMON_LDFLAGS_STAT@
PERFMON_LIB = @PERFMON_LIB@
PERF_EVENT_PARANOID = @PERF_EVENT_PARANOID@
RANLIB = @RANLIB@
+ROCM_PROFILER_LD_DIR = @ROCM_PROFILER_LD_DIR@
SED = @SED@
SET_MAKE = @SET_MAKE@
SHELL = @SHELL@
diff --git a/src/tool/hpcprof2/Makefile.in b/src/tool/hpcprof2/Makefile.in
index b40dfeef89..40dd29a49a 100644
--- a/src/tool/hpcprof2/Makefile.in
+++ b/src/tool/hpcprof2/Makefile.in
@@ -351,6 +351,7 @@ OPT_CUPTI_IFLAGS = @OPT_CUPTI_IFLAGS@
OPT_CUPTI_LDFLAGS = @OPT_CUPTI_LDFLAGS@
OPT_GTPIN = @OPT_GTPIN@
OPT_GTPIN_IFLAGS = @OPT_GTPIN_IFLAGS@
+OPT_GTPIN_LDFLAGS = @OPT_GTPIN_LDFLAGS@
OPT_GTPIN_LIBDIR = @OPT_GTPIN_LIBDIR@
OPT_IGC = @OPT_IGC@
OPT_IGC_IFLAGS = @OPT_IGC_IFLAGS@
@@ -388,6 +389,7 @@ PERFMON_LDFLAGS_STAT = @PERFMON_LDFLAGS_STAT@
PERFMON_LIB = @PERFMON_LIB@
PERF_EVENT_PARANOID = @PERF_EVENT_PARANOID@
RANLIB = @RANLIB@
+ROCM_PROFILER_LD_DIR = @ROCM_PROFILER_LD_DIR@
SED = @SED@
SET_MAKE = @SET_MAKE@
SHELL = @SHELL@
diff --git a/src/tool/hpcproftt/Makefile.in b/src/tool/hpcproftt/Makefile.in
index 4c305011e7..ca04ad8c53 100644
--- a/src/tool/hpcproftt/Makefile.in
+++ b/src/tool/hpcproftt/Makefile.in
@@ -417,6 +417,7 @@ OPT_CUPTI_IFLAGS = @OPT_CUPTI_IFLAGS@
OPT_CUPTI_LDFLAGS = @OPT_CUPTI_LDFLAGS@
OPT_GTPIN = @OPT_GTPIN@
OPT_GTPIN_IFLAGS = @OPT_GTPIN_IFLAGS@
+OPT_GTPIN_LDFLAGS = @OPT_GTPIN_LDFLAGS@
OPT_GTPIN_LIBDIR = @OPT_GTPIN_LIBDIR@
OPT_IGC = @OPT_IGC@
OPT_IGC_IFLAGS = @OPT_IGC_IFLAGS@
@@ -454,6 +455,7 @@ PERFMON_LDFLAGS_STAT = @PERFMON_LDFLAGS_STAT@
PERFMON_LIB = @PERFMON_LIB@
PERF_EVENT_PARANOID = @PERF_EVENT_PARANOID@
RANLIB = @RANLIB@
+ROCM_PROFILER_LD_DIR = @ROCM_PROFILER_LD_DIR@
SED = @SED@
SET_MAKE = @SET_MAKE@
SHELL = @SHELL@
diff --git a/src/tool/hpcrun-flat/Makefile.in b/src/tool/hpcrun-flat/Makefile.in
index 10f0c788b8..cc063d5a5e 100644
--- a/src/tool/hpcrun-flat/Makefile.in
+++ b/src/tool/hpcrun-flat/Makefile.in
@@ -411,6 +411,7 @@ OPT_CUPTI_IFLAGS = @OPT_CUPTI_IFLAGS@
OPT_CUPTI_LDFLAGS = @OPT_CUPTI_LDFLAGS@
OPT_GTPIN = @OPT_GTPIN@
OPT_GTPIN_IFLAGS = @OPT_GTPIN_IFLAGS@
+OPT_GTPIN_LDFLAGS = @OPT_GTPIN_LDFLAGS@
OPT_GTPIN_LIBDIR = @OPT_GTPIN_LIBDIR@
OPT_IGC = @OPT_IGC@
OPT_IGC_IFLAGS = @OPT_IGC_IFLAGS@
@@ -448,6 +449,7 @@ PERFMON_LDFLAGS_STAT = @PERFMON_LDFLAGS_STAT@
PERFMON_LIB = @PERFMON_LIB@
PERF_EVENT_PARANOID = @PERF_EVENT_PARANOID@
RANLIB = @RANLIB@
+ROCM_PROFILER_LD_DIR = @ROCM_PROFILER_LD_DIR@
SED = @SED@
SET_MAKE = @SET_MAKE@
SHELL = @SHELL@
diff --git a/src/tool/hpcrun/Makefile.am b/src/tool/hpcrun/Makefile.am
index a30590f819..7c2be6110d 100644
--- a/src/tool/hpcrun/Makefile.am
+++ b/src/tool/hpcrun/Makefile.am
@@ -163,6 +163,9 @@ XED2_INC = @XED2_INC@
XED2_HPCRUN_LIBS = @XED2_HPCRUN_LIBS@
XED2_HPCLINK_LIBS = @XED2_HPCLINK_LIBS@
CUPTI_INC_FLGS = @OPT_CUPTI_IFLAGS@
+
+ROCM_INC_FLGS = @OPT_ROCM_IFLAGS@
+
OPENCL_IFLAGS = @OPT_OPENCL_IFLAGS@
CUPTI_LD_FLGS = @OPT_CUPTI_LDFLAGS@
CUPTI_BASE = @OPT_CUPTI@
@@ -296,6 +299,7 @@ MY_BASE_FILES = \
sample_event.c \
sample_prob.c \
sample_sources_all.c \
+ tool_state.c \
sample-sources/blame-shift/blame-shift.c \
sample-sources/blame-shift/blame-map.c \
sample-sources/blame-shift/directed.c \
@@ -325,6 +329,7 @@ MY_BASE_FILES = \
control-knob.c \
control-knob.h \
device-finalizers.c \
+ gpu-monitors.c \
device-initializers.c \
module-ignore-map.c \
threadmgr.c \
@@ -387,6 +392,10 @@ MY_BASE_FILES = \
gpu/gpu-trace-channel-set.c \
gpu/gpu-trace-demultiplexer.c \
\
+ gpu/ompt/ompt-gpu-api.c \
+ gpu/ompt/ompt-activity-translate.c \
+ sample-sources/openmp-target.c \
+ \
ompt/ompt-callstack.c \
ompt/ompt-defer.c \
ompt/ompt-device.c \
@@ -503,6 +512,11 @@ if OPT_PAPI_CUPTI
MY_PAPI_FILES += sample-sources/papi-c-cupti.c
endif
+if OPT_PAPI_ROCM
+ MY_PAPI_FILES += sample-sources/papi-c-rocm.c
+endif
+
+
if OPT_ENABLE_CUPTI
MY_CUPTI_FILES = sample-sources/nvidia.c \
gpu/nvidia/cubin-hash-map.c \
@@ -542,9 +556,11 @@ endif
if OPT_ENABLE_ROCM
MY_ROCM_FILES =\
sample-sources/amd.c \
+ sample-sources/amd-rocprofiler.c \
+ gpu/amd/hip-api.c \
gpu/amd/roctracer-activity-translate.c \
gpu/amd/roctracer-api.c \
- gpu/amd/rocm-debug-api.c \
+ gpu/amd/rocprofiler-api.c \
gpu/amd/rocm-binary-processing.c
endif
@@ -590,6 +606,7 @@ MY_INCLUDE_DIRS = \
-I$(HPCFNBOUNDS_INC) \
$(OPT_CUDA_IFLAGS) \
$(OPT_CUPTI_IFLAGS) \
+ $(ROCM_INC_FLGS) \
-I$(LIBELF_INC) \
-I$(LIBMONITOR_INC) \
$(GOTCHA_IFLAGS) \
@@ -991,11 +1008,16 @@ libhpcrun_la_CPPFLAGS += $(CUPTI_INC_FLGS)
MY_CPP_DEFINES += -DHPCRUN_SS_PAPI_C_CUPTI
endif
+if OPT_PAPI_ROCM
+libhpcrun_la_CPPFLAGS += $(ROCM_INC_FLGS)
+MY_CPP_DEFINES += -DHPCRUN_SS_PAPI_C_ROCM
+endif
+
if OPT_PAPI_STATIC
libhpcrun_o_SOURCES += $(MY_PAPI_FILES)
libhpcrun_o_CPPFLAGS += $(PAPI_INC_FLGS)
libhpcrun_o_LDADD += $(OPT_PAPI_LIBS_STAT)
-
+
MY_CPP_DEFINES += -DHPCRUN_SS_PAPI
endif
@@ -1038,11 +1060,10 @@ if OPT_ENABLE_CUDA
libhpcrun_o_SOURCES += $(MY_CUDA_FILES)
endif
-
if OPT_ENABLE_ROCM
libhpcrun_la_SOURCES += $(MY_ROCM_FILES)
libhpcrun_la_CPPFLAGS += -DENABLE_ROCM
- libhpcrun_la_CFLAGS += $(OPT_ROCM_IFLAGS)
+ libhpcrun_la_CFLAGS += $(ROCM_INC_FLGS)
MY_CPP_DEFINES += -DHPCRUN_SS_AMD
endif
@@ -1177,7 +1198,7 @@ endif
# Don't use LDFLAGS for static case.
MONITOR_NAMES = -G 'monitor_*'
-HPCRUN_NAMES = -G 'hpcrun_*' -G 'hpctoolkit_*'
+HPCRUN_NAMES = -G 'hpcrun_*' -G 'hpctoolkit_*' -G 'OnLoadToolProp' -G 'OnLoad' -G 'OnUnloadTool'
MISC_NAMES = -G 'debug_flag_*' -G 'messages_*' -G ompt_start_tool
OPENCL_NAMES = -G 'clBuildProgram' -G 'clCreate*' -G 'clEnqueue*' -G 'clSetKernelArg' -G 'cl*Event*'
diff --git a/src/tool/hpcrun/Makefile.in b/src/tool/hpcrun/Makefile.in
index 13aaf4d908..afb6c56216 100644
--- a/src/tool/hpcrun/Makefile.in
+++ b/src/tool/hpcrun/Makefile.in
@@ -168,25 +168,25 @@ host_triplet = @host@
@OPT_ENABLE_KERNEL_4_3_TRUE@@OPT_ENABLE_PERF_EVENT_TRUE@am__append_14 = sample-sources/perf/kernel_blocking.c
@OPT_ENABLE_KERNEL_4_3_FALSE@@OPT_ENABLE_PERF_EVENT_TRUE@am__append_15 = sample-sources/perf/kernel_blocking_stub.c
@OPT_PAPI_CUPTI_TRUE@am__append_16 = sample-sources/papi-c-cupti.c
-@OPT_ENABLE_OPENCL_TRUE@am__append_17 = libhpcrun_opencl.la
-@OPT_ENABLE_LEVEL0_TRUE@am__append_18 = libhpcrun_level0.la
+@OPT_PAPI_ROCM_TRUE@am__append_17 = sample-sources/papi-c-rocm.c
+@OPT_ENABLE_OPENCL_TRUE@am__append_18 = libhpcrun_opencl.la
+@OPT_ENABLE_LEVEL0_TRUE@am__append_19 = libhpcrun_level0.la
#
# BG/Q backend requires special treatment to avoid deadlocks
#
-@OPT_BGQ_BACKEND_TRUE@am__append_19 = -DUSE_HW_THREAD_ID -DNONZERO_THRESHOLD
-@OPT_BGQ_BACKEND_TRUE@am__append_20 = -I$(srcdir)/utilities/bgq-cnk
+@OPT_BGQ_BACKEND_TRUE@am__append_20 = -DUSE_HW_THREAD_ID -DNONZERO_THRESHOLD
@OPT_BGQ_BACKEND_TRUE@am__append_21 = -I$(srcdir)/utilities/bgq-cnk
-@OPT_ENABLE_MPI_WRAP_TRUE@am__append_22 = mpi-overrides.c
+@OPT_BGQ_BACKEND_TRUE@am__append_22 = -I$(srcdir)/utilities/bgq-cnk
@OPT_ENABLE_MPI_WRAP_TRUE@am__append_23 = mpi-overrides.c
+@OPT_ENABLE_MPI_WRAP_TRUE@am__append_24 = mpi-overrides.c
#-----------------------------------------------------------
# whirled peas
#-----------------------------------------------------------
-@HOST_OS_LINUX_TRUE@am__append_24 = $(MY_LINUX_DYNAMIC_FILES)
-@HOST_CPU_MIPS_TRUE@am__append_25 = $(MY_MIPS_FILES)
+@HOST_OS_LINUX_TRUE@am__append_25 = $(MY_LINUX_DYNAMIC_FILES)
@HOST_CPU_MIPS_TRUE@am__append_26 = $(MY_MIPS_FILES)
-@HOST_CPU_MIPS_TRUE@am__append_27 = $(MY_MIPS_INCLUDE_DIRS)
+@HOST_CPU_MIPS_TRUE@am__append_27 = $(MY_MIPS_FILES)
@HOST_CPU_MIPS_TRUE@am__append_28 = $(MY_MIPS_INCLUDE_DIRS)
@HOST_CPU_MIPS_TRUE@am__append_29 = $(MY_MIPS_INCLUDE_DIRS)
@HOST_CPU_MIPS_TRUE@am__append_30 = $(MY_MIPS_INCLUDE_DIRS)
@@ -197,15 +197,15 @@ host_triplet = @host@
@HOST_CPU_MIPS_TRUE@am__append_35 = $(MY_MIPS_INCLUDE_DIRS)
@HOST_CPU_MIPS_TRUE@am__append_36 = $(MY_MIPS_INCLUDE_DIRS)
@HOST_CPU_MIPS_TRUE@am__append_37 = $(MY_MIPS_INCLUDE_DIRS)
+@HOST_CPU_MIPS_TRUE@am__append_38 = $(MY_MIPS_INCLUDE_DIRS)
# Note: setting CCASFLAGS here is a no-op hack with the side effect of
# prefixing the tramp.s file names so they will be compiled separately
# for .o and .so targets. CFLAGS does this for the .c files, but
# CFLAGS doesn't apply to .s files. See the automake docs section
# 8.3.9.2, Objects created with both libtool and without.
-@HOST_CPU_PPC_TRUE@am__append_38 = $(MY_PPC_FILES)
@HOST_CPU_PPC_TRUE@am__append_39 = $(MY_PPC_FILES)
-@HOST_CPU_PPC_TRUE@am__append_40 = $(MY_PPC_INCLUDE_DIRS)
+@HOST_CPU_PPC_TRUE@am__append_40 = $(MY_PPC_FILES)
@HOST_CPU_PPC_TRUE@am__append_41 = $(MY_PPC_INCLUDE_DIRS)
@HOST_CPU_PPC_TRUE@am__append_42 = $(MY_PPC_INCLUDE_DIRS)
@HOST_CPU_PPC_TRUE@am__append_43 = $(MY_PPC_INCLUDE_DIRS)
@@ -218,13 +218,13 @@ host_triplet = @host@
@HOST_CPU_PPC_TRUE@am__append_50 = $(MY_PPC_INCLUDE_DIRS)
@HOST_CPU_PPC_TRUE@am__append_51 = $(MY_PPC_INCLUDE_DIRS)
@HOST_CPU_PPC_TRUE@am__append_52 = $(MY_PPC_INCLUDE_DIRS)
-@HOST_CPU_X86_FAMILY_TRUE@am__append_53 = $(MY_X86_FILES)
+@HOST_CPU_PPC_TRUE@am__append_53 = $(MY_PPC_INCLUDE_DIRS)
@HOST_CPU_X86_FAMILY_TRUE@am__append_54 = $(MY_X86_FILES)
-@HOST_CPU_X86_FAMILY_TRUE@am__append_55 = $(MY_X86_INCLUDE_DIRS)
+@HOST_CPU_X86_FAMILY_TRUE@am__append_55 = $(MY_X86_FILES)
@HOST_CPU_X86_FAMILY_TRUE@am__append_56 = $(MY_X86_INCLUDE_DIRS)
-@HOST_CPU_X86_FAMILY_TRUE@am__append_57 = $(XED2_HPCRUN_LIBS)
-@HOST_CPU_X86_FAMILY_TRUE@am__append_58 = $(XED2_HPCLINK_LIBS)
-@HOST_CPU_X86_FAMILY_TRUE@am__append_59 = $(MY_X86_INCLUDE_DIRS)
+@HOST_CPU_X86_FAMILY_TRUE@am__append_57 = $(MY_X86_INCLUDE_DIRS)
+@HOST_CPU_X86_FAMILY_TRUE@am__append_58 = $(XED2_HPCRUN_LIBS)
+@HOST_CPU_X86_FAMILY_TRUE@am__append_59 = $(XED2_HPCLINK_LIBS)
@HOST_CPU_X86_FAMILY_TRUE@am__append_60 = $(MY_X86_INCLUDE_DIRS)
@HOST_CPU_X86_FAMILY_TRUE@am__append_61 = $(MY_X86_INCLUDE_DIRS)
@HOST_CPU_X86_FAMILY_TRUE@am__append_62 = $(MY_X86_INCLUDE_DIRS)
@@ -236,9 +236,9 @@ host_triplet = @host@
@HOST_CPU_X86_FAMILY_TRUE@am__append_68 = $(MY_X86_INCLUDE_DIRS)
@HOST_CPU_X86_FAMILY_TRUE@am__append_69 = $(MY_X86_INCLUDE_DIRS)
@HOST_CPU_X86_FAMILY_TRUE@am__append_70 = $(MY_X86_INCLUDE_DIRS)
-@HOST_CPU_IA64_TRUE@am__append_71 = $(MY_IA64_FILES)
+@HOST_CPU_X86_FAMILY_TRUE@am__append_71 = $(MY_X86_INCLUDE_DIRS)
@HOST_CPU_IA64_TRUE@am__append_72 = $(MY_IA64_FILES)
-@HOST_CPU_IA64_TRUE@am__append_73 = $(MY_IA64_INCLUDE_DIRS)
+@HOST_CPU_IA64_TRUE@am__append_73 = $(MY_IA64_FILES)
@HOST_CPU_IA64_TRUE@am__append_74 = $(MY_IA64_INCLUDE_DIRS)
@HOST_CPU_IA64_TRUE@am__append_75 = $(MY_IA64_INCLUDE_DIRS)
@HOST_CPU_IA64_TRUE@am__append_76 = $(MY_IA64_INCLUDE_DIRS)
@@ -249,9 +249,9 @@ host_triplet = @host@
@HOST_CPU_IA64_TRUE@am__append_81 = $(MY_IA64_INCLUDE_DIRS)
@HOST_CPU_IA64_TRUE@am__append_82 = $(MY_IA64_INCLUDE_DIRS)
@HOST_CPU_IA64_TRUE@am__append_83 = $(MY_IA64_INCLUDE_DIRS)
-@HOST_CPU_AARCH64_TRUE@am__append_84 = $(MY_AARCH64_FILES)
+@HOST_CPU_IA64_TRUE@am__append_84 = $(MY_IA64_INCLUDE_DIRS)
@HOST_CPU_AARCH64_TRUE@am__append_85 = $(MY_AARCH64_FILES)
-@HOST_CPU_AARCH64_TRUE@am__append_86 = $(MY_AARCH64_INCLUDE_DIRS)
+@HOST_CPU_AARCH64_TRUE@am__append_86 = $(MY_AARCH64_FILES)
@HOST_CPU_AARCH64_TRUE@am__append_87 = $(MY_AARCH64_INCLUDE_DIRS)
@HOST_CPU_AARCH64_TRUE@am__append_88 = $(MY_AARCH64_INCLUDE_DIRS)
@HOST_CPU_AARCH64_TRUE@am__append_89 = $(MY_AARCH64_INCLUDE_DIRS)
@@ -264,49 +264,52 @@ host_triplet = @host@
@HOST_CPU_AARCH64_TRUE@am__append_96 = $(MY_AARCH64_INCLUDE_DIRS)
@HOST_CPU_AARCH64_TRUE@am__append_97 = $(MY_AARCH64_INCLUDE_DIRS)
@HOST_CPU_AARCH64_TRUE@am__append_98 = $(MY_AARCH64_INCLUDE_DIRS)
-@OPT_PAPI_DYNAMIC_TRUE@am__append_99 = $(MY_PAPI_FILES)
-@OPT_PAPI_DYNAMIC_TRUE@am__append_100 = $(PAPI_INC_FLGS)
-@OPT_PAPI_DYNAMIC_TRUE@am__append_101 = $(PAPI_LD_FLGS)
-@OPT_PAPI_DYNAMIC_TRUE@am__append_102 = -DHPCRUN_SS_PAPI
-@OPT_ENABLE_CUPTI_TRUE@am__append_103 = $(MY_CUPTI_FILES)
+@HOST_CPU_AARCH64_TRUE@am__append_99 = $(MY_AARCH64_INCLUDE_DIRS)
+@OPT_PAPI_DYNAMIC_TRUE@am__append_100 = $(MY_PAPI_FILES)
+@OPT_PAPI_DYNAMIC_TRUE@am__append_101 = $(PAPI_INC_FLGS)
+@OPT_PAPI_DYNAMIC_TRUE@am__append_102 = $(PAPI_LD_FLGS)
+@OPT_PAPI_DYNAMIC_TRUE@am__append_103 = -DHPCRUN_SS_PAPI
@OPT_ENABLE_CUPTI_TRUE@am__append_104 = $(MY_CUPTI_FILES)
-@OPT_ENABLE_CUPTI_TRUE@am__append_105 = $(CUPTI_INC_FLGS)
-@OPT_ENABLE_CUPTI_TRUE@am__append_106 = -DHPCRUN_SS_NVIDIA
-@OPT_PAPI_CUPTI_TRUE@am__append_107 = $(CUPTI_INC_FLGS)
-@OPT_PAPI_CUPTI_TRUE@am__append_108 = -DHPCRUN_SS_PAPI_C_CUPTI
-@OPT_PAPI_STATIC_TRUE@am__append_109 = $(MY_PAPI_FILES)
-@OPT_PAPI_STATIC_TRUE@am__append_110 = $(PAPI_INC_FLGS)
-@OPT_PAPI_STATIC_TRUE@am__append_111 = $(OPT_PAPI_LIBS_STAT)
-@OPT_PAPI_STATIC_TRUE@am__append_112 = -DHPCRUN_SS_PAPI
-@OPT_ENABLE_UPC_TRUE@am__append_113 = $(MY_UPC_FILES)
-@OPT_ENABLE_UPC_TRUE@am__append_114 = $(MY_UPC_FILES)
-@OPT_ENABLE_UPC_TRUE@am__append_115 = $(OPT_UPC_IFLAGS)
-@OPT_ENABLE_UPC_TRUE@am__append_116 = $(OPT_UPC_IFLAGS)
-@OPT_ENABLE_UPC_TRUE@am__append_117 = $(OPT_UPC_LDFLAGS)
-@OPT_ENABLE_LUSH_PTHREADS_TRUE@am__append_118 = -DLUSH_PTHREADS
-@OPT_ENABLE_LUSH_PTHREADS_TRUE@am__append_119 = -DLUSH_PTHREADS
-@OPT_ENABLE_CUDA_TRUE@am__append_120 = $(MY_CUDA_FILES)
-@OPT_ENABLE_CUDA_TRUE@am__append_121 = -DENABLE_CUDA
-@OPT_ENABLE_CUDA_TRUE@am__append_122 = $(OPT_CUDA_IFLAGS)
+@OPT_ENABLE_CUPTI_TRUE@am__append_105 = $(MY_CUPTI_FILES)
+@OPT_ENABLE_CUPTI_TRUE@am__append_106 = $(CUPTI_INC_FLGS)
+@OPT_ENABLE_CUPTI_TRUE@am__append_107 = -DHPCRUN_SS_NVIDIA
+@OPT_PAPI_CUPTI_TRUE@am__append_108 = $(CUPTI_INC_FLGS)
+@OPT_PAPI_CUPTI_TRUE@am__append_109 = -DHPCRUN_SS_PAPI_C_CUPTI
+@OPT_PAPI_ROCM_TRUE@am__append_110 = $(ROCM_INC_FLGS)
+@OPT_PAPI_ROCM_TRUE@am__append_111 = -DHPCRUN_SS_PAPI_C_ROCM
+@OPT_PAPI_STATIC_TRUE@am__append_112 = $(MY_PAPI_FILES)
+@OPT_PAPI_STATIC_TRUE@am__append_113 = $(PAPI_INC_FLGS)
+@OPT_PAPI_STATIC_TRUE@am__append_114 = $(OPT_PAPI_LIBS_STAT)
+@OPT_PAPI_STATIC_TRUE@am__append_115 = -DHPCRUN_SS_PAPI
+@OPT_ENABLE_UPC_TRUE@am__append_116 = $(MY_UPC_FILES)
+@OPT_ENABLE_UPC_TRUE@am__append_117 = $(MY_UPC_FILES)
+@OPT_ENABLE_UPC_TRUE@am__append_118 = $(OPT_UPC_IFLAGS)
+@OPT_ENABLE_UPC_TRUE@am__append_119 = $(OPT_UPC_IFLAGS)
+@OPT_ENABLE_UPC_TRUE@am__append_120 = $(OPT_UPC_LDFLAGS)
+@OPT_ENABLE_LUSH_PTHREADS_TRUE@am__append_121 = -DLUSH_PTHREADS
+@OPT_ENABLE_LUSH_PTHREADS_TRUE@am__append_122 = -DLUSH_PTHREADS
@OPT_ENABLE_CUDA_TRUE@am__append_123 = $(MY_CUDA_FILES)
-@OPT_ENABLE_ROCM_TRUE@am__append_124 = $(MY_ROCM_FILES)
-@OPT_ENABLE_ROCM_TRUE@am__append_125 = -DENABLE_ROCM
-@OPT_ENABLE_ROCM_TRUE@am__append_126 = $(OPT_ROCM_IFLAGS)
-@OPT_ENABLE_ROCM_TRUE@am__append_127 = -DHPCRUN_SS_AMD
-@OPT_ENABLE_LEVEL0_TRUE@am__append_128 = $(MY_LEVEL0_FILES)
-@OPT_ENABLE_LEVEL0_TRUE@am__append_129 = -DENABLE_LEVEL0
-@OPT_ENABLE_LEVEL0_TRUE@am__append_130 = $(OPT_LEVEL0_IFLAGS)
-@OPT_ENABLE_LEVEL0_TRUE@am__append_131 = -DHPCRUN_SS_LEVEL0
-@OPT_ENABLE_OPENCL_TRUE@am__append_132 = $(MY_OPENCL_FILES)
-@OPT_ENABLE_OPENCL_TRUE@am__append_133 = -DENABLE_OPENCL
-@OPT_ENABLE_OPENCL_TRUE@am__append_134 = $(OPT_OPENCL_IFLAGS)
-@OPT_ENABLE_OPENCL_TRUE@am__append_135 = -DHPCRUN_SS_OPENCL
-@OPT_ENABLE_GTPIN_TRUE@am__append_136 = $(MY_GTPIN_FILES)
-@OPT_ENABLE_GTPIN_TRUE@am__append_137 = -DENABLE_GTPIN -DGTPIN_LIBDIR=$(OPT_GTPIN_LIBDIR)
-@OPT_ENABLE_GTPIN_TRUE@am__append_138 = $(OPT_GTPIN_IFLAGS)
-@OPT_ENABLE_GTPIN_TRUE@am__append_139 = -DHPCRUN_SS_GTPIN
-@OPT_ENABLE_LUSH_TRUE@@OPT_WITH_CILK_TRUE@am__append_140 = libagent-cilk.la
-@OPT_ENABLE_LUSH_TRUE@am__append_141 = libagent-pthread.la \
+@OPT_ENABLE_CUDA_TRUE@am__append_124 = -DENABLE_CUDA
+@OPT_ENABLE_CUDA_TRUE@am__append_125 = $(OPT_CUDA_IFLAGS)
+@OPT_ENABLE_CUDA_TRUE@am__append_126 = $(MY_CUDA_FILES)
+@OPT_ENABLE_ROCM_TRUE@am__append_127 = $(MY_ROCM_FILES)
+@OPT_ENABLE_ROCM_TRUE@am__append_128 = -DENABLE_ROCM
+@OPT_ENABLE_ROCM_TRUE@am__append_129 = $(ROCM_INC_FLGS)
+@OPT_ENABLE_ROCM_TRUE@am__append_130 = -DHPCRUN_SS_AMD
+@OPT_ENABLE_LEVEL0_TRUE@am__append_131 = $(MY_LEVEL0_FILES)
+@OPT_ENABLE_LEVEL0_TRUE@am__append_132 = -DENABLE_LEVEL0
+@OPT_ENABLE_LEVEL0_TRUE@am__append_133 = $(OPT_LEVEL0_IFLAGS)
+@OPT_ENABLE_LEVEL0_TRUE@am__append_134 = -DHPCRUN_SS_LEVEL0
+@OPT_ENABLE_OPENCL_TRUE@am__append_135 = $(MY_OPENCL_FILES)
+@OPT_ENABLE_OPENCL_TRUE@am__append_136 = -DENABLE_OPENCL
+@OPT_ENABLE_OPENCL_TRUE@am__append_137 = $(OPT_OPENCL_IFLAGS)
+@OPT_ENABLE_OPENCL_TRUE@am__append_138 = -DHPCRUN_SS_OPENCL
+@OPT_ENABLE_GTPIN_TRUE@am__append_139 = $(MY_GTPIN_FILES)
+@OPT_ENABLE_GTPIN_TRUE@am__append_140 = -DENABLE_GTPIN -DGTPIN_LIBDIR=$(OPT_GTPIN_LIBDIR)
+@OPT_ENABLE_GTPIN_TRUE@am__append_141 = $(OPT_GTPIN_IFLAGS)
+@OPT_ENABLE_GTPIN_TRUE@am__append_142 = -DHPCRUN_SS_GTPIN
+@OPT_ENABLE_LUSH_TRUE@@OPT_WITH_CILK_TRUE@am__append_143 = libagent-cilk.la
+@OPT_ENABLE_LUSH_TRUE@am__append_144 = libagent-pthread.la \
@OPT_ENABLE_LUSH_TRUE@ libagent-tbb.la
subdir = src/tool/hpcrun
ACLOCAL_M4 = $(top_srcdir)/aclocal.m4
@@ -455,7 +458,7 @@ am__libhpcrun_la_SOURCES_DIST = utilities/first_func.c main.h main.c \
cct_backtrace_finalize.c env.c epoch.c files.c \
handling_sample.c hpcrun-initializers.c hpcrun_options.c \
hpcrun_stats.c loadmap.c metrics.c name.c rank.c \
- sample_event.c sample_prob.c sample_sources_all.c \
+ sample_event.c sample_prob.c sample_sources_all.c tool_state.c \
sample-sources/blame-shift/blame-shift.c \
sample-sources/blame-shift/blame-map.c \
sample-sources/blame-shift/directed.c \
@@ -470,18 +473,18 @@ am__libhpcrun_la_SOURCES_DIST = utilities/first_func.c main.h main.c \
sample_sources_registered.c sample-sources/sample-filters.c \
segv_handler.c start-stop.c term_handler.c thread_data.c \
thread_use.c thread_finalize.c control-knob.c control-knob.h \
- device-finalizers.c device-initializers.c module-ignore-map.c \
- threadmgr.c trace.c weak.c write_data.c cct/cct_bundle.c \
- cct/cct_ctxt.c cct/cct.c cct/cct-node-vector.c cct2metrics.c \
- lush/lush-backtrace.h lush/lush-backtrace.c lush/lush.h \
- lush/lush.c lush/lush-pthread.h lush/lush-pthread.i \
- lush/lush-pthread.c lush/lush-support-rt.h \
- lush/lush-support-rt.c lush/lushi.h lush/lushi-cb.h \
- lush/lushi-cb.c fnbounds/fnbounds_common.c memory/mem.c \
- memory/mmap.c messages/debug-flag.c messages/messages-sync.c \
- messages/messages-async.c messages/fmt.c gpu/gpu-activity.c \
- gpu/gpu-activity-channel.c gpu/gpu-activity-process.c \
- gpu/gpu-application-thread-api.c \
+ device-finalizers.c gpu-monitors.c device-initializers.c \
+ module-ignore-map.c threadmgr.c trace.c weak.c write_data.c \
+ cct/cct_bundle.c cct/cct_ctxt.c cct/cct.c \
+ cct/cct-node-vector.c cct2metrics.c lush/lush-backtrace.h \
+ lush/lush-backtrace.c lush/lush.h lush/lush.c \
+ lush/lush-pthread.h lush/lush-pthread.i lush/lush-pthread.c \
+ lush/lush-support-rt.h lush/lush-support-rt.c lush/lushi.h \
+ lush/lushi-cb.h lush/lushi-cb.c fnbounds/fnbounds_common.c \
+ memory/mem.c memory/mmap.c messages/debug-flag.c \
+ messages/messages-sync.c messages/messages-async.c \
+ messages/fmt.c gpu/gpu-activity.c gpu/gpu-activity-channel.c \
+ gpu/gpu-activity-process.c gpu/gpu-application-thread-api.c \
gpu/gpu-channel-item-allocator.c gpu/gpu-context-id-map.c \
gpu/gpu-correlation.c gpu/gpu-correlation-channel.c \
gpu/gpu-correlation-channel-set.c gpu/gpu-correlation-id.c \
@@ -495,12 +498,14 @@ am__libhpcrun_la_SOURCES_DIST = utilities/first_func.c main.h main.c \
gpu/gpu-stream-id-map.c gpu/gpu-trace.c \
gpu/gpu-trace-channel.c gpu/gpu-trace-item.c \
gpu/gpu-trace-channel-set.c gpu/gpu-trace-demultiplexer.c \
- ompt/ompt-callstack.c ompt/ompt-defer.c ompt/ompt-device.c \
- ompt/ompt-defer-write.c ompt/ompt-interface.c \
- ompt/ompt-queues.c ompt/ompt-region.c ompt/ompt-region-debug.c \
- ompt/ompt-device-map.c ompt/ompt-task.c ompt/ompt-thread.c \
- extern-real/dl-iterate.c extern-real/mmap.c syscalls/poll.c \
- syscalls/ppoll.c syscalls/select.c syscalls/sysv_signal.c \
+ gpu/ompt/ompt-gpu-api.c gpu/ompt/ompt-activity-translate.c \
+ sample-sources/openmp-target.c ompt/ompt-callstack.c \
+ ompt/ompt-defer.c ompt/ompt-device.c ompt/ompt-defer-write.c \
+ ompt/ompt-interface.c ompt/ompt-queues.c ompt/ompt-region.c \
+ ompt/ompt-region-debug.c ompt/ompt-device-map.c \
+ ompt/ompt-task.c ompt/ompt-thread.c extern-real/dl-iterate.c \
+ extern-real/mmap.c syscalls/poll.c syscalls/ppoll.c \
+ syscalls/select.c syscalls/sysv_signal.c \
utilities/executable-path.h utilities/executable-path.c \
utilities/hpcrun-nanotime.h utilities/hpcrun-nanotime.c \
utilities/ip-normalized.h utilities/ip-normalized.c \
@@ -531,15 +536,17 @@ am__libhpcrun_la_SOURCES_DIST = utilities/first_func.c main.h main.c \
trampoline/aarch64/aarch64-tramp.c \
utilities/arch/libunwind/libunwind-context-pc.c \
sample-sources/papi.c sample-sources/papi-c-cupti.c \
- sample-sources/papi-c.c sample-sources/papi-c-extended-info.c \
- sample-sources/nvidia.c gpu/nvidia/cubin-hash-map.c \
- gpu/nvidia/cubin-id-map.c gpu/nvidia/cubin-symbols.c \
- gpu/nvidia/cuda-api.c gpu/nvidia/cuda-device-map.c \
+ sample-sources/papi-c-rocm.c sample-sources/papi-c.c \
+ sample-sources/papi-c-extended-info.c sample-sources/nvidia.c \
+ gpu/nvidia/cubin-hash-map.c gpu/nvidia/cubin-id-map.c \
+ gpu/nvidia/cubin-symbols.c gpu/nvidia/cuda-api.c \
+ gpu/nvidia/cuda-device-map.c \
gpu/nvidia/cupti-activity-translate.c \
gpu/nvidia/cupti-analysis.c gpu/nvidia/cupti-api.c \
gpu/nvidia/cupti-gpu-api.c sample-sources/upc.c \
- sample-sources/amd.c gpu/amd/roctracer-activity-translate.c \
- gpu/amd/roctracer-api.c gpu/amd/rocm-debug-api.c \
+ sample-sources/amd.c sample-sources/amd-rocprofiler.c \
+ gpu/amd/hip-api.c gpu/amd/roctracer-activity-translate.c \
+ gpu/amd/roctracer-api.c gpu/amd/rocprofiler-api.c \
gpu/amd/rocm-binary-processing.c sample-sources/level0.c \
gpu/level0/level0-api.c \
gpu/level0/level0-command-list-context-map.c \
@@ -604,7 +611,7 @@ am__objects_14 = utilities/libhpcrun_la-first_func.lo \
libhpcrun_la-loadmap.lo libhpcrun_la-metrics.lo \
libhpcrun_la-name.lo libhpcrun_la-rank.lo \
libhpcrun_la-sample_event.lo libhpcrun_la-sample_prob.lo \
- libhpcrun_la-sample_sources_all.lo \
+ libhpcrun_la-sample_sources_all.lo libhpcrun_la-tool_state.lo \
sample-sources/blame-shift/libhpcrun_la-blame-shift.lo \
sample-sources/blame-shift/libhpcrun_la-blame-map.lo \
sample-sources/blame-shift/libhpcrun_la-directed.lo \
@@ -629,6 +636,7 @@ am__objects_14 = utilities/libhpcrun_la-first_func.lo \
libhpcrun_la-term_handler.lo libhpcrun_la-thread_data.lo \
libhpcrun_la-thread_use.lo libhpcrun_la-thread_finalize.lo \
libhpcrun_la-control-knob.lo libhpcrun_la-device-finalizers.lo \
+ libhpcrun_la-gpu-monitors.lo \
libhpcrun_la-device-initializers.lo \
libhpcrun_la-module-ignore-map.lo libhpcrun_la-threadmgr.lo \
libhpcrun_la-trace.lo libhpcrun_la-weak.lo \
@@ -675,6 +683,9 @@ am__objects_14 = utilities/libhpcrun_la-first_func.lo \
gpu/libhpcrun_la-gpu-trace-item.lo \
gpu/libhpcrun_la-gpu-trace-channel-set.lo \
gpu/libhpcrun_la-gpu-trace-demultiplexer.lo \
+ gpu/ompt/libhpcrun_la-ompt-gpu-api.lo \
+ gpu/ompt/libhpcrun_la-ompt-activity-translate.lo \
+ sample-sources/libhpcrun_la-openmp-target.lo \
ompt/libhpcrun_la-ompt-callstack.lo \
ompt/libhpcrun_la-ompt-defer.lo \
ompt/libhpcrun_la-ompt-device.lo \
@@ -722,14 +733,15 @@ am__objects_26 = trampoline/aarch64/libhpcrun_la-aarch64-tramp.lo \
utilities/arch/libunwind/libhpcrun_la-libunwind-context-pc.lo
@HOST_CPU_AARCH64_TRUE@am__objects_27 = $(am__objects_26)
@OPT_PAPI_CUPTI_TRUE@am__objects_28 = sample-sources/libhpcrun_la-papi-c-cupti.lo
-@OPT_PAPI_COMPONENT_FALSE@am__objects_29 = \
+@OPT_PAPI_ROCM_TRUE@am__objects_29 = sample-sources/libhpcrun_la-papi-c-rocm.lo
+@OPT_PAPI_COMPONENT_FALSE@am__objects_30 = \
@OPT_PAPI_COMPONENT_FALSE@ sample-sources/libhpcrun_la-papi.lo \
-@OPT_PAPI_COMPONENT_FALSE@ $(am__objects_28)
-@OPT_PAPI_COMPONENT_TRUE@am__objects_29 = sample-sources/libhpcrun_la-papi-c.lo \
+@OPT_PAPI_COMPONENT_FALSE@ $(am__objects_28) $(am__objects_29)
+@OPT_PAPI_COMPONENT_TRUE@am__objects_30 = sample-sources/libhpcrun_la-papi-c.lo \
@OPT_PAPI_COMPONENT_TRUE@ sample-sources/libhpcrun_la-papi-c-extended-info.lo \
-@OPT_PAPI_COMPONENT_TRUE@ $(am__objects_28)
-@OPT_PAPI_DYNAMIC_TRUE@am__objects_30 = $(am__objects_29)
-@OPT_ENABLE_CUPTI_TRUE@am__objects_31 = \
+@OPT_PAPI_COMPONENT_TRUE@ $(am__objects_28) $(am__objects_29)
+@OPT_PAPI_DYNAMIC_TRUE@am__objects_31 = $(am__objects_30)
+@OPT_ENABLE_CUPTI_TRUE@am__objects_32 = \
@OPT_ENABLE_CUPTI_TRUE@ sample-sources/libhpcrun_la-nvidia.lo \
@OPT_ENABLE_CUPTI_TRUE@ gpu/nvidia/libhpcrun_la-cubin-hash-map.lo \
@OPT_ENABLE_CUPTI_TRUE@ gpu/nvidia/libhpcrun_la-cubin-id-map.lo \
@@ -740,18 +752,20 @@ am__objects_26 = trampoline/aarch64/libhpcrun_la-aarch64-tramp.lo \
@OPT_ENABLE_CUPTI_TRUE@ gpu/nvidia/libhpcrun_la-cupti-analysis.lo \
@OPT_ENABLE_CUPTI_TRUE@ gpu/nvidia/libhpcrun_la-cupti-api.lo \
@OPT_ENABLE_CUPTI_TRUE@ gpu/nvidia/libhpcrun_la-cupti-gpu-api.lo
-@OPT_ENABLE_CUPTI_TRUE@am__objects_32 = $(am__objects_31)
-am__objects_33 = sample-sources/libhpcrun_la-upc.lo
-@OPT_ENABLE_UPC_TRUE@am__objects_34 = $(am__objects_33)
-am__objects_35 =
-@OPT_ENABLE_ROCM_TRUE@am__objects_36 = \
+@OPT_ENABLE_CUPTI_TRUE@am__objects_33 = $(am__objects_32)
+am__objects_34 = sample-sources/libhpcrun_la-upc.lo
+@OPT_ENABLE_UPC_TRUE@am__objects_35 = $(am__objects_34)
+am__objects_36 =
+@OPT_ENABLE_ROCM_TRUE@am__objects_37 = \
@OPT_ENABLE_ROCM_TRUE@ sample-sources/libhpcrun_la-amd.lo \
+@OPT_ENABLE_ROCM_TRUE@ sample-sources/libhpcrun_la-amd-rocprofiler.lo \
+@OPT_ENABLE_ROCM_TRUE@ gpu/amd/libhpcrun_la-hip-api.lo \
@OPT_ENABLE_ROCM_TRUE@ gpu/amd/libhpcrun_la-roctracer-activity-translate.lo \
@OPT_ENABLE_ROCM_TRUE@ gpu/amd/libhpcrun_la-roctracer-api.lo \
-@OPT_ENABLE_ROCM_TRUE@ gpu/amd/libhpcrun_la-rocm-debug-api.lo \
+@OPT_ENABLE_ROCM_TRUE@ gpu/amd/libhpcrun_la-rocprofiler-api.lo \
@OPT_ENABLE_ROCM_TRUE@ gpu/amd/libhpcrun_la-rocm-binary-processing.lo
-@OPT_ENABLE_ROCM_TRUE@am__objects_37 = $(am__objects_36)
-@OPT_ENABLE_LEVEL0_TRUE@am__objects_38 = \
+@OPT_ENABLE_ROCM_TRUE@am__objects_38 = $(am__objects_37)
+@OPT_ENABLE_LEVEL0_TRUE@am__objects_39 = \
@OPT_ENABLE_LEVEL0_TRUE@ sample-sources/libhpcrun_la-level0.lo \
@OPT_ENABLE_LEVEL0_TRUE@ gpu/level0/libhpcrun_la-level0-api.lo \
@OPT_ENABLE_LEVEL0_TRUE@ gpu/level0/libhpcrun_la-level0-command-list-context-map.lo \
@@ -760,8 +774,8 @@ am__objects_35 =
@OPT_ENABLE_LEVEL0_TRUE@ gpu/level0/libhpcrun_la-level0-data-node.lo \
@OPT_ENABLE_LEVEL0_TRUE@ gpu/level0/libhpcrun_la-level0-event-map.lo \
@OPT_ENABLE_LEVEL0_TRUE@ gpu/level0/libhpcrun_la-level0-handle-map.lo
-@OPT_ENABLE_LEVEL0_TRUE@am__objects_39 = $(am__objects_38)
-@OPT_ENABLE_OPENCL_TRUE@am__objects_40 = \
+@OPT_ENABLE_LEVEL0_TRUE@am__objects_40 = $(am__objects_39)
+@OPT_ENABLE_OPENCL_TRUE@am__objects_41 = \
@OPT_ENABLE_OPENCL_TRUE@ sample-sources/libhpcrun_la-opencl.lo \
@OPT_ENABLE_OPENCL_TRUE@ gpu/opencl/libhpcrun_la-opencl-api.lo \
@OPT_ENABLE_OPENCL_TRUE@ gpu/opencl/libhpcrun_la-opencl-memory-manager.lo \
@@ -769,28 +783,28 @@ am__objects_35 =
@OPT_ENABLE_OPENCL_TRUE@ gpu/opencl/libhpcrun_la-opencl-h2d-map.lo \
@OPT_ENABLE_OPENCL_TRUE@ gpu/opencl/libhpcrun_la-opencl-queue-map.lo \
@OPT_ENABLE_OPENCL_TRUE@ gpu/opencl/libhpcrun_la-opencl-context-map.lo
-@OPT_ENABLE_OPENCL_TRUE@am__objects_41 = $(am__objects_40)
-@OPT_ENABLE_GTPIN_TRUE@am__objects_42 = gpu/instrumentation/libhpcrun_la-kernel-data-map.lo \
+@OPT_ENABLE_OPENCL_TRUE@am__objects_42 = $(am__objects_41)
+@OPT_ENABLE_GTPIN_TRUE@am__objects_43 = gpu/instrumentation/libhpcrun_la-kernel-data-map.lo \
@OPT_ENABLE_GTPIN_TRUE@ gpu/instrumentation/libhpcrun_la-gtpin-instrumentation.lo \
@OPT_ENABLE_GTPIN_TRUE@ gpu/instrumentation/libhpcrun_la-gtpin-correlation-id-map.lo
-@OPT_ENABLE_GTPIN_TRUE@am__objects_43 = $(am__objects_42)
-am__objects_44 = unwind/common/libhpcrun_la-backtrace.lo \
+@OPT_ENABLE_GTPIN_TRUE@am__objects_44 = $(am__objects_43)
+am__objects_45 = unwind/common/libhpcrun_la-backtrace.lo \
unwind/common/libhpcrun_la-unw-throw.lo
-am__objects_45 = $(am__objects_44) \
+am__objects_46 = $(am__objects_45) \
unwind/common/libhpcrun_la-binarytree_uwi.lo \
unwind/common/libhpcrun_la-interval_t.lo \
unwind/common/libhpcrun_la-libunw_intervals.lo \
unwind/common/libhpcrun_la-stack_troll.lo \
unwind/common/libhpcrun_la-uw_hash.lo \
unwind/common/libhpcrun_la-uw_recipe_map.lo
-am__objects_46 = $(am__objects_45) \
+am__objects_47 = $(am__objects_46) \
unwind/generic-libunwind/libhpcrun_la-libunw-unwind.lo \
unwind/common/libhpcrun_la-default_validation_summary.lo
-am__objects_47 = $(am__objects_45) \
+am__objects_48 = $(am__objects_46) \
unwind/ppc64/libhpcrun_la-ppc64-unwind.lo \
unwind/ppc64/libhpcrun_la-ppc64-unwind-interval.lo \
unwind/common/libhpcrun_la-default_validation_summary.lo
-am__objects_48 = $(am__objects_45) \
+am__objects_49 = $(am__objects_46) \
unwind/x86-family/libhpcrun_la-x86-all.lo \
unwind/x86-family/libhpcrun_la-amd-xop.lo \
unwind/x86-family/libhpcrun_la-x86-cold-path.lo \
@@ -810,15 +824,15 @@ am__objects_48 = $(am__objects_45) \
unwind/x86-family/manual-intervals/libhpcrun_la-x86-32bit-icc-variant.lo \
unwind/x86-family/manual-intervals/libhpcrun_la-x86-fail-intervals.lo \
unwind/x86-family/manual-intervals/libhpcrun_la-x86-pgi-mp_pexit.lo
-@UNW_LIBUNW_FALSE@@UNW_PPC64_FALSE@@UNW_X86_TRUE@am__objects_49 = $(am__objects_48)
-@UNW_LIBUNW_FALSE@@UNW_PPC64_TRUE@am__objects_49 = $(am__objects_47)
-@UNW_LIBUNW_TRUE@am__objects_49 = $(am__objects_46)
+@UNW_LIBUNW_FALSE@@UNW_PPC64_FALSE@@UNW_X86_TRUE@am__objects_50 = $(am__objects_49)
+@UNW_LIBUNW_FALSE@@UNW_PPC64_TRUE@am__objects_50 = $(am__objects_48)
+@UNW_LIBUNW_TRUE@am__objects_50 = $(am__objects_47)
am_libhpcrun_la_OBJECTS = $(am__objects_14) $(am__objects_15) \
$(am__objects_17) $(am__objects_19) $(am__objects_21) \
$(am__objects_23) $(am__objects_25) $(am__objects_27) \
- $(am__objects_30) $(am__objects_32) $(am__objects_34) \
- $(am__objects_35) $(am__objects_37) $(am__objects_39) \
- $(am__objects_41) $(am__objects_43) $(am__objects_49) \
+ $(am__objects_31) $(am__objects_33) $(am__objects_35) \
+ $(am__objects_36) $(am__objects_38) $(am__objects_40) \
+ $(am__objects_42) $(am__objects_44) $(am__objects_50) \
utilities/libhpcrun_la-last_func.lo
libhpcrun_la_OBJECTS = $(am_libhpcrun_la_OBJECTS)
libhpcrun_la_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) \
@@ -945,7 +959,7 @@ am__libhpcrun_o_SOURCES_DIST = utilities/first_func.c main.h main.c \
cct_backtrace_finalize.c env.c epoch.c files.c \
handling_sample.c hpcrun-initializers.c hpcrun_options.c \
hpcrun_stats.c loadmap.c metrics.c name.c rank.c \
- sample_event.c sample_prob.c sample_sources_all.c \
+ sample_event.c sample_prob.c sample_sources_all.c tool_state.c \
sample-sources/blame-shift/blame-shift.c \
sample-sources/blame-shift/blame-map.c \
sample-sources/blame-shift/directed.c \
@@ -960,18 +974,18 @@ am__libhpcrun_o_SOURCES_DIST = utilities/first_func.c main.h main.c \
sample_sources_registered.c sample-sources/sample-filters.c \
segv_handler.c start-stop.c term_handler.c thread_data.c \
thread_use.c thread_finalize.c control-knob.c control-knob.h \
- device-finalizers.c device-initializers.c module-ignore-map.c \
- threadmgr.c trace.c weak.c write_data.c cct/cct_bundle.c \
- cct/cct_ctxt.c cct/cct.c cct/cct-node-vector.c cct2metrics.c \
- lush/lush-backtrace.h lush/lush-backtrace.c lush/lush.h \
- lush/lush.c lush/lush-pthread.h lush/lush-pthread.i \
- lush/lush-pthread.c lush/lush-support-rt.h \
- lush/lush-support-rt.c lush/lushi.h lush/lushi-cb.h \
- lush/lushi-cb.c fnbounds/fnbounds_common.c memory/mem.c \
- memory/mmap.c messages/debug-flag.c messages/messages-sync.c \
- messages/messages-async.c messages/fmt.c gpu/gpu-activity.c \
- gpu/gpu-activity-channel.c gpu/gpu-activity-process.c \
- gpu/gpu-application-thread-api.c \
+ device-finalizers.c gpu-monitors.c device-initializers.c \
+ module-ignore-map.c threadmgr.c trace.c weak.c write_data.c \
+ cct/cct_bundle.c cct/cct_ctxt.c cct/cct.c \
+ cct/cct-node-vector.c cct2metrics.c lush/lush-backtrace.h \
+ lush/lush-backtrace.c lush/lush.h lush/lush.c \
+ lush/lush-pthread.h lush/lush-pthread.i lush/lush-pthread.c \
+ lush/lush-support-rt.h lush/lush-support-rt.c lush/lushi.h \
+ lush/lushi-cb.h lush/lushi-cb.c fnbounds/fnbounds_common.c \
+ memory/mem.c memory/mmap.c messages/debug-flag.c \
+ messages/messages-sync.c messages/messages-async.c \
+ messages/fmt.c gpu/gpu-activity.c gpu/gpu-activity-channel.c \
+ gpu/gpu-activity-process.c gpu/gpu-application-thread-api.c \
gpu/gpu-channel-item-allocator.c gpu/gpu-context-id-map.c \
gpu/gpu-correlation.c gpu/gpu-correlation-channel.c \
gpu/gpu-correlation-channel-set.c gpu/gpu-correlation-id.c \
@@ -985,12 +999,14 @@ am__libhpcrun_o_SOURCES_DIST = utilities/first_func.c main.h main.c \
gpu/gpu-stream-id-map.c gpu/gpu-trace.c \
gpu/gpu-trace-channel.c gpu/gpu-trace-item.c \
gpu/gpu-trace-channel-set.c gpu/gpu-trace-demultiplexer.c \
- ompt/ompt-callstack.c ompt/ompt-defer.c ompt/ompt-device.c \
- ompt/ompt-defer-write.c ompt/ompt-interface.c \
- ompt/ompt-queues.c ompt/ompt-region.c ompt/ompt-region-debug.c \
- ompt/ompt-device-map.c ompt/ompt-task.c ompt/ompt-thread.c \
- extern-real/dl-iterate.c extern-real/mmap.c syscalls/poll.c \
- syscalls/ppoll.c syscalls/select.c syscalls/sysv_signal.c \
+ gpu/ompt/ompt-gpu-api.c gpu/ompt/ompt-activity-translate.c \
+ sample-sources/openmp-target.c ompt/ompt-callstack.c \
+ ompt/ompt-defer.c ompt/ompt-device.c ompt/ompt-defer-write.c \
+ ompt/ompt-interface.c ompt/ompt-queues.c ompt/ompt-region.c \
+ ompt/ompt-region-debug.c ompt/ompt-device-map.c \
+ ompt/ompt-task.c ompt/ompt-thread.c extern-real/dl-iterate.c \
+ extern-real/mmap.c syscalls/poll.c syscalls/ppoll.c \
+ syscalls/select.c syscalls/sysv_signal.c \
utilities/executable-path.h utilities/executable-path.c \
utilities/hpcrun-nanotime.h utilities/hpcrun-nanotime.c \
utilities/ip-normalized.h utilities/ip-normalized.c \
@@ -1025,12 +1041,13 @@ am__libhpcrun_o_SOURCES_DIST = utilities/first_func.c main.h main.c \
gpu/nvidia/cupti-activity-translate.c \
gpu/nvidia/cupti-analysis.c gpu/nvidia/cupti-api.c \
gpu/nvidia/cupti-gpu-api.c sample-sources/papi.c \
- sample-sources/papi-c-cupti.c sample-sources/papi-c.c \
- sample-sources/papi-c-extended-info.c sample-sources/upc.c \
- unwind/common/backtrace.c unwind/common/unw-throw.c \
- unwind/common/binarytree_uwi.c unwind/common/interval_t.c \
- unwind/common/libunw_intervals.c unwind/common/stack_troll.c \
- unwind/common/uw_hash.c unwind/common/uw_recipe_map.c \
+ sample-sources/papi-c-cupti.c sample-sources/papi-c-rocm.c \
+ sample-sources/papi-c.c sample-sources/papi-c-extended-info.c \
+ sample-sources/upc.c unwind/common/backtrace.c \
+ unwind/common/unw-throw.c unwind/common/binarytree_uwi.c \
+ unwind/common/interval_t.c unwind/common/libunw_intervals.c \
+ unwind/common/stack_troll.c unwind/common/uw_hash.c \
+ unwind/common/uw_recipe_map.c \
unwind/generic-libunwind/libunw-unwind.c \
unwind/ppc64/ppc64-unwind.c \
unwind/ppc64/ppc64-unwind-interval.c \
@@ -1053,19 +1070,19 @@ am__libhpcrun_o_SOURCES_DIST = utilities/first_func.c main.h main.c \
unwind/x86-family/manual-intervals/x86-fail-intervals.c \
unwind/x86-family/manual-intervals/x86-pgi-mp_pexit.c \
utilities/last_func.c
-@HOST_CPU_PPC_TRUE@am__objects_50 = trampoline/common/libhpcrun_o-trampoline_eager.$(OBJEXT)
-@HOST_CPU_PPC_FALSE@am__objects_51 = trampoline/common/libhpcrun_o-trampoline_lazy.$(OBJEXT)
-@OPT_ENABLE_PERF_EVENT_TRUE@am__objects_52 = sample-sources/perf/libhpcrun_o-event_custom.$(OBJEXT) \
+@HOST_CPU_PPC_TRUE@am__objects_51 = trampoline/common/libhpcrun_o-trampoline_eager.$(OBJEXT)
+@HOST_CPU_PPC_FALSE@am__objects_52 = trampoline/common/libhpcrun_o-trampoline_lazy.$(OBJEXT)
+@OPT_ENABLE_PERF_EVENT_TRUE@am__objects_53 = sample-sources/perf/libhpcrun_o-event_custom.$(OBJEXT) \
@OPT_ENABLE_PERF_EVENT_TRUE@ sample-sources/perf/libhpcrun_o-linux_perf.$(OBJEXT) \
@OPT_ENABLE_PERF_EVENT_TRUE@ sample-sources/perf/libhpcrun_o-perf_event_open.$(OBJEXT) \
@OPT_ENABLE_PERF_EVENT_TRUE@ sample-sources/perf/libhpcrun_o-perf-util.$(OBJEXT) \
@OPT_ENABLE_PERF_EVENT_TRUE@ sample-sources/perf/libhpcrun_o-perf_mmap.$(OBJEXT) \
@OPT_ENABLE_PERF_EVENT_TRUE@ sample-sources/perf/libhpcrun_o-perf_skid.$(OBJEXT)
-@OPT_ENABLE_PERF_EVENT_TRUE@@OPT_PERFMON_TRUE@am__objects_53 = sample-sources/perf/libhpcrun_o-perfmon-util.$(OBJEXT)
-@OPT_ENABLE_PERF_EVENT_TRUE@@OPT_PERFMON_FALSE@am__objects_54 = sample-sources/perf/libhpcrun_o-perfmon-util-dummy.$(OBJEXT)
-@OPT_ENABLE_KERNEL_4_3_TRUE@@OPT_ENABLE_PERF_EVENT_TRUE@am__objects_55 = sample-sources/perf/libhpcrun_o-kernel_blocking.$(OBJEXT)
-@OPT_ENABLE_KERNEL_4_3_FALSE@@OPT_ENABLE_PERF_EVENT_TRUE@am__objects_56 = sample-sources/perf/libhpcrun_o-kernel_blocking_stub.$(OBJEXT)
-am__objects_57 = utilities/libhpcrun_o-first_func.$(OBJEXT) \
+@OPT_ENABLE_PERF_EVENT_TRUE@@OPT_PERFMON_TRUE@am__objects_54 = sample-sources/perf/libhpcrun_o-perfmon-util.$(OBJEXT)
+@OPT_ENABLE_PERF_EVENT_TRUE@@OPT_PERFMON_FALSE@am__objects_55 = sample-sources/perf/libhpcrun_o-perfmon-util-dummy.$(OBJEXT)
+@OPT_ENABLE_KERNEL_4_3_TRUE@@OPT_ENABLE_PERF_EVENT_TRUE@am__objects_56 = sample-sources/perf/libhpcrun_o-kernel_blocking.$(OBJEXT)
+@OPT_ENABLE_KERNEL_4_3_FALSE@@OPT_ENABLE_PERF_EVENT_TRUE@am__objects_57 = sample-sources/perf/libhpcrun_o-kernel_blocking_stub.$(OBJEXT)
+am__objects_58 = utilities/libhpcrun_o-first_func.$(OBJEXT) \
libhpcrun_o-main.$(OBJEXT) libhpcrun_o-disabled.$(OBJEXT) \
libhpcrun_o-closure-registry.$(OBJEXT) \
libhpcrun_o-cct_insert_backtrace.$(OBJEXT) \
@@ -1081,6 +1098,7 @@ am__objects_57 = utilities/libhpcrun_o-first_func.$(OBJEXT) \
libhpcrun_o-sample_event.$(OBJEXT) \
libhpcrun_o-sample_prob.$(OBJEXT) \
libhpcrun_o-sample_sources_all.$(OBJEXT) \
+ libhpcrun_o-tool_state.$(OBJEXT) \
sample-sources/blame-shift/libhpcrun_o-blame-shift.$(OBJEXT) \
sample-sources/blame-shift/libhpcrun_o-blame-map.$(OBJEXT) \
sample-sources/blame-shift/libhpcrun_o-directed.$(OBJEXT) \
@@ -1109,6 +1127,7 @@ am__objects_57 = utilities/libhpcrun_o-first_func.$(OBJEXT) \
libhpcrun_o-thread_finalize.$(OBJEXT) \
libhpcrun_o-control-knob.$(OBJEXT) \
libhpcrun_o-device-finalizers.$(OBJEXT) \
+ libhpcrun_o-gpu-monitors.$(OBJEXT) \
libhpcrun_o-device-initializers.$(OBJEXT) \
libhpcrun_o-module-ignore-map.$(OBJEXT) \
libhpcrun_o-threadmgr.$(OBJEXT) libhpcrun_o-trace.$(OBJEXT) \
@@ -1160,6 +1179,9 @@ am__objects_57 = utilities/libhpcrun_o-first_func.$(OBJEXT) \
gpu/libhpcrun_o-gpu-trace-item.$(OBJEXT) \
gpu/libhpcrun_o-gpu-trace-channel-set.$(OBJEXT) \
gpu/libhpcrun_o-gpu-trace-demultiplexer.$(OBJEXT) \
+ gpu/ompt/libhpcrun_o-ompt-gpu-api.$(OBJEXT) \
+ gpu/ompt/libhpcrun_o-ompt-activity-translate.$(OBJEXT) \
+ sample-sources/libhpcrun_o-openmp-target.$(OBJEXT) \
ompt/libhpcrun_o-ompt-callstack.$(OBJEXT) \
ompt/libhpcrun_o-ompt-defer.$(OBJEXT) \
ompt/libhpcrun_o-ompt-device.$(OBJEXT) \
@@ -1184,28 +1206,28 @@ am__objects_57 = utilities/libhpcrun_o-first_func.$(OBJEXT) \
utilities/libhpcrun_o-linuxtimer.$(OBJEXT) \
utilities/libhpcrun_o-timer.$(OBJEXT) \
utilities/libhpcrun_o-tokenize.$(OBJEXT) \
- utilities/libhpcrun_o-unlink.$(OBJEXT) $(am__objects_50) \
- $(am__objects_51) $(am__objects_52) $(am__objects_53) \
- $(am__objects_54) $(am__objects_55) $(am__objects_56)
-am__objects_58 = fnbounds/libhpcrun_o-fnbounds_static.$(OBJEXT) \
+ utilities/libhpcrun_o-unlink.$(OBJEXT) $(am__objects_51) \
+ $(am__objects_52) $(am__objects_53) $(am__objects_54) \
+ $(am__objects_55) $(am__objects_56) $(am__objects_57)
+am__objects_59 = fnbounds/libhpcrun_o-fnbounds_static.$(OBJEXT) \
libhpcrun_o-custom-init-static.$(OBJEXT)
-am__objects_59 = unwind/common/libhpcrun_o-default_validation_summary.$(OBJEXT)
-@HOST_CPU_MIPS_TRUE@am__objects_60 = $(am__objects_59)
-am__objects_61 = trampoline/ppc64/libhpcrun_o-ppc64-tramp.$(OBJEXT) \
+am__objects_60 = unwind/common/libhpcrun_o-default_validation_summary.$(OBJEXT)
+@HOST_CPU_MIPS_TRUE@am__objects_61 = $(am__objects_60)
+am__objects_62 = trampoline/ppc64/libhpcrun_o-ppc64-tramp.$(OBJEXT) \
utilities/arch/ppc64/libhpcrun_o-ppc64-context-pc.$(OBJEXT)
-@HOST_CPU_PPC_TRUE@am__objects_62 = $(am__objects_61)
-am__objects_63 = \
+@HOST_CPU_PPC_TRUE@am__objects_63 = $(am__objects_62)
+am__objects_64 = \
trampoline/x86-family/libhpcrun_o-x86-tramp.$(OBJEXT) \
utilities/arch/x86-family/libhpcrun_o-x86-context-pc.$(OBJEXT)
-@HOST_CPU_X86_FAMILY_TRUE@am__objects_64 = $(am__objects_63)
-am__objects_65 = trampoline/ia64/libhpcrun_o-ia64-tramp.$(OBJEXT) \
+@HOST_CPU_X86_FAMILY_TRUE@am__objects_65 = $(am__objects_64)
+am__objects_66 = trampoline/ia64/libhpcrun_o-ia64-tramp.$(OBJEXT) \
utilities/arch/ia64/libhpcrun_o-ia64-context-pc.$(OBJEXT)
-@HOST_CPU_IA64_TRUE@am__objects_66 = $(am__objects_65)
-am__objects_67 = \
+@HOST_CPU_IA64_TRUE@am__objects_67 = $(am__objects_66)
+am__objects_68 = \
trampoline/aarch64/libhpcrun_o-aarch64-tramp.$(OBJEXT) \
utilities/arch/libunwind/libhpcrun_o-libunwind-context-pc.$(OBJEXT)
-@HOST_CPU_AARCH64_TRUE@am__objects_68 = $(am__objects_67)
-@OPT_ENABLE_CUPTI_TRUE@am__objects_69 = sample-sources/libhpcrun_o-nvidia.$(OBJEXT) \
+@HOST_CPU_AARCH64_TRUE@am__objects_69 = $(am__objects_68)
+@OPT_ENABLE_CUPTI_TRUE@am__objects_70 = sample-sources/libhpcrun_o-nvidia.$(OBJEXT) \
@OPT_ENABLE_CUPTI_TRUE@ gpu/nvidia/libhpcrun_o-cubin-hash-map.$(OBJEXT) \
@OPT_ENABLE_CUPTI_TRUE@ gpu/nvidia/libhpcrun_o-cubin-id-map.$(OBJEXT) \
@OPT_ENABLE_CUPTI_TRUE@ gpu/nvidia/libhpcrun_o-cubin-symbols.$(OBJEXT) \
@@ -1215,33 +1237,34 @@ am__objects_67 = \
@OPT_ENABLE_CUPTI_TRUE@ gpu/nvidia/libhpcrun_o-cupti-analysis.$(OBJEXT) \
@OPT_ENABLE_CUPTI_TRUE@ gpu/nvidia/libhpcrun_o-cupti-api.$(OBJEXT) \
@OPT_ENABLE_CUPTI_TRUE@ gpu/nvidia/libhpcrun_o-cupti-gpu-api.$(OBJEXT)
-@OPT_ENABLE_CUPTI_TRUE@am__objects_70 = $(am__objects_69)
-@OPT_PAPI_CUPTI_TRUE@am__objects_71 = sample-sources/libhpcrun_o-papi-c-cupti.$(OBJEXT)
-@OPT_PAPI_COMPONENT_FALSE@am__objects_72 = sample-sources/libhpcrun_o-papi.$(OBJEXT) \
-@OPT_PAPI_COMPONENT_FALSE@ $(am__objects_71)
-@OPT_PAPI_COMPONENT_TRUE@am__objects_72 = sample-sources/libhpcrun_o-papi-c.$(OBJEXT) \
+@OPT_ENABLE_CUPTI_TRUE@am__objects_71 = $(am__objects_70)
+@OPT_PAPI_CUPTI_TRUE@am__objects_72 = sample-sources/libhpcrun_o-papi-c-cupti.$(OBJEXT)
+@OPT_PAPI_ROCM_TRUE@am__objects_73 = sample-sources/libhpcrun_o-papi-c-rocm.$(OBJEXT)
+@OPT_PAPI_COMPONENT_FALSE@am__objects_74 = sample-sources/libhpcrun_o-papi.$(OBJEXT) \
+@OPT_PAPI_COMPONENT_FALSE@ $(am__objects_72) $(am__objects_73)
+@OPT_PAPI_COMPONENT_TRUE@am__objects_74 = sample-sources/libhpcrun_o-papi-c.$(OBJEXT) \
@OPT_PAPI_COMPONENT_TRUE@ sample-sources/libhpcrun_o-papi-c-extended-info.$(OBJEXT) \
-@OPT_PAPI_COMPONENT_TRUE@ $(am__objects_71)
-@OPT_PAPI_STATIC_TRUE@am__objects_73 = $(am__objects_72)
-am__objects_74 = sample-sources/libhpcrun_o-upc.$(OBJEXT)
-@OPT_ENABLE_UPC_TRUE@am__objects_75 = $(am__objects_74)
-am__objects_76 = unwind/common/libhpcrun_o-backtrace.$(OBJEXT) \
+@OPT_PAPI_COMPONENT_TRUE@ $(am__objects_72) $(am__objects_73)
+@OPT_PAPI_STATIC_TRUE@am__objects_75 = $(am__objects_74)
+am__objects_76 = sample-sources/libhpcrun_o-upc.$(OBJEXT)
+@OPT_ENABLE_UPC_TRUE@am__objects_77 = $(am__objects_76)
+am__objects_78 = unwind/common/libhpcrun_o-backtrace.$(OBJEXT) \
unwind/common/libhpcrun_o-unw-throw.$(OBJEXT)
-am__objects_77 = $(am__objects_76) \
+am__objects_79 = $(am__objects_78) \
unwind/common/libhpcrun_o-binarytree_uwi.$(OBJEXT) \
unwind/common/libhpcrun_o-interval_t.$(OBJEXT) \
unwind/common/libhpcrun_o-libunw_intervals.$(OBJEXT) \
unwind/common/libhpcrun_o-stack_troll.$(OBJEXT) \
unwind/common/libhpcrun_o-uw_hash.$(OBJEXT) \
unwind/common/libhpcrun_o-uw_recipe_map.$(OBJEXT)
-am__objects_78 = $(am__objects_77) \
+am__objects_80 = $(am__objects_79) \
unwind/generic-libunwind/libhpcrun_o-libunw-unwind.$(OBJEXT) \
unwind/common/libhpcrun_o-default_validation_summary.$(OBJEXT)
-am__objects_79 = $(am__objects_77) \
+am__objects_81 = $(am__objects_79) \
unwind/ppc64/libhpcrun_o-ppc64-unwind.$(OBJEXT) \
unwind/ppc64/libhpcrun_o-ppc64-unwind-interval.$(OBJEXT) \
unwind/common/libhpcrun_o-default_validation_summary.$(OBJEXT)
-am__objects_80 = $(am__objects_77) \
+am__objects_82 = $(am__objects_79) \
unwind/x86-family/libhpcrun_o-x86-all.$(OBJEXT) \
unwind/x86-family/libhpcrun_o-amd-xop.$(OBJEXT) \
unwind/x86-family/libhpcrun_o-x86-cold-path.$(OBJEXT) \
@@ -1261,14 +1284,14 @@ am__objects_80 = $(am__objects_77) \
unwind/x86-family/manual-intervals/libhpcrun_o-x86-32bit-icc-variant.$(OBJEXT) \
unwind/x86-family/manual-intervals/libhpcrun_o-x86-fail-intervals.$(OBJEXT) \
unwind/x86-family/manual-intervals/libhpcrun_o-x86-pgi-mp_pexit.$(OBJEXT)
-@UNW_LIBUNW_FALSE@@UNW_PPC64_FALSE@@UNW_X86_TRUE@am__objects_81 = $(am__objects_80)
-@UNW_LIBUNW_FALSE@@UNW_PPC64_TRUE@am__objects_81 = $(am__objects_79)
-@UNW_LIBUNW_TRUE@am__objects_81 = $(am__objects_78)
-am_libhpcrun_o_OBJECTS = $(am__objects_57) $(am__objects_58) \
- $(am__objects_60) $(am__objects_62) $(am__objects_64) \
- $(am__objects_66) $(am__objects_68) $(am__objects_70) \
- $(am__objects_73) $(am__objects_75) $(am__objects_35) \
- $(am__objects_81) utilities/libhpcrun_o-last_func.$(OBJEXT)
+@UNW_LIBUNW_FALSE@@UNW_PPC64_FALSE@@UNW_X86_TRUE@am__objects_83 = $(am__objects_82)
+@UNW_LIBUNW_FALSE@@UNW_PPC64_TRUE@am__objects_83 = $(am__objects_81)
+@UNW_LIBUNW_TRUE@am__objects_83 = $(am__objects_80)
+am_libhpcrun_o_OBJECTS = $(am__objects_58) $(am__objects_59) \
+ $(am__objects_61) $(am__objects_63) $(am__objects_65) \
+ $(am__objects_67) $(am__objects_69) $(am__objects_71) \
+ $(am__objects_75) $(am__objects_77) $(am__objects_36) \
+ $(am__objects_83) utilities/libhpcrun_o-last_func.$(OBJEXT)
libhpcrun_o_OBJECTS = $(am_libhpcrun_o_OBJECTS)
@HOST_CPU_X86_FAMILY_TRUE@am__DEPENDENCIES_2 = $(am__DEPENDENCIES_1)
@OPT_PAPI_STATIC_TRUE@am__DEPENDENCIES_3 = $(am__DEPENDENCIES_1)
@@ -1580,6 +1603,7 @@ OPT_CUPTI_IFLAGS = @OPT_CUPTI_IFLAGS@
OPT_CUPTI_LDFLAGS = @OPT_CUPTI_LDFLAGS@
OPT_GTPIN = @OPT_GTPIN@
OPT_GTPIN_IFLAGS = @OPT_GTPIN_IFLAGS@
+OPT_GTPIN_LDFLAGS = @OPT_GTPIN_LDFLAGS@
OPT_GTPIN_LIBDIR = @OPT_GTPIN_LIBDIR@
OPT_IGC = @OPT_IGC@
OPT_IGC_IFLAGS = @OPT_IGC_IFLAGS@
@@ -1617,6 +1641,7 @@ PERFMON_LDFLAGS_STAT = @PERFMON_LDFLAGS_STAT@
PERFMON_LIB = @PERFMON_LIB@
PERF_EVENT_PARANOID = @PERF_EVENT_PARANOID@
RANLIB = @RANLIB@
+ROCM_PROFILER_LD_DIR = @ROCM_PROFILER_LD_DIR@
SED = @SED@
SET_MAKE = @SET_MAKE@
SHELL = @SHELL@
@@ -1777,14 +1802,15 @@ bin_SCRIPTS = $(am__append_4) $(am__append_6)
pkglibexec_SCRIPTS = $(am__append_1)
include_HEADERS = $(am__append_2)
pkglib_LIBRARIES = $(am__append_5)
-pkglib_LTLIBRARIES = $(am__append_3) $(am__append_7) $(am__append_17) \
- $(am__append_18) $(am__append_140) $(am__append_141)
-BUILT_SOURCES = $(am__append_22)
-CLEANFILES = $(am__append_23)
+pkglib_LTLIBRARIES = $(am__append_3) $(am__append_7) $(am__append_18) \
+ $(am__append_19) $(am__append_143) $(am__append_144)
+BUILT_SOURCES = $(am__append_23)
+CLEANFILES = $(am__append_24)
@OPT_ENABLE_HPCRUN_DYNAMIC_TRUE@noinst_LTLIBRARIES = libhpcrun.la
PAPI_INC_FLGS = @OPT_PAPI_IFLAGS@
PAPI_LD_FLGS = @OPT_PAPI_LDFLAGS@
CUPTI_INC_FLGS = @OPT_CUPTI_IFLAGS@
+ROCM_INC_FLGS = @OPT_ROCM_IFLAGS@
OPENCL_IFLAGS = @OPT_OPENCL_IFLAGS@
CUPTI_LD_FLGS = @OPT_CUPTI_LDFLAGS@
CUPTI_BASE = @OPT_CUPTI@
@@ -1867,16 +1893,16 @@ UNW_MIPS_INCLUDE_DIRS = \
UNW_MIPS_LD_FLAGS =
MY_CPP_DEFINES = -D_GNU_SOURCE -DINLINE_FN=1 -DLOCAL_BUILD=1 \
- -D__HIP_PLATFORM_HCC__=1 $(am__append_11) $(am__append_19) \
- $(am__append_102) $(am__append_106) $(am__append_108) \
- $(am__append_112) $(am__append_127) $(am__append_131) \
- $(am__append_135) $(am__append_139)
+ -D__HIP_PLATFORM_HCC__=1 $(am__append_11) $(am__append_20) \
+ $(am__append_103) $(am__append_107) $(am__append_109) \
+ $(am__append_111) $(am__append_115) $(am__append_130) \
+ $(am__append_134) $(am__append_138) $(am__append_142)
MY_BASE_FILES = utilities/first_func.c main.h main.c disabled.c \
closure-registry.c cct_insert_backtrace.c \
cct_backtrace_finalize.c env.c epoch.c files.c \
handling_sample.c hpcrun-initializers.c hpcrun_options.c \
hpcrun_stats.c loadmap.c metrics.c name.c rank.c \
- sample_event.c sample_prob.c sample_sources_all.c \
+ sample_event.c sample_prob.c sample_sources_all.c tool_state.c \
sample-sources/blame-shift/blame-shift.c \
sample-sources/blame-shift/blame-map.c \
sample-sources/blame-shift/directed.c \
@@ -1891,18 +1917,18 @@ MY_BASE_FILES = utilities/first_func.c main.h main.c disabled.c \
sample_sources_registered.c sample-sources/sample-filters.c \
segv_handler.c start-stop.c term_handler.c thread_data.c \
thread_use.c thread_finalize.c control-knob.c control-knob.h \
- device-finalizers.c device-initializers.c module-ignore-map.c \
- threadmgr.c trace.c weak.c write_data.c cct/cct_bundle.c \
- cct/cct_ctxt.c cct/cct.c cct/cct-node-vector.c cct2metrics.c \
- lush/lush-backtrace.h lush/lush-backtrace.c lush/lush.h \
- lush/lush.c lush/lush-pthread.h lush/lush-pthread.i \
- lush/lush-pthread.c lush/lush-support-rt.h \
- lush/lush-support-rt.c lush/lushi.h lush/lushi-cb.h \
- lush/lushi-cb.c fnbounds/fnbounds_common.c memory/mem.c \
- memory/mmap.c messages/debug-flag.c messages/messages-sync.c \
- messages/messages-async.c messages/fmt.c gpu/gpu-activity.c \
- gpu/gpu-activity-channel.c gpu/gpu-activity-process.c \
- gpu/gpu-application-thread-api.c \
+ device-finalizers.c gpu-monitors.c device-initializers.c \
+ module-ignore-map.c threadmgr.c trace.c weak.c write_data.c \
+ cct/cct_bundle.c cct/cct_ctxt.c cct/cct.c \
+ cct/cct-node-vector.c cct2metrics.c lush/lush-backtrace.h \
+ lush/lush-backtrace.c lush/lush.h lush/lush.c \
+ lush/lush-pthread.h lush/lush-pthread.i lush/lush-pthread.c \
+ lush/lush-support-rt.h lush/lush-support-rt.c lush/lushi.h \
+ lush/lushi-cb.h lush/lushi-cb.c fnbounds/fnbounds_common.c \
+ memory/mem.c memory/mmap.c messages/debug-flag.c \
+ messages/messages-sync.c messages/messages-async.c \
+ messages/fmt.c gpu/gpu-activity.c gpu/gpu-activity-channel.c \
+ gpu/gpu-activity-process.c gpu/gpu-application-thread-api.c \
gpu/gpu-channel-item-allocator.c gpu/gpu-context-id-map.c \
gpu/gpu-correlation.c gpu/gpu-correlation-channel.c \
gpu/gpu-correlation-channel-set.c gpu/gpu-correlation-id.c \
@@ -1916,12 +1942,14 @@ MY_BASE_FILES = utilities/first_func.c main.h main.c disabled.c \
gpu/gpu-stream-id-map.c gpu/gpu-trace.c \
gpu/gpu-trace-channel.c gpu/gpu-trace-item.c \
gpu/gpu-trace-channel-set.c gpu/gpu-trace-demultiplexer.c \
- ompt/ompt-callstack.c ompt/ompt-defer.c ompt/ompt-device.c \
- ompt/ompt-defer-write.c ompt/ompt-interface.c \
- ompt/ompt-queues.c ompt/ompt-region.c ompt/ompt-region-debug.c \
- ompt/ompt-device-map.c ompt/ompt-task.c ompt/ompt-thread.c \
- extern-real/dl-iterate.c extern-real/mmap.c syscalls/poll.c \
- syscalls/ppoll.c syscalls/select.c syscalls/sysv_signal.c \
+ gpu/ompt/ompt-gpu-api.c gpu/ompt/ompt-activity-translate.c \
+ sample-sources/openmp-target.c ompt/ompt-callstack.c \
+ ompt/ompt-defer.c ompt/ompt-device.c ompt/ompt-defer-write.c \
+ ompt/ompt-interface.c ompt/ompt-queues.c ompt/ompt-region.c \
+ ompt/ompt-region-debug.c ompt/ompt-device-map.c \
+ ompt/ompt-task.c ompt/ompt-thread.c extern-real/dl-iterate.c \
+ extern-real/mmap.c syscalls/poll.c syscalls/ppoll.c \
+ syscalls/select.c syscalls/sysv_signal.c \
utilities/executable-path.h utilities/executable-path.c \
utilities/hpcrun-nanotime.h utilities/hpcrun-nanotime.c \
utilities/ip-normalized.h utilities/ip-normalized.c \
@@ -1965,10 +1993,10 @@ MY_AARCH64_FILES = \
utilities/arch/libunwind/libunwind-context-pc.c
@OPT_PAPI_COMPONENT_FALSE@MY_PAPI_FILES = sample-sources/papi.c \
-@OPT_PAPI_COMPONENT_FALSE@ $(am__append_16)
+@OPT_PAPI_COMPONENT_FALSE@ $(am__append_16) $(am__append_17)
@OPT_PAPI_COMPONENT_TRUE@MY_PAPI_FILES = sample-sources/papi-c.c \
@OPT_PAPI_COMPONENT_TRUE@ sample-sources/papi-c-extended-info.c \
-@OPT_PAPI_COMPONENT_TRUE@ $(am__append_16)
+@OPT_PAPI_COMPONENT_TRUE@ $(am__append_16) $(am__append_17)
@OPT_ENABLE_CUPTI_TRUE@MY_CUPTI_FILES = sample-sources/nvidia.c \
@OPT_ENABLE_CUPTI_TRUE@ gpu/nvidia/cubin-hash-map.c \
@OPT_ENABLE_CUPTI_TRUE@ gpu/nvidia/cubin-id-map.c \
@@ -1999,9 +2027,11 @@ MY_AARCH64_FILES = \
@OPT_ENABLE_ROCM_TRUE@MY_ROCM_FILES = \
@OPT_ENABLE_ROCM_TRUE@ sample-sources/amd.c \
+@OPT_ENABLE_ROCM_TRUE@ sample-sources/amd-rocprofiler.c \
+@OPT_ENABLE_ROCM_TRUE@ gpu/amd/hip-api.c \
@OPT_ENABLE_ROCM_TRUE@ gpu/amd/roctracer-activity-translate.c \
@OPT_ENABLE_ROCM_TRUE@ gpu/amd/roctracer-api.c \
-@OPT_ENABLE_ROCM_TRUE@ gpu/amd/rocm-debug-api.c \
+@OPT_ENABLE_ROCM_TRUE@ gpu/amd/rocprofiler-api.c \
@OPT_ENABLE_ROCM_TRUE@ gpu/amd/rocm-binary-processing.c
@OPT_ENABLE_LEVEL0_TRUE@MY_LEVEL0_FILES = \
@@ -2032,6 +2062,7 @@ MY_INCLUDE_DIRS = \
-I$(HPCFNBOUNDS_INC) \
$(OPT_CUDA_IFLAGS) \
$(OPT_CUPTI_IFLAGS) \
+ $(ROCM_INC_FLGS) \
-I$(LIBELF_INC) \
-I$(LIBMONITOR_INC) \
$(GOTCHA_IFLAGS) \
@@ -2053,11 +2084,11 @@ MY_AARCH64_INCLUDE_DIRS = \
-I$(srcdir)/utilities/arch/aarch64
libhpcrun_la_SOURCES = $(MY_BASE_FILES) $(MY_DYNAMIC_FILES) \
- $(am__append_24) $(am__append_25) $(am__append_38) \
- $(am__append_53) $(am__append_71) $(am__append_84) \
- $(am__append_99) $(am__append_103) $(am__append_113) \
- $(am__append_120) $(am__append_124) $(am__append_128) \
- $(am__append_132) $(am__append_136) $(UNW_SOURCE_FILES) \
+ $(am__append_25) $(am__append_26) $(am__append_39) \
+ $(am__append_54) $(am__append_72) $(am__append_85) \
+ $(am__append_100) $(am__append_104) $(am__append_116) \
+ $(am__append_123) $(am__append_127) $(am__append_131) \
+ $(am__append_135) $(am__append_139) $(UNW_SOURCE_FILES) \
utilities/last_func.c
libhpcrun_fake_audit_la_SOURCES = \
audit/fake-auditor.c
@@ -2066,9 +2097,9 @@ libhpcrun_audit_la_SOURCES = \
audit/auditor.c
libhpcrun_o_SOURCES = $(MY_BASE_FILES) $(MY_STATIC_FILES) \
- $(am__append_26) $(am__append_39) $(am__append_54) \
- $(am__append_72) $(am__append_85) $(am__append_104) \
- $(am__append_109) $(am__append_114) $(am__append_123) \
+ $(am__append_27) $(am__append_40) $(am__append_55) \
+ $(am__append_73) $(am__append_86) $(am__append_105) \
+ $(am__append_112) $(am__append_117) $(am__append_126) \
$(UNW_SOURCE_FILES) utilities/last_func.c
libhpcrun_wrap_a_SOURCES = \
monitor-exts/openmp.c
@@ -2113,12 +2144,12 @@ libhpctoolkit_a_SOURCES = \
# cppflags
#-----------------------------------------------------------
libhpcrun_la_CPPFLAGS = $(MY_CPP_DEFINES) $(MY_INCLUDE_DIRS) \
- $(am__append_20) $(am__append_27) $(am__append_40) \
- $(am__append_55) $(am__append_73) $(am__append_86) \
- $(am__append_100) $(am__append_105) $(am__append_107) \
- $(am__append_115) $(am__append_118) $(am__append_121) \
- $(am__append_125) $(am__append_129) $(am__append_133) \
- $(am__append_137) $(UNW_INCLUDE_DIRS)
+ $(am__append_21) $(am__append_28) $(am__append_41) \
+ $(am__append_56) $(am__append_74) $(am__append_87) \
+ $(am__append_101) $(am__append_106) $(am__append_108) \
+ $(am__append_110) $(am__append_118) $(am__append_121) \
+ $(am__append_124) $(am__append_128) $(am__append_132) \
+ $(am__append_136) $(am__append_140) $(UNW_INCLUDE_DIRS)
libhpcrun_fake_audit_la_CPPFLAGS = \
$(MY_CPP_DEFINES) \
$(MY_INCLUDE_DIRS)
@@ -2128,51 +2159,51 @@ libhpcrun_audit_la_CPPFLAGS = \
$(MY_INCLUDE_DIRS)
libhpcrun_o_CPPFLAGS = -DHPCRUN_STATIC_LINK $(MY_CPP_DEFINES) \
- $(MY_INCLUDE_DIRS) $(am__append_21) $(am__append_28) \
- $(am__append_41) $(am__append_56) $(am__append_74) \
- $(am__append_87) $(am__append_110) $(am__append_116) \
- $(am__append_119) $(UNW_INCLUDE_DIRS)
+ $(MY_INCLUDE_DIRS) $(am__append_22) $(am__append_29) \
+ $(am__append_42) $(am__append_57) $(am__append_75) \
+ $(am__append_88) $(am__append_113) $(am__append_119) \
+ $(am__append_122) $(UNW_INCLUDE_DIRS)
libhpcrun_wrap_a_CPPFLAGS = \
-DHPCRUN_STATIC_LINK \
$(MY_CPP_DEFINES) \
$(MY_INCLUDE_DIRS)
libhpcrun_ga_la_CPPFLAGS = $(MY_CPP_DEFINES) $(MY_INCLUDE_DIRS) \
- $(am__append_29) $(am__append_42) $(am__append_59) \
- $(am__append_75) $(am__append_88) $(UNW_INCLUDE_DIRS)
+ $(am__append_30) $(am__append_43) $(am__append_60) \
+ $(am__append_76) $(am__append_89) $(UNW_INCLUDE_DIRS)
libhpcrun_ga_wrap_a_CPPFLAGS = -DHPCRUN_STATIC_LINK $(MY_CPP_DEFINES) \
- $(MY_INCLUDE_DIRS) $(am__append_30) $(am__append_43) \
- $(am__append_60) $(am__append_76) $(am__append_89) \
+ $(MY_INCLUDE_DIRS) $(am__append_31) $(am__append_44) \
+ $(am__append_61) $(am__append_77) $(am__append_90) \
$(UNW_INCLUDE_DIRS)
libhpcrun_gprof_la_CPPFLAGS = $(MY_CPP_DEFINES) $(MY_INCLUDE_DIRS) \
- $(am__append_44) $(am__append_61) $(am__append_90)
+ $(am__append_45) $(am__append_62) $(am__append_91)
libhpcrun_gprof_wrap_a_CPPFLAGS = -DHPCRUN_STATIC_LINK \
- $(MY_CPP_DEFINES) $(MY_INCLUDE_DIRS) $(am__append_45) \
- $(am__append_62) $(am__append_91)
+ $(MY_CPP_DEFINES) $(MY_INCLUDE_DIRS) $(am__append_46) \
+ $(am__append_63) $(am__append_92)
libhpcrun_io_la_CPPFLAGS = $(MY_CPP_DEFINES) $(MY_INCLUDE_DIRS) \
- $(am__append_31) $(am__append_46) $(am__append_63) \
- $(am__append_77) $(am__append_92) $(UNW_INCLUDE_DIRS)
+ $(am__append_32) $(am__append_47) $(am__append_64) \
+ $(am__append_78) $(am__append_93) $(UNW_INCLUDE_DIRS)
libhpcrun_io_wrap_a_CPPFLAGS = -DHPCRUN_STATIC_LINK $(MY_CPP_DEFINES) \
- $(MY_INCLUDE_DIRS) $(am__append_32) $(am__append_47) \
- $(am__append_64) $(am__append_78) $(am__append_93) \
+ $(MY_INCLUDE_DIRS) $(am__append_33) $(am__append_48) \
+ $(am__append_65) $(am__append_79) $(am__append_94) \
$(UNW_INCLUDE_DIRS)
libhpcrun_memleak_la_CPPFLAGS = $(MY_CPP_DEFINES) $(MY_INCLUDE_DIRS) \
- $(am__append_33) $(am__append_48) $(am__append_65) \
- $(am__append_79) $(am__append_94) $(UNW_INCLUDE_DIRS)
+ $(am__append_34) $(am__append_49) $(am__append_66) \
+ $(am__append_80) $(am__append_95) $(UNW_INCLUDE_DIRS)
libhpcrun_memleak_wrap_a_CPPFLAGS = -DHPCRUN_STATIC_LINK \
- $(MY_CPP_DEFINES) $(MY_INCLUDE_DIRS) $(am__append_34) \
- $(am__append_49) $(am__append_66) $(am__append_80) \
- $(am__append_95) $(UNW_INCLUDE_DIRS)
+ $(MY_CPP_DEFINES) $(MY_INCLUDE_DIRS) $(am__append_35) \
+ $(am__append_50) $(am__append_67) $(am__append_81) \
+ $(am__append_96) $(UNW_INCLUDE_DIRS)
libhpcrun_pthread_la_CPPFLAGS = $(MY_CPP_DEFINES) $(MY_INCLUDE_DIRS) \
- $(am__append_35) $(am__append_50) $(am__append_67) \
- $(am__append_81) $(am__append_96) $(UNW_INCLUDE_DIRS)
+ $(am__append_36) $(am__append_51) $(am__append_68) \
+ $(am__append_82) $(am__append_97) $(UNW_INCLUDE_DIRS)
libhpcrun_pthread_wrap_a_CPPFLAGS = -DHPCRUN_STATIC_LINK \
- $(MY_CPP_DEFINES) $(MY_INCLUDE_DIRS) $(am__append_36) \
- $(am__append_51) $(am__append_68) $(am__append_82) \
- $(am__append_97) $(UNW_INCLUDE_DIRS)
+ $(MY_CPP_DEFINES) $(MY_INCLUDE_DIRS) $(am__append_37) \
+ $(am__append_52) $(am__append_69) $(am__append_83) \
+ $(am__append_98) $(UNW_INCLUDE_DIRS)
libhpcrun_mpi_la_CPPFLAGS = $(MY_CPP_DEFINES) -I$(MPI_INC) \
- $(MY_INCLUDE_DIRS) $(am__append_37) $(am__append_52) \
- $(am__append_69) $(am__append_83) $(am__append_98) \
+ $(MY_INCLUDE_DIRS) $(am__append_38) $(am__append_53) \
+ $(am__append_70) $(am__append_84) $(am__append_99) \
$(UNW_INCLUDE_DIRS)
libhpctoolkit_la_CPPFLAGS = \
$(MY_CPP_DEFINES) \
@@ -2188,8 +2219,8 @@ libhpctoolkit_a_CPPFLAGS = \
# cflags
#-----------------------------------------------------------
libhpcrun_la_CFLAGS = $(CFLAGS) $(HOST_CFLAGS) $(PERFMON_CFLAGS) \
- $(am__append_122) $(am__append_126) $(am__append_130) \
- $(am__append_134) $(am__append_138) $(GOTCHA_IFLAGS)
+ $(am__append_125) $(am__append_129) $(am__append_133) \
+ $(am__append_137) $(am__append_141) $(GOTCHA_IFLAGS)
libhpcrun_o_CFLAGS = $(CFLAGS) $(HOST_CFLAGS) $(PERFMON_CFLAGS)
libhpcrun_wrap_a_CFLAGS = $(CFLAGS) $(HOST_CFLAGS)
libhpcrun_ga_la_CFLAGS = $(CFLAGS) $(HOST_CFLAGS)
@@ -2221,8 +2252,8 @@ OUR_LIBUNWIND_A = $(top_builddir)/src/extern/libunwind/libunwind.a
OUR_LZMA_A = $(top_builddir)/src/extern/lzma/liblzma.a
libhpcrun_la_LDFLAGS = -Wl,-Bsymbolic -L$(LIBMONITOR_LIB) -lmonitor \
-lpthread -lrt -L$(LIBELF_LIB) -lelf $(PERFMON_LDFLAGS_DYN) \
- $(OPT_ROCM_LDFLAGS) $(am__append_57) $(am__append_101) \
- $(am__append_117) $(GOTCHA_LDFLAGS) $(UNW_DYNAMIC_LD_FLAGS)
+ $(OPT_ROCM_LDFLAGS) $(am__append_58) $(am__append_102) \
+ $(am__append_120) $(GOTCHA_LDFLAGS) $(UNW_DYNAMIC_LD_FLAGS)
libhpcrun_fake_audit_la_LDFLAGS = -Wl,-Bsymbolic -ldl
libhpcrun_audit_la_LDFLAGS = -Wl,-Bsymbolic -ldl
libhpcrun_ga_la_LDFLAGS = -Wl,-Bsymbolic
@@ -2248,9 +2279,9 @@ libhpcrun_la_LIBADD = \
libhpcrun_o_LDADD = $(PROF_LEAN_A) $(SUPPORT_LEAN_A) \
$(PERFMON_LDFLAGS_STAT) $(MBEDTLS_LIBS) $(OUR_LIBUNWIND_A) \
- $(OUR_LZMA_A) $(am__append_58) $(am__append_111) \
+ $(OUR_LZMA_A) $(am__append_59) $(am__append_114) \
$(UNW_STATIC_LD_FLAGS)
-MY_AGENT_INCLUDE_DIRS = $(MY_INCLUDE_DIRS) $(am__append_70) \
+MY_AGENT_INCLUDE_DIRS = $(MY_INCLUDE_DIRS) $(am__append_71) \
$(UNW_INCLUDE_DIRS)
@HOST_CPU_AARCH64_TRUE@libhpcrun_la_CCASFLAGS = $(AM_CCASFLAGS)
@HOST_CPU_PPC_TRUE@libhpcrun_la_CCASFLAGS = $(AM_CCASFLAGS)
@@ -2334,7 +2365,7 @@ MY_AGENT_TBB_CFLAGS = \
# and hidden into libhpcrun.o. Other dependencies go into hpclink.
# Don't use LDFLAGS for static case.
MONITOR_NAMES = -G 'monitor_*'
-HPCRUN_NAMES = -G 'hpcrun_*' -G 'hpctoolkit_*'
+HPCRUN_NAMES = -G 'hpcrun_*' -G 'hpctoolkit_*' -G 'OnLoadToolProp' -G 'OnLoad' -G 'OnUnloadTool'
MISC_NAMES = -G 'debug_flag_*' -G 'messages_*' -G ompt_start_tool
OPENCL_NAMES = -G 'clBuildProgram' -G 'clCreate*' -G 'clEnqueue*' -G 'clSetKernelArg' -G 'cl*Event*'
DYN_SYSCALL_LIST = poll ppoll pselect select __sysv_signal
@@ -2744,6 +2775,19 @@ gpu/libhpcrun_la-gpu-trace-channel-set.lo: gpu/$(am__dirstamp) \
gpu/$(DEPDIR)/$(am__dirstamp)
gpu/libhpcrun_la-gpu-trace-demultiplexer.lo: gpu/$(am__dirstamp) \
gpu/$(DEPDIR)/$(am__dirstamp)
+gpu/ompt/$(am__dirstamp):
+ @$(MKDIR_P) gpu/ompt
+ @: > gpu/ompt/$(am__dirstamp)
+gpu/ompt/$(DEPDIR)/$(am__dirstamp):
+ @$(MKDIR_P) gpu/ompt/$(DEPDIR)
+ @: > gpu/ompt/$(DEPDIR)/$(am__dirstamp)
+gpu/ompt/libhpcrun_la-ompt-gpu-api.lo: gpu/ompt/$(am__dirstamp) \
+ gpu/ompt/$(DEPDIR)/$(am__dirstamp)
+gpu/ompt/libhpcrun_la-ompt-activity-translate.lo: \
+ gpu/ompt/$(am__dirstamp) gpu/ompt/$(DEPDIR)/$(am__dirstamp)
+sample-sources/libhpcrun_la-openmp-target.lo: \
+ sample-sources/$(am__dirstamp) \
+ sample-sources/$(DEPDIR)/$(am__dirstamp)
ompt/$(am__dirstamp):
@$(MKDIR_P) ompt
@: > ompt/$(am__dirstamp)
@@ -2960,6 +3004,9 @@ sample-sources/libhpcrun_la-papi.lo: sample-sources/$(am__dirstamp) \
sample-sources/libhpcrun_la-papi-c-cupti.lo: \
sample-sources/$(am__dirstamp) \
sample-sources/$(DEPDIR)/$(am__dirstamp)
+sample-sources/libhpcrun_la-papi-c-rocm.lo: \
+ sample-sources/$(am__dirstamp) \
+ sample-sources/$(DEPDIR)/$(am__dirstamp)
sample-sources/libhpcrun_la-papi-c.lo: sample-sources/$(am__dirstamp) \
sample-sources/$(DEPDIR)/$(am__dirstamp)
sample-sources/libhpcrun_la-papi-c-extended-info.lo: \
@@ -2997,17 +3044,22 @@ sample-sources/libhpcrun_la-upc.lo: sample-sources/$(am__dirstamp) \
sample-sources/$(DEPDIR)/$(am__dirstamp)
sample-sources/libhpcrun_la-amd.lo: sample-sources/$(am__dirstamp) \
sample-sources/$(DEPDIR)/$(am__dirstamp)
+sample-sources/libhpcrun_la-amd-rocprofiler.lo: \
+ sample-sources/$(am__dirstamp) \
+ sample-sources/$(DEPDIR)/$(am__dirstamp)
gpu/amd/$(am__dirstamp):
@$(MKDIR_P) gpu/amd
@: > gpu/amd/$(am__dirstamp)
gpu/amd/$(DEPDIR)/$(am__dirstamp):
@$(MKDIR_P) gpu/amd/$(DEPDIR)
@: > gpu/amd/$(DEPDIR)/$(am__dirstamp)
+gpu/amd/libhpcrun_la-hip-api.lo: gpu/amd/$(am__dirstamp) \
+ gpu/amd/$(DEPDIR)/$(am__dirstamp)
gpu/amd/libhpcrun_la-roctracer-activity-translate.lo: \
gpu/amd/$(am__dirstamp) gpu/amd/$(DEPDIR)/$(am__dirstamp)
gpu/amd/libhpcrun_la-roctracer-api.lo: gpu/amd/$(am__dirstamp) \
gpu/amd/$(DEPDIR)/$(am__dirstamp)
-gpu/amd/libhpcrun_la-rocm-debug-api.lo: gpu/amd/$(am__dirstamp) \
+gpu/amd/libhpcrun_la-rocprofiler-api.lo: gpu/amd/$(am__dirstamp) \
gpu/amd/$(DEPDIR)/$(am__dirstamp)
gpu/amd/libhpcrun_la-rocm-binary-processing.lo: \
gpu/amd/$(am__dirstamp) gpu/amd/$(DEPDIR)/$(am__dirstamp)
@@ -3435,6 +3487,13 @@ gpu/libhpcrun_o-gpu-trace-channel-set.$(OBJEXT): gpu/$(am__dirstamp) \
gpu/$(DEPDIR)/$(am__dirstamp)
gpu/libhpcrun_o-gpu-trace-demultiplexer.$(OBJEXT): \
gpu/$(am__dirstamp) gpu/$(DEPDIR)/$(am__dirstamp)
+gpu/ompt/libhpcrun_o-ompt-gpu-api.$(OBJEXT): gpu/ompt/$(am__dirstamp) \
+ gpu/ompt/$(DEPDIR)/$(am__dirstamp)
+gpu/ompt/libhpcrun_o-ompt-activity-translate.$(OBJEXT): \
+ gpu/ompt/$(am__dirstamp) gpu/ompt/$(DEPDIR)/$(am__dirstamp)
+sample-sources/libhpcrun_o-openmp-target.$(OBJEXT): \
+ sample-sources/$(am__dirstamp) \
+ sample-sources/$(DEPDIR)/$(am__dirstamp)
ompt/libhpcrun_o-ompt-callstack.$(OBJEXT): ompt/$(am__dirstamp) \
ompt/$(DEPDIR)/$(am__dirstamp)
ompt/libhpcrun_o-ompt-defer.$(OBJEXT): ompt/$(am__dirstamp) \
@@ -3586,6 +3645,9 @@ sample-sources/libhpcrun_o-papi.$(OBJEXT): \
sample-sources/libhpcrun_o-papi-c-cupti.$(OBJEXT): \
sample-sources/$(am__dirstamp) \
sample-sources/$(DEPDIR)/$(am__dirstamp)
+sample-sources/libhpcrun_o-papi-c-rocm.$(OBJEXT): \
+ sample-sources/$(am__dirstamp) \
+ sample-sources/$(DEPDIR)/$(am__dirstamp)
sample-sources/libhpcrun_o-papi-c.$(OBJEXT): \
sample-sources/$(am__dirstamp) \
sample-sources/$(DEPDIR)/$(am__dirstamp)
@@ -3786,6 +3848,8 @@ mostlyclean-compile:
-rm -f gpu/level0/*.lo
-rm -f gpu/nvidia/*.$(OBJEXT)
-rm -f gpu/nvidia/*.lo
+ -rm -f gpu/ompt/*.$(OBJEXT)
+ -rm -f gpu/ompt/*.lo
-rm -f gpu/opencl/*.$(OBJEXT)
-rm -f gpu/opencl/*.lo
-rm -f lush-agents/*.$(OBJEXT)
@@ -3858,6 +3922,7 @@ distclean-compile:
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libhpcrun_la-env.Plo@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libhpcrun_la-epoch.Plo@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libhpcrun_la-files.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libhpcrun_la-gpu-monitors.Plo@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libhpcrun_la-handling_sample.Plo@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libhpcrun_la-hpcrun-initializers.Plo@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libhpcrun_la-hpcrun_options.Plo@am__quote@
@@ -3879,6 +3944,7 @@ distclean-compile:
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libhpcrun_la-thread_finalize.Plo@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libhpcrun_la-thread_use.Plo@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libhpcrun_la-threadmgr.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libhpcrun_la-tool_state.Plo@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libhpcrun_la-trace.Plo@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libhpcrun_la-weak.Plo@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libhpcrun_la-write_data.Plo@am__quote@
@@ -3895,6 +3961,7 @@ distclean-compile:
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libhpcrun_o-env.Po@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libhpcrun_o-epoch.Po@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libhpcrun_o-files.Po@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libhpcrun_o-gpu-monitors.Po@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libhpcrun_o-handling_sample.Po@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libhpcrun_o-hpcrun-initializers.Po@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libhpcrun_o-hpcrun_options.Po@am__quote@
@@ -3916,6 +3983,7 @@ distclean-compile:
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libhpcrun_o-thread_finalize.Po@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libhpcrun_o-thread_use.Po@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libhpcrun_o-threadmgr.Po@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libhpcrun_o-tool_state.Po@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libhpcrun_o-trace.Po@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libhpcrun_o-weak.Po@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libhpcrun_o-write_data.Po@am__quote@
@@ -4001,8 +4069,9 @@ distclean-compile:
@AMDEP_TRUE@@am__include@ @am__quote@gpu/$(DEPDIR)/libhpcrun_o-gpu-trace-demultiplexer.Po@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@gpu/$(DEPDIR)/libhpcrun_o-gpu-trace-item.Po@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@gpu/$(DEPDIR)/libhpcrun_o-gpu-trace.Po@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@gpu/amd/$(DEPDIR)/libhpcrun_la-hip-api.Plo@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@gpu/amd/$(DEPDIR)/libhpcrun_la-rocm-binary-processing.Plo@am__quote@
-@AMDEP_TRUE@@am__include@ @am__quote@gpu/amd/$(DEPDIR)/libhpcrun_la-rocm-debug-api.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@gpu/amd/$(DEPDIR)/libhpcrun_la-rocprofiler-api.Plo@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@gpu/amd/$(DEPDIR)/libhpcrun_la-roctracer-activity-translate.Plo@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@gpu/amd/$(DEPDIR)/libhpcrun_la-roctracer-api.Plo@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@gpu/instrumentation/$(DEPDIR)/libhpcrun_la-gtpin-correlation-id-map.Plo@am__quote@
@@ -4034,6 +4103,10 @@ distclean-compile:
@AMDEP_TRUE@@am__include@ @am__quote@gpu/nvidia/$(DEPDIR)/libhpcrun_o-cupti-analysis.Po@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@gpu/nvidia/$(DEPDIR)/libhpcrun_o-cupti-api.Po@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@gpu/nvidia/$(DEPDIR)/libhpcrun_o-cupti-gpu-api.Po@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@gpu/ompt/$(DEPDIR)/libhpcrun_la-ompt-activity-translate.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@gpu/ompt/$(DEPDIR)/libhpcrun_la-ompt-gpu-api.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@gpu/ompt/$(DEPDIR)/libhpcrun_o-ompt-activity-translate.Po@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@gpu/ompt/$(DEPDIR)/libhpcrun_o-ompt-gpu-api.Po@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@gpu/opencl/$(DEPDIR)/libhpcrun_la-opencl-activity-translate.Plo@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@gpu/opencl/$(DEPDIR)/libhpcrun_la-opencl-api.Plo@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@gpu/opencl/$(DEPDIR)/libhpcrun_la-opencl-context-map.Plo@am__quote@
@@ -4098,6 +4171,7 @@ distclean-compile:
@AMDEP_TRUE@@am__include@ @am__quote@sample-sources/$(DEPDIR)/libhpcrun_ga_wrap_a-ga-overrides.Po@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@sample-sources/$(DEPDIR)/libhpcrun_io_la-io-over.Plo@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@sample-sources/$(DEPDIR)/libhpcrun_io_wrap_a-io-over.Po@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@sample-sources/$(DEPDIR)/libhpcrun_la-amd-rocprofiler.Plo@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@sample-sources/$(DEPDIR)/libhpcrun_la-amd.Plo@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@sample-sources/$(DEPDIR)/libhpcrun_la-common.Plo@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@sample-sources/$(DEPDIR)/libhpcrun_la-display.Plo@am__quote@
@@ -4113,8 +4187,10 @@ distclean-compile:
@AMDEP_TRUE@@am__include@ @am__quote@sample-sources/$(DEPDIR)/libhpcrun_la-omp-idle.Plo@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@sample-sources/$(DEPDIR)/libhpcrun_la-omp-mutex.Plo@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@sample-sources/$(DEPDIR)/libhpcrun_la-opencl.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@sample-sources/$(DEPDIR)/libhpcrun_la-openmp-target.Plo@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@sample-sources/$(DEPDIR)/libhpcrun_la-papi-c-cupti.Plo@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@sample-sources/$(DEPDIR)/libhpcrun_la-papi-c-extended-info.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@sample-sources/$(DEPDIR)/libhpcrun_la-papi-c-rocm.Plo@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@sample-sources/$(DEPDIR)/libhpcrun_la-papi-c.Plo@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@sample-sources/$(DEPDIR)/libhpcrun_la-papi.Plo@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@sample-sources/$(DEPDIR)/libhpcrun_la-pthread-blame.Plo@am__quote@
@@ -4136,8 +4212,10 @@ distclean-compile:
@AMDEP_TRUE@@am__include@ @am__quote@sample-sources/$(DEPDIR)/libhpcrun_o-nvidia.Po@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@sample-sources/$(DEPDIR)/libhpcrun_o-omp-idle.Po@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@sample-sources/$(DEPDIR)/libhpcrun_o-omp-mutex.Po@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@sample-sources/$(DEPDIR)/libhpcrun_o-openmp-target.Po@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@sample-sources/$(DEPDIR)/libhpcrun_o-papi-c-cupti.Po@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@sample-sources/$(DEPDIR)/libhpcrun_o-papi-c-extended-info.Po@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@sample-sources/$(DEPDIR)/libhpcrun_o-papi-c-rocm.Po@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@sample-sources/$(DEPDIR)/libhpcrun_o-papi-c.Po@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@sample-sources/$(DEPDIR)/libhpcrun_o-papi.Po@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@sample-sources/$(DEPDIR)/libhpcrun_o-pthread-blame.Po@am__quote@
@@ -4631,6 +4709,13 @@ libhpcrun_la-sample_sources_all.lo: sample_sources_all.c
@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
@am__fastdepCC_FALSE@ $(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -c -o libhpcrun_la-sample_sources_all.lo `test -f 'sample_sources_all.c' || echo '$(srcdir)/'`sample_sources_all.c
+libhpcrun_la-tool_state.lo: tool_state.c
+@am__fastdepCC_TRUE@ $(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -MT libhpcrun_la-tool_state.lo -MD -MP -MF $(DEPDIR)/libhpcrun_la-tool_state.Tpo -c -o libhpcrun_la-tool_state.lo `test -f 'tool_state.c' || echo '$(srcdir)/'`tool_state.c
+@am__fastdepCC_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/libhpcrun_la-tool_state.Tpo $(DEPDIR)/libhpcrun_la-tool_state.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@ $(AM_V_CC)source='tool_state.c' object='libhpcrun_la-tool_state.lo' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@ $(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -c -o libhpcrun_la-tool_state.lo `test -f 'tool_state.c' || echo '$(srcdir)/'`tool_state.c
+
sample-sources/blame-shift/libhpcrun_la-blame-shift.lo: sample-sources/blame-shift/blame-shift.c
@am__fastdepCC_TRUE@ $(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -MT sample-sources/blame-shift/libhpcrun_la-blame-shift.lo -MD -MP -MF sample-sources/blame-shift/$(DEPDIR)/libhpcrun_la-blame-shift.Tpo -c -o sample-sources/blame-shift/libhpcrun_la-blame-shift.lo `test -f 'sample-sources/blame-shift/blame-shift.c' || echo '$(srcdir)/'`sample-sources/blame-shift/blame-shift.c
@am__fastdepCC_TRUE@ $(AM_V_at)$(am__mv) sample-sources/blame-shift/$(DEPDIR)/libhpcrun_la-blame-shift.Tpo sample-sources/blame-shift/$(DEPDIR)/libhpcrun_la-blame-shift.Plo
@@ -4827,6 +4912,13 @@ libhpcrun_la-device-finalizers.lo: device-finalizers.c
@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
@am__fastdepCC_FALSE@ $(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -c -o libhpcrun_la-device-finalizers.lo `test -f 'device-finalizers.c' || echo '$(srcdir)/'`device-finalizers.c
+libhpcrun_la-gpu-monitors.lo: gpu-monitors.c
+@am__fastdepCC_TRUE@ $(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -MT libhpcrun_la-gpu-monitors.lo -MD -MP -MF $(DEPDIR)/libhpcrun_la-gpu-monitors.Tpo -c -o libhpcrun_la-gpu-monitors.lo `test -f 'gpu-monitors.c' || echo '$(srcdir)/'`gpu-monitors.c
+@am__fastdepCC_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/libhpcrun_la-gpu-monitors.Tpo $(DEPDIR)/libhpcrun_la-gpu-monitors.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@ $(AM_V_CC)source='gpu-monitors.c' object='libhpcrun_la-gpu-monitors.lo' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@ $(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -c -o libhpcrun_la-gpu-monitors.lo `test -f 'gpu-monitors.c' || echo '$(srcdir)/'`gpu-monitors.c
+
libhpcrun_la-device-initializers.lo: device-initializers.c
@am__fastdepCC_TRUE@ $(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -MT libhpcrun_la-device-initializers.lo -MD -MP -MF $(DEPDIR)/libhpcrun_la-device-initializers.Tpo -c -o libhpcrun_la-device-initializers.lo `test -f 'device-initializers.c' || echo '$(srcdir)/'`device-initializers.c
@am__fastdepCC_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/libhpcrun_la-device-initializers.Tpo $(DEPDIR)/libhpcrun_la-device-initializers.Plo
@@ -5198,6 +5290,27 @@ gpu/libhpcrun_la-gpu-trace-demultiplexer.lo: gpu/gpu-trace-demultiplexer.c
@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
@am__fastdepCC_FALSE@ $(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -c -o gpu/libhpcrun_la-gpu-trace-demultiplexer.lo `test -f 'gpu/gpu-trace-demultiplexer.c' || echo '$(srcdir)/'`gpu/gpu-trace-demultiplexer.c
+gpu/ompt/libhpcrun_la-ompt-gpu-api.lo: gpu/ompt/ompt-gpu-api.c
+@am__fastdepCC_TRUE@ $(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -MT gpu/ompt/libhpcrun_la-ompt-gpu-api.lo -MD -MP -MF gpu/ompt/$(DEPDIR)/libhpcrun_la-ompt-gpu-api.Tpo -c -o gpu/ompt/libhpcrun_la-ompt-gpu-api.lo `test -f 'gpu/ompt/ompt-gpu-api.c' || echo '$(srcdir)/'`gpu/ompt/ompt-gpu-api.c
+@am__fastdepCC_TRUE@ $(AM_V_at)$(am__mv) gpu/ompt/$(DEPDIR)/libhpcrun_la-ompt-gpu-api.Tpo gpu/ompt/$(DEPDIR)/libhpcrun_la-ompt-gpu-api.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@ $(AM_V_CC)source='gpu/ompt/ompt-gpu-api.c' object='gpu/ompt/libhpcrun_la-ompt-gpu-api.lo' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@ $(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -c -o gpu/ompt/libhpcrun_la-ompt-gpu-api.lo `test -f 'gpu/ompt/ompt-gpu-api.c' || echo '$(srcdir)/'`gpu/ompt/ompt-gpu-api.c
+
+gpu/ompt/libhpcrun_la-ompt-activity-translate.lo: gpu/ompt/ompt-activity-translate.c
+@am__fastdepCC_TRUE@ $(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -MT gpu/ompt/libhpcrun_la-ompt-activity-translate.lo -MD -MP -MF gpu/ompt/$(DEPDIR)/libhpcrun_la-ompt-activity-translate.Tpo -c -o gpu/ompt/libhpcrun_la-ompt-activity-translate.lo `test -f 'gpu/ompt/ompt-activity-translate.c' || echo '$(srcdir)/'`gpu/ompt/ompt-activity-translate.c
+@am__fastdepCC_TRUE@ $(AM_V_at)$(am__mv) gpu/ompt/$(DEPDIR)/libhpcrun_la-ompt-activity-translate.Tpo gpu/ompt/$(DEPDIR)/libhpcrun_la-ompt-activity-translate.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@ $(AM_V_CC)source='gpu/ompt/ompt-activity-translate.c' object='gpu/ompt/libhpcrun_la-ompt-activity-translate.lo' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@ $(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -c -o gpu/ompt/libhpcrun_la-ompt-activity-translate.lo `test -f 'gpu/ompt/ompt-activity-translate.c' || echo '$(srcdir)/'`gpu/ompt/ompt-activity-translate.c
+
+sample-sources/libhpcrun_la-openmp-target.lo: sample-sources/openmp-target.c
+@am__fastdepCC_TRUE@ $(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -MT sample-sources/libhpcrun_la-openmp-target.lo -MD -MP -MF sample-sources/$(DEPDIR)/libhpcrun_la-openmp-target.Tpo -c -o sample-sources/libhpcrun_la-openmp-target.lo `test -f 'sample-sources/openmp-target.c' || echo '$(srcdir)/'`sample-sources/openmp-target.c
+@am__fastdepCC_TRUE@ $(AM_V_at)$(am__mv) sample-sources/$(DEPDIR)/libhpcrun_la-openmp-target.Tpo sample-sources/$(DEPDIR)/libhpcrun_la-openmp-target.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@ $(AM_V_CC)source='sample-sources/openmp-target.c' object='sample-sources/libhpcrun_la-openmp-target.lo' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@ $(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -c -o sample-sources/libhpcrun_la-openmp-target.lo `test -f 'sample-sources/openmp-target.c' || echo '$(srcdir)/'`sample-sources/openmp-target.c
+
ompt/libhpcrun_la-ompt-callstack.lo: ompt/ompt-callstack.c
@am__fastdepCC_TRUE@ $(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -MT ompt/libhpcrun_la-ompt-callstack.lo -MD -MP -MF ompt/$(DEPDIR)/libhpcrun_la-ompt-callstack.Tpo -c -o ompt/libhpcrun_la-ompt-callstack.lo `test -f 'ompt/ompt-callstack.c' || echo '$(srcdir)/'`ompt/ompt-callstack.c
@am__fastdepCC_TRUE@ $(AM_V_at)$(am__mv) ompt/$(DEPDIR)/libhpcrun_la-ompt-callstack.Tpo ompt/$(DEPDIR)/libhpcrun_la-ompt-callstack.Plo
@@ -5548,6 +5661,13 @@ sample-sources/libhpcrun_la-papi-c-cupti.lo: sample-sources/papi-c-cupti.c
@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
@am__fastdepCC_FALSE@ $(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -c -o sample-sources/libhpcrun_la-papi-c-cupti.lo `test -f 'sample-sources/papi-c-cupti.c' || echo '$(srcdir)/'`sample-sources/papi-c-cupti.c
+sample-sources/libhpcrun_la-papi-c-rocm.lo: sample-sources/papi-c-rocm.c
+@am__fastdepCC_TRUE@ $(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -MT sample-sources/libhpcrun_la-papi-c-rocm.lo -MD -MP -MF sample-sources/$(DEPDIR)/libhpcrun_la-papi-c-rocm.Tpo -c -o sample-sources/libhpcrun_la-papi-c-rocm.lo `test -f 'sample-sources/papi-c-rocm.c' || echo '$(srcdir)/'`sample-sources/papi-c-rocm.c
+@am__fastdepCC_TRUE@ $(AM_V_at)$(am__mv) sample-sources/$(DEPDIR)/libhpcrun_la-papi-c-rocm.Tpo sample-sources/$(DEPDIR)/libhpcrun_la-papi-c-rocm.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@ $(AM_V_CC)source='sample-sources/papi-c-rocm.c' object='sample-sources/libhpcrun_la-papi-c-rocm.lo' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@ $(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -c -o sample-sources/libhpcrun_la-papi-c-rocm.lo `test -f 'sample-sources/papi-c-rocm.c' || echo '$(srcdir)/'`sample-sources/papi-c-rocm.c
+
sample-sources/libhpcrun_la-papi-c.lo: sample-sources/papi-c.c
@am__fastdepCC_TRUE@ $(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -MT sample-sources/libhpcrun_la-papi-c.lo -MD -MP -MF sample-sources/$(DEPDIR)/libhpcrun_la-papi-c.Tpo -c -o sample-sources/libhpcrun_la-papi-c.lo `test -f 'sample-sources/papi-c.c' || echo '$(srcdir)/'`sample-sources/papi-c.c
@am__fastdepCC_TRUE@ $(AM_V_at)$(am__mv) sample-sources/$(DEPDIR)/libhpcrun_la-papi-c.Tpo sample-sources/$(DEPDIR)/libhpcrun_la-papi-c.Plo
@@ -5646,6 +5766,20 @@ sample-sources/libhpcrun_la-amd.lo: sample-sources/amd.c
@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
@am__fastdepCC_FALSE@ $(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -c -o sample-sources/libhpcrun_la-amd.lo `test -f 'sample-sources/amd.c' || echo '$(srcdir)/'`sample-sources/amd.c
+sample-sources/libhpcrun_la-amd-rocprofiler.lo: sample-sources/amd-rocprofiler.c
+@am__fastdepCC_TRUE@ $(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -MT sample-sources/libhpcrun_la-amd-rocprofiler.lo -MD -MP -MF sample-sources/$(DEPDIR)/libhpcrun_la-amd-rocprofiler.Tpo -c -o sample-sources/libhpcrun_la-amd-rocprofiler.lo `test -f 'sample-sources/amd-rocprofiler.c' || echo '$(srcdir)/'`sample-sources/amd-rocprofiler.c
+@am__fastdepCC_TRUE@ $(AM_V_at)$(am__mv) sample-sources/$(DEPDIR)/libhpcrun_la-amd-rocprofiler.Tpo sample-sources/$(DEPDIR)/libhpcrun_la-amd-rocprofiler.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@ $(AM_V_CC)source='sample-sources/amd-rocprofiler.c' object='sample-sources/libhpcrun_la-amd-rocprofiler.lo' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@ $(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -c -o sample-sources/libhpcrun_la-amd-rocprofiler.lo `test -f 'sample-sources/amd-rocprofiler.c' || echo '$(srcdir)/'`sample-sources/amd-rocprofiler.c
+
+gpu/amd/libhpcrun_la-hip-api.lo: gpu/amd/hip-api.c
+@am__fastdepCC_TRUE@ $(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -MT gpu/amd/libhpcrun_la-hip-api.lo -MD -MP -MF gpu/amd/$(DEPDIR)/libhpcrun_la-hip-api.Tpo -c -o gpu/amd/libhpcrun_la-hip-api.lo `test -f 'gpu/amd/hip-api.c' || echo '$(srcdir)/'`gpu/amd/hip-api.c
+@am__fastdepCC_TRUE@ $(AM_V_at)$(am__mv) gpu/amd/$(DEPDIR)/libhpcrun_la-hip-api.Tpo gpu/amd/$(DEPDIR)/libhpcrun_la-hip-api.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@ $(AM_V_CC)source='gpu/amd/hip-api.c' object='gpu/amd/libhpcrun_la-hip-api.lo' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@ $(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -c -o gpu/amd/libhpcrun_la-hip-api.lo `test -f 'gpu/amd/hip-api.c' || echo '$(srcdir)/'`gpu/amd/hip-api.c
+
gpu/amd/libhpcrun_la-roctracer-activity-translate.lo: gpu/amd/roctracer-activity-translate.c
@am__fastdepCC_TRUE@ $(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -MT gpu/amd/libhpcrun_la-roctracer-activity-translate.lo -MD -MP -MF gpu/amd/$(DEPDIR)/libhpcrun_la-roctracer-activity-translate.Tpo -c -o gpu/amd/libhpcrun_la-roctracer-activity-translate.lo `test -f 'gpu/amd/roctracer-activity-translate.c' || echo '$(srcdir)/'`gpu/amd/roctracer-activity-translate.c
@am__fastdepCC_TRUE@ $(AM_V_at)$(am__mv) gpu/amd/$(DEPDIR)/libhpcrun_la-roctracer-activity-translate.Tpo gpu/amd/$(DEPDIR)/libhpcrun_la-roctracer-activity-translate.Plo
@@ -5660,12 +5794,12 @@ gpu/amd/libhpcrun_la-roctracer-api.lo: gpu/amd/roctracer-api.c
@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
@am__fastdepCC_FALSE@ $(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -c -o gpu/amd/libhpcrun_la-roctracer-api.lo `test -f 'gpu/amd/roctracer-api.c' || echo '$(srcdir)/'`gpu/amd/roctracer-api.c
-gpu/amd/libhpcrun_la-rocm-debug-api.lo: gpu/amd/rocm-debug-api.c
-@am__fastdepCC_TRUE@ $(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -MT gpu/amd/libhpcrun_la-rocm-debug-api.lo -MD -MP -MF gpu/amd/$(DEPDIR)/libhpcrun_la-rocm-debug-api.Tpo -c -o gpu/amd/libhpcrun_la-rocm-debug-api.lo `test -f 'gpu/amd/rocm-debug-api.c' || echo '$(srcdir)/'`gpu/amd/rocm-debug-api.c
-@am__fastdepCC_TRUE@ $(AM_V_at)$(am__mv) gpu/amd/$(DEPDIR)/libhpcrun_la-rocm-debug-api.Tpo gpu/amd/$(DEPDIR)/libhpcrun_la-rocm-debug-api.Plo
-@AMDEP_TRUE@@am__fastdepCC_FALSE@ $(AM_V_CC)source='gpu/amd/rocm-debug-api.c' object='gpu/amd/libhpcrun_la-rocm-debug-api.lo' libtool=yes @AMDEPBACKSLASH@
+gpu/amd/libhpcrun_la-rocprofiler-api.lo: gpu/amd/rocprofiler-api.c
+@am__fastdepCC_TRUE@ $(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -MT gpu/amd/libhpcrun_la-rocprofiler-api.lo -MD -MP -MF gpu/amd/$(DEPDIR)/libhpcrun_la-rocprofiler-api.Tpo -c -o gpu/amd/libhpcrun_la-rocprofiler-api.lo `test -f 'gpu/amd/rocprofiler-api.c' || echo '$(srcdir)/'`gpu/amd/rocprofiler-api.c
+@am__fastdepCC_TRUE@ $(AM_V_at)$(am__mv) gpu/amd/$(DEPDIR)/libhpcrun_la-rocprofiler-api.Tpo gpu/amd/$(DEPDIR)/libhpcrun_la-rocprofiler-api.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@ $(AM_V_CC)source='gpu/amd/rocprofiler-api.c' object='gpu/amd/libhpcrun_la-rocprofiler-api.lo' libtool=yes @AMDEPBACKSLASH@
@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
-@am__fastdepCC_FALSE@ $(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -c -o gpu/amd/libhpcrun_la-rocm-debug-api.lo `test -f 'gpu/amd/rocm-debug-api.c' || echo '$(srcdir)/'`gpu/amd/rocm-debug-api.c
+@am__fastdepCC_FALSE@ $(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -c -o gpu/amd/libhpcrun_la-rocprofiler-api.lo `test -f 'gpu/amd/rocprofiler-api.c' || echo '$(srcdir)/'`gpu/amd/rocprofiler-api.c
gpu/amd/libhpcrun_la-rocm-binary-processing.lo: gpu/amd/rocm-binary-processing.c
@am__fastdepCC_TRUE@ $(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -MT gpu/amd/libhpcrun_la-rocm-binary-processing.lo -MD -MP -MF gpu/amd/$(DEPDIR)/libhpcrun_la-rocm-binary-processing.Tpo -c -o gpu/amd/libhpcrun_la-rocm-binary-processing.lo `test -f 'gpu/amd/rocm-binary-processing.c' || echo '$(srcdir)/'`gpu/amd/rocm-binary-processing.c
@@ -6367,6 +6501,20 @@ libhpcrun_o-sample_sources_all.obj: sample_sources_all.c
@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
@am__fastdepCC_FALSE@ $(AM_V_CC@am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_o_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_o_CFLAGS) $(CFLAGS) -c -o libhpcrun_o-sample_sources_all.obj `if test -f 'sample_sources_all.c'; then $(CYGPATH_W) 'sample_sources_all.c'; else $(CYGPATH_W) '$(srcdir)/sample_sources_all.c'; fi`
+libhpcrun_o-tool_state.o: tool_state.c
+@am__fastdepCC_TRUE@ $(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_o_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_o_CFLAGS) $(CFLAGS) -MT libhpcrun_o-tool_state.o -MD -MP -MF $(DEPDIR)/libhpcrun_o-tool_state.Tpo -c -o libhpcrun_o-tool_state.o `test -f 'tool_state.c' || echo '$(srcdir)/'`tool_state.c
+@am__fastdepCC_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/libhpcrun_o-tool_state.Tpo $(DEPDIR)/libhpcrun_o-tool_state.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@ $(AM_V_CC)source='tool_state.c' object='libhpcrun_o-tool_state.o' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@ $(AM_V_CC@am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_o_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_o_CFLAGS) $(CFLAGS) -c -o libhpcrun_o-tool_state.o `test -f 'tool_state.c' || echo '$(srcdir)/'`tool_state.c
+
+libhpcrun_o-tool_state.obj: tool_state.c
+@am__fastdepCC_TRUE@ $(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_o_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_o_CFLAGS) $(CFLAGS) -MT libhpcrun_o-tool_state.obj -MD -MP -MF $(DEPDIR)/libhpcrun_o-tool_state.Tpo -c -o libhpcrun_o-tool_state.obj `if test -f 'tool_state.c'; then $(CYGPATH_W) 'tool_state.c'; else $(CYGPATH_W) '$(srcdir)/tool_state.c'; fi`
+@am__fastdepCC_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/libhpcrun_o-tool_state.Tpo $(DEPDIR)/libhpcrun_o-tool_state.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@ $(AM_V_CC)source='tool_state.c' object='libhpcrun_o-tool_state.obj' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@ $(AM_V_CC@am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_o_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_o_CFLAGS) $(CFLAGS) -c -o libhpcrun_o-tool_state.obj `if test -f 'tool_state.c'; then $(CYGPATH_W) 'tool_state.c'; else $(CYGPATH_W) '$(srcdir)/tool_state.c'; fi`
+
sample-sources/blame-shift/libhpcrun_o-blame-shift.o: sample-sources/blame-shift/blame-shift.c
@am__fastdepCC_TRUE@ $(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_o_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_o_CFLAGS) $(CFLAGS) -MT sample-sources/blame-shift/libhpcrun_o-blame-shift.o -MD -MP -MF sample-sources/blame-shift/$(DEPDIR)/libhpcrun_o-blame-shift.Tpo -c -o sample-sources/blame-shift/libhpcrun_o-blame-shift.o `test -f 'sample-sources/blame-shift/blame-shift.c' || echo '$(srcdir)/'`sample-sources/blame-shift/blame-shift.c
@am__fastdepCC_TRUE@ $(AM_V_at)$(am__mv) sample-sources/blame-shift/$(DEPDIR)/libhpcrun_o-blame-shift.Tpo sample-sources/blame-shift/$(DEPDIR)/libhpcrun_o-blame-shift.Po
@@ -6759,6 +6907,20 @@ libhpcrun_o-device-finalizers.obj: device-finalizers.c
@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
@am__fastdepCC_FALSE@ $(AM_V_CC@am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_o_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_o_CFLAGS) $(CFLAGS) -c -o libhpcrun_o-device-finalizers.obj `if test -f 'device-finalizers.c'; then $(CYGPATH_W) 'device-finalizers.c'; else $(CYGPATH_W) '$(srcdir)/device-finalizers.c'; fi`
+libhpcrun_o-gpu-monitors.o: gpu-monitors.c
+@am__fastdepCC_TRUE@ $(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_o_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_o_CFLAGS) $(CFLAGS) -MT libhpcrun_o-gpu-monitors.o -MD -MP -MF $(DEPDIR)/libhpcrun_o-gpu-monitors.Tpo -c -o libhpcrun_o-gpu-monitors.o `test -f 'gpu-monitors.c' || echo '$(srcdir)/'`gpu-monitors.c
+@am__fastdepCC_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/libhpcrun_o-gpu-monitors.Tpo $(DEPDIR)/libhpcrun_o-gpu-monitors.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@ $(AM_V_CC)source='gpu-monitors.c' object='libhpcrun_o-gpu-monitors.o' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@ $(AM_V_CC@am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_o_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_o_CFLAGS) $(CFLAGS) -c -o libhpcrun_o-gpu-monitors.o `test -f 'gpu-monitors.c' || echo '$(srcdir)/'`gpu-monitors.c
+
+libhpcrun_o-gpu-monitors.obj: gpu-monitors.c
+@am__fastdepCC_TRUE@ $(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_o_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_o_CFLAGS) $(CFLAGS) -MT libhpcrun_o-gpu-monitors.obj -MD -MP -MF $(DEPDIR)/libhpcrun_o-gpu-monitors.Tpo -c -o libhpcrun_o-gpu-monitors.obj `if test -f 'gpu-monitors.c'; then $(CYGPATH_W) 'gpu-monitors.c'; else $(CYGPATH_W) '$(srcdir)/gpu-monitors.c'; fi`
+@am__fastdepCC_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/libhpcrun_o-gpu-monitors.Tpo $(DEPDIR)/libhpcrun_o-gpu-monitors.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@ $(AM_V_CC)source='gpu-monitors.c' object='libhpcrun_o-gpu-monitors.obj' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@ $(AM_V_CC@am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_o_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_o_CFLAGS) $(CFLAGS) -c -o libhpcrun_o-gpu-monitors.obj `if test -f 'gpu-monitors.c'; then $(CYGPATH_W) 'gpu-monitors.c'; else $(CYGPATH_W) '$(srcdir)/gpu-monitors.c'; fi`
+
libhpcrun_o-device-initializers.o: device-initializers.c
@am__fastdepCC_TRUE@ $(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_o_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_o_CFLAGS) $(CFLAGS) -MT libhpcrun_o-device-initializers.o -MD -MP -MF $(DEPDIR)/libhpcrun_o-device-initializers.Tpo -c -o libhpcrun_o-device-initializers.o `test -f 'device-initializers.c' || echo '$(srcdir)/'`device-initializers.c
@am__fastdepCC_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/libhpcrun_o-device-initializers.Tpo $(DEPDIR)/libhpcrun_o-device-initializers.Po
@@ -7501,6 +7663,48 @@ gpu/libhpcrun_o-gpu-trace-demultiplexer.obj: gpu/gpu-trace-demultiplexer.c
@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
@am__fastdepCC_FALSE@ $(AM_V_CC@am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_o_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_o_CFLAGS) $(CFLAGS) -c -o gpu/libhpcrun_o-gpu-trace-demultiplexer.obj `if test -f 'gpu/gpu-trace-demultiplexer.c'; then $(CYGPATH_W) 'gpu/gpu-trace-demultiplexer.c'; else $(CYGPATH_W) '$(srcdir)/gpu/gpu-trace-demultiplexer.c'; fi`
+gpu/ompt/libhpcrun_o-ompt-gpu-api.o: gpu/ompt/ompt-gpu-api.c
+@am__fastdepCC_TRUE@ $(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_o_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_o_CFLAGS) $(CFLAGS) -MT gpu/ompt/libhpcrun_o-ompt-gpu-api.o -MD -MP -MF gpu/ompt/$(DEPDIR)/libhpcrun_o-ompt-gpu-api.Tpo -c -o gpu/ompt/libhpcrun_o-ompt-gpu-api.o `test -f 'gpu/ompt/ompt-gpu-api.c' || echo '$(srcdir)/'`gpu/ompt/ompt-gpu-api.c
+@am__fastdepCC_TRUE@ $(AM_V_at)$(am__mv) gpu/ompt/$(DEPDIR)/libhpcrun_o-ompt-gpu-api.Tpo gpu/ompt/$(DEPDIR)/libhpcrun_o-ompt-gpu-api.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@ $(AM_V_CC)source='gpu/ompt/ompt-gpu-api.c' object='gpu/ompt/libhpcrun_o-ompt-gpu-api.o' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@ $(AM_V_CC@am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_o_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_o_CFLAGS) $(CFLAGS) -c -o gpu/ompt/libhpcrun_o-ompt-gpu-api.o `test -f 'gpu/ompt/ompt-gpu-api.c' || echo '$(srcdir)/'`gpu/ompt/ompt-gpu-api.c
+
+gpu/ompt/libhpcrun_o-ompt-gpu-api.obj: gpu/ompt/ompt-gpu-api.c
+@am__fastdepCC_TRUE@ $(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_o_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_o_CFLAGS) $(CFLAGS) -MT gpu/ompt/libhpcrun_o-ompt-gpu-api.obj -MD -MP -MF gpu/ompt/$(DEPDIR)/libhpcrun_o-ompt-gpu-api.Tpo -c -o gpu/ompt/libhpcrun_o-ompt-gpu-api.obj `if test -f 'gpu/ompt/ompt-gpu-api.c'; then $(CYGPATH_W) 'gpu/ompt/ompt-gpu-api.c'; else $(CYGPATH_W) '$(srcdir)/gpu/ompt/ompt-gpu-api.c'; fi`
+@am__fastdepCC_TRUE@ $(AM_V_at)$(am__mv) gpu/ompt/$(DEPDIR)/libhpcrun_o-ompt-gpu-api.Tpo gpu/ompt/$(DEPDIR)/libhpcrun_o-ompt-gpu-api.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@ $(AM_V_CC)source='gpu/ompt/ompt-gpu-api.c' object='gpu/ompt/libhpcrun_o-ompt-gpu-api.obj' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@ $(AM_V_CC@am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_o_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_o_CFLAGS) $(CFLAGS) -c -o gpu/ompt/libhpcrun_o-ompt-gpu-api.obj `if test -f 'gpu/ompt/ompt-gpu-api.c'; then $(CYGPATH_W) 'gpu/ompt/ompt-gpu-api.c'; else $(CYGPATH_W) '$(srcdir)/gpu/ompt/ompt-gpu-api.c'; fi`
+
+gpu/ompt/libhpcrun_o-ompt-activity-translate.o: gpu/ompt/ompt-activity-translate.c
+@am__fastdepCC_TRUE@ $(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_o_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_o_CFLAGS) $(CFLAGS) -MT gpu/ompt/libhpcrun_o-ompt-activity-translate.o -MD -MP -MF gpu/ompt/$(DEPDIR)/libhpcrun_o-ompt-activity-translate.Tpo -c -o gpu/ompt/libhpcrun_o-ompt-activity-translate.o `test -f 'gpu/ompt/ompt-activity-translate.c' || echo '$(srcdir)/'`gpu/ompt/ompt-activity-translate.c
+@am__fastdepCC_TRUE@ $(AM_V_at)$(am__mv) gpu/ompt/$(DEPDIR)/libhpcrun_o-ompt-activity-translate.Tpo gpu/ompt/$(DEPDIR)/libhpcrun_o-ompt-activity-translate.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@ $(AM_V_CC)source='gpu/ompt/ompt-activity-translate.c' object='gpu/ompt/libhpcrun_o-ompt-activity-translate.o' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@ $(AM_V_CC@am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_o_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_o_CFLAGS) $(CFLAGS) -c -o gpu/ompt/libhpcrun_o-ompt-activity-translate.o `test -f 'gpu/ompt/ompt-activity-translate.c' || echo '$(srcdir)/'`gpu/ompt/ompt-activity-translate.c
+
+gpu/ompt/libhpcrun_o-ompt-activity-translate.obj: gpu/ompt/ompt-activity-translate.c
+@am__fastdepCC_TRUE@ $(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_o_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_o_CFLAGS) $(CFLAGS) -MT gpu/ompt/libhpcrun_o-ompt-activity-translate.obj -MD -MP -MF gpu/ompt/$(DEPDIR)/libhpcrun_o-ompt-activity-translate.Tpo -c -o gpu/ompt/libhpcrun_o-ompt-activity-translate.obj `if test -f 'gpu/ompt/ompt-activity-translate.c'; then $(CYGPATH_W) 'gpu/ompt/ompt-activity-translate.c'; else $(CYGPATH_W) '$(srcdir)/gpu/ompt/ompt-activity-translate.c'; fi`
+@am__fastdepCC_TRUE@ $(AM_V_at)$(am__mv) gpu/ompt/$(DEPDIR)/libhpcrun_o-ompt-activity-translate.Tpo gpu/ompt/$(DEPDIR)/libhpcrun_o-ompt-activity-translate.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@ $(AM_V_CC)source='gpu/ompt/ompt-activity-translate.c' object='gpu/ompt/libhpcrun_o-ompt-activity-translate.obj' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@ $(AM_V_CC@am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_o_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_o_CFLAGS) $(CFLAGS) -c -o gpu/ompt/libhpcrun_o-ompt-activity-translate.obj `if test -f 'gpu/ompt/ompt-activity-translate.c'; then $(CYGPATH_W) 'gpu/ompt/ompt-activity-translate.c'; else $(CYGPATH_W) '$(srcdir)/gpu/ompt/ompt-activity-translate.c'; fi`
+
+sample-sources/libhpcrun_o-openmp-target.o: sample-sources/openmp-target.c
+@am__fastdepCC_TRUE@ $(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_o_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_o_CFLAGS) $(CFLAGS) -MT sample-sources/libhpcrun_o-openmp-target.o -MD -MP -MF sample-sources/$(DEPDIR)/libhpcrun_o-openmp-target.Tpo -c -o sample-sources/libhpcrun_o-openmp-target.o `test -f 'sample-sources/openmp-target.c' || echo '$(srcdir)/'`sample-sources/openmp-target.c
+@am__fastdepCC_TRUE@ $(AM_V_at)$(am__mv) sample-sources/$(DEPDIR)/libhpcrun_o-openmp-target.Tpo sample-sources/$(DEPDIR)/libhpcrun_o-openmp-target.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@ $(AM_V_CC)source='sample-sources/openmp-target.c' object='sample-sources/libhpcrun_o-openmp-target.o' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@ $(AM_V_CC@am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_o_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_o_CFLAGS) $(CFLAGS) -c -o sample-sources/libhpcrun_o-openmp-target.o `test -f 'sample-sources/openmp-target.c' || echo '$(srcdir)/'`sample-sources/openmp-target.c
+
+sample-sources/libhpcrun_o-openmp-target.obj: sample-sources/openmp-target.c
+@am__fastdepCC_TRUE@ $(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_o_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_o_CFLAGS) $(CFLAGS) -MT sample-sources/libhpcrun_o-openmp-target.obj -MD -MP -MF sample-sources/$(DEPDIR)/libhpcrun_o-openmp-target.Tpo -c -o sample-sources/libhpcrun_o-openmp-target.obj `if test -f 'sample-sources/openmp-target.c'; then $(CYGPATH_W) 'sample-sources/openmp-target.c'; else $(CYGPATH_W) '$(srcdir)/sample-sources/openmp-target.c'; fi`
+@am__fastdepCC_TRUE@ $(AM_V_at)$(am__mv) sample-sources/$(DEPDIR)/libhpcrun_o-openmp-target.Tpo sample-sources/$(DEPDIR)/libhpcrun_o-openmp-target.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@ $(AM_V_CC)source='sample-sources/openmp-target.c' object='sample-sources/libhpcrun_o-openmp-target.obj' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@ $(AM_V_CC@am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_o_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_o_CFLAGS) $(CFLAGS) -c -o sample-sources/libhpcrun_o-openmp-target.obj `if test -f 'sample-sources/openmp-target.c'; then $(CYGPATH_W) 'sample-sources/openmp-target.c'; else $(CYGPATH_W) '$(srcdir)/sample-sources/openmp-target.c'; fi`
+
ompt/libhpcrun_o-ompt-callstack.o: ompt/ompt-callstack.c
@am__fastdepCC_TRUE@ $(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_o_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_o_CFLAGS) $(CFLAGS) -MT ompt/libhpcrun_o-ompt-callstack.o -MD -MP -MF ompt/$(DEPDIR)/libhpcrun_o-ompt-callstack.Tpo -c -o ompt/libhpcrun_o-ompt-callstack.o `test -f 'ompt/ompt-callstack.c' || echo '$(srcdir)/'`ompt/ompt-callstack.c
@am__fastdepCC_TRUE@ $(AM_V_at)$(am__mv) ompt/$(DEPDIR)/libhpcrun_o-ompt-callstack.Tpo ompt/$(DEPDIR)/libhpcrun_o-ompt-callstack.Po
@@ -8299,6 +8503,20 @@ sample-sources/libhpcrun_o-papi-c-cupti.obj: sample-sources/papi-c-cupti.c
@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
@am__fastdepCC_FALSE@ $(AM_V_CC@am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_o_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_o_CFLAGS) $(CFLAGS) -c -o sample-sources/libhpcrun_o-papi-c-cupti.obj `if test -f 'sample-sources/papi-c-cupti.c'; then $(CYGPATH_W) 'sample-sources/papi-c-cupti.c'; else $(CYGPATH_W) '$(srcdir)/sample-sources/papi-c-cupti.c'; fi`
+sample-sources/libhpcrun_o-papi-c-rocm.o: sample-sources/papi-c-rocm.c
+@am__fastdepCC_TRUE@ $(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_o_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_o_CFLAGS) $(CFLAGS) -MT sample-sources/libhpcrun_o-papi-c-rocm.o -MD -MP -MF sample-sources/$(DEPDIR)/libhpcrun_o-papi-c-rocm.Tpo -c -o sample-sources/libhpcrun_o-papi-c-rocm.o `test -f 'sample-sources/papi-c-rocm.c' || echo '$(srcdir)/'`sample-sources/papi-c-rocm.c
+@am__fastdepCC_TRUE@ $(AM_V_at)$(am__mv) sample-sources/$(DEPDIR)/libhpcrun_o-papi-c-rocm.Tpo sample-sources/$(DEPDIR)/libhpcrun_o-papi-c-rocm.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@ $(AM_V_CC)source='sample-sources/papi-c-rocm.c' object='sample-sources/libhpcrun_o-papi-c-rocm.o' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@ $(AM_V_CC@am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_o_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_o_CFLAGS) $(CFLAGS) -c -o sample-sources/libhpcrun_o-papi-c-rocm.o `test -f 'sample-sources/papi-c-rocm.c' || echo '$(srcdir)/'`sample-sources/papi-c-rocm.c
+
+sample-sources/libhpcrun_o-papi-c-rocm.obj: sample-sources/papi-c-rocm.c
+@am__fastdepCC_TRUE@ $(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_o_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_o_CFLAGS) $(CFLAGS) -MT sample-sources/libhpcrun_o-papi-c-rocm.obj -MD -MP -MF sample-sources/$(DEPDIR)/libhpcrun_o-papi-c-rocm.Tpo -c -o sample-sources/libhpcrun_o-papi-c-rocm.obj `if test -f 'sample-sources/papi-c-rocm.c'; then $(CYGPATH_W) 'sample-sources/papi-c-rocm.c'; else $(CYGPATH_W) '$(srcdir)/sample-sources/papi-c-rocm.c'; fi`
+@am__fastdepCC_TRUE@ $(AM_V_at)$(am__mv) sample-sources/$(DEPDIR)/libhpcrun_o-papi-c-rocm.Tpo sample-sources/$(DEPDIR)/libhpcrun_o-papi-c-rocm.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@ $(AM_V_CC)source='sample-sources/papi-c-rocm.c' object='sample-sources/libhpcrun_o-papi-c-rocm.obj' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@ $(AM_V_CC@am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_o_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_o_CFLAGS) $(CFLAGS) -c -o sample-sources/libhpcrun_o-papi-c-rocm.obj `if test -f 'sample-sources/papi-c-rocm.c'; then $(CYGPATH_W) 'sample-sources/papi-c-rocm.c'; else $(CYGPATH_W) '$(srcdir)/sample-sources/papi-c-rocm.c'; fi`
+
sample-sources/libhpcrun_o-papi-c.o: sample-sources/papi-c.c
@am__fastdepCC_TRUE@ $(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_o_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_o_CFLAGS) $(CFLAGS) -MT sample-sources/libhpcrun_o-papi-c.o -MD -MP -MF sample-sources/$(DEPDIR)/libhpcrun_o-papi-c.Tpo -c -o sample-sources/libhpcrun_o-papi-c.o `test -f 'sample-sources/papi-c.c' || echo '$(srcdir)/'`sample-sources/papi-c.c
@am__fastdepCC_TRUE@ $(AM_V_at)$(am__mv) sample-sources/$(DEPDIR)/libhpcrun_o-papi-c.Tpo sample-sources/$(DEPDIR)/libhpcrun_o-papi-c.Po
@@ -8817,6 +9035,7 @@ clean-libtool:
-rm -rf gpu/instrumentation/.libs gpu/instrumentation/_libs
-rm -rf gpu/level0/.libs gpu/level0/_libs
-rm -rf gpu/nvidia/.libs gpu/nvidia/_libs
+ -rm -rf gpu/ompt/.libs gpu/ompt/_libs
-rm -rf gpu/opencl/.libs gpu/opencl/_libs
-rm -rf lush/.libs lush/_libs
-rm -rf lush-agents/.libs lush-agents/_libs
@@ -9079,6 +9298,8 @@ distclean-generic:
-rm -f gpu/level0/$(am__dirstamp)
-rm -f gpu/nvidia/$(DEPDIR)/$(am__dirstamp)
-rm -f gpu/nvidia/$(am__dirstamp)
+ -rm -f gpu/ompt/$(DEPDIR)/$(am__dirstamp)
+ -rm -f gpu/ompt/$(am__dirstamp)
-rm -f gpu/opencl/$(DEPDIR)/$(am__dirstamp)
-rm -f gpu/opencl/$(am__dirstamp)
-rm -f lush-agents/$(DEPDIR)/$(am__dirstamp)
@@ -9146,7 +9367,7 @@ clean-am: clean-generic clean-libtool clean-noinstLTLIBRARIES \
clean-pkglibLTLIBRARIES mostlyclean-am
distclean: distclean-recursive
- -rm -rf ./$(DEPDIR) audit/$(DEPDIR) cct/$(DEPDIR) dlmopen/$(DEPDIR) extern-real/$(DEPDIR) fnbounds/$(DEPDIR) gpu/$(DEPDIR) gpu/amd/$(DEPDIR) gpu/instrumentation/$(DEPDIR) gpu/level0/$(DEPDIR) gpu/nvidia/$(DEPDIR) gpu/opencl/$(DEPDIR) lush-agents/$(DEPDIR) lush/$(DEPDIR) memory/$(DEPDIR) messages/$(DEPDIR) monitor-exts/$(DEPDIR) ompt/$(DEPDIR) os/linux/$(DEPDIR) sample-sources/$(DEPDIR) sample-sources/blame-shift/$(DEPDIR) sample-sources/perf/$(DEPDIR) syscalls/$(DEPDIR) trampoline/aarch64/$(DEPDIR) trampoline/common/$(DEPDIR) trampoline/x86-family/$(DEPDIR) unwind/common/$(DEPDIR) unwind/generic-libunwind/$(DEPDIR) unwind/ppc64/$(DEPDIR) unwind/x86-family/$(DEPDIR) unwind/x86-family/manual-intervals/$(DEPDIR) utilities/$(DEPDIR) utilities/arch/ia64/$(DEPDIR) utilities/arch/libunwind/$(DEPDIR) utilities/arch/ppc64/$(DEPDIR) utilities/arch/x86-family/$(DEPDIR)
+ -rm -rf ./$(DEPDIR) audit/$(DEPDIR) cct/$(DEPDIR) dlmopen/$(DEPDIR) extern-real/$(DEPDIR) fnbounds/$(DEPDIR) gpu/$(DEPDIR) gpu/amd/$(DEPDIR) gpu/instrumentation/$(DEPDIR) gpu/level0/$(DEPDIR) gpu/nvidia/$(DEPDIR) gpu/ompt/$(DEPDIR) gpu/opencl/$(DEPDIR) lush-agents/$(DEPDIR) lush/$(DEPDIR) memory/$(DEPDIR) messages/$(DEPDIR) monitor-exts/$(DEPDIR) ompt/$(DEPDIR) os/linux/$(DEPDIR) sample-sources/$(DEPDIR) sample-sources/blame-shift/$(DEPDIR) sample-sources/perf/$(DEPDIR) syscalls/$(DEPDIR) trampoline/aarch64/$(DEPDIR) trampoline/common/$(DEPDIR) trampoline/x86-family/$(DEPDIR) unwind/common/$(DEPDIR) unwind/generic-libunwind/$(DEPDIR) unwind/ppc64/$(DEPDIR) unwind/x86-family/$(DEPDIR) unwind/x86-family/manual-intervals/$(DEPDIR) utilities/$(DEPDIR) utilities/arch/ia64/$(DEPDIR) utilities/arch/libunwind/$(DEPDIR) utilities/arch/ppc64/$(DEPDIR) utilities/arch/x86-family/$(DEPDIR)
-rm -f Makefile
distclean-am: clean-am distclean-compile distclean-generic \
distclean-tags
@@ -9195,7 +9416,7 @@ install-ps-am:
installcheck-am:
maintainer-clean: maintainer-clean-recursive
- -rm -rf ./$(DEPDIR) audit/$(DEPDIR) cct/$(DEPDIR) dlmopen/$(DEPDIR) extern-real/$(DEPDIR) fnbounds/$(DEPDIR) gpu/$(DEPDIR) gpu/amd/$(DEPDIR) gpu/instrumentation/$(DEPDIR) gpu/level0/$(DEPDIR) gpu/nvidia/$(DEPDIR) gpu/opencl/$(DEPDIR) lush-agents/$(DEPDIR) lush/$(DEPDIR) memory/$(DEPDIR) messages/$(DEPDIR) monitor-exts/$(DEPDIR) ompt/$(DEPDIR) os/linux/$(DEPDIR) sample-sources/$(DEPDIR) sample-sources/blame-shift/$(DEPDIR) sample-sources/perf/$(DEPDIR) syscalls/$(DEPDIR) trampoline/aarch64/$(DEPDIR) trampoline/common/$(DEPDIR) trampoline/x86-family/$(DEPDIR) unwind/common/$(DEPDIR) unwind/generic-libunwind/$(DEPDIR) unwind/ppc64/$(DEPDIR) unwind/x86-family/$(DEPDIR) unwind/x86-family/manual-intervals/$(DEPDIR) utilities/$(DEPDIR) utilities/arch/ia64/$(DEPDIR) utilities/arch/libunwind/$(DEPDIR) utilities/arch/ppc64/$(DEPDIR) utilities/arch/x86-family/$(DEPDIR)
+ -rm -rf ./$(DEPDIR) audit/$(DEPDIR) cct/$(DEPDIR) dlmopen/$(DEPDIR) extern-real/$(DEPDIR) fnbounds/$(DEPDIR) gpu/$(DEPDIR) gpu/amd/$(DEPDIR) gpu/instrumentation/$(DEPDIR) gpu/level0/$(DEPDIR) gpu/nvidia/$(DEPDIR) gpu/ompt/$(DEPDIR) gpu/opencl/$(DEPDIR) lush-agents/$(DEPDIR) lush/$(DEPDIR) memory/$(DEPDIR) messages/$(DEPDIR) monitor-exts/$(DEPDIR) ompt/$(DEPDIR) os/linux/$(DEPDIR) sample-sources/$(DEPDIR) sample-sources/blame-shift/$(DEPDIR) sample-sources/perf/$(DEPDIR) syscalls/$(DEPDIR) trampoline/aarch64/$(DEPDIR) trampoline/common/$(DEPDIR) trampoline/x86-family/$(DEPDIR) unwind/common/$(DEPDIR) unwind/generic-libunwind/$(DEPDIR) unwind/ppc64/$(DEPDIR) unwind/x86-family/$(DEPDIR) unwind/x86-family/manual-intervals/$(DEPDIR) utilities/$(DEPDIR) utilities/arch/ia64/$(DEPDIR) utilities/arch/libunwind/$(DEPDIR) utilities/arch/ppc64/$(DEPDIR) utilities/arch/x86-family/$(DEPDIR)
-rm -f Makefile
maintainer-clean-am: distclean-am maintainer-clean-generic
diff --git a/src/tool/hpcrun/fnbounds/fnbounds_client.c b/src/tool/hpcrun/fnbounds/fnbounds_client.c
index a2b4bdf951..dfdb1f7129 100644
--- a/src/tool/hpcrun/fnbounds/fnbounds_client.c
+++ b/src/tool/hpcrun/fnbounds/fnbounds_client.c
@@ -454,7 +454,7 @@ launch_server(void)
struct {
int sendfd[2], recvfd[2];
} fds;
- bool sampling_is_running;
+ bool sampling_is_running = false;
pid_t child_pid;
// already running
@@ -472,11 +472,13 @@ launch_server(void)
return -1;
}
- // some sample sources need to be stopped in the parent, or else
- // they cause problems in the child.
- sampling_is_running = SAMPLE_SOURCES(started);
- if (sampling_is_running) {
- SAMPLE_SOURCES(stop);
+ if (hpcrun_is_initialized()){
+ // some sample sources need to be stopped in the parent, or else
+ // they cause problems in the child.
+ sampling_is_running = SAMPLE_SOURCES(started);
+ if (sampling_is_running) {
+ SAMPLE_SOURCES(stop);
+ }
}
// Give up a bit of our stack for the child shim. It doesn't need much.
diff --git a/src/tool/hpcrun/gpu-monitors.c b/src/tool/hpcrun/gpu-monitors.c
new file mode 100644
index 0000000000..5b03c70503
--- /dev/null
+++ b/src/tool/hpcrun/gpu-monitors.c
@@ -0,0 +1,40 @@
+//
+// Created by dejan on 7/15/20.
+//
+
+#include "gpu-monitors.h"
+#include "hpcrun-malloc.h"
+
+
+static __thread gpu_monitor_node_t *gpu_monitor_list = NULL;
+
+void
+gpu_monitor_register( gpu_monitor_node_t node)
+{
+ gpu_monitor_node_t* new_node = hpcrun_malloc(sizeof(gpu_monitor_node_t));
+ new_node->ci = node.ci;
+ new_node->enter_fn = node.enter_fn;
+ new_node->exit_fn = node.exit_fn;
+ new_node->next = gpu_monitor_list;
+ gpu_monitor_list = new_node;
+}
+
+
+void
+gpu_monitors_apply(cct_node_t *cct_node, gpu_monitor_type_t type)
+{
+ gpu_monitor_node_t *node = gpu_monitor_list;
+
+ if (type == gpu_monitor_type_enter){
+ while (node != NULL) {
+ node->enter_fn(node->ci, cct_node);
+ node = node->next;
+ }
+ }
+ else if (type == gpu_monitor_type_exit){
+ while (node != NULL) {
+ node->exit_fn(node->ci);
+ node = node->next;
+ }
+ }
+}
diff --git a/src/tool/hpcrun/gpu-monitors.h b/src/tool/hpcrun/gpu-monitors.h
new file mode 100644
index 0000000000..7fd2c0d334
--- /dev/null
+++ b/src/tool/hpcrun/gpu-monitors.h
@@ -0,0 +1,35 @@
+//
+// Created by dejan on 7/15/20.
+//
+
+#ifndef HPCTOOLKIT_GPU_MONITORS_H
+#define HPCTOOLKIT_GPU_MONITORS_H
+
+#include
+#include
+
+
+
+typedef enum {
+ gpu_monitor_type_enter,
+ gpu_monitor_type_exit
+} gpu_monitor_type_t;
+
+
+typedef void (*gpu_monitor_enter_fn_t)(papi_component_info_t *ci, cct_node_t *cct_node);
+typedef void (*gpu_monitor_exit_fn_t)(papi_component_info_t *ci);
+
+
+typedef struct gpu_monitor_node_t {
+ struct gpu_monitor_node_t * next;
+ papi_component_info_t *ci;
+ gpu_monitor_enter_fn_t enter_fn;
+ gpu_monitor_exit_fn_t exit_fn;
+} gpu_monitor_node_t;
+
+
+extern void gpu_monitor_register(gpu_monitor_node_t node);
+extern void gpu_monitors_apply(cct_node_t *cct_node, gpu_monitor_type_t type);
+
+
+#endif //HPCTOOLKIT_GPU_MONITORS_H
diff --git a/src/tool/hpcrun/gpu/amd/hip-api.c b/src/tool/hpcrun/gpu/amd/hip-api.c
new file mode 100644
index 0000000000..be0d916b83
--- /dev/null
+++ b/src/tool/hpcrun/gpu/amd/hip-api.c
@@ -0,0 +1,251 @@
+// -*-Mode: C++;-*- // technically C99
+
+// * BeginRiceCopyright *****************************************************
+//
+// --------------------------------------------------------------------------
+// Part of HPCToolkit (hpctoolkit.org)
+//
+// Information about sources of support for research and development of
+// HPCToolkit is at 'hpctoolkit.org' and in 'README.Acknowledgments'.
+// --------------------------------------------------------------------------
+//
+// Copyright ((c)) 2002-2020, Rice University
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// * Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// * Neither the name of Rice University (RICE) nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// This software is provided by RICE and contributors "as is" and any
+// express or implied warranties, including, but not limited to, the
+// implied warranties of merchantability and fitness for a particular
+// purpose are disclaimed. In no event shall RICE or contributors be
+// liable for any direct, indirect, incidental, special, exemplary, or
+// consequential damages (including, but not limited to, procurement of
+// substitute goods or services; loss of use, data, or profits; or
+// business interruption) however caused and on any theory of liability,
+// whether in contract, strict liability, or tort (including negligence
+// or otherwise) arising in any way out of the use of this software, even
+// if advised of the possibility of such damage.
+//
+// ******************************************************* EndRiceCopyright *
+
+//***************************************************************************
+//
+// File:
+// hip-api.c
+//
+// Purpose:
+// wrapper around AMD HIP layer
+//
+//***************************************************************************
+
+
+//*****************************************************************************
+// system include files
+//*****************************************************************************
+
+#include
+#include
+#include // memset
+
+// #include
+#include
+
+//*****************************************************************************
+// local include files
+//*****************************************************************************
+
+#include
+#include
+
+#include "hip-api.h"
+
+
+
+//*****************************************************************************
+// macros
+//*****************************************************************************
+
+#define HIP_FN_NAME(f) DYN_FN_NAME(f)
+
+#define HIP_FN(fn, args) \
+ static hipError_t (*HIP_FN_NAME(fn)) args
+
+#define HPCRUN_HIP_API_CALL(fn, args) \
+{ \
+ hipError_t error_result = HIP_FN_NAME(fn) args; \
+ if (error_result != hipSuccess) { \
+ ETMSG(CUDA, "hip api %s returned %d", #fn, (int) error_result); \
+ exit(-1); \
+ } \
+}
+
+#define FORALL_HIP_ROUTINES(macro) \
+ macro(hipDeviceSynchronize) \
+ macro(hipDeviceGetAttribute) \
+ macro(hipCtxGetCurrent)
+
+//******************************************************************************
+// static data
+//******************************************************************************
+
+#ifndef HPCRUN_STATIC_LINK
+HIP_FN
+(
+ hipDeviceSynchronize,
+( void )
+);
+
+HIP_FN
+(
+ hipDeviceGetAttribute,
+ (
+ int *pi,
+ hipDeviceAttribute_t attrib,
+ int dev
+ )
+);
+
+HIP_FN
+(
+ hipCtxGetCurrent,
+ (
+ hipCtx_t *ctx
+ )
+);
+
+#endif
+
+//******************************************************************************
+// private operations
+//******************************************************************************
+//TODO: Copied from cuda-api.c - check if works for hip
+#ifndef HPCRUN_STATIC_LINK
+static int
+hip_device_sm_blocks_query
+(
+ int major,
+ int minor
+)
+{
+ switch(major) {
+ case 7:
+ case 6:
+ return 32;
+ default:
+ // TODO(Keren): add more devices
+ return 8;
+ }
+}
+#endif
+
+
+//******************************************************************************
+// interface operations
+//******************************************************************************
+
+int
+hip_bind
+(
+void
+)
+{
+#ifndef HPCRUN_STATIC_LINK
+ // dynamic libraries only availabile in non-static case
+ CHK_DLOPEN(hip, "libamdhip64.so", RTLD_NOW | RTLD_GLOBAL);
+
+#define HIP_BIND(fn) \
+ CHK_DLSYM(hip, fn);
+
+ FORALL_HIP_ROUTINES(HIP_BIND)
+#undef CUPTI_BIND
+
+ return 0;
+#else
+ return -1;
+#endif // ! HPCRUN_STATIC_LINK
+}
+
+int
+hip_context
+(
+ hipCtx_t *ctx
+)
+{
+#ifndef HPCRUN_STATIC_LINK
+ HPCRUN_HIP_API_CALL(hipCtxGetCurrent, (ctx));
+ return 0;
+#else
+ return -1;
+#endif
+}
+
+int
+hip_device_property_query
+(
+ int device_id,
+ hip_device_property_t *property
+)
+{
+#ifndef HPCRUN_STATIC_LINK
+ HPCRUN_HIP_API_CALL(hipDeviceGetAttribute,
+ (&property->sm_count, hipDeviceAttributeMultiprocessorCount, device_id));
+
+ HPCRUN_HIP_API_CALL(hipDeviceGetAttribute,
+ (&property->sm_clock_rate, hipDeviceAttributeClockRate, device_id));
+
+ HPCRUN_HIP_API_CALL(hipDeviceGetAttribute,
+ (&property->sm_shared_memory,
+ hipDeviceAttributeMaxSharedMemoryPerMultiprocessor, device_id));
+
+ HPCRUN_HIP_API_CALL(hipDeviceGetAttribute,
+ (&property->sm_registers,
+ hipDeviceAttributeMaxRegistersPerBlock, device_id));//CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR
+
+ HPCRUN_HIP_API_CALL(hipDeviceGetAttribute,
+ (&property->sm_threads, hipDeviceAttributeMaxThreadsPerMultiProcessor,
+ device_id));
+
+ HPCRUN_HIP_API_CALL(hipDeviceGetAttribute,
+ (&property->num_threads_per_warp, hipDeviceAttributeWarpSize,
+ device_id));
+
+ int major = 0, minor = 0;
+
+ HPCRUN_HIP_API_CALL(hipDeviceGetAttribute,
+ (&major, hipDeviceAttributeComputeCapabilityMajor, device_id));
+
+ HPCRUN_HIP_API_CALL(hipDeviceGetAttribute,
+ (&minor, hipDeviceAttributeComputeCapabilityMinor, device_id));
+
+ property->sm_blocks = hip_device_sm_blocks_query(major, minor);
+
+ return 0;
+#else
+ return -1;
+#endif
+}
+
+int
+hip_dev_sync
+()
+{
+#ifndef HPCRUN_STATIC_LINK
+ HPCRUN_HIP_API_CALL(hipDeviceSynchronize, () );
+ return 0;
+#else
+ return -1;
+#endif
+}
diff --git a/src/tool/hpcrun/gpu/amd/hip-api.h b/src/tool/hpcrun/gpu/amd/hip-api.h
new file mode 100644
index 0000000000..5d21ac5d6f
--- /dev/null
+++ b/src/tool/hpcrun/gpu/amd/hip-api.h
@@ -0,0 +1,112 @@
+// -*-Mode: C++;-*- // technically C99
+
+// * BeginRiceCopyright *****************************************************
+//
+// --------------------------------------------------------------------------
+// Part of HPCToolkit (hpctoolkit.org)
+//
+// Information about sources of support for research and development of
+// HPCToolkit is at 'hpctoolkit.org' and in 'README.Acknowledgments'.
+// --------------------------------------------------------------------------
+//
+// Copyright ((c)) 2002-2020, Rice University
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// * Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// * Neither the name of Rice University (RICE) nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// This software is provided by RICE and contributors "as is" and any
+// express or implied warranties, including, but not limited to, the
+// implied warranties of merchantability and fitness for a particular
+// purpose are disclaimed. In no event shall RICE or contributors be
+// liable for any direct, indirect, incidental, special, exemplary, or
+// consequential damages (including, but not limited to, procurement of
+// substitute goods or services; loss of use, data, or profits; or
+// business interruption) however caused and on any theory of liability,
+// whether in contract, strict liability, or tort (including negligence
+// or otherwise) arising in any way out of the use of this software, even
+// if advised of the possibility of such damage.
+//
+// ******************************************************* EndRiceCopyright *
+
+//***************************************************************************
+//
+// File:
+// cuda-api.h
+//
+// Purpose:
+// interface definitions for wrapper around AMD HIP layer
+//
+//***************************************************************************
+
+#ifndef hip_api_h
+#define hip_api_h
+
+
+
+//*****************************************************************************
+// rocm includes
+//*****************************************************************************
+
+// #include
+#include
+
+
+
+//*****************************************************************************
+// interface operations
+//*****************************************************************************
+
+typedef struct hip_device_property {
+ int sm_count;
+ int sm_clock_rate;
+ int sm_shared_memory;
+ int sm_registers;
+ int sm_threads;
+ int sm_blocks;
+ int num_threads_per_warp;
+} hip_device_property_t;
+
+
+//*****************************************************************************
+// interface operations
+//*****************************************************************************
+
+// returns 0 on success
+int
+hip_bind
+(
+ void
+);
+
+// returns 0 on success
+int
+hip_context
+(
+ hipCtx_t *ctx
+);
+
+// returns 0 on success
+int
+hip_device_property_query
+(
+ int device_id,
+ hip_device_property_t *property
+);
+
+int
+hip_dev_sync();
+
+#endif //hip_api_h
diff --git a/src/tool/hpcrun/gpu/amd/rocm-binary-processing.c b/src/tool/hpcrun/gpu/amd/rocm-binary-processing.c
index 1044368644..e66672bff3 100644
--- a/src/tool/hpcrun/gpu/amd/rocm-binary-processing.c
+++ b/src/tool/hpcrun/gpu/amd/rocm-binary-processing.c
@@ -58,8 +58,7 @@
//******************************************************************************
#include
-
-#include "rocm-debug-api.h"
+#include
#include "rocm-binary-processing.h"
#include
#include
@@ -96,6 +95,12 @@ typedef struct amd_gpu_binary {
amd_gpu_binary_t* binary_list = NULL;
+// A spin lock to serialize two AMD GPU binary opertionas:
+// 1. parse and add a code object to the binary list
+// 2. look up a function name from the the binary list
+static spinlock_t rocm_binary_list_lock;
+
+
//******************************************************************************
// private operations
//******************************************************************************
@@ -313,55 +318,33 @@ file_uri_exists
return 0;
}
-static int
+static void
parse_amd_gpu_binary
(
- void
+ const char* uri
)
{
- // rocm debug api library creates a new thread through std::thread.
- // This breaks automatic thread ignoring code because we only check
- // the caller of pthread_create. So, we manually ignore the new thread.
- monitor_disable_new_threads();
-
- rocm_debug_api_init();
- size_t code_object_count;
- rocm_debug_api_query_code_object(&code_object_count);
-
- for (size_t i = 0; i < code_object_count; ++i) {
- char* uri = rocm_debug_api_query_uri(i);
- PRINT("uri %d, %s\n", i, uri);
-
- // Handle file URIs
- if (strncmp(uri, "file://", strlen("file://")) == 0) {
- if (file_uri_exists(uri)) continue;
-
- // Handle a new AMD GPU binary
- amd_gpu_binary_t* bin = (amd_gpu_binary_t*) malloc(sizeof(amd_gpu_binary_t));
- bin->uri = strdup(uri);
- bin->next = binary_list;
- binary_list = bin;
-
- // Parse URI to extract the binary
- parse_amd_gpu_binary_uri(uri, bin);
-
- // Parse the ELF symbol table
- elf_version(EV_CURRENT);
- Elf *elf = elf_memory(bin->buf, bin->size);
- if (elf != 0) {
- construct_amd_gpu_symbols(elf, &(bin->function_table));
- elf_end(elf);
- }
+ // Handle file URIs
+ if (strncmp(uri, "file://", strlen("file://")) == 0) {
+ if (file_uri_exists(uri)) return;
+
+ // Handle a new AMD GPU binary
+ amd_gpu_binary_t* bin = (amd_gpu_binary_t*) malloc(sizeof(amd_gpu_binary_t));
+ bin->uri = strdup(uri);
+ bin->next = binary_list;
+ binary_list = bin;
+
+ // Parse URI to extract the binary
+ parse_amd_gpu_binary_uri(uri, bin);
+
+ // Parse the ELF symbol table
+ elf_version(EV_CURRENT);
+ Elf *elf = elf_memory(bin->buf, bin->size);
+ if (elf != 0) {
+ construct_amd_gpu_symbols(elf, &(bin->function_table));
+ elf_end(elf);
}
}
-
- rocm_debug_api_fini();
-
- // Now we are done with the rocm debug api.
- // we enable tracing threads
- monitor_enable_new_threads();
-
- return 0;
}
// TODO:
@@ -408,19 +391,31 @@ rocm_binary_function_lookup
)
{
// TODO:
- // 1. Handle multi-threaded case. Currently, this function is called when the first
- // HIP kernel launch is done. So multiple threads can enter this concurrently.
- // 2. Currently we support multiple GPU binaries, but assume that kernel is unique
+ // 1. Currently we support multiple GPU binaries, but assume that kernel is unique
// across GPU binaries.
- if (binary_list == NULL) {
- if (parse_amd_gpu_binary() < 0) {
- // Allocate a placeholder binary
- binary_list = (amd_gpu_binary_t*)malloc(sizeof(amd_gpu_binary_t));
- binary_list->next = NULL;
- binary_list->function_table.size = 0;
- }
- }
+ spinlock_lock(&rocm_binary_list_lock);
ip_normalized_t nip = lookup_amd_function(kernel_name);
PRINT("HIP launch kernel %s, lm_ip %lx\n", kernel_name, nip.lm_ip);
+ spinlock_unlock(&rocm_binary_list_lock);
return nip;
}
+
+void
+rocm_binary_uri_add
+(
+ const char* uri
+)
+{
+ spinlock_lock(&rocm_binary_list_lock);
+ parse_amd_gpu_binary(uri);
+ spinlock_unlock(&rocm_binary_list_lock);
+}
+
+void
+rocm_binary_uri_list_init
+(
+ void
+)
+{
+ spinlock_init(&rocm_binary_list_lock);
+}
diff --git a/src/tool/hpcrun/gpu/amd/rocm-binary-processing.h b/src/tool/hpcrun/gpu/amd/rocm-binary-processing.h
index 9300ffa710..0fa592e823 100644
--- a/src/tool/hpcrun/gpu/amd/rocm-binary-processing.h
+++ b/src/tool/hpcrun/gpu/amd/rocm-binary-processing.h
@@ -60,4 +60,16 @@ rocm_binary_function_lookup
const char* kernel_name
);
+void
+rocm_binary_uri_add
+(
+ const char* uri
+);
+
+void
+rocm_binary_uri_list_init
+(
+ void
+);
+
#endif
diff --git a/src/tool/hpcrun/gpu/amd/rocm-debug-api.c b/src/tool/hpcrun/gpu/amd/rocm-debug-api.c
deleted file mode 100644
index b20b664f9b..0000000000
--- a/src/tool/hpcrun/gpu/amd/rocm-debug-api.c
+++ /dev/null
@@ -1,312 +0,0 @@
-// -*-Mode: C++;-*- // technically C99
-
-// * BeginRiceCopyright *****************************************************
-//
-// --------------------------------------------------------------------------
-// Part of HPCToolkit (hpctoolkit.org)
-//
-// Information about sources of support for research and development of
-// HPCToolkit is at 'hpctoolkit.org' and in 'README.Acknowledgments'.
-// --------------------------------------------------------------------------
-//
-// Copyright ((c)) 2002-2022, Rice University
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// * Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// * Neither the name of Rice University (RICE) nor the names of its
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// This software is provided by RICE and contributors "as is" and any
-// express or implied warranties, including, but not limited to, the
-// implied warranties of merchantability and fitness for a particular
-// purpose are disclaimed. In no event shall RICE or contributors be
-// liable for any direct, indirect, incidental, special, exemplary, or
-// consequential damages (including, but not limited to, procurement of
-// substitute goods or services; loss of use, data, or profits; or
-// business interruption) however caused and on any theory of liability,
-// whether in contract, strict liability, or tort (including negligence
-// or otherwise) arising in any way out of the use of this software, even
-// if advised of the possibility of such damage.
-//
-// ******************************************************* EndRiceCopyright *
-
-//******************************************************************************
-// system includes
-//******************************************************************************
-
-#include "amd-dbgapi.h"
-
-#include
-#include
-
-//******************************************************************************
-// local includes
-//******************************************************************************
-
-#include "rocm-debug-api.h"
-
-#include
-#include
-#include
-
-//******************************************************************************
-// macros
-//******************************************************************************
-
-#define FORALL_ROCM_DEBUG_ROUTINES(macro) \
- macro(amd_dbgapi_initialize) \
- macro(amd_dbgapi_process_attach) \
- macro(amd_dbgapi_process_detach) \
- macro(amd_dbgapi_process_code_object_list) \
- macro(amd_dbgapi_code_object_get_info)
-
-
-#define ROCM_DEBUG_FN_NAME(f) DYN_FN_NAME(f)
-
-#define ROCM_DEBUG_FN(fn, args) \
- static amd_dbgapi_status_t (*ROCM_DEBUG_FN_NAME(fn)) args
-
-#define HPCRUN_ROCM_DEBUG_CALL(fn, args) \
-{ \
- amd_dbgapi_status_t ret = ROCM_DEBUG_FN_NAME(fn) args; \
- check_rocm_debug_status(ret, __LINE__); \
-}
-
-//******************************************************************************
-// debug print
-//******************************************************************************
-
-#define DEBUG 0
-
-#include "hpcrun/gpu/gpu-print.h"
-
-//******************************************************************************
-// local variables
-//******************************************************************************
-
-static amd_dbgapi_callbacks_t callbacks;
-static amd_dbgapi_process_id_t self;
-static amd_dbgapi_code_object_id_t *code_objects_id;
-
-//----------------------------------------------------------
-// rocm debug api function pointers for late binding
-//----------------------------------------------------------
-
-ROCM_DEBUG_FN
-(
- amd_dbgapi_initialize,
- (
- amd_dbgapi_callbacks_t*
- )
-);
-
-ROCM_DEBUG_FN
-(
- amd_dbgapi_process_attach,
- (
- amd_dbgapi_client_process_id_t,
- amd_dbgapi_process_id_t *
- )
-);
-
-ROCM_DEBUG_FN
-(
- amd_dbgapi_process_detach,
- (
- amd_dbgapi_process_id_t
- )
-);
-
-ROCM_DEBUG_FN
-(
- amd_dbgapi_process_code_object_list,
- (
- amd_dbgapi_process_id_t,
- size_t *,
- amd_dbgapi_code_object_id_t **,
- amd_dbgapi_changed_t *
- )
-);
-
-ROCM_DEBUG_FN
-(
- amd_dbgapi_code_object_get_info,
- (
- amd_dbgapi_code_object_id_t,
- amd_dbgapi_code_object_info_t,
- size_t,
- void*
- )
-);
-
-//******************************************************************************
-// private operations
-//******************************************************************************
-
-static amd_dbgapi_status_t
-hpcrun_self_process
-(
- amd_dbgapi_client_process_id_t cp,
- amd_dbgapi_os_process_id_t *os_pid
-)
-{
- *os_pid = getpid();
- return AMD_DBGAPI_STATUS_SUCCESS;
-}
-
-static amd_dbgapi_status_t
-hpcrun_insert_breakpoint
-(
- amd_dbgapi_client_process_id_t client_process_id,
- amd_dbgapi_global_address_t address,
- amd_dbgapi_breakpoint_id_t breakpoint_id
-)
-{
- return AMD_DBGAPI_STATUS_SUCCESS;
-}
-
-static amd_dbgapi_status_t
-hpcrun_remove_breakpoint
-(
- amd_dbgapi_client_process_id_t client_process_id,
- amd_dbgapi_breakpoint_id_t breakpoint_id
-)
-{
- return AMD_DBGAPI_STATUS_SUCCESS;
-}
-
-static void
-hpcrun_log_message
-(
- amd_dbgapi_log_level_t level,
- const char *message
-)
-{
- PRINT("%s\n", message);
-}
-
-static void
-check_rocm_debug_status
-(
- amd_dbgapi_status_t ret,
- int lineNo
-)
-{
- if (ret == AMD_DBGAPI_STATUS_SUCCESS) {
- return;
- }
-
-#define CHECK_RET(x) case x: { PRINT("%s", #x); break; }
- switch(ret) {
- CHECK_RET(AMD_DBGAPI_STATUS_FATAL)
- CHECK_RET(AMD_DBGAPI_STATUS_ERROR_NOT_INITIALIZED)
- CHECK_RET(AMD_DBGAPI_STATUS_ERROR_INVALID_PROCESS_ID)
- CHECK_RET(AMD_DBGAPI_STATUS_ERROR_INVALID_ARGUMENT)
- CHECK_RET(AMD_DBGAPI_STATUS_ERROR_CLIENT_CALLBACK)
- CHECK_RET(AMD_DBGAPI_STATUS_ERROR_INVALID_CODE_OBJECT_ID)
- CHECK_RET(AMD_DBGAPI_STATUS_ERROR_INVALID_ARGUMENT_COMPATIBILITY)
- default:
- PRINT("unknown rocm debug return value");
- break;
- }
-
-#undef CHECK_RET
-
- PRINT(" at line %d\n", lineNo);
-}
-
-//******************************************************************************
-// interface operations
-//******************************************************************************
-
-int
-rocm_debug_api_bind
-(
- void
-)
-{
- // This disable HIP's deferred code object loading.
- // We can remove this when we start to use HSA API tracing
- setenv("HIP_ENABLE_DEFERRED_LOADING", "0", 1);
-
-#ifndef HPCRUN_STATIC_LINK
- // dynamic libraries only availabile in non-static case
- hpcrun_force_dlopen(true);
- CHK_DLOPEN(rocm_debug, "librocm-dbgapi.so", RTLD_NOW | RTLD_GLOBAL);
- hpcrun_force_dlopen(false);
-
-#define ROCM_DEBUG_BIND(fn) \
- CHK_DLSYM(rocm_debug, fn);
-
- FORALL_ROCM_DEBUG_ROUTINES(ROCM_DEBUG_BIND);
-
-#undef ROCM_DEBUG_BIND
- return DYNAMIC_BINDING_STATUS_OK;
-#else
- return DYNAMIC_BINDING_STATUS_ERROR;
-#endif // ! HPCRUN_STATIC_LINK
-}
-
-void
-rocm_debug_api_init
-(
- void
-)
-{
- // Fill in call back functions for rocm debug api
- callbacks.allocate_memory = malloc;
- callbacks.deallocate_memory = free;
- callbacks.get_os_pid = hpcrun_self_process;
- callbacks.insert_breakpoint = hpcrun_insert_breakpoint;
- callbacks.remove_breakpoint = hpcrun_remove_breakpoint;
- callbacks.log_message = hpcrun_log_message;
-
- HPCRUN_ROCM_DEBUG_CALL(amd_dbgapi_initialize, (&callbacks));
- HPCRUN_ROCM_DEBUG_CALL(amd_dbgapi_process_attach,
- ((amd_dbgapi_client_process_id_t)(&self), &self));
-}
-
-void
-rocm_debug_api_fini
-(
- void
-)
-{
- HPCRUN_ROCM_DEBUG_CALL(amd_dbgapi_process_detach, (self));
-}
-
-void
-rocm_debug_api_query_code_object
-(
- size_t* code_object_count_ptr
-)
-{
- HPCRUN_ROCM_DEBUG_CALL(amd_dbgapi_process_code_object_list,
- (self, code_object_count_ptr, &code_objects_id, NULL));
- PRINT("code object count %u\n", *code_object_count_ptr);
-}
-
-char*
-rocm_debug_api_query_uri
-(
- size_t code_object_index
-)
-{
- char* uri;
- HPCRUN_ROCM_DEBUG_CALL(amd_dbgapi_code_object_get_info,
- (code_objects_id[code_object_index],
- AMD_DBGAPI_CODE_OBJECT_INFO_URI_NAME,
- sizeof(char*), (void*)(&uri)));
- return uri;
-}
diff --git a/src/tool/hpcrun/gpu/amd/rocprofiler-api.c b/src/tool/hpcrun/gpu/amd/rocprofiler-api.c
new file mode 100644
index 0000000000..9e3daac034
--- /dev/null
+++ b/src/tool/hpcrun/gpu/amd/rocprofiler-api.c
@@ -0,0 +1,674 @@
+// -*-Mode: C++;-*- // technically C99
+
+// * BeginRiceCopyright *****************************************************
+//
+// --------------------------------------------------------------------------
+// Part of HPCToolkit (hpctoolkit.org)
+//
+// Information about sources of support for research and development of
+// HPCToolkit is at 'hpctoolkit.org' and in 'README.Acknowledgments'.
+// --------------------------------------------------------------------------
+//
+// Copyright ((c)) 2002-2021, Rice University
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// * Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// * Neither the name of Rice University (RICE) nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// This software is provided by RICE and contributors "as is" and any
+// express or implied warranties, including, but not limited to, the
+// implied warranties of merchantability and fitness for a particular
+// purpose are disclaimed. In no event shall RICE or contributors be
+// liable for any direct, indirect, incidental, special, exemplary, or
+// consequential damages (including, but not limited to, procurement of
+// substitute goods or services; loss of use, data, or profits; or
+// business interruption) however caused and on any theory of liability,
+// whether in contract, strict liability, or tort (including negligence
+// or otherwise) arising in any way out of the use of this software, even
+// if advised of the possibility of such damage.
+//
+// ******************************************************* EndRiceCopyright *
+
+//******************************************************************************
+// local includes
+//******************************************************************************
+
+#include "rocprofiler-api.h"
+#include "rocm-binary-processing.h"
+
+#include
+#include
+#include
+
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+
+#include
+#include
+
+#include
+
+#include
+#include
+
+#define DEBUG 0
+
+#include "hpcrun/gpu/gpu-print.h"
+//******************************************************************************
+// macros
+//******************************************************************************
+
+
+#define PUBLIC_API __attribute__((visibility("default")))
+
+#define FORALL_ROCPROFILER_ROUTINES(macro) \
+ macro(rocprofiler_open) \
+ macro(rocprofiler_close) \
+ macro(rocprofiler_get_metrics) \
+ macro(rocprofiler_set_queue_callbacks) \
+ macro(rocprofiler_start_queue_callbacks) \
+ macro(rocprofiler_stop_queue_callbacks) \
+ macro(rocprofiler_remove_queue_callbacks) \
+ macro(rocprofiler_iterate_info) \
+ macro(rocprofiler_group_get_data) \
+ macro(rocprofiler_get_group)
+
+
+
+#define ROCPROFILER_FN_NAME(f) DYN_FN_NAME(f)
+
+#define ROCPROFILER_FN(fn, args) \
+ static hsa_status_t (*ROCPROFILER_FN_NAME(fn)) args
+
+#define HPCRUN_ROCPROFILER_CALL(fn, args) \
+{ \
+ hsa_status_t status = ROCPROFILER_FN_NAME(fn) args; \
+ if (status != HSA_STATUS_SUCCESS) { \
+ const char* error_string = NULL; \
+ rocprofiler_error_string(&error_string); \
+ fprintf(stderr, "ERROR: %s\n", error_string); \
+ abort(); \
+ } \
+}
+
+typedef struct {
+ bool valid;
+ hsa_agent_t agent;
+ rocprofiler_group_t group;
+ rocprofiler_callback_data_t data;
+} hpcrun_amd_counter_data_t;
+
+//******************************************************************************
+// local variables
+//******************************************************************************
+
+// Currently we serialize kernel execution when collecting counters.
+// So we have one global correlation id, counter data storage,
+// and one variable indicating whether the processing is finished or not
+static hpcrun_amd_counter_data_t counter_data;
+static uint64_t rocprofiler_correlation_id;
+static volatile int context_callback_finish;
+
+static bool rocprofiler_initialized = false;
+
+// total number of counters supported by rocprofiler,
+// an array of their string names, and an array of their description
+static int total_counters = 0;
+static const char** counter_name = NULL;
+static const char** counter_description = NULL;
+
+// the list of counters specified at the command line
+static int *is_specified_by_user = NULL;
+static int total_requested = 0;
+static rocprofiler_feature_t* rocprofiler_input = NULL;
+static const char** requested_counter_name = NULL;
+static const char** requested_counter_description = NULL;
+
+// A spin lock to serialize GPU kernels
+static spinlock_t kernel_lock;
+
+//----------------------------------------------------------
+// rocprofiler function pointers for late binding
+//----------------------------------------------------------
+
+ROCPROFILER_FN
+(
+ rocprofiler_open,
+ (
+ hsa_agent_t agent, // GPU handle
+ rocprofiler_feature_t* features, // [in/out] profiling feature array
+ uint32_t feature_count, // profiling feature count
+ rocprofiler_t** context, // [out] profiling context handle
+ uint32_t mode, // profiling mode mask
+ rocprofiler_properties_t* properties // profiler properties
+ )
+);
+
+ROCPROFILER_FN
+(
+ rocprofiler_close,
+ (
+ rocprofiler_t* context // [in] profiling context
+ )
+);
+
+ROCPROFILER_FN
+(
+ rocprofiler_get_metrics,
+ (
+ rocprofiler_t* context // [in/out] profiling context
+ )
+);
+
+ROCPROFILER_FN
+(
+ rocprofiler_set_queue_callbacks,
+ (
+ rocprofiler_queue_callbacks_t callbacks, // callbacks
+ void* data
+ )
+);
+
+ROCPROFILER_FN
+(
+ rocprofiler_start_queue_callbacks,
+ (
+ void
+ )
+);
+
+ROCPROFILER_FN
+(
+ rocprofiler_stop_queue_callbacks,
+ (
+ void
+ )
+);
+
+ROCPROFILER_FN
+(
+ rocprofiler_remove_queue_callbacks,
+ (
+ void
+ )
+);
+
+ROCPROFILER_FN
+(
+ rocprofiler_iterate_info,
+ (
+ const hsa_agent_t* agent, // [in] GPU handle, NULL for all
+ // GPU agents
+ rocprofiler_info_kind_t kind, // kind of iterated info
+ hsa_status_t (*callback)(const rocprofiler_info_data_t info, void *data), // callback
+ void *data
+ )
+);
+
+ROCPROFILER_FN
+(
+ rocprofiler_group_get_data,
+ (
+ rocprofiler_group_t* group // [in/out] profiling group
+ )
+);
+
+ROCPROFILER_FN
+(
+ rocprofiler_get_group,
+ (
+ rocprofiler_t* context, // [in/out] profiling context,
+ // will be returned as
+ // a part of the group structure
+ uint32_t index, // [in] group index
+ rocprofiler_group_t* group // [out] profiling group
+ )
+);
+
+//******************************************************************************
+// private operations
+//******************************************************************************
+
+static const char *
+rocprofiler_path
+(
+ void
+)
+{
+ const char *path = "librocprofiler64.so";
+
+ return path;
+}
+
+static void
+translate_rocprofiler_output
+(
+ gpu_activity_t* ga
+)
+{
+ // Translate counter results stored in rocprofiler_feature_t
+ // to hpcrun's gpu_activity_t data structure
+ rocprofiler_feature_t** features = counter_data.group.features;
+ unsigned feature_count = counter_data.group.feature_count;
+
+ ga->kind = GPU_ACTIVITY_COUNTER;
+ ga->details.counters.correlation_id = rocprofiler_correlation_id;
+ ga->details.counters.total_counters = feature_count;
+
+ // This function should be called by rocprofiler thread,
+ // which is not monitored. So, this function will not be called
+ // inside a signal handler and we can call malloc.
+ // The memory is freed when we attribute this gpu_activity_t.
+ ga->details.counters.values = (uint64_t*) malloc(sizeof(uint64_t) * feature_count);
+
+ // rocprofiler should pass metric results in the same order
+ // that we pass metrics as input to rocprofiler
+ for (unsigned i = 0; i < feature_count; ++i) {
+ const rocprofiler_feature_t* p = features[i];
+ ga->details.counters.values[i] = p->data.result_int64;
+ }
+}
+
+// Profiling completion handler
+// Dump and delete the context entry
+// Return true if the context was dumped successfully
+static bool
+rocprofiler_context_handler
+(
+ rocprofiler_group_t group,
+ void* arg
+)
+{
+ hpcrun_thread_init_mem_pool_once(0, NULL, false, true);
+
+ // This wait-loop is taken from rocprofiler example.
+ // It is strange that the rocprofiler thread will have to
+ // wait for subscriber callback to finish.
+ volatile bool valid = counter_data.valid;
+ while (!valid) {
+ sched_yield();
+ valid = counter_data.valid;
+ }
+
+ if (counter_data.group.context == NULL) {
+ EMSG("error: AMD group->context = NULL");
+ }
+ if (counter_data.group.feature_count > 0) {
+ HPCRUN_ROCPROFILER_CALL(rocprofiler_group_get_data, (&counter_data.group));
+ HPCRUN_ROCPROFILER_CALL(rocprofiler_get_metrics, (counter_data.group.context));
+ }
+
+ gpu_activity_t ga;
+ memset(&ga, 0, sizeof(gpu_activity_t));
+ cstack_ptr_set(&(ga.next), 0);
+
+ translate_rocprofiler_output(&ga);
+
+ // Consume the correlation channel for rocprofiler
+ gpu_monitoring_thread_activities_ready_with_idx(ROCPROFILER_CHANNEL_IDX);
+ if (gpu_correlation_id_map_lookup(rocprofiler_correlation_id) == NULL) {
+ gpu_correlation_id_map_insert(rocprofiler_correlation_id, rocprofiler_correlation_id);
+ }
+ gpu_activity_process(&ga);
+
+ context_callback_finish = 1;
+ return false;
+}
+
+static hsa_status_t
+rocprofiler_dispatch_callback
+(
+ const rocprofiler_callback_data_t* callback_data,
+ void* arg,
+ rocprofiler_group_t* group
+) {
+ if (total_requested == 0) return HSA_STATUS_SUCCESS;
+
+ // Passed tool data
+ hsa_agent_t agent = callback_data->agent;
+ // HSA status
+ hsa_status_t status = HSA_STATUS_ERROR;
+
+ rocprofiler_t* context = NULL;
+ rocprofiler_properties_t properties = {};
+ properties.handler = rocprofiler_context_handler;
+ properties.handler_arg = NULL;
+
+ counter_data.valid = false;
+ HPCRUN_ROCPROFILER_CALL(rocprofiler_open, (agent, rocprofiler_input, total_requested,
+ &context, 0 /*ROCPROFILER_MODE_SINGLEGROUP*/, &properties));
+
+
+ // Get group[0]
+ HPCRUN_ROCPROFILER_CALL(rocprofiler_get_group, (context, 0, group));
+
+ // Fill profiling context entry
+ counter_data.agent = agent;
+ counter_data.group = *group;
+ counter_data.data = *callback_data;
+ counter_data.valid = true;
+
+ return HSA_STATUS_SUCCESS;
+}
+
+static hsa_status_t
+total_counter_accumulator
+(
+ const rocprofiler_info_data_t info,
+ void *data
+)
+{
+ total_counters += 1;
+ return HSA_STATUS_SUCCESS;
+}
+
+static hsa_status_t
+counter_info_accumulator
+(
+ const rocprofiler_info_data_t info,
+ void *data
+)
+{
+ if (getenv("HPCRUN_PRINT_ROCPROFILER_COUNTER_DETAILS")) {
+ printf("Enter counter_info_accumulator\n");
+ printf("\tname %s\n", info.metric.name);
+ printf("\tinstances %d\n", info.metric.instances);
+ printf("\texpr %s\n", info.metric.expr);
+ printf("\tblock name %s\n", info.metric.block_name);
+ printf("\tblock_counters %d\n", info.metric.block_counters);
+ }
+ counter_name[total_counters] = strdup(info.metric.name);
+ counter_description[total_counters] = strdup(info.metric.description);
+ total_counters += 1;
+ return HSA_STATUS_SUCCESS;
+}
+
+static void
+initialize_counter_information
+(
+
+)
+{
+ // First we iterate over all counters to get the total
+ HPCRUN_ROCPROFILER_CALL(rocprofiler_iterate_info,
+ (NULL, ROCPROFILER_INFO_KIND_METRIC, total_counter_accumulator, NULL));
+
+ // Allocate infomation array
+ counter_name = (const char**) malloc(total_counters * sizeof(const char*));
+ counter_description = (const char**) malloc(total_counters * sizeof(const char*));
+
+ // Fill in name and description string for each counter
+ total_counters = 0;
+ HPCRUN_ROCPROFILER_CALL(rocprofiler_iterate_info,
+ (NULL, ROCPROFILER_INFO_KIND_METRIC, counter_info_accumulator, NULL));
+
+ // Allocate an array to record whether a counter is asked by the user
+ is_specified_by_user = (int*) malloc(total_counters * sizeof(int));
+ memset(is_specified_by_user, 0, total_counters * sizeof(int));
+}
+
+// This function should be implemented in roctracer-api.c,
+// but due to c++ism in AMD software, I can only include rocprofiler header
+// filers in one .o
+static void
+roctracer_codeobj_callback
+(
+ uint32_t domain,
+ uint32_t cid,
+ const void* data,
+ void* arg
+)
+{
+ const hsa_evt_data_t* evt_data = (const hsa_evt_data_t*)(data);
+ const char* uri = evt_data->codeobj.uri;
+ rocm_binary_uri_add(uri);
+ PRINT("codeobj_callback domain(%u) cid(%u): load_base(0x%lx) load_size(0x%lx) load_delta(0x%lx) uri(\"%s\")\n",
+ domain,
+ cid,
+ evt_data->codeobj.load_base,
+ evt_data->codeobj.load_size,
+ evt_data->codeobj.load_delta,
+ uri);
+ free((void*)uri);
+}
+
+//******************************************************************************
+// AMD hidden interface operations
+//******************************************************************************
+
+// This is necessary for rocprofiler callback to work
+extern PUBLIC_API void OnLoadToolProp(rocprofiler_settings_t* settings){
+ // Enable hsa interception for getting code object URIs
+ settings->hsa_intercepting = 1;
+}
+
+extern PUBLIC_API void OnUnloadTool() {
+ // Must be provided. Otherwise rocprofiler
+ // will refuse to work
+}
+
+//******************************************************************************
+// interface operations
+//******************************************************************************
+
+
+void
+rocprofiler_start_kernel
+(
+ uint64_t cor
+)
+{
+ spinlock_lock(&kernel_lock);
+ rocprofiler_correlation_id = cor;
+ // We will only allow the critical section
+ // to finish after we get rocprofiler results
+ context_callback_finish = 0;
+ HPCRUN_ROCPROFILER_CALL(rocprofiler_start_queue_callbacks, ());
+}
+
+
+void rocprofiler_stop_kernel(){
+ HPCRUN_ROCPROFILER_CALL(rocprofiler_stop_queue_callbacks, ());
+ spinlock_unlock(&kernel_lock);
+}
+
+
+void
+rocprofiler_init
+(
+ void
+)
+{
+ if (rocprofiler_initialized) {
+ return;
+ }
+ // Ensure librocprofiler64.so is loaded
+ // and initialize all rocprofiler API function pointers
+ rocprofiler_initialized = true;
+
+#ifndef HPCRUN_STATIC_LINK
+ // We usually bind GPU vendor library in finalize_event_list.
+ // But here we must do early binding to query supported list of counters
+ if (rocprofiler_bind() != DYNAMIC_BINDING_STATUS_OK) {
+ EEMSG("hpcrun: unable to bind to AMD rocprofiler library %s\n", dlerror());
+ monitor_real_exit(-1);
+ }
+#endif
+
+ rocprofiler_queue_callbacks_t callbacks_ptrs = {};
+ callbacks_ptrs.dispatch = rocprofiler_dispatch_callback;
+ HPCRUN_ROCPROFILER_CALL(rocprofiler_set_queue_callbacks, (callbacks_ptrs, NULL));
+
+ initialize_counter_information();
+
+ // Initialize the spin lock used to serialize GPU kernel launches
+ spinlock_init(&kernel_lock);
+ return;
+}
+
+
+void
+rocprofiler_fini
+(
+ void *args,
+ int how
+)
+{
+ HPCRUN_ROCPROFILER_CALL(rocprofiler_remove_queue_callbacks, ());
+ return;
+}
+
+
+
+int
+rocprofiler_bind
+(
+ void
+)
+{
+#ifndef HPCRUN_STATIC_LINK
+ // dynamic libraries only availabile in non-static case
+ hpcrun_force_dlopen(true);
+ CHK_DLOPEN(rocprofiler, rocprofiler_path(), RTLD_NOW | RTLD_GLOBAL);
+ hpcrun_force_dlopen(false);
+
+#define ROCPROFILER_BIND(fn) \
+ CHK_DLSYM(rocprofiler, fn);
+
+ FORALL_ROCPROFILER_ROUTINES(ROCPROFILER_BIND);
+
+#undef ROCPROFILER_BIND
+
+ hpcrun_force_dlopen(true);
+ //if (getenv("HPCRUN_LIST_EVENT")) {
+ CHK_DLOPEN(hsa, "libhsa-runtime64.so", RTLD_NOW | RTLD_GLOBAL);
+ hsa_init();
+ //}
+ hpcrun_force_dlopen(false);
+
+ return DYNAMIC_BINDING_STATUS_OK;
+#else
+ return DYNAMIC_BINDING_STATUS_ERROR;
+#endif // ! HPCRUN_STATIC_LINK
+}
+
+void
+rocprofiler_wait_context_callback
+(
+ void
+)
+{
+ // The rocprofiler monitoring thread will set
+ // context_callback_finish to 1 after it finishes processing
+ // rocprofiler data
+ while (context_callback_finish == 0);
+}
+
+int
+rocprofiler_total_counters
+(
+ void
+)
+{
+ return total_counters;
+}
+
+const char*
+rocprofiler_counter_name
+(
+ int idx
+)
+{
+ if (idx < 0 || idx >= total_counters || counter_name == NULL) return NULL;
+ return counter_name[idx];
+}
+
+const char*
+rocprofiler_counter_description
+(
+ int idx
+)
+{
+ if (idx < 0 || idx >= total_counters || counter_description == NULL) return NULL;
+ return counter_description[idx];
+}
+
+int
+rocprofiler_match_event
+(
+ const char* ev_str
+)
+{
+ for (int i = 0; i < total_counters; i++) {
+ if (strcmp(ev_str, counter_name[i]) == 0) {
+ is_specified_by_user[i] = 1;
+ return 1;
+ }
+ }
+ return 0;
+}
+
+void
+rocprofiler_finalize_event_list
+(
+)
+{
+ for (int i = 0; i < total_counters; i++) {
+ if (is_specified_by_user[i] == 1) {
+ total_requested += 1;
+ }
+ }
+
+ rocprofiler_input = (rocprofiler_feature_t*) malloc(sizeof(rocprofiler_feature_t) * total_requested);
+ memset(rocprofiler_input, 0, total_requested * sizeof(rocprofiler_feature_t));
+
+ requested_counter_name = (const char**) malloc(sizeof(const char*) * total_requested);
+ requested_counter_description = (const char**) malloc(sizeof(const char*) * total_requested);
+
+ int cur_id = 0;
+ for (int i = 0; i < total_counters; i++) {
+ if (is_specified_by_user[i] == 1) {
+ rocprofiler_input[cur_id].kind = ROCPROFILER_FEATURE_KIND_METRIC;
+ rocprofiler_input[cur_id].name = counter_name[i];
+ requested_counter_name[cur_id] = counter_name[i];
+ requested_counter_description[cur_id] = counter_description[i];
+ cur_id += 1;
+ }
+ }
+
+ gpu_metrics_GPU_CTR_enable(total_requested, requested_counter_name, requested_counter_description);
+}
+
+void
+rocprofiler_uri_setup
+(
+ void
+)
+{
+ // Ask roctracer to set up code object URI callbacks
+ // TODO: this really should be implemented in roctracer-api.c,
+ // however, due to an AMD header file that is not fully C compatible,
+ // I can only include rocprofiler header file in one source file.
+ rocm_binary_uri_list_init();
+ roctracer_enable_op_callback(
+ ACTIVITY_DOMAIN_HSA_EVT, HSA_EVT_ID_CODEOBJ, roctracer_codeobj_callback, NULL
+ );
+}
diff --git a/src/tool/hpcrun/gpu/amd/rocprofiler-api.h b/src/tool/hpcrun/gpu/amd/rocprofiler-api.h
new file mode 100644
index 0000000000..267db702c0
--- /dev/null
+++ b/src/tool/hpcrun/gpu/amd/rocprofiler-api.h
@@ -0,0 +1,136 @@
+// -*-Mode: C++;-*- // technically C99
+
+// * BeginRiceCopyright *****************************************************
+//
+// --------------------------------------------------------------------------
+// Part of HPCToolkit (hpctoolkit.org)
+//
+// Information about sources of support for research and development of
+// HPCToolkit is at 'hpctoolkit.org' and in 'README.Acknowledgments'.
+// --------------------------------------------------------------------------
+//
+// Copyright ((c)) 2002-2022, Rice University
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// * Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// * Neither the name of Rice University (RICE) nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// This software is provided by RICE and contributors "as is" and any
+// express or implied warranties, including, but not limited to, the
+// implied warranties of merchantability and fitness for a particular
+// purpose are disclaimed. In no event shall RICE or contributors be
+// liable for any direct, indirect, incidental, special, exemplary, or
+// consequential damages (including, but not limited to, procurement of
+// substitute goods or services; loss of use, data, or profits; or
+// business interruption) however caused and on any theory of liability,
+// whether in contract, strict liability, or tort (including negligence
+// or otherwise) arising in any way out of the use of this software, even
+// if advised of the possibility of such damage.
+//
+// ******************************************************* EndRiceCopyright *
+
+#ifndef rocprofiler_api_h
+#define rocprofiler_api_h
+
+//******************************************************************************
+// macro definitions
+//******************************************************************************
+
+#define ROCTRACER_CHANNEL_IDX 0
+#define ROCPROFILER_CHANNEL_IDX 1
+
+//******************************************************************************
+// interface operations
+//******************************************************************************
+
+void
+rocprofiler_start_kernel
+(
+ uint64_t
+);
+
+
+void
+rocprofiler_stop_kernel
+(
+ void
+);
+
+
+void
+rocprofiler_init
+(
+ void
+);
+
+
+void
+rocprofiler_fini
+(
+ void *args,
+ int how
+);
+
+
+int
+rocprofiler_bind
+(
+ void
+);
+
+void
+rocprofiler_wait_context_callback
+(
+ void
+);
+
+int
+rocprofiler_total_counters
+(
+ void
+);
+
+const char*
+rocprofiler_counter_name
+(
+ int
+);
+
+const char*
+rocprofiler_counter_description
+(
+ int
+);
+
+int
+rocprofiler_match_event
+(
+ const char*
+);
+
+void
+rocprofiler_finalize_event_list
+(
+ void
+);
+
+void
+rocprofiler_uri_setup
+(
+ void
+);
+
+
+#endif
diff --git a/src/tool/hpcrun/gpu/amd/roctracer-api.c b/src/tool/hpcrun/gpu/amd/roctracer-api.c
index d02a081d27..db45e6e407 100644
--- a/src/tool/hpcrun/gpu/amd/roctracer-api.c
+++ b/src/tool/hpcrun/gpu/amd/roctracer-api.c
@@ -47,11 +47,15 @@
#include "roctracer-api.h"
#include "roctracer-activity-translate.h"
-#include "rocm-debug-api.h"
+
+#include "hip-api.h"
#include "rocm-binary-processing.h"
+#include "tool_state.h"
#include
+#include
+
#include
#include
#include
@@ -66,11 +70,18 @@
#include
+
+#include "rocprofiler-api.h"
+
//******************************************************************************
// macros
//******************************************************************************
-#define FORALL_ROCTRACER_ROUTINES(macro) \
+#define DEBUG 0
+#include
+
+
+#define FORALL_ROCTRACER_ROUTINES(macro) \
macro(roctracer_open_pool_expl) \
macro(roctracer_flush_activity_expl) \
macro(roctracer_activity_push_external_correlation_id) \
@@ -81,7 +92,6 @@
macro(roctracer_disable_domain_activity) \
macro(roctracer_set_properties)
-
#define ROCTRACER_FN_NAME(f) DYN_FN_NAME(f)
#define ROCTRACER_FN(fn, args) \
@@ -89,10 +99,10 @@
#define HPCRUN_ROCTRACER_CALL(fn, args) \
{ \
- roctracer_status_t status = ROCTRACER_FN_NAME(fn) args; \
- if (status != ROCTRACER_STATUS_SUCCESS) { \
+ roctracer_status_t status = ROCTRACER_FN_NAME(fn) args; \
+ if (status != ROCTRACER_STATUS_SUCCESS) { \
/* use roctracer_error_string() */ \
- } \
+ } \
}
typedef const char* (*hip_kernel_name_fnt)(const hipFunction_t f);
@@ -109,6 +119,11 @@ typedef const char* (*hip_kernel_name_ref_fnt)(const void* hostFunction, hipStre
static hip_kernel_name_fnt hip_kernel_name_fn;
static hip_kernel_name_ref_fnt hip_kernel_name_ref_fn;
+// If we collect counters for GPU kernels,
+// we will serilize kernel executions.
+// Hopefully, AMD tool support will improve this the future
+static bool collect_counter = false;
+
//----------------------------------------------------------
// roctracer function pointers for late binding
//----------------------------------------------------------
@@ -230,35 +245,35 @@ roctracer_kernel_data_set
{
case HIP_API_ID_hipModuleLaunchKernel:
entry_data->kernel.blockSharedMemory =
- data->args.hipModuleLaunchKernel.sharedMemBytes;
+ data->args.hipModuleLaunchKernel.sharedMemBytes;
entry_data->kernel.blockThreads =
- data->args.hipModuleLaunchKernel.blockDimX *
- data->args.hipModuleLaunchKernel.blockDimY *
- data->args.hipModuleLaunchKernel.blockDimZ;
+ data->args.hipModuleLaunchKernel.blockDimX *
+ data->args.hipModuleLaunchKernel.blockDimY *
+ data->args.hipModuleLaunchKernel.blockDimZ;
break;
case HIP_API_ID_hipLaunchCooperativeKernel:
entry_data->kernel.blockSharedMemory =
- data->args.hipLaunchCooperativeKernel.sharedMemBytes;
+ data->args.hipLaunchCooperativeKernel.sharedMemBytes;
entry_data->kernel.blockThreads =
- data->args.hipLaunchCooperativeKernel.blockDimX.x *
- data->args.hipLaunchCooperativeKernel.blockDimX.y *
- data->args.hipLaunchCooperativeKernel.blockDimX.z;
+ data->args.hipLaunchCooperativeKernel.blockDimX.x *
+ data->args.hipLaunchCooperativeKernel.blockDimX.y *
+ data->args.hipLaunchCooperativeKernel.blockDimX.z;
break;
case HIP_API_ID_hipHccModuleLaunchKernel:
entry_data->kernel.blockSharedMemory =
- data->args.hipHccModuleLaunchKernel.sharedMemBytes;
+ data->args.hipHccModuleLaunchKernel.sharedMemBytes;
entry_data->kernel.blockThreads =
- (data->args.hipHccModuleLaunchKernel.globalWorkSizeX *
- data->args.hipHccModuleLaunchKernel.globalWorkSizeY *
- data->args.hipHccModuleLaunchKernel.globalWorkSizeZ) +
- (data->args.hipHccModuleLaunchKernel.localWorkSizeX *
- data->args.hipHccModuleLaunchKernel.localWorkSizeY *
- data->args.hipHccModuleLaunchKernel.localWorkSizeZ);
+ (data->args.hipHccModuleLaunchKernel.globalWorkSizeX *
+ data->args.hipHccModuleLaunchKernel.globalWorkSizeY *
+ data->args.hipHccModuleLaunchKernel.globalWorkSizeZ) +
+ (data->args.hipHccModuleLaunchKernel.localWorkSizeX *
+ data->args.hipHccModuleLaunchKernel.localWorkSizeY *
+ data->args.hipHccModuleLaunchKernel.localWorkSizeZ);
break;
}
}
@@ -284,7 +299,7 @@ ensure_kernel_ip_present
// is already present
if (hpcrun_cct_children(kernel_ph) == NULL) {
cct_node_t *kernel =
- hpcrun_cct_insert_ip_norm(kernel_ph, kernel_ip);
+ hpcrun_cct_insert_ip_norm(kernel_ph, kernel_ip, true);
hpcrun_cct_retain(kernel);
}
}
@@ -298,11 +313,18 @@ roctracer_subscriber_callback
void* arg
)
{
+ if (is_tool_active()) {
+// TMSG(ROCM, "PAPI correlation callback");
+// gpu_correlation_channel_produce(PAPI_CORR_ID, NULL, 0);
+ return;
+ }
+
gpu_op_placeholder_flags_t gpu_op_placeholder_flags = 0;
bool is_valid_op = false;
bool is_kernel_op = false;
const hip_api_data_t* data = (const hip_api_data_t*)(callback_data);
- const char* kernel_name = NULL;
+ const char* kernel_name = NULL;
+ hipStream_t kernel_stream = 0;
switch (callback_id) {
case HIP_API_ID_hipMemcpy:
@@ -329,7 +351,7 @@ roctracer_subscriber_callback
case HIP_API_ID_hipMemcpyDtoHAsync:
case HIP_API_ID_hipMemcpyParam2D:
gpu_op_placeholder_flags_set(&gpu_op_placeholder_flags,
- gpu_placeholder_type_copy);
+ gpu_placeholder_type_copy);
is_valid_op = true;
break;
@@ -342,7 +364,7 @@ roctracer_subscriber_callback
case HIP_API_ID_hipMalloc3D:
case HIP_API_ID_hipExtMallocWithFlags:
gpu_op_placeholder_flags_set(&gpu_op_placeholder_flags,
- gpu_placeholder_type_alloc);
+ gpu_placeholder_type_alloc);
is_valid_op = true;
break;
@@ -355,14 +377,14 @@ roctracer_subscriber_callback
case HIP_API_ID_hipMemsetAsync:
case HIP_API_ID_hipMemsetD32Async:
gpu_op_placeholder_flags_set(&gpu_op_placeholder_flags,
- gpu_placeholder_type_memset);
+ gpu_placeholder_type_memset);
is_valid_op = true;
break;
case HIP_API_ID_hipFree:
case HIP_API_ID_hipFreeArray:
gpu_op_placeholder_flags_set(&gpu_op_placeholder_flags,
- gpu_placeholder_type_delete);
+ gpu_placeholder_type_delete);
is_valid_op = true;
break;
@@ -377,6 +399,9 @@ roctracer_subscriber_callback
is_valid_op = true;
is_kernel_op = true;
kernel_name = hip_kernel_name_fn(data->args.hipModuleLaunchKernel.f);
+ if (collect_counter) {
+ kernel_stream = data->args.hipModuleLaunchKernel.stream;
+ }
break;
}
case HIP_API_ID_hipLaunchKernel: {
@@ -386,8 +411,11 @@ roctracer_subscriber_callback
gpu_placeholder_type_trace);
is_valid_op = true;
is_kernel_op = true;
- kernel_name = hip_kernel_name_ref_fn(data->args.hipLaunchKernel.function_address,
+ kernel_name = hip_kernel_name_ref_fn(data->args.hipLaunchKernel.function_address,
data->args.hipLaunchKernel.stream);
+ if (collect_counter) {
+ kernel_stream = data->args.hipLaunchKernel.stream;
+ }
break;
}
case HIP_API_ID_hipCtxSynchronize:
@@ -395,7 +423,7 @@ roctracer_subscriber_callback
case HIP_API_ID_hipDeviceSynchronize:
case HIP_API_ID_hipEventSynchronize:
gpu_op_placeholder_flags_set(&gpu_op_placeholder_flags,
- gpu_placeholder_type_sync);
+ gpu_placeholder_type_sync);
is_valid_op = true;
break;
default:
@@ -408,8 +436,9 @@ roctracer_subscriber_callback
if (data->phase == ACTIVITY_API_PHASE_ENTER) {
uint64_t correlation_id = data->correlation_id;
+ uint64_t rocprofiler_correlation_id = 0;
cct_node_t *api_node =
- gpu_application_thread_correlation_callback(correlation_id);
+ gpu_application_thread_correlation_callback(correlation_id);
gpu_op_ccts_t gpu_op_ccts;
hpcrun_safe_enter();
@@ -423,17 +452,39 @@ roctracer_subscriber_callback
cct_node_t *trace_ph = gpu_op_ccts_get(&gpu_op_ccts, gpu_placeholder_type_trace);
ensure_kernel_ip_present(trace_ph, kernel_ip);
+
+ if (collect_counter) {
+ rocprofiler_correlation_id = correlation_id;
+ rocprofiler_start_kernel(rocprofiler_correlation_id);
+ }
}
hpcrun_safe_exit();
-
- gpu_activity_channel_consume(gpu_metrics_attribute);
+ gpu_activity_channel_consume_with_idx(ROCTRACER_CHANNEL_IDX, gpu_metrics_attribute);
+ if (collect_counter) {
+ gpu_activity_channel_consume_with_idx(ROCPROFILER_CHANNEL_IDX, gpu_metrics_attribute);
+ }
// Generate notification entry
uint64_t cpu_submit_time = hpcrun_nanotime();
- gpu_correlation_channel_produce(correlation_id, &gpu_op_ccts, cpu_submit_time);
+ //gpu_monitors_apply(api_node, gpu_monitor_type_enter);
+
+ gpu_correlation_channel_produce_with_idx(ROCTRACER_CHANNEL_IDX, correlation_id, &gpu_op_ccts, cpu_submit_time);
+ if (collect_counter && is_kernel_op && kernel_name != NULL) {
+ gpu_correlation_channel_produce_with_idx(ROCPROFILER_CHANNEL_IDX, rocprofiler_correlation_id, &gpu_op_ccts, cpu_submit_time);
+ }
+
+ }else if (data->phase == ACTIVITY_API_PHASE_EXIT){
+ if (is_kernel_op && collect_counter) {
+ //gpu_monitors_apply(NULL, gpu_monitor_type_exit);
+ hipStreamSynchronize(kernel_stream);
+ rocprofiler_wait_context_callback();
+ rocprofiler_stop_kernel();
+ }
}
+
+
}
@@ -443,7 +494,7 @@ roctracer_buffer_completion_notify
void
)
{
- gpu_monitoring_thread_activities_ready();
+ gpu_monitoring_thread_activities_ready_with_idx(ROCTRACER_CHANNEL_IDX);
}
@@ -457,7 +508,7 @@ roctracer_activity_process
roctracer_activity_translate(&gpu_activity, roctracer_record);
if (gpu_correlation_id_map_lookup(roctracer_record->correlation_id) == NULL) {
gpu_correlation_id_map_insert(roctracer_record->correlation_id,
- roctracer_record->correlation_id);
+ roctracer_record->correlation_id);
}
gpu_activity_process(&gpu_activity);
}
@@ -493,8 +544,6 @@ roctracer_path
return path;
}
-
-
//******************************************************************************
// interface operations
//******************************************************************************
@@ -509,10 +558,6 @@ roctracer_bind
// More details: https://github.com/ROCm-Developer-Tools/roctracer/issues/22
setenv("HSA_ENABLE_INTERRUPT", "0", 1);
- if (rocm_debug_api_bind() != DYNAMIC_BINDING_STATUS_OK) {
- return DYNAMIC_BINDING_STATUS_ERROR;
- }
-
#ifndef HPCRUN_STATIC_LINK
// dynamic libraries only availabile in non-static case
hpcrun_force_dlopen(true);
@@ -572,6 +617,9 @@ roctracer_init
HPCRUN_ROCTRACER_CALL(roctracer_enable_domain_callback, (ACTIVITY_DOMAIN_KFD_API, roctracer_subscriber_callback, NULL));
// Enable rocTX
HPCRUN_ROCTRACER_CALL(roctracer_enable_domain_callback, (ACTIVITY_DOMAIN_ROCTX, roctracer_subscriber_callback, NULL));
+
+ // Prepare getting URI
+ rocprofiler_uri_setup();
}
void
@@ -604,3 +652,11 @@ roctracer_fini
roctracer_flush(args, how);
}
+void
+roctracer_enable_counter_collection
+(
+ void
+)
+{
+ collect_counter = true;
+}
diff --git a/src/tool/hpcrun/gpu/amd/roctracer-api.h b/src/tool/hpcrun/gpu/amd/roctracer-api.h
index af3a381849..db8462e205 100644
--- a/src/tool/hpcrun/gpu/amd/roctracer-api.h
+++ b/src/tool/hpcrun/gpu/amd/roctracer-api.h
@@ -79,6 +79,10 @@ roctracer_bind
void
);
-
+void
+roctracer_enable_counter_collection
+(
+ void
+);
#endif
diff --git a/src/tool/hpcrun/gpu/gpu-activity-channel.c b/src/tool/hpcrun/gpu/gpu-activity-channel.c
index 12d1e0a755..b1386b6b01 100644
--- a/src/tool/hpcrun/gpu/gpu-activity-channel.c
+++ b/src/tool/hpcrun/gpu/gpu-activity-channel.c
@@ -50,6 +50,7 @@
#include "gpu-activity.h"
#include "gpu-activity-channel.h"
#include "gpu-channel-item-allocator.h"
+#include "gpu-channel-common.h"
//******************************************************************************
@@ -96,7 +97,7 @@ typedef struct gpu_activity_channel_t {
// local data
//******************************************************************************
-static __thread gpu_activity_channel_t *gpu_activity_channel = NULL;
+static __thread gpu_activity_channel_t *gpu_activity_channels[GPU_CHANNEL_TOTAL];
@@ -134,11 +135,20 @@ gpu_activity_channel_get
void
)
{
- if (gpu_activity_channel == NULL) {
- gpu_activity_channel = gpu_activity_channel_alloc();
+ return gpu_activity_channel_get_with_idx(0);
+}
+
+gpu_activity_channel_t *
+gpu_activity_channel_get_with_idx
+(
+ int idx
+)
+{
+ if (gpu_activity_channels[idx] == NULL) {
+ gpu_activity_channels[idx] = gpu_activity_channel_alloc();
}
- return gpu_activity_channel;
+ return gpu_activity_channels[idx];
}
@@ -164,7 +174,17 @@ gpu_activity_channel_consume
gpu_activity_attribute_fn_t aa_fn
)
{
- gpu_activity_channel_t *channel = gpu_activity_channel_get();
+ return gpu_activity_channel_consume_with_idx(0, aa_fn);
+}
+
+void
+gpu_activity_channel_consume_with_idx
+(
+ int idx,
+ gpu_activity_attribute_fn_t aa_fn
+)
+{
+ gpu_activity_channel_t *channel = gpu_activity_channel_get_with_idx(idx);
// steal elements previously enqueued by the producer
channel_steal(channel, bichannel_direction_forward);
diff --git a/src/tool/hpcrun/gpu/gpu-activity-channel.h b/src/tool/hpcrun/gpu/gpu-activity-channel.h
index 4565b797a6..e9a994c0e6 100644
--- a/src/tool/hpcrun/gpu/gpu-activity-channel.h
+++ b/src/tool/hpcrun/gpu/gpu-activity-channel.h
@@ -51,6 +51,7 @@
#include
#include "gpu-activity.h"
+#include "gpu-channel-common.h"
//******************************************************************************
@@ -74,6 +75,13 @@ gpu_activity_channel_get
);
+gpu_activity_channel_t *
+gpu_activity_channel_get_with_idx
+(
+ int
+);
+
+
void
gpu_activity_channel_produce
(
@@ -89,5 +97,12 @@ gpu_activity_channel_consume
);
+void
+gpu_activity_channel_consume_with_idx
+(
+ int idx,
+ gpu_activity_attribute_fn_t aa_fn
+);
+
#endif
diff --git a/src/tool/hpcrun/gpu/gpu-activity-process.c b/src/tool/hpcrun/gpu/gpu-activity-process.c
index 47a6dc4288..29947bfd1c 100644
--- a/src/tool/hpcrun/gpu/gpu-activity-process.c
+++ b/src/tool/hpcrun/gpu/gpu-activity-process.c
@@ -57,6 +57,7 @@
#include
#include
#include
+#include
#include
#include
#include
@@ -140,6 +141,11 @@ attribute_activity
gpu_activity_channel_t *channel =
gpu_host_correlation_map_entry_channel_get(hc);
activity->cct_node = cct_node;
+
+ PRINT("attributing activity to %p time = [%lu,%lu)\n",
+ cct_node, activity->details.interval.start,
+ activity->details.interval.end);
+
gpu_activity_channel_produce(channel, activity);
}
@@ -150,7 +156,7 @@ gpu_memcpy_process
gpu_activity_t *activity
)
{
- uint32_t correlation_id = activity->details.memcpy.correlation_id;
+ uint64_t correlation_id = activity->details.memcpy.correlation_id;
gpu_correlation_id_map_entry_t *cid_map_entry =
gpu_correlation_id_map_lookup(correlation_id);
if (cid_map_entry != NULL) {
@@ -194,9 +200,9 @@ gpu_memcpy_process
}
gpu_correlation_id_map_delete(correlation_id);
} else {
- PRINT("Memcpy copy correlation_id %u cannot be found\n", correlation_id);
+ PRINT("Memcpy copy correlation_id 0x%lx cannot be found\n", correlation_id);
}
- PRINT("Memcpy copy CorrelationId %u\n", correlation_id);
+ PRINT("Memcpy copy correlation_id 0x%lx\n", correlation_id);
PRINT("Memcpy copy kind %u\n", activity->details.memcpy.copyKind);
PRINT("Memcpy copy bytes %lu\n", activity->details.memcpy.bytes);
}
@@ -217,7 +223,7 @@ gpu_sample_process
gpu_activity_t* sample
)
{
- uint32_t correlation_id = sample->details.pc_sampling.correlation_id;
+ uint64_t correlation_id = sample->details.pc_sampling.correlation_id;
gpu_correlation_id_map_entry_t *cid_map_entry =
gpu_correlation_id_map_lookup(correlation_id);
@@ -254,7 +260,7 @@ gpu_sample_process
PRINT("host_map_entry %lu not found\n", external_id);
}
} else {
- PRINT("correlation_id_map_entry %u not found\n", correlation_id);
+ PRINT("correlation_id_map_entry %lu not found\n", correlation_id);
}
}
@@ -265,7 +271,7 @@ gpu_sampling_info_process
gpu_activity_t *sri
)
{
- uint32_t correlation_id = sri->details.pc_sampling_info.correlation_id;
+ uint64_t correlation_id = sri->details.pc_sampling_info.correlation_id;
gpu_correlation_id_map_entry_t *cid_map_entry =
gpu_correlation_id_map_lookup(correlation_id);
if (cid_map_entry != NULL) {
@@ -305,7 +311,8 @@ gpu_correlation_process
if (gpu_correlation_id_map_lookup(gpu_correlation_id) == NULL) {
gpu_correlation_id_map_insert(gpu_correlation_id, host_correlation_id);
} else {
- gpu_correlation_id_map_external_id_replace(gpu_correlation_id, host_correlation_id);
+ gpu_correlation_id_map_external_id_replace(gpu_correlation_id,
+ host_correlation_id);
}
PRINT("Correlation: native_correlation %u --> host_correlation %lu\n",
gpu_correlation_id, host_correlation_id);
@@ -318,7 +325,7 @@ gpu_memset_process
gpu_activity_t *activity
)
{
- uint32_t correlation_id = activity->details.memset.correlation_id;
+ uint64_t correlation_id = activity->details.memset.correlation_id;
gpu_correlation_id_map_entry_t *cid_map_entry =
gpu_correlation_id_map_lookup(correlation_id);
if (cid_map_entry != NULL) {
@@ -344,7 +351,7 @@ gpu_memset_process
}
gpu_correlation_id_map_delete(correlation_id);
}
- PRINT("Memset CorrelationId %u\n", correlation_id);
+ PRINT("Memset correlation_id 0x%lx\n", correlation_id);
PRINT("Memset kind %u\n", activity->details.memset.memKind);
PRINT("Memset bytes %lu\n", activity->details.memset.bytes);
}
@@ -356,7 +363,8 @@ gpu_function_process
gpu_activity_t *activity
)
{
- gpu_function_id_map_insert(activity->details.function.function_id, activity->details.function.pc);
+ gpu_function_id_map_insert(activity->details.function.function_id,
+ activity->details.function.pc);
PRINT("Function id %u\n", activity->details.function.function_id);
}
@@ -367,7 +375,7 @@ gpu_kernel_process
gpu_activity_t *activity
)
{
- uint32_t correlation_id = activity->details.kernel.correlation_id;
+ uint64_t correlation_id = activity->details.kernel.correlation_id;
gpu_correlation_id_map_entry_t *cid_map_entry =
gpu_correlation_id_map_lookup(correlation_id);
@@ -417,11 +425,12 @@ gpu_kernel_process
attribute_activity(host_op_entry, activity, kernel_node);
}
} else {
- PRINT("Kernel execution correlation_id %u cannot be found\n", correlation_id);
+ PRINT("Kernel execution correlation_id 0x%lx cannot be found\n",
+ correlation_id);
}
PRINT("Kernel execution deviceId %u\n", activity->details.kernel.device_id);
- PRINT("Kernel execution CorrelationId %u\n", correlation_id);
+ PRINT("Kernel execution correlation_id 0x%lx\n", correlation_id);
}
@@ -461,15 +470,17 @@ gpu_synchronization_process
gpu_activity_t *activity
)
{
- uint32_t correlation_id = activity->details.synchronization.correlation_id;
+ uint64_t correlation_id = activity->details.synchronization.correlation_id;
gpu_correlation_id_map_entry_t *cid_map_entry =
gpu_correlation_id_map_lookup(correlation_id);
if (cid_map_entry != NULL) {
uint64_t external_id =
gpu_correlation_id_map_entry_external_id_get(cid_map_entry);
+
gpu_host_correlation_map_entry_t *host_op_entry =
gpu_host_correlation_map_lookup(external_id);
- if (host_op_entry != NULL) {
+ if (host_op_entry != NULL && external_id != IGNORE_CORR_ID) {
+
cct_node_t *host_op_node =
gpu_host_correlation_map_entry_op_cct_get(host_op_entry,
gpu_placeholder_type_sync);
@@ -500,7 +511,8 @@ gpu_synchronization_process
case GPU_SYNC_EVENT:
{
// Find the corresponding stream that records the event
- gpu_event_id_map_entry_t *event_id_entry = gpu_event_id_map_lookup(event_id);
+ gpu_event_id_map_entry_t *event_id_entry =
+ gpu_event_id_map_lookup(event_id);
if (event_id_entry != NULL) {
context_id = gpu_event_id_map_entry_context_id_get(event_id_entry);
stream_id = gpu_event_id_map_entry_stream_id_get(event_id_entry);
@@ -511,7 +523,8 @@ gpu_synchronization_process
}
default:
// invalid
- PRINT("Invalid synchronization %u\n", correlation_id);
+ PRINT("Synchronization correlation_id 0x%lx cannot be found\n",
+ correlation_id);
}
}
// TODO(Keren): handle event synchronization
@@ -520,7 +533,7 @@ gpu_synchronization_process
}
gpu_correlation_id_map_delete(correlation_id);
}
- PRINT("Synchronization CorrelationId %u\n", correlation_id);
+ PRINT("Synchronization correlation_id 0x%lx\n", correlation_id);
}
@@ -530,7 +543,7 @@ gpu_cdpkernel_process
gpu_activity_t *activity
)
{
- uint32_t correlation_id = activity->details.cdpkernel.correlation_id;
+ uint64_t correlation_id = activity->details.cdpkernel.correlation_id;
gpu_correlation_id_map_entry_t *cid_map_entry =
gpu_correlation_id_map_lookup(correlation_id);
if (cid_map_entry != NULL) {
@@ -557,7 +570,7 @@ gpu_cdpkernel_process
}
gpu_correlation_id_map_delete(correlation_id);
}
- PRINT("Cdp Kernel CorrelationId %u\n", correlation_id);
+ PRINT("Cdp Kernel correlation_id 0x%lx\n", correlation_id);
}
@@ -575,6 +588,22 @@ gpu_event_process
PRINT("GPU event %u\n", event_id);
}
+static gpu_placeholder_type_t
+gpu_memory_placeholder
+(
+ gpu_activity_t *activity
+)
+{
+ gpu_mem_op_t mem_op = activity->details.memory.mem_op;;
+ switch(mem_op) {
+ case GPU_MEM_OP_ALLOC: return gpu_placeholder_type_alloc;
+ case GPU_MEM_OP_DELETE: return gpu_placeholder_type_delete;
+ default:
+ assert(0);
+ }
+ return gpu_placeholder_type_alloc;
+}
+
static void
gpu_memory_process
@@ -582,7 +611,7 @@ gpu_memory_process
gpu_activity_t *activity
)
{
- uint32_t correlation_id = activity->details.memory.correlation_id;
+ uint64_t correlation_id = activity->details.memory.correlation_id;
gpu_correlation_id_map_entry_t *cid_map_entry =
gpu_correlation_id_map_lookup(correlation_id);
if (cid_map_entry != NULL) {
@@ -591,19 +620,29 @@ gpu_memory_process
gpu_host_correlation_map_entry_t *host_op_entry =
gpu_host_correlation_map_lookup(external_id);
if (host_op_entry != NULL) {
- gpu_placeholder_type_t ph = gpu_placeholder_type_alloc;
+ gpu_placeholder_type_t ph = gpu_memory_placeholder(activity);
cct_node_t *host_op_node =
gpu_host_correlation_map_entry_op_cct_get(host_op_entry, ph);
assert(host_op_node != NULL);
// Memory allocation does not always happen on the device
// Do not send it to trace channels
+
+ gpu_trace_item_t entry_trace;
+ trace_item_set(&entry_trace, activity, host_op_entry, host_op_node);
+
+ gpu_context_stream_trace
+ (activity->details.memory.device_id,
+ activity->details.memory.context_id,
+ activity->details.memory.stream_id,
+ &entry_trace);
+
attribute_activity(host_op_entry, activity, host_op_node);
}
gpu_correlation_id_map_delete(correlation_id);
} else {
- PRINT("Memory correlation_id %u cannot be found\n", correlation_id);
+ PRINT("Memory correlation_id 0x%lx cannot be found\n", correlation_id);
}
- PRINT("Memory CorrelationId %u\n", correlation_id);
+ PRINT("Memory correlation_id 0x%lx\n", correlation_id);
PRINT("Memory kind %u\n", activity->details.memory.memKind);
PRINT("Memory bytes %lu\n", activity->details.memory.bytes);
}
@@ -615,7 +654,7 @@ gpu_instruction_process
gpu_activity_t *activity
)
{
- uint32_t correlation_id = activity->details.instruction.correlation_id;
+ uint64_t correlation_id = activity->details.instruction.correlation_id;
ip_normalized_t pc = activity->details.instruction.pc;
gpu_correlation_id_map_entry_t *cid_map_entry =
gpu_correlation_id_map_lookup(correlation_id);
@@ -633,7 +672,49 @@ gpu_instruction_process
attribute_activity(host_op_entry, activity, func_ins);
}
}
- PRINT("Instruction correlation_id %u\n", correlation_id);
+ PRINT("Instruction correlation_id 0x%lx\n", correlation_id);
+}
+
+static void
+gpu_counter_process
+(
+ gpu_activity_t *activity
+)
+{
+ uint32_t correlation_id = activity->details.counters.correlation_id;
+ gpu_correlation_id_map_entry_t *cid_map_entry =
+ gpu_correlation_id_map_lookup(correlation_id);
+ if (cid_map_entry != NULL) {
+ uint64_t external_id =
+ gpu_correlation_id_map_entry_external_id_get(cid_map_entry);
+ gpu_host_correlation_map_entry_t *host_op_entry =
+ gpu_host_correlation_map_lookup(external_id);
+ if (host_op_entry != NULL) {
+ gpu_placeholder_type_t ph = gpu_placeholder_type_kernel;
+ cct_node_t *host_op_node =
+ gpu_host_correlation_map_entry_op_cct_get(host_op_entry, ph);
+ assert(host_op_node != NULL);
+
+ cct_node_t *func_node = hpcrun_cct_children(host_op_node); // only child
+ cct_node_t *kernel_node;
+ if (func_node == NULL) {
+ kernel_node = host_op_node;
+ } else {
+ cct_addr_t *addr = hpcrun_cct_addr(func_node);
+ kernel_node = hpcrun_cct_insert_ip_norm(host_op_node, addr->ip_norm, true);
+ }
+ // Memory allocation does not always happen on the device
+ // Do not send it to trace channels
+ attribute_activity(host_op_entry, activity, kernel_node);
+ }
+ gpu_correlation_id_map_delete(correlation_id);
+ } else {
+ PRINT("Counter correlation_id %u cannot be found\n", correlation_id);
+ }
+ PRINT("Counter CorrelationId %u\n", correlation_id);
+ PRINT("Counter cycles %lu\n", activity->details.counters.cycles);
+ PRINT("Counter l2 cache hit %lu\n", activity->details.counters.l2_cache_hit);
+ PRINT("Counter l2 cache miss %lu\n", activity->details.counters.l2_cache_miss);
}
@@ -713,6 +794,10 @@ gpu_activity_process
gpu_event_process(ga);
break;
+ case GPU_ACTIVITY_COUNTER:
+ gpu_counter_process(ga);
+ break;
+
case GPU_ACTIVITY_MEMCPY2:
default:
gpu_unknown_process(ga);
diff --git a/src/tool/hpcrun/gpu/gpu-activity.c b/src/tool/hpcrun/gpu/gpu-activity.c
index 0a640c8636..ba595e187f 100644
--- a/src/tool/hpcrun/gpu/gpu-activity.c
+++ b/src/tool/hpcrun/gpu/gpu-activity.c
@@ -56,6 +56,9 @@
#include "gpu-activity.h"
#include "gpu-channel-item-allocator.h"
+
+#define DEBUG 0
+
#include "gpu-print.h"
@@ -66,9 +69,6 @@
#define UNIT_TEST 0
-#define DEBUG 0
-
-
#define FORALL_OPENCL_KINDS(macro) \
macro(GPU_ACTIVITY_UNKNOWN) \
macro(GPU_ACTIVITY_KERNEL) \
@@ -160,6 +160,8 @@ gpu_interval_set
{
interval->start = start;
interval->end = end;
+ PRINT("gpu interval: [%lu, %lu) delta = %ld\n", interval->start,
+ interval->end, interval->end - interval->start);
}
diff --git a/src/tool/hpcrun/gpu/gpu-activity.h b/src/tool/hpcrun/gpu/gpu-activity.h
index 786c4da451..7cf388de90 100644
--- a/src/tool/hpcrun/gpu/gpu-activity.h
+++ b/src/tool/hpcrun/gpu/gpu-activity.h
@@ -95,7 +95,8 @@ typedef enum {
GPU_ACTIVITY_EXTERNAL_CORRELATION = 14,
GPU_ACTIVITY_EVENT = 15,
GPU_ACTIVITY_FUNCTION = 16,
- GPU_ACTIVITY_FLUSH = 17
+ GPU_ACTIVITY_FLUSH = 17,
+ GPU_ACTIVITY_COUNTER = 18
} gpu_activity_kind_t;
@@ -179,9 +180,16 @@ typedef enum {
} gpu_mem_type_t;
+typedef enum {
+ GPU_MEM_OP_ALLOC = 0,
+ GPU_MEM_OP_DELETE = 1,
+ GPU_MEM_OP_UNKNOWN = 2
+} gpu_mem_op_t;
+
+
// pc sampling
typedef struct gpu_pc_sampling_t {
- uint32_t correlation_id;
+ uint64_t correlation_id;
ip_normalized_t pc;
uint32_t samples;
uint32_t latencySamples;
@@ -190,7 +198,7 @@ typedef struct gpu_pc_sampling_t {
typedef struct gpu_pc_sampling_info_t {
- uint32_t correlation_id;
+ uint64_t correlation_id;
uint64_t droppedSamples;
uint64_t samplingPeriodInCycles;
uint64_t totalSamples;
@@ -217,7 +225,7 @@ typedef struct gpu_memcpy_t {
uint64_t end;
uint64_t bytes;
uint64_t submit_time;
- uint32_t correlation_id;
+ uint64_t correlation_id;
uint32_t device_id;
uint32_t context_id;
uint32_t stream_id;
@@ -231,10 +239,12 @@ typedef struct gpu_memory_t {
uint64_t start;
uint64_t end;
uint64_t bytes;
- uint32_t correlation_id;
+ uint64_t correlation_id;
uint32_t device_id;
uint32_t context_id;
+ uint32_t stream_id;
gpu_mem_type_t memKind;
+ gpu_mem_op_t mem_op;
} gpu_memory_t;
@@ -243,7 +253,7 @@ typedef struct gpu_memset_t {
uint64_t start;
uint64_t end;
uint64_t bytes;
- uint32_t correlation_id;
+ uint64_t correlation_id;
uint32_t device_id;
uint32_t context_id;
uint32_t stream_id;
@@ -256,7 +266,7 @@ typedef struct gpu_kernel_t {
uint64_t start;
uint64_t end;
uint64_t submit_time;
- uint32_t correlation_id;
+ uint64_t correlation_id;
uint32_t device_id;
uint32_t context_id;
uint32_t stream_id;
@@ -282,7 +292,7 @@ typedef struct gpu_kernel_block_t {
typedef struct gpu_cdpkernel_t {
uint64_t start;
uint64_t end;
- uint32_t correlation_id;
+ uint64_t correlation_id;
uint32_t device_id;
uint32_t context_id;
uint32_t stream_id;
@@ -303,7 +313,7 @@ typedef struct gpu_event_t {
typedef struct gpu_global_access_t {
- uint32_t correlation_id;
+ uint64_t correlation_id;
ip_normalized_t pc;
uint64_t l2_transactions;
uint64_t theoreticalL2Transactions;
@@ -313,7 +323,7 @@ typedef struct gpu_global_access_t {
typedef struct gpu_local_access_t {
- uint32_t correlation_id;
+ uint64_t correlation_id;
ip_normalized_t pc;
uint64_t sharedTransactions;
uint64_t theoreticalSharedTransactions;
@@ -323,7 +333,7 @@ typedef struct gpu_local_access_t {
typedef struct gpu_branch_t {
- uint32_t correlation_id;
+ uint64_t correlation_id;
ip_normalized_t pc;
uint32_t diverged;
uint32_t executed;
@@ -333,7 +343,7 @@ typedef struct gpu_branch_t {
typedef struct gpu_synchronization_t {
uint64_t start;
uint64_t end;
- uint32_t correlation_id;
+ uint64_t correlation_id;
uint32_t context_id;
uint32_t stream_id;
uint32_t event_id;
@@ -342,10 +352,19 @@ typedef struct gpu_synchronization_t {
typedef struct gpu_host_correlation_t {
- uint32_t correlation_id;
+ uint64_t correlation_id;
uint64_t host_correlation_id;
} gpu_host_correlation_t;
+typedef struct gpu_counter_t {
+ uint32_t correlation_id;
+ int total_counters;
+ // The function that creates the structure should
+ // be responsible for allocating memory.
+ // The function that attributes the structure should
+ // be responsible for deallocating the memory.
+ uint64_t* values;
+} gpu_counter_t;
// a type that can be used to access start and end times
// for a subset of activity kinds including kernel execution,
@@ -357,7 +376,7 @@ typedef struct gpu_interval_t {
typedef struct gpu_instruction_t {
- uint32_t correlation_id;
+ uint64_t correlation_id;
ip_normalized_t pc;
} gpu_instruction_t;
@@ -383,6 +402,7 @@ typedef struct gpu_activity_details_t {
gpu_synchronization_t synchronization;
gpu_host_correlation_t correlation;
gpu_flush_t flush;
+ gpu_counter_t counters;
/* Access short cut for activitiy fields shared by multiple kinds */
diff --git a/src/tool/hpcrun/gpu/gpu-application-thread-api.c b/src/tool/hpcrun/gpu/gpu-application-thread-api.c
index 7d3e6b3f9f..1bc5b20c70 100644
--- a/src/tool/hpcrun/gpu/gpu-application-thread-api.c
+++ b/src/tool/hpcrun/gpu/gpu-application-thread-api.c
@@ -118,7 +118,6 @@ gpu_application_thread_correlation_callback
}
}
-
// skip procedure frames in libhpcrun
while (libhpcrun_id != 0 && node_addr->ip_norm.lm_id == libhpcrun_id) {
node = hpcrun_cct_parent(node);
diff --git a/src/tool/hpcrun/gpu/gpu-channel-common.h b/src/tool/hpcrun/gpu/gpu-channel-common.h
new file mode 100644
index 0000000000..b2396dd1a7
--- /dev/null
+++ b/src/tool/hpcrun/gpu/gpu-channel-common.h
@@ -0,0 +1,24 @@
+#ifndef GPU_CHANNEL_COMMON_H
+#define GPU_CHANNEL_COMMON_H
+
+// GPU_CHANNEL_TOTAL specifies the total number
+// of correlation and activity channels an application
+// thread will create.
+// This is created for supporting AMD GPUs,
+// where roctracer and rocprofiler will each create
+// one monitoring thread.
+// As the implementation of the channel is one-proceduer-one-consumer,
+// we need an array of correlation and
+// activity channel for each application thread.
+// For platforms where there is just one monitoring
+// thread, such as NVIDIA, the implementation maintains
+// backward compatibility, where we will just use
+// the first channel pair.
+// Implementation wise, channel operations without _with_idx suffix
+// represent old operations and will use channel 0
+// Channel operations with _with_idx suffix requires a channel
+// index to specify which channel to operate with
+
+#define GPU_CHANNEL_TOTAL 2
+
+#endif
diff --git a/src/tool/hpcrun/gpu/gpu-correlation-channel-set.c b/src/tool/hpcrun/gpu/gpu-correlation-channel-set.c
index f7f2d95a93..5557818cac 100644
--- a/src/tool/hpcrun/gpu/gpu-correlation-channel-set.c
+++ b/src/tool/hpcrun/gpu/gpu-correlation-channel-set.c
@@ -51,6 +51,7 @@
#include "gpu-correlation-channel.h"
#include "gpu-correlation-channel-set.h"
+#include "gpu-channel-common.h"
@@ -99,7 +100,7 @@ typed_stack_declare_type(gpu_correlation_channel_ptr_t);
static
typed_stack_elem_ptr(gpu_correlation_channel_ptr_t)
-gpu_correlation_channel_stack;
+gpu_correlation_channel_stacks[GPU_CHANNEL_TOTAL];
@@ -128,12 +129,13 @@ channel_forone
static void
-gpu_correlation_channel_set_forall
+gpu_correlation_channel_set_forall_with_idx
(
+ int idx,
gpu_correlation_channel_fn_t channel_fn
)
{
- channel_stack_forall(&gpu_correlation_channel_stack, channel_forone,
+ channel_stack_forall(&gpu_correlation_channel_stacks[idx], channel_forone,
channel_fn);
}
@@ -143,8 +145,9 @@ gpu_correlation_channel_set_forall
//******************************************************************************
void
-gpu_correlation_channel_set_insert
+gpu_correlation_channel_set_insert_with_idx
(
+ int idx,
gpu_correlation_channel_t *channel
)
{
@@ -157,15 +160,15 @@ gpu_correlation_channel_set_insert
channel_stack_elem_ptr_set(e, 0); // clear the entry's next ptr
// add the entry to the channel stack
- channel_stack_push(&gpu_correlation_channel_stack, e);
+ channel_stack_push(&gpu_correlation_channel_stacks[idx], e);
}
void
-gpu_correlation_channel_set_consume
+gpu_correlation_channel_set_consume_with_idx
(
- void
+ int idx
)
{
- gpu_correlation_channel_set_forall(gpu_correlation_channel_consume);
+ gpu_correlation_channel_set_forall_with_idx(idx, gpu_correlation_channel_consume);
}
diff --git a/src/tool/hpcrun/gpu/gpu-correlation-channel-set.h b/src/tool/hpcrun/gpu/gpu-correlation-channel-set.h
index 5eac5a7d5a..091ba7394c 100644
--- a/src/tool/hpcrun/gpu/gpu-correlation-channel-set.h
+++ b/src/tool/hpcrun/gpu/gpu-correlation-channel-set.h
@@ -70,16 +70,17 @@ typedef void (*gpu_correlation_channel_fn_t)
//******************************************************************************
void
-gpu_correlation_channel_set_insert
+gpu_correlation_channel_set_insert_with_idx
(
+ int idx,
gpu_correlation_channel_t *channel
);
void
-gpu_correlation_channel_set_consume
+gpu_correlation_channel_set_consume_with_idx
(
- void
+ int idx
);
diff --git a/src/tool/hpcrun/gpu/gpu-correlation-channel.c b/src/tool/hpcrun/gpu/gpu-correlation-channel.c
index 47a8345554..cf855b1c54 100644
--- a/src/tool/hpcrun/gpu/gpu-correlation-channel.c
+++ b/src/tool/hpcrun/gpu/gpu-correlation-channel.c
@@ -71,7 +71,7 @@
#define typed_bichannel(x) gpu_correlation_channel_t
#define typed_stack_elem(x) gpu_correlation_t
-// define macros that simplify use of correlation channel API
+// define macros that simplify use of correlation channel API
#define channel_init \
typed_bichannel_init(gpu_correlation_t)
@@ -100,7 +100,7 @@ typedef struct gpu_correlation_channel_t {
// local data
//******************************************************************************
-static __thread gpu_correlation_channel_t *gpu_correlation_channel = NULL;
+static __thread gpu_correlation_channel_t *gpu_correlation_channels[GPU_CHANNEL_TOTAL];
@@ -113,37 +113,35 @@ typed_bichannel_impl(gpu_correlation_t)
static gpu_correlation_channel_t *
-gpu_correlation_channel_alloc
+gpu_correlation_channel_alloc_with_idx
(
- void
+ int idx
)
{
- gpu_correlation_channel_t *c =
+ gpu_correlation_channel_t *c =
hpcrun_malloc_safe(sizeof(gpu_correlation_channel_t));
channel_init(c);
- gpu_correlation_channel_set_insert(c);
+ gpu_correlation_channel_set_insert_with_idx(idx, c);
return c;
}
static gpu_correlation_channel_t *
-gpu_correlation_channel_get
+gpu_correlation_channel_get_with_idx
(
- void
+ int idx
)
{
- if (gpu_correlation_channel == NULL) {
- gpu_correlation_channel = gpu_correlation_channel_alloc();
+ if (gpu_correlation_channels[idx] == NULL) {
+ gpu_correlation_channels[idx] = gpu_correlation_channel_alloc_with_idx(idx);
}
- return gpu_correlation_channel;
+ return gpu_correlation_channels[idx];
}
-
-
//******************************************************************************
// interface functions
//******************************************************************************
@@ -156,8 +154,21 @@ gpu_correlation_channel_produce
uint64_t cpu_submit_time
)
{
- gpu_correlation_channel_t *corr_channel = gpu_correlation_channel_get();
- gpu_activity_channel_t *activity_channel = gpu_activity_channel_get();
+ // Relaying parameters with index 0
+ gpu_correlation_channel_produce_with_idx(0, host_correlation_id, gpu_op_ccts, cpu_submit_time);
+}
+
+void
+gpu_correlation_channel_produce_with_idx
+(
+ int idx,
+ uint64_t host_correlation_id,
+ gpu_op_ccts_t *gpu_op_ccts,
+ uint64_t cpu_submit_time
+)
+{
+ gpu_correlation_channel_t *corr_channel = gpu_correlation_channel_get_with_idx(idx);
+ gpu_activity_channel_t *activity_channel = gpu_activity_channel_get_with_idx(idx);
gpu_correlation_t *c = gpu_correlation_alloc(corr_channel);
@@ -167,7 +178,6 @@ gpu_correlation_channel_produce
channel_push(corr_channel, bichannel_direction_forward, c);
}
-
void
gpu_correlation_channel_consume
(
@@ -204,7 +214,7 @@ gpu_correlation_channel_consume
void *hpcrun_malloc_safe
(
size_t s
-)
+)
{
return malloc(s);
}
@@ -214,7 +224,7 @@ gpu_activity_channel_t *
gpu_activity_channel_get
(
void
-)
+)
{
return (gpu_activity_channel_t *) 0x5000;
}
@@ -223,7 +233,7 @@ gpu_activity_channel_get
int
main
(
- int argc,
+ int argc,
char **argv
)
{
diff --git a/src/tool/hpcrun/gpu/gpu-correlation-channel.h b/src/tool/hpcrun/gpu/gpu-correlation-channel.h
index 33fcc0185e..5e321d6730 100644
--- a/src/tool/hpcrun/gpu/gpu-correlation-channel.h
+++ b/src/tool/hpcrun/gpu/gpu-correlation-channel.h
@@ -50,8 +50,7 @@
//******************************************************************************
#include "gpu-correlation.h"
-
-
+#include "gpu-channel-common.h"
//******************************************************************************
// type declarations
@@ -67,7 +66,7 @@ typedef struct gpu_op_ccts_t gpu_op_ccts_t;
// interface operations
//******************************************************************************
-// produce into a channel that my thread created
+// produce into the first channel that my thread created
void
gpu_correlation_channel_produce
(
@@ -76,6 +75,16 @@ gpu_correlation_channel_produce
uint64_t cpu_submit_time
);
+// produce into a specified channel (with idx) that my thread created
+// when idx == 0, this function is equivalent to gpu_correlation_channel_produce
+void
+gpu_correlation_channel_produce_with_idx
+(
+ int idx,
+ uint64_t host_correlation_id,
+ gpu_op_ccts_t *gpu_ccts,
+ uint64_t cpu_submit_time
+);
// consume from a channel that another thread created
void
diff --git a/src/tool/hpcrun/gpu/gpu-correlation-id-map.c b/src/tool/hpcrun/gpu/gpu-correlation-id-map.c
index 90ba4a0470..473640b811 100644
--- a/src/tool/hpcrun/gpu/gpu-correlation-id-map.c
+++ b/src/tool/hpcrun/gpu/gpu-correlation-id-map.c
@@ -109,7 +109,7 @@ typedef struct typed_splay_node(correlation_id) {
uint32_t device_id;
uint64_t start;
uint64_t end;
-} typed_splay_node(correlation_id);
+} typed_splay_node(correlation_id);
@@ -119,9 +119,9 @@ typedef struct typed_splay_node(correlation_id) {
// local data
//******************************************************************************
-static gpu_correlation_id_map_entry_t *map_root = NULL;
+static __thread gpu_correlation_id_map_entry_t *map_root = NULL;
-static gpu_correlation_id_map_entry_t *free_list = NULL;
+static __thread gpu_correlation_id_map_entry_t *free_list = NULL;
@@ -142,13 +142,13 @@ gpu_correlation_id_map_entry_alloc()
static gpu_correlation_id_map_entry_t *
gpu_correlation_id_map_entry_new
(
- uint32_t gpu_correlation_id,
+ uint64_t gpu_correlation_id,
uint64_t host_correlation_id
)
{
gpu_correlation_id_map_entry_t *e = gpu_correlation_id_map_entry_alloc();
- memset(e, 0, sizeof(gpu_correlation_id_map_entry_t));
+ memset(e, 0, sizeof(gpu_correlation_id_map_entry_t));
e->gpu_correlation_id = gpu_correlation_id;
e->host_correlation_id = host_correlation_id;
@@ -165,13 +165,13 @@ gpu_correlation_id_map_entry_new
gpu_correlation_id_map_entry_t *
gpu_correlation_id_map_lookup
(
- uint32_t gpu_correlation_id
+ uint64_t gpu_correlation_id
)
{
uint64_t correlation_id = gpu_correlation_id;
gpu_correlation_id_map_entry_t *result = st_lookup(&map_root, correlation_id);
- PRINT("correlation_id map lookup: id=0x%lx (record %p)\n",
+ PRINT("correlation_id map lookup: id=0x%lx (record %p)\n",
correlation_id, result);
return result;
@@ -181,21 +181,21 @@ gpu_correlation_id_map_lookup
void
gpu_correlation_id_map_insert
(
- uint32_t gpu_correlation_id,
+ uint64_t gpu_correlation_id,
uint64_t host_correlation_id
)
{
- if (st_lookup(&map_root, gpu_correlation_id)) {
+ if (st_lookup(&map_root, gpu_correlation_id)) {
// fatal error: correlation_id already present; a
// correlation should be inserted only once.
assert(0);
} else {
- gpu_correlation_id_map_entry_t *entry =
+ gpu_correlation_id_map_entry_t *entry =
gpu_correlation_id_map_entry_new(gpu_correlation_id, host_correlation_id);
st_insert(&map_root, entry);
- PRINT("correlation_id_map insert: correlation_id=0x%lx external_id=%ld (entry=%p)\n",
+ PRINT("correlation_id_map insert: correlation_id=0x%lx external_id=%ld (entry=%p)\n",
gpu_correlation_id, host_correlation_id, entry);
}
}
@@ -205,7 +205,7 @@ gpu_correlation_id_map_insert
void
gpu_correlation_id_map_external_id_replace
(
- uint32_t gpu_correlation_id,
+ uint64_t gpu_correlation_id,
uint64_t host_correlation_id
)
{
@@ -221,7 +221,7 @@ gpu_correlation_id_map_external_id_replace
void
gpu_correlation_id_map_delete
(
- uint32_t gpu_correlation_id
+ uint64_t gpu_correlation_id
)
{
gpu_correlation_id_map_entry_t *node = st_delete(&map_root, gpu_correlation_id);
@@ -232,7 +232,7 @@ gpu_correlation_id_map_delete
void
gpu_correlation_id_map_kernel_update
(
- uint32_t gpu_correlation_id,
+ uint64_t gpu_correlation_id,
uint32_t device_id,
uint64_t start,
uint64_t end
@@ -280,7 +280,7 @@ gpu_correlation_id_map_entry_end_get
}
-uint32_t
+uint64_t
gpu_correlation_id_map_entry_device_id_get
(
gpu_correlation_id_map_entry_t *entry
diff --git a/src/tool/hpcrun/gpu/gpu-correlation-id-map.h b/src/tool/hpcrun/gpu/gpu-correlation-id-map.h
index 1ba5b2a5b0..8a0340ed81 100644
--- a/src/tool/hpcrun/gpu/gpu-correlation-id-map.h
+++ b/src/tool/hpcrun/gpu/gpu-correlation-id-map.h
@@ -71,14 +71,14 @@ typedef struct cct_node_t cct_node_t;
gpu_correlation_id_map_entry_t *
gpu_correlation_id_map_lookup
(
- uint32_t gpu_correlation_id
+ uint64_t gpu_correlation_id
);
void
gpu_correlation_id_map_insert
(
- uint32_t gpu_correlation_id,
+ uint64_t gpu_correlation_id,
uint64_t host_correlation_id
);
@@ -86,14 +86,14 @@ gpu_correlation_id_map_insert
void
gpu_correlation_id_map_delete
(
- uint32_t gpu_correlation_id
+ uint64_t gpu_correlation_id
);
void
gpu_correlation_id_map_external_id_replace
(
- uint32_t gpu_correlation_id,
+ uint64_t gpu_correlation_id,
uint64_t host_correlation_id
);
@@ -101,7 +101,7 @@ gpu_correlation_id_map_external_id_replace
void
gpu_correlation_id_map_kernel_update
(
- uint32_t correlation_id,
+ uint64_t correlation_id,
uint32_t device_id,
uint64_t start,
uint64_t end
@@ -129,7 +129,7 @@ gpu_correlation_id_map_entry_end_get
);
-uint32_t
+uint64_t
gpu_correlation_id_map_entry_device_id_get
(
gpu_correlation_id_map_entry_t *entry
diff --git a/src/tool/hpcrun/gpu/gpu-correlation-id.h b/src/tool/hpcrun/gpu/gpu-correlation-id.h
index 59a138e2b9..0f24428696 100644
--- a/src/tool/hpcrun/gpu/gpu-correlation-id.h
+++ b/src/tool/hpcrun/gpu/gpu-correlation-id.h
@@ -52,7 +52,8 @@
#include
-
+//we use this for our activity that should be ignored
+#define IGNORE_CORR_ID (~0ULL)
//******************************************************************************
// interface operations
diff --git a/src/tool/hpcrun/gpu/gpu-correlation.c b/src/tool/hpcrun/gpu/gpu-correlation.c
index bed6e41d76..ebc91cab25 100644
--- a/src/tool/hpcrun/gpu/gpu-correlation.c
+++ b/src/tool/hpcrun/gpu/gpu-correlation.c
@@ -103,7 +103,7 @@ gpu_correlation_produce
{
PRINT("Produce correlation id 0x%lx\n", host_correlation_id);
c->host_correlation_id = host_correlation_id;
- c->gpu_op_ccts = *gpu_op_ccts;
+ if (gpu_op_ccts) c->gpu_op_ccts = *gpu_op_ccts;
c->activity_channel = activity_channel;
c->cpu_submit_time = cpu_submit_time;
}
diff --git a/src/tool/hpcrun/gpu/gpu-correlation.h b/src/tool/hpcrun/gpu/gpu-correlation.h
index d9ecce8262..7b83680add 100644
--- a/src/tool/hpcrun/gpu/gpu-correlation.h
+++ b/src/tool/hpcrun/gpu/gpu-correlation.h
@@ -61,7 +61,6 @@
#define UNIT_TEST_CORRELATION_HEADER 0
-
//******************************************************************************
// forward type declarations
//******************************************************************************
diff --git a/src/tool/hpcrun/gpu/gpu-host-correlation-map.c b/src/tool/hpcrun/gpu/gpu-host-correlation-map.c
index fdd8edb583..f2a3c28cca 100644
--- a/src/tool/hpcrun/gpu/gpu-host-correlation-map.c
+++ b/src/tool/hpcrun/gpu/gpu-host-correlation-map.c
@@ -125,11 +125,11 @@ typedef struct typed_splay_node(host_correlation) {
// local data
//******************************************************************************
-static gpu_host_correlation_map_entry_t *map_root = NULL;
+static __thread gpu_host_correlation_map_entry_t *map_root = NULL;
-static gpu_host_correlation_map_entry_t *free_list = NULL;
+static __thread gpu_host_correlation_map_entry_t *free_list = NULL;
-static bool allow_replace = false;
+static __thread bool allow_replace = false;
//******************************************************************************
// private operations
@@ -200,7 +200,8 @@ gpu_host_correlation_map_lookup
{
gpu_host_correlation_map_entry_t *result = st_lookup(&map_root, host_correlation_id);
- PRINT("host_correlation_map lookup: id=0x%lx (entry %p)\n", host_correlation_id, result);
+ PRINT("host_correlation_map lookup: id=0x%lx (entry %p) (&map_root=%p) tid=%llu\n",
+ host_correlation_id, result, &map_root, (uint64_t) pthread_self());
return result;
}
@@ -234,8 +235,9 @@ gpu_host_correlation_map_insert
st_insert(&map_root, entry);
PRINT("host_correlation_map insert: correlation_id=0x%lx "
- "activity_channel=%p (entry=%p)\n",
- host_correlation_id, activity_channel, entry);
+ "activity_channel=%p (entry=%p) (&map_root=%p) tid=%llu\n",
+ host_correlation_id, activity_channel, entry, &map_root,
+ (uint64_t) pthread_self());
}
}
diff --git a/src/tool/hpcrun/gpu/gpu-metrics.c b/src/tool/hpcrun/gpu/gpu-metrics.c
index 84e81657fa..139bfddcb8 100644
--- a/src/tool/hpcrun/gpu/gpu-metrics.c
+++ b/src/tool/hpcrun/gpu/gpu-metrics.c
@@ -84,7 +84,9 @@
macro(GPU_INST, 9) \
macro(GTIMES, 10) \
macro(KINFO, 12) \
- macro(GSAMP, 13)
+ macro(GSAMP, 13) \
+ macro(GXFER, 14) \
+ macro(CTR, 3)
#define FORALL_METRIC_KINDS(macro) \
@@ -203,11 +205,14 @@ name ## _metric_kind
// local variables
//*****************************************************************************
-FORALL_METRIC_KINDS(INITIALIZE_METRIC_KINDS)
+FORALL_METRIC_KINDS(INITIALIZE_METRIC_KINDS);
-FORALL_INDEXED_METRIC_KINDS(INITIALIZE_INDEXED_METRIC)
+FORALL_INDEXED_METRIC_KINDS(INITIALIZE_INDEXED_METRIC);
-FORALL_SCALAR_METRIC_KINDS(INITIALIZE_SCALAR_METRIC_KIND)
+FORALL_SCALAR_METRIC_KINDS(INITIALIZE_SCALAR_METRIC_KIND);
+
+static kind_info_t* GPU_COUNTER_METRIC_KIND_INFO = NULL;
+static int* gpu_counter_hpcrun_metric_id_array = NULL;
static const unsigned int MAX_CHAR_FORMULA = 32;
@@ -592,6 +597,59 @@ gpu_metrics_attribute_branch
b->executed);
}
+static void
+gpu_metrics_attribute_counter
+(
+ gpu_activity_t *activity
+)
+{
+ gpu_counter_t * c = &(activity->details.counters);
+ cct_node_t *cct_node = activity->cct_node;
+
+ metric_data_list_t *metrics =
+ hpcrun_reify_metric_set(cct_node,gpu_counter_hpcrun_metric_id_array[0]);
+
+ for (int i = 0; i < c->total_counters; ++i) {
+ gpu_metrics_attribute_metric_int(metrics, gpu_counter_hpcrun_metric_id_array[i], c->values[i]);
+ }
+
+ free(c->values);
+}
+
+static void
+gpu_metrics_attribute_link
+(
+gpu_activity_t *activity
+)
+{
+
+ printf("Attrubute NVLINK not implemented\n\n");
+// gpu_link_t *m = &(activity->details.memcpy);
+// cct_node_t *cct_node = activity->cct_node;
+
+// metric_data_list_t *metrics =
+// hpcrun_reify_metric_set(cct_node, METRIC_ID(GPU_KINFO_STMEM_ACUMU));
+//
+// gpu_metrics_attribute_metric_int(metrics, METRIC_ID(GPU_XFER_XMIT),
+// m->staticSharedMemory);
+//
+// gpu_metrics_attribute_metric_int(metrics, METRIC_ID(GPU_XFER_RCV),
+// m->dynamicSharedMemory);
+//
+// gpu_metrics_attribute_metric_int(metrics, METRIC_ID(GPU_XFER_XMIT_TP),
+// m->localMemoryTotal);
+//
+// gpu_metrics_attribute_metric_int(metrics, METRIC_ID(GPU_XFER_XRCV_TP),
+// m->activeWarpsPerSM);
+//
+// gpu_metrics_attribute_metric_int(metrics, METRIC_ID(GPU_XFER_XMIT_COUNT),
+// m->activeWarpsPerSM);
+//
+// gpu_metrics_attribute_metric_int(metrics, METRIC_ID(GPU_XFER_XRCV_COUNT),
+// m->activeWarpsPerSM);
+
+
+}
//******************************************************************************
// interface operations
@@ -652,6 +710,9 @@ gpu_metrics_attribute
gpu_metrics_attribute_branch(activity);
break;
+ case GPU_ACTIVITY_COUNTER:
+ gpu_metrics_attribute_counter(activity);
+ break;
default:
break;
}
@@ -898,3 +959,42 @@ gpu_metrics_GPU_INST_STALL_enable
FINALIZE_METRIC_KIND();
}
+
+void
+gpu_metrics_GPU_CTR_enable
+(
+ int total,
+ const char** counter_name,
+ const char** counter_desc
+)
+{
+ gpu_counter_hpcrun_metric_id_array = (int*) malloc(sizeof(int) * total);
+
+ GPU_COUNTER_METRIC_KIND_INFO = hpcrun_metrics_new_kind();
+
+ for (int i = 0; i < total; ++i) {
+ gpu_counter_hpcrun_metric_id_array[i] = hpcrun_set_new_metric_desc_and_period(
+ GPU_COUNTER_METRIC_KIND_INFO, counter_name[i], counter_desc[i],
+ MetricFlags_ValFmt_Int, 1, metric_property_none
+ );
+ }
+
+ hpcrun_close_kind(GPU_COUNTER_METRIC_KIND_INFO);
+}
+
+
+void
+gpu_metrics_GXFER_enable
+(
+void
+)
+{
+//#undef CURRENT_METRIC
+//#define CURRENT_METRIC GXFER
+
+ //INITIALIZE_METRIC_KIND();
+
+ //FORALL_GXFER(INITIALIZE_SCALAR_METRIC_INT)
+
+ //FINALIZE_METRIC_KIND();
+}
diff --git a/src/tool/hpcrun/gpu/gpu-metrics.h b/src/tool/hpcrun/gpu/gpu-metrics.h
index 6e05548fb1..b997487198 100644
--- a/src/tool/hpcrun/gpu/gpu-metrics.h
+++ b/src/tool/hpcrun/gpu/gpu-metrics.h
@@ -89,6 +89,15 @@ typedef enum {
} gpu_lmem_ops_t;
+typedef enum {
+GPU_XFER_XMIT = 0,
+GPU_XFER_XRCV = 1,
+GPU_XFER_XMIT_TP = 2,
+GPU_XFER_XRCV_TP = 3,
+GPU_XFER_XMIT_COUNT = 4,
+GPU_XFER_XRCV_COUNT = 5
+} gpu_xfer_ops_t;
+
//--------------------------------------------------------------------------
// indexed metrics
@@ -329,7 +338,6 @@ typedef enum {
"GPU kernel: launch count") \
macro("GKER:OCC_THR", GPU_KINFO_OCCUPANCY_THR, \
"GPU kernel: theoretical occupancy (FGP_ACT / FGP_MAX)") \
-
// gpu implicit copy
#define FORALL_GICOPY(macro) \
@@ -387,6 +395,21 @@ typedef enum {
FORALL_GSAMP_REAL(macro)
+// gpu transfer information
+#define FORALL_GXFER(macro) \
+ macro("GXFER:XMIT (B)", GPU_XFER_XMIT, \
+ "GPU link total data transmited") \
+ macro("GXFER:XRCV (B)", GPU_XFER_XRCV, \
+ "GPU link total data received") \
+ macro("GXFER:XMIT_TP (GB)", GPU_XFER_XMIT_TP, \
+ "GPU link total transmit throughput") \
+ macro("GXFER:XRCV_TP (GB)", GPU_XFER_XRCV_TP, \
+ "GPU link total received throughput") \
+ macro("GXFER:XMIT_COUNT", GPU_XFER_XMIT_COUNT, \
+ "GPU link launch count transmited") \
+ macro("GXFER:XRCV_COUNT", GPU_XFER_XRCV_COUNT, \
+ "GPU kernel: launch count received")
+
//******************************************************************************
// interface operations
@@ -452,6 +475,11 @@ gpu_metrics_GSAMP_enable
void
);
+void
+gpu_metrics_GXFER_enable
+(
+void
+);
//--------------------------------------------------
// record global memory access statistics
@@ -486,6 +514,24 @@ gpu_metrics_GBR_enable
);
+//--------------------------------------------------
+// record GPU hardware counters
+//--------------------------------------------------
+
+// Unlike other GPU metric types that may have up to a dozen of metrics,
+// GPU hardware counters may have a few hundred metrics.
+// So, we should only create counter metrics for the ones that are
+// requested at the command line.
+void
+gpu_metrics_GPU_CTR_enable
+(
+ int,
+ const char**,
+ const char**
+);
+
+
+
//--------------------------------------------------
// attribute GPU measurements to an application
// thread's calling context tree
diff --git a/src/tool/hpcrun/gpu/gpu-monitoring-thread-api.c b/src/tool/hpcrun/gpu/gpu-monitoring-thread-api.c
index 1c4a937374..361262069b 100644
--- a/src/tool/hpcrun/gpu/gpu-monitoring-thread-api.c
+++ b/src/tool/hpcrun/gpu/gpu-monitoring-thread-api.c
@@ -60,6 +60,15 @@ gpu_monitoring_thread_activities_ready
void
)
{
- gpu_correlation_channel_set_consume();
+ gpu_correlation_channel_set_consume_with_idx(0);
+}
+
+void
+gpu_monitoring_thread_activities_ready_with_idx
+(
+ int idx
+)
+{
+ gpu_correlation_channel_set_consume_with_idx(idx);
}
diff --git a/src/tool/hpcrun/gpu/gpu-monitoring-thread-api.h b/src/tool/hpcrun/gpu/gpu-monitoring-thread-api.h
index 881667601e..c3d02d4c82 100644
--- a/src/tool/hpcrun/gpu/gpu-monitoring-thread-api.h
+++ b/src/tool/hpcrun/gpu/gpu-monitoring-thread-api.h
@@ -57,5 +57,12 @@ gpu_monitoring_thread_activities_ready
);
+void
+gpu_monitoring_thread_activities_ready_with_idx
+(
+ int idx
+);
+
+
#endif
diff --git a/src/tool/hpcrun/gpu/gpu-operation-multiplexer.c b/src/tool/hpcrun/gpu/gpu-operation-multiplexer.c
index 9254575f5f..cba3cef38b 100644
--- a/src/tool/hpcrun/gpu/gpu-operation-multiplexer.c
+++ b/src/tool/hpcrun/gpu/gpu-operation-multiplexer.c
@@ -93,6 +93,7 @@ gpu_init_operation_channel(){
}
+// OpenCL Monitoring thread
static void *
gpu_operation_record
(
@@ -133,9 +134,11 @@ gpu_operation_multiplexer_create
gpu_operation_channel_set_alloc(max_completion_cb_threads);
- // You are the first to create monitor thread
+ monitor_disable_new_threads();
+ // Create monitor thread
pthread_create(&thread, NULL, (pthread_start_routine_t) gpu_operation_record,
NULL);
+ monitor_enable_new_threads();
}
diff --git a/src/tool/hpcrun/gpu/gpu-trace-demultiplexer.c b/src/tool/hpcrun/gpu/gpu-trace-demultiplexer.c
index ea6827159f..33a9079000 100644
--- a/src/tool/hpcrun/gpu/gpu-trace-demultiplexer.c
+++ b/src/tool/hpcrun/gpu/gpu-trace-demultiplexer.c
@@ -54,6 +54,8 @@
#include "gpu-trace-demultiplexer.h"
#include "gpu-print.h"
+#include
+
//******************************************************************************
// type declarations
@@ -96,8 +98,11 @@ gpu_trace_channel_set_create
new_channel_set->channel_set_ptr = gpu_trace_channel_set_alloc(streams_per_thread);
atomic_store(&new_channel_set->channel_index, 0);
+ monitor_disable_new_threads();
+ // Create tracing thread
pthread_create(&new_channel_set->thread, NULL, (pthread_start_routine_t) gpu_trace_record,
new_channel_set);
+ monitor_enable_new_threads();
return new_channel_set;
}
diff --git a/src/tool/hpcrun/gpu/gpu-trace.c b/src/tool/hpcrun/gpu/gpu-trace.c
index bdf00173b7..d548dae439 100644
--- a/src/tool/hpcrun/gpu/gpu-trace.c
+++ b/src/tool/hpcrun/gpu/gpu-trace.c
@@ -195,19 +195,6 @@ gpu_trace_cct_insert_context
}
-static uint64_t
-gpu_trace_time
-(
- uint64_t gpu_time
-)
-{
- // return time in ns
- uint64_t time = gpu_time;
-
- return time;
-}
-
-
static void
gpu_trace_stream_append
(
@@ -371,6 +358,7 @@ gpu_trace_fini
}
+// Tracing thread
void *
gpu_trace_record
(
@@ -380,6 +368,7 @@ gpu_trace_record
gpu_trace_channel_set_t *channel_set = (gpu_trace_channel_set_t *) args;
hpcrun_thread_init_mem_pool_once(0, NULL, false, true);
+ atomic_fetch_add(&active_streams_counter, 1);
while (!atomic_load(&stop_trace_flag)) {
//getting data from a trace channel
@@ -408,9 +397,6 @@ gpu_trace_create
monitor_disable_new_threads();
trace->thread = gpu_trace_demultiplexer_push(trace->trace_channel);
- atomic_fetch_add(&active_streams_counter, 1);
-
- monitor_enable_new_threads();
return trace;
}
@@ -450,8 +436,8 @@ consume_one_trace_item
cct_node_t *leaf = gpu_trace_cct_insert_context(td, call_path);
- uint64_t start = gpu_trace_time(start_time);
- uint64_t end = gpu_trace_time(end_time);
+ uint64_t start = start_time;
+ uint64_t end = end_time;
stream_start_set(start_time);
@@ -480,8 +466,11 @@ consume_one_trace_item
if (append) {
gpu_trace_stream_append(td, leaf, start);
- gpu_trace_stream_append(td, no_activity, end + 1);
- PRINT("%p Append trace activity [%lu, %lu]\n", td, start, end);
+ // note: adding 1 to end makes sense. however, with AMD OMPT, this
+ // causes adjacent events to share a timestamp. so, don't add 1.
+ gpu_trace_stream_append(td, no_activity, end);
+
+ PRINT("%p Append trace activity [%lu, %lu)\n", td, start, end);
}
}
diff --git a/src/tool/hpcrun/gpu/instrumentation/gtpin-instrumentation.c b/src/tool/hpcrun/gpu/instrumentation/gtpin-instrumentation.c
index a4e93c46ac..0b9ddea71a 100644
--- a/src/tool/hpcrun/gpu/instrumentation/gtpin-instrumentation.c
+++ b/src/tool/hpcrun/gpu/instrumentation/gtpin-instrumentation.c
@@ -243,7 +243,6 @@ writeBinary
}
}
-
static size_t
computeHash
(
@@ -264,7 +263,6 @@ computeHash
return used;
}
-
static void
computeBinaryHash
(
diff --git a/src/tool/hpcrun/gpu/nvidia/cuda-api.c b/src/tool/hpcrun/gpu/nvidia/cuda-api.c
index 8240f9cab5..30a5dc035a 100644
--- a/src/tool/hpcrun/gpu/nvidia/cuda-api.c
+++ b/src/tool/hpcrun/gpu/nvidia/cuda-api.c
@@ -185,6 +185,7 @@ CUDA_RUNTIME_FN
// private operations
//******************************************************************************
+
int
cuda_bind
(
@@ -291,6 +292,7 @@ cuda_runtime_version
// interface operations
//******************************************************************************
+
int
cuda_context
(
diff --git a/src/tool/hpcrun/gpu/nvidia/cupti-activity-translate.c b/src/tool/hpcrun/gpu/nvidia/cupti-activity-translate.c
index 170f35d20a..406b043747 100644
--- a/src/tool/hpcrun/gpu/nvidia/cupti-activity-translate.c
+++ b/src/tool/hpcrun/gpu/nvidia/cupti-activity-translate.c
@@ -587,7 +587,7 @@ cupti_activity_translate
case CUPTI_ACTIVITY_KIND_PC_SAMPLING_RECORD_INFO:
convert_pcsampling_record_info
- (ga, (CUpti_ActivityPCSamplingRecordInfo *)activity);
+ (ga, (CUpti_ActivityPCSamplingRecordInfo *)activity);
break;
case CUPTI_ACTIVITY_KIND_MEMCPY2:
diff --git a/src/tool/hpcrun/gpu/nvidia/cupti-api.c b/src/tool/hpcrun/gpu/nvidia/cupti-api.c
index 8358ab161b..0cccbe4122 100644
--- a/src/tool/hpcrun/gpu/nvidia/cupti-api.c
+++ b/src/tool/hpcrun/gpu/nvidia/cupti-api.c
@@ -162,6 +162,7 @@ flush_alarm_handler(int sig, siginfo_t* siginfo, void* context)
#include
#include // hpcrun_force_dlopen
#include
+#include
#include
#include
@@ -177,20 +178,31 @@ flush_alarm_handler(int sig, siginfo_t* siginfo, void* context)
#include
+#include
+
#include "cuda-api.h"
#include "cupti-api.h"
#include "cupti-gpu-api.h"
#include "cubin-hash-map.h"
#include "cubin-id-map.h"
+#include "tool_state.h"
+
+//#include "sample_sources_all.h"
//******************************************************************************
// macros
//******************************************************************************
-#define CUPTI_LIBRARY_LOCATION "lib64/libcupti.so"
-#define CUPTI_PATH_FROM_CUDA "extras/CUPTI/"
+
+#define DEBUG 0
+#include
+
+
+#define CUPTI_LIBRARY_LOCATION "/lib64/libcupti.so"
+#define CUPTI_PATH_FROM_CUDA "extras/CUPTI"
+
#define HPCRUN_CUPTI_ACTIVITY_BUFFER_SIZE (16 * 1024 * 1024)
#define HPCRUN_CUPTI_ACTIVITY_BUFFER_ALIGNMENT (8)
@@ -853,6 +865,23 @@ ensure_kernel_ip_present
}
+static void
+cupti_gpu_monitors_apply_enter(cct_node_t *cct_node)
+{
+ cupti_correlation_id_push(IGNORE_CORR_ID);
+ gpu_monitors_apply( cct_node, gpu_monitor_type_enter);
+ cupti_correlation_id_pop();
+}
+
+
+static void
+cupti_gpu_monitors_apply_exit()
+{
+ cupti_correlation_id_push(IGNORE_CORR_ID);
+ gpu_monitors_apply( NULL, gpu_monitor_type_exit);
+ cupti_correlation_id_pop();
+}
+
static void
cupti_subscriber_callback
(
@@ -862,6 +891,11 @@ cupti_subscriber_callback
const void *cb_info
)
{
+
+ if (is_tool_active()) {
+ return;
+ }
+
if (domain == CUPTI_CB_DOMAIN_RESOURCE) {
const CUpti_ResourceData *rd = (const CUpti_ResourceData *) cb_info;
if (cb_id == CUPTI_CBID_RESOURCE_MODULE_LOADED) {
@@ -889,6 +923,7 @@ cupti_subscriber_callback
cupti_stop_flag_set();
const CUpti_CallbackData *cd = (const CUpti_CallbackData *) cb_info;
+ PRINT("\nDriver API: -----------------%s\n", cd->functionName );
bool ompt_runtime_api_flag = ompt_runtime_status_get();
@@ -1043,11 +1078,15 @@ cupti_subscriber_callback
default:
break;
}
- bool is_kernel_op = gpu_op_placeholder_flags_is_set(gpu_op_placeholder_flags,
- gpu_placeholder_type_kernel);
+
+ bool is_kernel_op = gpu_op_placeholder_flags_is_set(gpu_op_placeholder_flags,gpu_placeholder_type_kernel);
+
+// PRINT("DRIVER: is_valid_op = %d \t is_kernel = %d \t cupti_runtime_api_flag = %d \t ompt_runtime_api_flag = %d | callback_site = %d\n",
+// is_valid_op, is_kernel_op, cupti_runtime_api_flag, ompt_runtime_api_flag, cd->callbackSite);
+
// If we have a valid operation and is not in the interval of a cuda/ompt runtime api
if (is_valid_op && !cupti_runtime_api_flag && !ompt_runtime_api_flag) {
- if (cd->callbackSite == CUPTI_API_ENTER) {
+ if (cd->callbackSite == CUPTI_API_ENTER) {
// A driver API cannot be implemented by other driver APIs, so we get an id
// and unwind when the API is entered
@@ -1073,19 +1112,26 @@ cupti_subscriber_callback
ensure_kernel_ip_present(trace_ph, kernel_ip);
}
+
hpcrun_safe_exit();
// Generate notification entry
uint64_t cpu_submit_time = hpcrun_nanotime();
- gpu_correlation_channel_produce(correlation_id, &gpu_op_ccts,
+
+
+ cupti_gpu_monitors_apply_enter(api_node);
+
+ gpu_correlation_channel_produce(correlation_id, &gpu_op_ccts,
cpu_submit_time);
TMSG(CUPTI_TRACE, "Driver push externalId %lu (cb_id = %u)", correlation_id, cb_id);
} else if (cd->callbackSite == CUPTI_API_EXIT) {
+ cupti_gpu_monitors_apply_exit();
+
uint64_t correlation_id __attribute__((unused)); // not used if PRINT omitted
correlation_id = cupti_correlation_id_pop();
TMSG(CUPTI_TRACE, "Driver pop externalId %lu (cb_id = %u)", correlation_id, cb_id);
- }
+ }
} else if (is_kernel_op && cupti_runtime_api_flag && cd->callbackSite ==
CUPTI_API_ENTER) {
if (cupti_kernel_ph != NULL) {
@@ -1106,6 +1152,7 @@ cupti_subscriber_callback
cupti_stop_flag_set();
const CUpti_CallbackData *cd = (const CUpti_CallbackData *)cb_info;
+ PRINT("\nRuntime API: -----------------%s\n", cd->functionName );
bool is_valid_op = false;
bool is_kernel_op __attribute__((unused)) = false; // used only by PRINT when debugging
@@ -1200,12 +1247,17 @@ cupti_subscriber_callback
default:
break;
}
+
+// PRINT("RUNTIME: is_valid_op = %d \t is_kernel = %d \t cupti_runtime_api_flag = %d \t ompt_runtime_api_flag = %d | callback_site = %d\n",
+// is_valid_op, is_kernel_op, cupti_runtime_api_flag, ompt_runtime_status_get(), cd->callbackSite);
+
if (is_valid_op) {
if (cd->callbackSite == CUPTI_API_ENTER) {
// Enter a CUDA runtime api
cupti_runtime_api_flag_set();
uint64_t correlation_id = gpu_correlation_id();
cupti_correlation_id_push(correlation_id);
+
// We should make notification records in the api enter callback.
// A runtime API must be implemented by driver APIs.
// Though unlikely in most cases,
@@ -1226,11 +1278,16 @@ cupti_subscriber_callback
// Generate notification entry
uint64_t cpu_submit_time = hpcrun_nanotime();
- gpu_correlation_channel_produce(correlation_id, &gpu_op_ccts,
+
+ cupti_gpu_monitors_apply_enter(cupti_kernel_ph);
+
+ gpu_correlation_channel_produce(correlation_id, &gpu_op_ccts,
cpu_submit_time);
TMSG(CUPTI_TRACE, "Runtime push externalId %lu (cb_id = %u)", correlation_id, cb_id);
} else if (cd->callbackSite == CUPTI_API_EXIT) {
+
+ cupti_gpu_monitors_apply_exit();
// Exit an CUDA runtime api
cupti_runtime_api_flag_unset();
@@ -1348,7 +1405,7 @@ cupti_buffer_completion_callback
do {
status = cupti_buffer_cursor_advance(buffer, validSize, &cupti_activity);
if (status) {
- cupti_activity_process(cupti_activity);
+ cupti_activity_process(cupti_activity);
++processed;
}
} while (status);
diff --git a/src/tool/hpcrun/gpu/ompt/ompt-activity-translate.c b/src/tool/hpcrun/gpu/ompt/ompt-activity-translate.c
new file mode 100644
index 0000000000..4f89e2c37c
--- /dev/null
+++ b/src/tool/hpcrun/gpu/ompt/ompt-activity-translate.c
@@ -0,0 +1,309 @@
+// -*-Mode: C++;-*- // technically C99
+
+// * BeginRiceCopyright *****************************************************
+//
+// --------------------------------------------------------------------------
+// Part of HPCToolkit (hpctoolkit.org)
+//
+// Information about sources of support for research and development of
+// HPCToolkit is at 'hpctoolkit.org' and in 'README.Acknowledgments'.
+// --------------------------------------------------------------------------
+//
+// Copyright ((c)) 2002-2021, Rice University
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// * Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// * Neither the name of Rice University (RICE) nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// This software is provided by RICE and contributors "as is" and any
+// express or implied warranties, including, but not limited to, the
+// implied warranties of merchantability and fitness for a particular
+// purpose are disclaimed. In no event shall RICE or contributors be
+// liable for any direct, indirect, incidental, special, exemplary, or
+// consequential damages (including, but not limited to, procurement of
+// substitute goods or services; loss of use, data, or profits; or
+// business interruption) however caused and on any theory of liability,
+// whether in contract, strict liability, or tort (including negligence
+// or otherwise) arising in any way out of the use of this software, even
+// if advised of the possibility of such damage.
+//
+// ******************************************************* EndRiceCopyright *
+
+
+//******************************************************************************
+// Description:
+// Read fields from a ompt_record_ompt_t and assign to a
+// GPU-independent gpu_activity_t.
+//
+// This interface is only used by the CUPTI GPU monitoring thread.
+// It is thread-safe as long as it does not access details structures
+// shared by worker threads.
+//******************************************************************************
+
+//******************************************************************************
+// local includes
+//******************************************************************************
+
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+
+
+#include "ompt-activity-translate.h"
+
+
+//******************************************************************************
+// macros
+//******************************************************************************
+
+
+
+
+//******************************************************************************
+// private operations
+//******************************************************************************
+
+static void
+convert_unknown
+(
+ gpu_activity_t *ga,
+ ompt_record_ompt_t *r,
+ uint64_t *cid_ptr
+)
+{
+ ga->kind = GPU_ACTIVITY_UNKNOWN;
+ *cid_ptr = 0;
+}
+
+
+static void
+convert_ptrop
+(
+ gpu_activity_t *ga,
+ ompt_record_ompt_t *r,
+ uint64_t *cid_ptr
+)
+{
+ ga->kind = GPU_ACTIVITY_UNKNOWN;
+ *cid_ptr = 0;
+}
+
+
+static void
+convert_target
+(
+ gpu_activity_t *ga,
+ ompt_record_ompt_t *r,
+ uint64_t *cid_ptr
+)
+{
+ ompt_record_target_t *t __attribute__((unused)) = &r->record.target;
+
+ ga->kind = GPU_ACTIVITY_UNKNOWN;
+ *cid_ptr = 0;
+}
+
+
+static void
+convert_memory
+(
+ gpu_activity_t *ga,
+ ompt_record_ompt_t *r,
+ gpu_mem_op_t mem_op,
+ uint64_t *cid_ptr
+)
+{
+ ompt_record_target_data_op_t *d = &r->record.target_data_op;
+
+ ga->kind = GPU_ACTIVITY_MEMORY;
+ ga->details.memory.memKind = GPU_MEM_UNKNOWN;
+ ga->details.memory.correlation_id = d->host_op_id;
+ ga->details.memory.mem_op = mem_op;
+ *cid_ptr = d->host_op_id;
+
+ ga->details.memory.bytes = d->bytes;
+}
+
+
+static void
+convert_alloc
+(
+ gpu_activity_t *ga,
+ ompt_record_ompt_t *r,
+ uint64_t *cid_ptr
+)
+{
+ convert_memory(ga, r, GPU_MEM_OP_ALLOC, cid_ptr);
+}
+
+
+static void
+convert_delete
+(
+ gpu_activity_t *ga,
+ ompt_record_ompt_t *r,
+ uint64_t *cid_ptr
+)
+{
+ convert_memory(ga, r, GPU_MEM_OP_DELETE, cid_ptr);
+}
+
+
+static gpu_memcpy_type_t
+convert_memcpy_type
+(
+ ompt_target_data_op_t kind
+)
+{
+ switch (kind) {
+ case ompt_target_data_transfer_to_device_async:
+ case ompt_target_data_transfer_to_device:
+ return GPU_MEMCPY_H2D;
+
+ case ompt_target_data_transfer_from_device_async:
+ case ompt_target_data_transfer_from_device:
+ return GPU_MEMCPY_D2H;
+
+ default:
+ return GPU_MEMCPY_UNK;
+ }
+}
+
+
+static void
+convert_memcpy
+(
+ gpu_activity_t *ga,
+ ompt_record_ompt_t *r,
+ uint64_t *cid_ptr
+)
+{
+ ompt_record_target_data_op_t *d = &r->record.target_data_op;
+
+ ga->kind = GPU_ACTIVITY_MEMCPY;
+
+ ga->details.memcpy.correlation_id = d->host_op_id;
+ *cid_ptr = d->host_op_id;
+
+ ga->details.memcpy.bytes = d->bytes;
+ ga->details.memcpy.copyKind = convert_memcpy_type(d->optype);
+}
+
+
+static void
+convert_target_data_op
+(
+ gpu_activity_t *ga,
+ ompt_record_ompt_t *r,
+ uint64_t *cid_ptr
+)
+{
+ ompt_record_target_data_op_t *d = &r->record.target_data_op;
+
+ switch(d->optype) {
+
+ case ompt_target_data_transfer_to_device:
+ case ompt_target_data_transfer_from_device:
+ convert_memcpy(ga, r, cid_ptr);
+ break;
+
+ case ompt_target_data_alloc_async:
+ case ompt_target_data_alloc:
+ convert_alloc(ga, r, cid_ptr);
+ break;
+
+ case ompt_target_data_delete_async:
+ case ompt_target_data_delete:
+ convert_delete(ga, r, cid_ptr);
+ break;
+
+ case ompt_target_data_associate:
+ case ompt_target_data_disassociate:
+ convert_ptrop(ga, r, cid_ptr);
+ break;
+
+ default:
+ convert_unknown(ga, r, cid_ptr);
+ break;
+ }
+
+ gpu_interval_set(&ga->details.interval, r->time, d->end_time);
+}
+
+
+void
+convert_target_submit
+(
+ gpu_activity_t *ga,
+ ompt_record_ompt_t *r,
+ uint64_t *cid_ptr
+)
+{
+ ompt_record_target_kernel_t *k = &r->record.target_kernel;
+
+ ga->kind = GPU_ACTIVITY_KERNEL;
+ ga->details.kernel.correlation_id = k->host_op_id;
+ *cid_ptr = k->host_op_id;
+
+ gpu_interval_set(&ga->details.interval, r->time, k->end_time);
+}
+
+
+
+//******************************************************************************
+// interface operations
+//******************************************************************************
+
+void
+ompt_activity_translate
+(
+ gpu_activity_t *ga,
+ ompt_record_ompt_t *r,
+ uint64_t *cid_ptr
+)
+{
+ memset(ga, 0, sizeof(gpu_activity_t));
+ switch (r->type) {
+
+ case ompt_callback_target:
+ case ompt_callback_target_emi:
+
+ convert_target(ga,r, cid_ptr);
+ break;
+
+ case ompt_callback_target_data_op:
+ case ompt_callback_target_data_op_emi:
+
+ convert_target_data_op(ga,r, cid_ptr);
+ break;
+
+ case ompt_callback_target_submit:
+ case ompt_callback_target_submit_emi:
+
+ convert_target_submit(ga,r, cid_ptr);
+ break;
+
+ default:
+ convert_unknown(ga, r, cid_ptr);
+ break;
+ }
+
+
+ cstack_ptr_set(&(ga->next), 0);
+}
diff --git a/src/tool/hpcrun/gpu/ompt/ompt-activity-translate.h b/src/tool/hpcrun/gpu/ompt/ompt-activity-translate.h
new file mode 100644
index 0000000000..30dedb5c01
--- /dev/null
+++ b/src/tool/hpcrun/gpu/ompt/ompt-activity-translate.h
@@ -0,0 +1,79 @@
+// -*-Mode: C++;-*- // technically C99
+
+// * BeginRiceCopyright *****************************************************
+//
+// --------------------------------------------------------------------------
+// Part of HPCToolkit (hpctoolkit.org)
+//
+// Information about sources of support for research and development of
+// HPCToolkit is at 'hpctoolkit.org' and in 'README.Acknowledgments'.
+// --------------------------------------------------------------------------
+//
+// Copyright ((c)) 2002-2021, Rice University
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// * Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// * Neither the name of Rice University (RICE) nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// This software is provided by RICE and contributors "as is" and any
+// express or implied warranties, including, but not limited to, the
+// implied warranties of merchantability and fitness for a particular
+// purpose are disclaimed. In no event shall RICE or contributors be
+// liable for any direct, indirect, incidental, special, exemplary, or
+// consequential damages (including, but not limited to, procurement of
+// substitute goods or services; loss of use, data, or profits; or
+// business interruption) however caused and on any theory of liability,
+// whether in contract, strict liability, or tort (including negligence
+// or otherwise) arising in any way out of the use of this software, even
+// if advised of the possibility of such damage.
+//
+// ******************************************************* EndRiceCopyright *
+
+#ifndef ompt_activity_translate_h
+#define ompt_activity_translate_h
+
+
+//******************************************************************************
+// OpenMP includes
+//******************************************************************************
+
+#include
+
+
+
+//******************************************************************************
+// type declarations
+//******************************************************************************
+
+typedef struct gpu_activity_t gpu_activity_t;
+typedef struct cct_node_t cct_node_t;
+
+
+
+//******************************************************************************
+// interface operations
+//******************************************************************************
+
+void
+ompt_activity_translate
+(
+ gpu_activity_t *entry,
+ ompt_record_ompt_t *record,
+ uint64_t *cid_ptr
+);
+
+
+
+#endif
diff --git a/src/tool/hpcrun/gpu/ompt/ompt-gpu-api.c b/src/tool/hpcrun/gpu/ompt/ompt-gpu-api.c
new file mode 100644
index 0000000000..e1dae062ec
--- /dev/null
+++ b/src/tool/hpcrun/gpu/ompt/ompt-gpu-api.c
@@ -0,0 +1,86 @@
+// -*-Mode: C++;-*- // technically C99
+
+// * BeginRiceCopyright *****************************************************
+//
+// --------------------------------------------------------------------------
+// Part of HPCToolkit (hpctoolkit.org)
+//
+// Information about sources of support for research and development of
+// HPCToolkit is at 'hpctoolkit.org' and in 'README.Acknowledgments'.
+// --------------------------------------------------------------------------
+//
+// Copyright ((c)) 2002-2021, Rice University
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// * Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// * Neither the name of Rice University (RICE) nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// This software is provided by RICE and contributors "as is" and any
+// express or implied warranties, including, but not limited to, the
+// implied warranties of merchantability and fitness for a particular
+// purpose are disclaimed. In no event shall RICE or contributors be
+// liable for any direct, indirect, incidental, special, exemplary, or
+// consequential damages (including, but not limited to, procurement of
+// substitute goods or services; loss of use, data, or profits; or
+// business interruption) however caused and on any theory of liability,
+// whether in contract, strict liability, or tort (including negligence
+// or otherwise) arising in any way out of the use of this software, even
+// if advised of the possibility of such damage.
+//
+// ******************************************************* EndRiceCopyright *
+
+//******************************************************************************
+// local includes
+//******************************************************************************
+
+#include
+
+#include
+#include
+#include
+
+#include "ompt-gpu-api.h"
+#include "ompt-activity-translate.h"
+
+
+
+//******************************************************************************
+// interface operations
+//******************************************************************************
+
+void
+ompt_buffer_completion_notify
+(
+ void
+)
+{
+ gpu_monitoring_thread_activities_ready();
+}
+
+
+void
+ompt_activity_process
+(
+ ompt_record_ompt_t *record
+)
+{
+ gpu_activity_t gpu_activity;
+ uint64_t correlation_id;
+ ompt_activity_translate(&gpu_activity, record, &correlation_id);
+ if (gpu_correlation_id_map_lookup(correlation_id) == NULL) {
+ gpu_correlation_id_map_insert(correlation_id, correlation_id);
+ }
+ gpu_activity_process(&gpu_activity);
+}
diff --git a/src/tool/hpcrun/gpu/amd/rocm-debug-api.h b/src/tool/hpcrun/gpu/ompt/ompt-gpu-api.h
similarity index 85%
rename from src/tool/hpcrun/gpu/amd/rocm-debug-api.h
rename to src/tool/hpcrun/gpu/ompt/ompt-gpu-api.h
index 9ffacea2a8..cca8cee7af 100644
--- a/src/tool/hpcrun/gpu/amd/rocm-debug-api.h
+++ b/src/tool/hpcrun/gpu/ompt/ompt-gpu-api.h
@@ -9,7 +9,7 @@
// HPCToolkit is at 'hpctoolkit.org' and in 'README.Acknowledgments'.
// --------------------------------------------------------------------------
//
-// Copyright ((c)) 2002-2022, Rice University
+// Copyright ((c)) 2002-2021, Rice University
// All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
@@ -41,41 +41,36 @@
//
// ******************************************************* EndRiceCopyright *
-#ifndef rocm_debug_api_h
-#define rocm_debug_api_h
+#ifndef ompt_gpu_api_h
+#define ompt_gpu_api_h
+
+
//******************************************************************************
-// interface operations
+// OpenMP includes
//******************************************************************************
-int
-rocm_debug_api_bind
-(
- void
-);
+#include
+
-void
-rocm_debug_api_init
-(
- void
-);
+
+//******************************************************************************
+// interface operations
+//******************************************************************************
void
-rocm_debug_api_fini
+ompt_buffer_completion_notify
(
- void
+ void
);
+
void
-rocm_debug_api_query_code_object
+ompt_activity_process
(
- size_t* code_obejct_count_ptr
+ ompt_record_ompt_t *record
);
-char*
-rocm_debug_api_query_uri
-(
- size_t code_object_index
-);
+
#endif
diff --git a/src/tool/hpcrun/gpu/opencl/opencl-api.c b/src/tool/hpcrun/gpu/opencl/opencl-api.c
index 91df87455a..98689be8e7 100644
--- a/src/tool/hpcrun/gpu/opencl/opencl-api.c
+++ b/src/tool/hpcrun/gpu/opencl/opencl-api.c
@@ -476,10 +476,10 @@ opencl_operation_multiplexer_push
gpu_activity.details.correlation.host_correlation_id = correlation_id;
gpu_operation_multiplexer_push(obj->details.initiator_channel,
NULL, &gpu_activity);
-
+
// The actual entry
opencl_activity_translate(&gpu_activity, obj, interval);
- gpu_operation_multiplexer_push(obj->details.initiator_channel,
+ gpu_operation_multiplexer_push(obj->details.initiator_channel,
obj->pending_operations, &gpu_activity);
}
@@ -1250,6 +1250,441 @@ opencl_api_thread_finalize
}
+cl_program
+clCreateProgramWithSource
+(
+ cl_context context,
+ cl_uint count,
+ const char** strings,
+ const size_t* lengths,
+ cl_int* errcode_ret
+)
+{
+ ETMSG(OPENCL, "inside clCreateProgramWithSource_wrapper");
+
+#if 0
+ if (strings != NULL && lengths != NULL) {
+ FILE *f_ptr;
+ for (int i = 0; i < (int)count; i++) {
+ // what if a single file has multiple kernels?
+ // we need to add logic to get filenames by reading the strings contents
+ char fileno = '0' + (i + 1); // right now we are naming the files as index numbers
+
+ // TO-DO: AARON using malloc instead of hpcrun_malloc gives extra garbage characters in file name
+ char *filename = (char *)hpcrun_malloc(sizeof(fileno) + 1);
+ *filename = fileno + '\0';
+ f_ptr = fopen(filename, "w");
+ fwrite(strings[i], lengths[i], 1, f_ptr);
+ }
+ fclose(f_ptr);
+ }
+#endif
+
+ return HPCRUN_OPENCL_CALL(clCreateProgramWithSource, (context, count, strings, lengths, errcode_ret));
+}
+
+#ifdef OPT_ENABLE_IGC
+// one downside of this appproach is that we may override the callback provided by user
+cl_int
+clBuildProgram
+(
+ cl_program program,
+ cl_uint num_devices,
+ const cl_device_id* device_list,
+ const char* options,
+ void (CL_CALLBACK* pfn_notify)(cl_program program, void* user_data),
+ void* user_data
+)
+{
+ ETMSG(OPENCL, "inside clBuildProgram_wrapper");
+ // XXX(Aaron): Caution, what's the maximum length of options?
+ int len_options = options == NULL ? 0 : strlen(options);
+ int len_flag = strlen(LINE_TABLE_FLAG);
+ char *options_with_debug_flags = (char *)malloc((len_options + len_flag + 1) * sizeof(char));
+ memset(options_with_debug_flags, 0, (len_options + len_flag + 1));
+ if (len_options != 0) {
+ strncat(options_with_debug_flags, options, len_options);
+ }
+ strcat(options_with_debug_flags, LINE_TABLE_FLAG);
+ cl_int ret = HPCRUN_OPENCL_CALL(clBuildProgram, (program, num_devices, device_list, options_with_debug_flags, clBuildProgramCallback, user_data));
+ free(options_with_debug_flags);
+ return ret;
+}
+#endif // OPT_ENABLE_IGC
+
+
+cl_command_queue
+clCreateCommandQueue
+(
+ cl_context context,
+ cl_device_id device,
+ cl_command_queue_properties properties,
+ cl_int *errcode_ret
+)
+{
+ // enabling profiling
+ properties |= (cl_command_queue_properties)CL_QUEUE_PROFILING_ENABLE;
+
+ cl_command_queue queue = HPCRUN_OPENCL_CALL(clCreateCommandQueue, (context, device,
+ properties,errcode_ret));
+
+ uint32_t context_id = opencl_cl_context_map_update((uint64_t)context);
+ opencl_cl_queue_map_update((uint64_t)queue, context_id);
+
+ return queue;
+}
+
+
+cl_command_queue
+clCreateCommandQueueWithProperties
+(
+ cl_context context,
+ cl_device_id device,
+ const cl_queue_properties* properties,
+ cl_int* errcode_ret
+)
+{
+ cl_queue_properties *queue_properties = (cl_queue_properties *)properties;
+ if (properties == NULL) {
+ queue_properties = (cl_queue_properties *)malloc(sizeof(cl_queue_properties) * 3);
+ queue_properties[0] = CL_QUEUE_PROPERTIES;
+ queue_properties[1] = CL_QUEUE_PROFILING_ENABLE;
+ queue_properties[2] = 0;
+ } else {
+ int queue_props_id = -1;
+ int props_count = 0;
+ while (properties[props_count] != 0) {
+ if (properties[props_count] == CL_QUEUE_PROPERTIES) {
+ queue_props_id = props_count;
+ ++props_count;
+ } else if (properties[props_count] == CL_QUEUE_SIZE) {
+ ++props_count;
+ }
+ ++props_count;
+ }
+
+ if (queue_props_id >= 0 && queue_props_id + 1 < props_count) {
+ queue_properties = (cl_queue_properties *)malloc(sizeof(cl_queue_properties) * (props_count + 1));
+ for (int i = 0; i < props_count; ++i) {
+ queue_properties[i] = properties[i];
+ }
+ // We do have a queue property entry, just enable profiling
+ queue_properties[queue_props_id + 1] |= CL_QUEUE_PROFILING_ENABLE;
+ queue_properties[props_count] = 0;
+ } else {
+ // We do not have a queue property entry, need to allocate a queue property entry and set up
+ queue_properties = (cl_queue_properties *)malloc(sizeof(cl_queue_properties) * (props_count + 3));
+ for (int i = 0; i < props_count; ++i) {
+ queue_properties[i] = properties[i];
+ }
+ queue_properties[props_count] = CL_QUEUE_PROPERTIES;
+ queue_properties[props_count + 1] = CL_QUEUE_PROFILING_ENABLE;
+ queue_properties[props_count + 2] = 0;
+ }
+ }
+ cl_command_queue queue = HPCRUN_OPENCL_CALL(clCreateCommandQueueWithProperties, (context, device, queue_properties, errcode_ret));
+ if (queue_properties != NULL) {
+ // The property is created by us
+ free(queue_properties);
+ }
+
+ uint32_t context_id = opencl_cl_context_map_update((uint64_t)context);
+ opencl_cl_queue_map_update((uint64_t)queue, context_id);
+ return queue;
+}
+
+
+cl_int
+clEnqueueNDRangeKernel
+(
+ cl_command_queue command_queue,
+ cl_kernel ocl_kernel,
+ cl_uint work_dim,
+ const size_t *global_work_offset,
+ const size_t *global_work_size,
+ const size_t *local_work_size,
+ cl_uint num_events_in_wait_list,
+ const cl_event *event_wait_list,
+ cl_event *event
+)
+{
+ opencl_object_t *kernel_info = opencl_malloc_kind(GPU_ACTIVITY_KERNEL);
+ INITIALIZE_CALLBACK_INFO(initializeKernelCallBackInfo, kernel_info, (kernel_info, command_queue))
+
+ opencl_subscriber_callback(kernel_info);
+
+ cl_event *eventp = NULL;
+ SET_EVENT_POINTER(eventp, event, kernel_info)
+
+ cl_int return_status =
+ HPCRUN_OPENCL_CALL(clEnqueueNDRangeKernel, (command_queue, ocl_kernel, work_dim,
+ global_work_offset, global_work_size, local_work_size,
+ num_events_in_wait_list, event_wait_list, eventp));
+
+ ETMSG(OPENCL, "Registering callback for kind: Kernel. "
+ "Correlation id: %"PRIu64 "", kernel_info->details.ker_cb.correlation_id);
+
+ clSetEventCallback_wrapper(*eventp, CL_COMPLETE,
+ &opencl_activity_completion_callback, kernel_info);
+ return return_status;
+}
+
+
+// this is a simplified version of clEnqueueNDRangeKernel, TODO: check if code duplication can be avoided
+cl_int
+clEnqueueTask
+(
+ cl_command_queue command_queue,
+ cl_kernel kernel,
+ cl_uint num_events_in_wait_list,
+ const cl_event* event_wait_list,
+ cl_event* event
+)
+{
+ opencl_object_t *kernel_info = opencl_malloc_kind(GPU_ACTIVITY_KERNEL);
+ INITIALIZE_CALLBACK_INFO(initializeKernelCallBackInfo, kernel_info, (kernel_info, command_queue))
+
+ opencl_subscriber_callback(kernel_info);
+
+ cl_event *eventp = NULL;
+ SET_EVENT_POINTER(eventp, event, kernel_info);
+
+ cl_int return_status =
+ HPCRUN_OPENCL_CALL(clEnqueueTask, (command_queue, kernel,
+ num_events_in_wait_list, event_wait_list, eventp));
+
+ ETMSG(OPENCL, "Registering callback for kind: Kernel. "
+ "Correlation id: %"PRIu64 "", kernel_info->details.ker_cb.correlation_id);
+
+ clSetEventCallback_wrapper(*eventp, CL_COMPLETE,
+ &opencl_activity_completion_callback, kernel_info);
+ return return_status;
+}
+
+
+cl_int
+clEnqueueReadBuffer
+(
+ cl_command_queue command_queue,
+ cl_mem buffer,
+ cl_bool blocking_read,
+ size_t offset,
+ size_t cb,
+ void *ptr,
+ cl_uint num_events_in_wait_list,
+ const cl_event *event_wait_list,
+ cl_event *event
+)
+{
+ ETMSG(OPENCL, "inside clEnqueueReadBuffer wrapper");
+
+ opencl_object_t *cpy_info = opencl_malloc_kind(GPU_ACTIVITY_MEMCPY);
+ INITIALIZE_CALLBACK_INFO(initializeMemcpyCallBackInfo, cpy_info, (cpy_info, GPU_MEMCPY_D2H, cb, command_queue))
+
+ opencl_subscriber_callback(cpy_info);
+
+ cl_event *eventp = NULL;
+ SET_EVENT_POINTER(eventp, event, cpy_info);
+
+ cl_int return_status =
+ HPCRUN_OPENCL_CALL(clEnqueueReadBuffer,
+ (command_queue, buffer, blocking_read, offset,
+ cb, ptr, num_events_in_wait_list, event_wait_list, eventp));
+
+ ETMSG(OPENCL, "Registering callback for kind MEMCPY, type: D2H. "
+ "Correlation id: %"PRIu64 "", cpy_info->details.cpy_cb.correlation_id);
+ ETMSG(OPENCL, "%d(bytes) of data being transferred from device to host",
+ (long)cb);
+
+
+ clSetEventCallback_wrapper(*eventp, CL_COMPLETE,
+ &opencl_activity_completion_callback, cpy_info);
+
+ return return_status;
+}
+
+
+cl_int
+clEnqueueWriteBuffer
+(
+ cl_command_queue command_queue,
+ cl_mem buffer,
+ cl_bool blocking_write,
+ size_t offset,
+ size_t cb,
+ const void *ptr,
+ cl_uint num_events_in_wait_list,
+ const cl_event *event_wait_list,
+ cl_event *event
+)
+{
+ ETMSG(OPENCL, "inside clEnqueueWriteBuffer wrapper. cl_mem buffer: %p", buffer);
+ opencl_object_t *cpy_info = opencl_malloc_kind(GPU_ACTIVITY_MEMCPY);
+ INITIALIZE_CALLBACK_INFO(initializeMemcpyCallBackInfo, cpy_info, (cpy_info, GPU_MEMCPY_H2D, cb, command_queue))
+
+ opencl_subscriber_callback(cpy_info);
+
+ cl_event *eventp = NULL;
+ SET_EVENT_POINTER(eventp, event, cpy_info);
+
+ cl_int return_status =
+ HPCRUN_OPENCL_CALL(clEnqueueWriteBuffer,
+ (command_queue, buffer, blocking_write, offset, cb, ptr,
+ num_events_in_wait_list, event_wait_list, eventp));
+
+ ETMSG(OPENCL, "Registering callback for kind MEMCPY, type: H2D. "
+ "Correlation id: %"PRIu64 "", cpy_info->details.cpy_cb.correlation_id);
+ ETMSG(OPENCL, "%d(bytes) of data being transferred from host to device",
+ (long)cb);
+
+ clSetEventCallback_wrapper(*eventp, CL_COMPLETE,
+ &opencl_activity_completion_callback,
+ (void*) cpy_info);
+
+ return return_status;
+}
+
+
+void*
+clEnqueueMapBuffer
+(
+ cl_command_queue command_queue,
+ cl_mem buffer,
+ cl_bool blocking_map,
+ cl_map_flags map_flags,
+ size_t offset,
+ size_t size,
+ cl_uint num_events_in_wait_list,
+ const cl_event* event_wait_list,
+ cl_event* event,
+ cl_int* errcode_ret
+)
+{
+ ETMSG(OPENCL, "inside clEnqueueMapBuffer wrapper");
+
+ opencl_object_t *cpy_info = opencl_malloc_kind(GPU_ACTIVITY_MEMCPY);
+ if (map_flags == CL_MAP_READ) {
+ INITIALIZE_CALLBACK_INFO(initializeMemcpyCallBackInfo, cpy_info, (cpy_info, GPU_MEMCPY_D2H, size, command_queue));
+ } else {
+ //map_flags == CL_MAP_WRITE || map_flags == CL_MAP_WRITE_INVALIDATE_REGION
+ INITIALIZE_CALLBACK_INFO(initializeMemcpyCallBackInfo, cpy_info, (cpy_info, GPU_MEMCPY_H2D, size, command_queue));
+ }
+
+ opencl_subscriber_callback(cpy_info);
+
+ cl_event *eventp = NULL;
+ SET_EVENT_POINTER(eventp, event, cpy_info);
+
+ void *map_ptr =
+ HPCRUN_OPENCL_CALL(clEnqueueMapBuffer,
+ (command_queue, buffer, blocking_map, map_flags, offset,
+ size, num_events_in_wait_list, event_wait_list, eventp, errcode_ret));
+
+ if (map_flags == CL_MAP_READ) {
+ ETMSG(OPENCL, "Registering callback for kind MEMCPY, type: D2H. "
+ "Correlation id: %"PRIu64 "", cpy_info->details.cpy_cb.correlation_id);
+ ETMSG(OPENCL, "%d(bytes) of data being transferred from device to host",
+ (long)size);
+ } else {
+ ETMSG(OPENCL, "Registering callback for kind MEMCPY, type: H2D. "
+ "Correlation id: %"PRIu64 "", cpy_info->details.cpy_cb.correlation_id);
+ ETMSG(OPENCL, "%d(bytes) of data being transferred from host to device",
+ (long)size);
+ }
+
+ clSetEventCallback_wrapper(*eventp, CL_COMPLETE,
+ &opencl_activity_completion_callback, cpy_info);
+
+ return map_ptr;
+}
+
+
+cl_mem
+clCreateBuffer
+(
+ cl_context context,
+ cl_mem_flags flags,
+ size_t size,
+ void* host_ptr,
+ cl_int* errcode_ret
+)
+{
+ ETMSG(OPENCL, "clCreateBuffer flags: %u, size: %"PRIu64 "", flags, size);
+
+ opencl_object_t *mem_info = opencl_malloc_kind(GPU_ACTIVITY_MEMORY);
+ INITIALIZE_CALLBACK_INFO(initializeMemoryCallBackInfo, mem_info, (mem_info, flags, size))
+
+ opencl_subscriber_callback(mem_info);
+
+ gpu_interval_t interval;
+ interval.start = CPU_NANOTIME();
+ cl_mem buffer =
+ HPCRUN_OPENCL_CALL(clCreateBuffer, (context, flags, size, host_ptr, errcode_ret));
+ interval.end = CPU_NANOTIME();
+
+ opencl_operation_multiplexer_push(interval, mem_info, mem_info->details.mem_cb.correlation_id);
+
+ opencl_free(mem_info);
+
+ return buffer;
+}
+
+
+cl_int
+clSetKernelArg
+(
+ cl_kernel kernel,
+ cl_uint arg_index,
+ size_t arg_size,
+ const void* arg_value
+)
+{
+ return HPCRUN_OPENCL_CALL(clSetKernelArg, (kernel, arg_index, arg_size, arg_value));
+}
+
+
+void
+opencl_instrumentation_enable
+(
+ void
+)
+{
+ instrumentation = true;
+}
+
+
+void
+opencl_api_thread_finalize
+(
+ void *args
+)
+{
+ if (opencl_api_flag) {
+ // If I have invoked any opencl api, I have to attribute all my activities to my ccts
+ opencl_api_flag = false;
+
+ atomic_bool wait;
+ atomic_store(&wait, true);
+ gpu_activity_t gpu_activity;
+ memset(&gpu_activity, 0, sizeof(gpu_activity_t));
+
+ gpu_activity.kind = GPU_ACTIVITY_FLUSH;
+ gpu_activity.details.flush.wait = &wait;
+ gpu_operation_multiplexer_push(gpu_activity_channel_get(), NULL, &gpu_activity);
+
+ // Wait until operations are drained
+ // Operation channel is FIFO
+ while (atomic_load(&wait)) {}
+
+ // Wait until my activities are drained
+ opencl_wait_for_self_pending_operations();
+
+ // Now I can attribute activities
+ gpu_application_thread_process_activities();
+ }
+}
+
+
void
opencl_api_process_finalize
(
diff --git a/src/tool/hpcrun/loadmap.c b/src/tool/hpcrun/loadmap.c
index 6491d70029..a3b2d463f6 100644
--- a/src/tool/hpcrun/loadmap.c
+++ b/src/tool/hpcrun/loadmap.c
@@ -145,7 +145,7 @@ hpcrun_dso_make(const char* name, void** table,
TMSG(DSO," hpcrun_dso_make for module %s", name);
int namelen = strlen(name) + 1;
- x->name = (char*) hpcrun_malloc(namelen);
+ x->name = (char*) malloc(namelen);
strcpy(x->name, name);
x->table = table;
diff --git a/src/tool/hpcrun/main.c b/src/tool/hpcrun/main.c
index ef3f54720a..b229e51531 100644
--- a/src/tool/hpcrun/main.c
+++ b/src/tool/hpcrun/main.c
@@ -78,6 +78,7 @@
#include
#include
+#include
#include "main.h"
@@ -221,7 +222,6 @@ bool hpcrun_no_unwind = false;
*****************************************************************************/
static __thread bool hpcrun_thread_suppress_sample = true;
-
//***************************************************************************
// local variables
//***************************************************************************
@@ -244,6 +244,9 @@ static hpcrun_aux_cleanup_t * hpcrun_aux_cleanup_free_list_head = NULL;
static char execname[PATH_MAX] = {'\0'};
static int monitor_fini_process_how = 0;
+static atomic_int ms_init_started = ATOMIC_VAR_INIT(0);
+static atomic_int ms_init_completed = ATOMIC_VAR_INIT(0);
+
//***************************************************************************
// Interface functions for suppressing samples
@@ -422,7 +425,7 @@ abort_timeout_handler(int sig, siginfo_t* siginfo, void* context)
static void
hpcrun_set_abort_timeout()
{
- static process_index = 0;
+ static int process_index = 0;
char *abort_timeout = getenv("HPCRUN_ABORT_TIMEOUT");
@@ -799,12 +802,13 @@ hpcrun_thread_init(int id, local_thread_data_t* local_thread_data, bool has_trac
epoch_t* epoch = TD_GET(core_profile_trace_data.epoch);
- if (! hpcrun_thread_suppress_sample) {
- // handle event sets for sample sources
- SAMPLE_SOURCES(gen_event_set,lush_metrics);
+ if (! hpcrun_thread_suppress_sample ) {
// sample sources take thread specific action prior to start (often is a 'registration' action);
SAMPLE_SOURCES(thread_init_action);
+ // handle event sets for sample sources
+ SAMPLE_SOURCES(gen_event_set,lush_metrics);
+
// start the sample sources
SAMPLE_SOURCES(start);
@@ -889,19 +893,19 @@ hpcrun_wait()
//***************************************************************************
-// process control (via libmonitor)
+// hpcrun initialization ( process control via libmonitor)
//***************************************************************************
+static
+void hpcrun_prepare_measurement_subsystem(bool is_child);
+
void*
monitor_init_process(int *argc, char **argv, void* data)
{
- char* process_name;
+ const char* process_name;
hpcrun_thread_suppress_sample = false;
- fork_data_t* fork_data = (fork_data_t*) data;
- bool is_child = data && fork_data->is_child;
-
hpcrun_wait();
#ifndef HPCRUN_STATIC_LINK
@@ -925,6 +929,8 @@ monitor_init_process(int *argc, char **argv, void* data)
copy_execname(process_name);
hpcrun_files_set_executable(process_name);
+ TMSG(PROCESS,"hpcrun_files_set_executable called w process name = %s", process_name);
+
// We initialize the load map and fnbounds before registering sample source.
// This is because sample source init (such as PAPI) may dlopen other libraries,
// which will trigger our library monitoring code and fnbound queries
@@ -938,6 +944,10 @@ monitor_init_process(int *argc, char **argv, void* data)
// We need to initialize messages related functions and set up measurement directory,
// so that we can write vdso and prevent fnbounds print messages to the terminal.
messages_init();
+
+ fork_data_t* fork_data = (fork_data_t*) data;
+ bool is_child = data && fork_data->is_child;
+
if (!hpcrun_get_disabled()) {
hpcrun_files_set_directory();
messages_logfile_create();
@@ -960,53 +970,73 @@ monitor_init_process(int *argc, char **argv, void* data)
auditor_exports->mainlib_connected(get_saved_vdso_path());
#endif
}
+
+ if (is_child){
+ hpcrun_prepare_measurement_subsystem(is_child);
+ }
- hpcrun_registered_sources_init();
+ return data;
+}
- hpcrun_do_custom_init();
+void
+monitor_at_main()
+{
+ bool is_child = false;
+ hpcrun_prepare_measurement_subsystem(is_child);
+}
- // for debugging, limit the life of the execution with an alarm.
- char* life = getenv("HPCRUN_LIFETIME");
- if (life != NULL){
- int seconds = atoi(life);
- if (seconds > 0) alarm((unsigned int) seconds);
- }
- // see if unwinding has been turned off
- // the same setting governs whether or not fnbounds is needed or used.
- hpcrun_no_unwind = hpcrun_get_env_bool("HPCRUN_NO_UNWIND");
+static
+void hpcrun_prepare_measurement_subsystem(bool is_child)
+{
+ if (atomic_fetch_add(&ms_init_started, 1) == 0){
+ hpcrun_registered_sources_init();
- char* s = getenv(HPCRUN_EVENT_LIST);
+ hpcrun_do_custom_init();
- if (! is_child) {
- hpcrun_sample_sources_from_eventlist(s);
- }
+ // for debugging, limit the life of the execution with an alarm.
+ char* life = getenv("HPCRUN_LIFETIME");
+ if (life != NULL){
+ int seconds = atoi(life);
+ if (seconds > 0) alarm((unsigned int) seconds);
+ }
- hpcrun_set_abort_timeout();
+ // see if unwinding has been turned off
+ // the same setting governs whether or not fnbounds is needed or used.
+ hpcrun_no_unwind = hpcrun_get_env_bool("HPCRUN_NO_UNWIND");
- hpcrun_process_sample_source_none();
+ char* s = getenv(HPCRUN_EVENT_LIST);
- TMSG(PROCESS,"hpcrun_files_set_executable called w process name = %s", process_name);
+ if (! is_child) {
+ hpcrun_sample_sources_from_eventlist(s);
+ }
- TMSG(PROCESS,"init");
+ hpcrun_set_abort_timeout();
+ hpcrun_process_sample_source_none();
- hpcrun_sample_prob_mesg();
+ TMSG(PROCESS,"hpcrun outer initialization");
- TMSG(PROCESS, "I am a %s process", is_child ? "child" : "parent");
+ hpcrun_sample_prob_mesg();
- hpcrun_init_internal(is_child);
+ TMSG(PROCESS, "I am a %s process parent");
- if (ENABLED(TST)){
- EEMSG("TST debug ctl is active!");
- STDERR_MSG("Std Err message appears");
- }
+ hpcrun_init_internal(is_child);
+ if (ENABLED(TST)){
+ EEMSG("TST debug ctl is active!");
+ STDERR_MSG("Std Err message appears");
+ }
- hpcrun_safe_exit();
+ hpcrun_safe_exit();
- return data;
+ atomic_store(&ms_init_completed, 1);
+
+ }else{
+ while(! atomic_load(&ms_init_completed));
+ }
+
}
@@ -1166,6 +1196,7 @@ monitor_init_thread_support(void)
hpcrun_safe_exit();
}
+
void*
monitor_thread_pre_create(void)
{
@@ -1182,7 +1213,11 @@ monitor_thread_pre_create(void)
if (module_ignore_map_inrange_lookup(thread_pre_create_address)) {
return MONITOR_IGNORE_NEW_THREAD;
}
-
+ bool is_child = false;
+ // outer initialization
+ hpcrun_prepare_measurement_subsystem(is_child);
+
+
hpcrun_safe_enter();
local_thread_data_t* rv = hpcrun_malloc(sizeof(local_thread_data_t));
diff --git a/src/tool/hpcrun/memory/mem.c b/src/tool/hpcrun/memory/mem.c
index e996453277..7af6897a3c 100644
--- a/src/tool/hpcrun/memory/mem.c
+++ b/src/tool/hpcrun/memory/mem.c
@@ -94,6 +94,15 @@ static long total_non_freeable = 0;
static int out_of_mem_mesg = 0;
+
+// ---------------------------------------------------
+// hpcrun_malloc() memory thread local data structures
+// ---------------------------------------------------
+__thread hpcrun_meminfo_t memstore;
+__thread int mem_low;
+
+
+
//------------------------------------------------------------------
// Internal functions
//------------------------------------------------------------------
@@ -222,22 +231,12 @@ hpcrun_memory_reinit(void)
// Allocate space and init a thread's memstore.
// If failure, shutdown sampling and leave old memstore in place.
void
-hpcrun_make_memstore(hpcrun_meminfo_t *mi, int is_child)
+hpcrun_make_memstore(hpcrun_meminfo_t *mi)
{
void *addr;
hpcrun_mem_init();
- // If in the child after fork(), then continue to use the parent's
- // memstore if it looks ok, else mmap a new one. Note: we can't
- // reset the memstore to empty unless we delete everything that was
- // created via hpcrun_malloc() (cct, uw_recipe_map, ...).
- if (is_child && mi->mi_start != NULL
- && mi->mi_start <= mi->mi_low && mi->mi_low <= mi->mi_high
- && mi->mi_high <= mi->mi_start + mi->mi_size) {
- return;
- }
-
addr = hpcrun_mmap_anon(memsize);
if (addr == NULL) {
if (! out_of_mem_mesg) {
@@ -260,10 +259,10 @@ hpcrun_make_memstore(hpcrun_meminfo_t *mi, int is_child)
void
hpcrun_reclaim_freeable_mem(void)
{
- hpcrun_meminfo_t *mi = &TD_GET(memstore);
+ hpcrun_meminfo_t *mi = &memstore;
mi->mi_low = mi->mi_start;
- TD_GET(mem_low) = 0;
+ mem_low = 0;
num_reclaims++;
TMSG(MALLOC, "%s: %d", __func__, num_reclaims);
}
@@ -283,7 +282,7 @@ hpcrun_malloc(size_t size)
return NULL;
}
- mi = &TD_GET(memstore);
+ mi = &memstore;
size = round_up(size);
// For a large request that doesn't fit within the existing
@@ -310,7 +309,7 @@ hpcrun_malloc(size_t size)
|| mi->mi_high - mi->mi_low < low_memsize
|| mi->mi_high - mi->mi_low < size) {
if (allow_extra_mmap) {
- hpcrun_make_memstore(mi, 0);
+ hpcrun_make_memstore(mi);
} else {
if (! out_of_mem_mesg) {
EMSG("%s: out of memory, shutting down sampling", __func__);
@@ -412,3 +411,11 @@ hpcrun_memory_summary(void)
"malloc failures: %ld",
total_freeable/meg, total_non_freeable/meg, num_failures);
}
+
+int
+get_mem_low(
+ void
+)
+{
+ return mem_low;
+}
diff --git a/src/tool/hpcrun/memory/newmem.h b/src/tool/hpcrun/memory/newmem.h
index 7fb1ed1211..90695c7322 100644
--- a/src/tool/hpcrun/memory/newmem.h
+++ b/src/tool/hpcrun/memory/newmem.h
@@ -63,6 +63,7 @@ struct hpcrun_meminfo {
typedef struct hpcrun_meminfo hpcrun_meminfo_t;
-void hpcrun_make_memstore(hpcrun_meminfo_t *mi, int is_child);
+void hpcrun_make_memstore(hpcrun_meminfo_t *mi);
+int get_mem_low(void);
#endif
diff --git a/src/tool/hpcrun/messages/debug-flag.c b/src/tool/hpcrun/messages/debug-flag.c
index 2e9c60a15f..66397b2dae 100644
--- a/src/tool/hpcrun/messages/debug-flag.c
+++ b/src/tool/hpcrun/messages/debug-flag.c
@@ -159,7 +159,7 @@ static pmsg_category all_list_entries [] = {
// E(CSP_MALLOC),
// E(MEM__ALLOC),
E(NORM_IP),
- E(PARTIAL_UNW)
+ E(PARTIAL_UNW)
};
diff --git a/src/tool/hpcrun/messages/messages.flag-defns b/src/tool/hpcrun/messages/messages.flag-defns
index 8379ee5f71..8b2d321418 100644
--- a/src/tool/hpcrun/messages/messages.flag-defns
+++ b/src/tool/hpcrun/messages/messages.flag-defns
@@ -156,6 +156,7 @@
E(CUPTI_TRACE),
E(CUDA_CUBIN),
E(CUPTI_ACTIVITY),
+ E(ROCM),
E(DATACENTRIC),
E(IDLE),
E(MAIN_BOUNDS),
diff --git a/src/tool/hpcrun/metrics.c b/src/tool/hpcrun/metrics.c
index d9aa9d7a0a..d6388b58a7 100644
--- a/src/tool/hpcrun/metrics.c
+++ b/src/tool/hpcrun/metrics.c
@@ -366,6 +366,7 @@ hpcrun_set_new_metric_desc(kind_info_t *kind, const char* name,
metric_desc_list_t* n = NULL;
// if there are pre-allocated metrics, use them
+ // (default metrics - not alloc, added metrics - prealloc)
if (pre_alloc) {
n = pre_alloc;
pre_alloc = pre_alloc->next;
@@ -373,6 +374,7 @@ hpcrun_set_new_metric_desc(kind_info_t *kind, const char* name,
else {
n = (metric_desc_list_t*) hpcrun_malloc(sizeof(metric_desc_list_t));
}
+ // Add n into the list of metric description - kind->metric_data
n->next = kind->metric_data;
kind->metric_data = n;
n->proc = upd_fn;
diff --git a/src/tool/hpcrun/module-ignore-map.c b/src/tool/hpcrun/module-ignore-map.c
index 56b9db6e7c..98a60630aa 100644
--- a/src/tool/hpcrun/module-ignore-map.c
+++ b/src/tool/hpcrun/module-ignore-map.c
@@ -108,7 +108,7 @@
// where any GPU can indicate that its functions should be added to
// the module ignore map when that type of GPU is being monitored.
-#define NUM_FNS 8
+#define NUM_FNS 9
@@ -127,16 +127,19 @@ typedef struct module_ignore_entry {
// static data
//***************************************************************************
-static const char *IGNORE_FNS[NUM_FNS] = {
+
+static const char *IGNORE_FNS[] = {
"cuLaunchKernel",
"cudaLaunchKernel",
"cuptiActivityEnable",
+ "rocprofiler_iterate_info",
"roctracer_set_properties", // amd roctracer library
"amd_dbgapi_initialize", // amd debug library
"hipKernelNameRefByPtr", // amd hip runtime
- "hsa_queue_create", // amd hsa runtime
+ "hsa_init", // amd hsa runtime
"hpcrun_malloc" // hpcrun library
};
+
static module_ignore_entry_t modules[NUM_FNS];
static pfq_rwlock_t modules_lock;
@@ -250,7 +253,7 @@ module_ignore_map_lookup
}
int
-serach_functions_in_module(Elf *e, GElf_Shdr* secHead, Elf_Scn *section)
+search_functions_in_module(Elf *e, GElf_Shdr* secHead, Elf_Scn *section)
{
Elf_Data *data;
char *symName;
@@ -287,6 +290,8 @@ module_ignore_map_ignore
load_module_t* lm
)
{
+ if (lm == NULL) return false;
+
// Update path
// Only one thread could update the flag,
// Guarantee dlopen modules before notification are updated.
@@ -332,7 +337,7 @@ module_ignore_map_ignore
gelf_getshdr(scn, &secHead);
// Only search .dynsym section
if (secHead.sh_type != SHT_DYNSYM) continue;
- int module_ignore_index = serach_functions_in_module(elf, &secHead, scn);
+ int module_ignore_index = search_functions_in_module(elf, &secHead, scn);
if (module_ignore_index != -1) {
modules[module_ignore_index].module = module;
modules[module_ignore_index].empty = false;
diff --git a/src/tool/hpcrun/ompt/omp-tools.h b/src/tool/hpcrun/ompt/omp-tools.h
index 43788206d2..ffa406ab86 100644
--- a/src/tool/hpcrun/ompt/omp-tools.h
+++ b/src/tool/hpcrun/ompt/omp-tools.h
@@ -1,5 +1,5 @@
/*
- * include/50/omp-tools.h.var
+ * include/omp-tools.h.var
*/
//===----------------------------------------------------------------------===//
@@ -20,6 +20,16 @@
#include
#include
+#ifdef DEPRECATION_WARNINGS
+# ifdef __cplusplus
+# define DEPRECATED_51 [[deprecated("as of 5.1")]]
+# else
+# define DEPRECATED_51 __attribute__((deprecated("as of 5.1")))
+#endif
+#else
+#define DEPRECATED_51
+#endif
+
/*****************************************************************************
* iteration macros
*****************************************************************************/
@@ -133,7 +143,7 @@
\
macro (ompt_callback_work, ompt_callback_work_t, 20) /* task at work begin or end */ \
\
- macro (ompt_callback_master, ompt_callback_master_t, 21) /* task at master begin or end */ \
+ macro (ompt_callback_masked, ompt_callback_masked_t, 21) /* task at masked begin or end */ \
\
macro (ompt_callback_target_map, ompt_callback_target_map_t, 22) /* target map */ \
\
@@ -153,7 +163,26 @@
\
macro (ompt_callback_reduction, ompt_callback_sync_region_t, 31) /* reduction */ \
\
- macro (ompt_callback_dispatch, ompt_callback_dispatch_t, 32) /* dispatch of work */
+ macro (ompt_callback_dispatch, ompt_callback_dispatch_t, 32) /* dispatch of work */ \
+ macro (ompt_callback_target_emi, ompt_callback_target_emi_t, 33) /* target */ \
+ macro (ompt_callback_target_data_op_emi,ompt_callback_target_data_op_emi_t,34) /* target data op */ \
+ macro (ompt_callback_target_submit_emi, ompt_callback_target_submit_emi_t, 35) /* target submit */ \
+ macro (ompt_callback_target_map_emi, ompt_callback_target_map_emi_t, 36) /* target map */ \
+ macro (ompt_callback_error, ompt_callback_error_t, 37) /* error */
+
+#define FOREACH_OMPT_TARGET_CALLBACK(macro) \
+ macro(ompt_callback_device_initialize) \
+ macro(ompt_callback_device_finalize) \
+ macro(ompt_callback_device_load) \
+ macro(ompt_callback_device_unload) \
+ macro(ompt_callback_target) \
+ macro(ompt_callback_target_map) \
+ macro(ompt_callback_target_data_op) \
+ macro(ompt_callback_target_submit) \
+ macro(ompt_callback_target_data_op_emi) \
+ macro(ompt_callback_target_emi) \
+ macro(ompt_callback_target_map_emi) \
+ macro(ompt_callback_target_submit_emi)
/*****************************************************************************
* implementation specific types
@@ -190,7 +219,8 @@ typedef enum ompt_callbacks_t {
ompt_callback_dependences = 18,
ompt_callback_task_dependence = 19,
ompt_callback_work = 20,
- ompt_callback_master = 21,
+ ompt_callback_master DEPRECATED_51 = 21,
+ ompt_callback_masked = 21,
ompt_callback_target_map = 22,
ompt_callback_sync_region = 23,
ompt_callback_lock_init = 24,
@@ -201,7 +231,12 @@ typedef enum ompt_callbacks_t {
ompt_callback_flush = 29,
ompt_callback_cancel = 30,
ompt_callback_reduction = 31,
- ompt_callback_dispatch = 32
+ ompt_callback_dispatch = 32,
+ ompt_callback_target_emi = 33,
+ ompt_callback_target_data_op_emi = 34,
+ ompt_callback_target_submit_emi = 35,
+ ompt_callback_target_map_emi = 36,
+ ompt_callback_error = 37
} ompt_callbacks_t;
typedef enum ompt_record_t {
@@ -239,7 +274,8 @@ typedef enum ompt_thread_t {
typedef enum ompt_scope_endpoint_t {
ompt_scope_begin = 1,
- ompt_scope_end = 2
+ ompt_scope_end = 2,
+ ompt_scope_beginend = 3
} ompt_scope_endpoint_t;
typedef enum ompt_dispatch_t {
@@ -248,22 +284,29 @@ typedef enum ompt_dispatch_t {
} ompt_dispatch_t;
typedef enum ompt_sync_region_t {
- ompt_sync_region_barrier = 1,
- ompt_sync_region_barrier_implicit = 2,
+ ompt_sync_region_barrier DEPRECATED_51 = 1,
+ ompt_sync_region_barrier_implicit DEPRECATED_51 = 2,
ompt_sync_region_barrier_explicit = 3,
ompt_sync_region_barrier_implementation = 4,
ompt_sync_region_taskwait = 5,
ompt_sync_region_taskgroup = 6,
- ompt_sync_region_reduction = 7
+ ompt_sync_region_reduction = 7,
+ ompt_sync_region_barrier_implicit_workshare = 8,
+ ompt_sync_region_barrier_implicit_parallel = 9,
+ ompt_sync_region_barrier_teams = 10
} ompt_sync_region_t;
typedef enum ompt_target_data_op_t {
- ompt_target_data_alloc = 1,
- ompt_target_data_transfer_to_device = 2,
- ompt_target_data_transfer_from_device = 3,
- ompt_target_data_delete = 4,
- ompt_target_data_associate = 5,
- ompt_target_data_disassociate = 6
+ ompt_target_data_alloc = 1,
+ ompt_target_data_transfer_to_device = 2,
+ ompt_target_data_transfer_from_device = 3,
+ ompt_target_data_delete = 4,
+ ompt_target_data_associate = 5,
+ ompt_target_data_disassociate = 6,
+ ompt_target_data_alloc_async = 17,
+ ompt_target_data_transfer_to_device_async = 18,
+ ompt_target_data_transfer_from_device_async = 19,
+ ompt_target_data_delete_async = 20
} ompt_target_data_op_t;
typedef enum ompt_work_t {
@@ -273,7 +316,8 @@ typedef enum ompt_work_t {
ompt_work_single_other = 4,
ompt_work_workshare = 5,
ompt_work_distribute = 6,
- ompt_work_taskloop = 7
+ ompt_work_taskloop = 7,
+ ompt_work_scope = 8
} ompt_work_t;
typedef enum ompt_mutex_t {
@@ -302,6 +346,7 @@ typedef enum ompt_task_flag_t {
ompt_task_implicit = 0x00000002,
ompt_task_explicit = 0x00000004,
ompt_task_target = 0x00000008,
+ ompt_task_taskwait = 0x00000010,
ompt_task_undeferred = 0x08000000,
ompt_task_untied = 0x10000000,
ompt_task_final = 0x20000000,
@@ -316,14 +361,19 @@ typedef enum ompt_task_status_t {
ompt_task_detach = 4,
ompt_task_early_fulfill = 5,
ompt_task_late_fulfill = 6,
- ompt_task_switch = 7
+ ompt_task_switch = 7,
+ ompt_taskwait_complete = 8
} ompt_task_status_t;
typedef enum ompt_target_t {
ompt_target = 1,
ompt_target_enter_data = 2,
ompt_target_exit_data = 3,
- ompt_target_update = 4
+ ompt_target_update = 4,
+ ompt_target_nowait = 9,
+ ompt_target_enter_data_nowait = 10,
+ ompt_target_exit_data_nowait = 11,
+ ompt_target_update_nowait = 12
} ompt_target_t;
typedef enum ompt_parallel_flag_t {
@@ -348,9 +398,15 @@ typedef enum ompt_dependence_type_t {
ompt_dependence_type_inout = 3,
ompt_dependence_type_mutexinoutset = 4,
ompt_dependence_type_source = 5,
- ompt_dependence_type_sink = 6
+ ompt_dependence_type_sink = 6,
+ ompt_dependence_type_inoutset = 7
} ompt_dependence_type_t;
+typedef enum ompt_severity_t {
+ ompt_warning = 1,
+ ompt_fatal = 2
+} ompt_severity_t;
+
typedef enum ompt_cancel_flag_t {
ompt_cancel_parallel = 0x01,
ompt_cancel_sections = 0x02,
@@ -371,18 +427,20 @@ typedef enum ompt_frame_flag_t {
ompt_frame_cfa = 0x10,
ompt_frame_framepointer = 0x20,
ompt_frame_stackaddress = 0x30
-} ompt_frame_flag_t;
+} ompt_frame_flag_t;
typedef enum ompt_state_t {
ompt_state_work_serial = 0x000,
ompt_state_work_parallel = 0x001,
ompt_state_work_reduction = 0x002,
- ompt_state_wait_barrier = 0x010,
+ ompt_state_wait_barrier DEPRECATED_51 = 0x010,
ompt_state_wait_barrier_implicit_parallel = 0x011,
ompt_state_wait_barrier_implicit_workshare = 0x012,
- ompt_state_wait_barrier_implicit = 0x013,
+ ompt_state_wait_barrier_implicit DEPRECATED_51 = 0x013,
ompt_state_wait_barrier_explicit = 0x014,
+ ompt_state_wait_barrier_implementation = 0x015,
+ ompt_state_wait_barrier_teams = 0x016,
ompt_state_wait_taskwait = 0x020,
ompt_state_wait_taskgroup = 0x021,
@@ -439,6 +497,8 @@ typedef enum ompd_rc_t {
ompd_rc_device_read_error = 8,
ompd_rc_device_write_error = 9,
ompd_rc_nomem = 10,
+ ompd_rc_incomplete = 11,
+ ompd_rc_callback_error = 12
} ompd_rc_t;
typedef void (*ompt_interface_fn_t) (void);
@@ -720,14 +780,14 @@ typedef void (*ompt_callback_dispatch_t) (
ompt_data_t *parallel_data,
ompt_data_t *task_data,
ompt_dispatch_t kind,
- ompt_data_t instance
+ ompt_data_t instance
);
typedef struct ompt_record_dispatch_t {
ompt_id_t parallel_id;
ompt_id_t task_id;
ompt_dispatch_t kind;
- ompt_data_t instance;
+ ompt_data_t instance;
} ompt_record_dispatch_t;
typedef void (*ompt_callback_task_create_t) (
@@ -799,19 +859,21 @@ typedef struct ompt_record_implicit_task_t {
int flags;
} ompt_record_implicit_task_t;
-typedef void (*ompt_callback_master_t) (
+typedef void (*ompt_callback_masked_t) (
ompt_scope_endpoint_t endpoint,
ompt_data_t *parallel_data,
ompt_data_t *task_data,
const void *codeptr_ra
);
-typedef struct ompt_record_master_t {
+typedef ompt_callback_masked_t ompt_callback_master_t DEPRECATED_51;
+
+typedef struct ompt_record_masked_t {
ompt_scope_endpoint_t endpoint;
ompt_id_t parallel_id;
ompt_id_t task_id;
const void *codeptr_ra;
-} ompt_record_master_t;
+} ompt_record_masked_t;
typedef void (*ompt_callback_sync_region_t) (
ompt_sync_region_t kind,
@@ -918,6 +980,20 @@ typedef void (*ompt_callback_device_unload_t) (
uint64_t module_id
);
+typedef void (*ompt_callback_target_data_op_emi_t) (
+ ompt_scope_endpoint_t endpoint,
+ ompt_data_t *target_task_data,
+ ompt_data_t *target_data,
+ ompt_id_t *host_op_id,
+ ompt_target_data_op_t optype,
+ void *src_addr,
+ int src_device_num,
+ void *dest_addr,
+ int dest_device_num,
+ size_t bytes,
+ const void *codeptr_ra
+);
+
typedef void (*ompt_callback_target_data_op_t) (
ompt_scope_endpoint_t endpoint,
ompt_id_t target_id,
@@ -943,6 +1019,16 @@ typedef struct ompt_record_target_data_op_t {
const void *codeptr_ra;
} ompt_record_target_data_op_t;
+typedef void (*ompt_callback_target_emi_t) (
+ ompt_target_t kind,
+ ompt_scope_endpoint_t endpoint,
+ int device_num,
+ ompt_data_t *task_data,
+ ompt_data_t *target_task_data,
+ ompt_data_t *target_data,
+ const void *codeptr_ra
+);
+
typedef void (*ompt_callback_target_t) (
ompt_target_t kind,
ompt_scope_endpoint_t endpoint,
@@ -961,6 +1047,16 @@ typedef struct ompt_record_target_t {
const void *codeptr_ra;
} ompt_record_target_t;
+typedef void (*ompt_callback_target_map_emi_t) (
+ ompt_data_t *target_data,
+ unsigned int nitems,
+ void **host_addr,
+ void **device_addr,
+ size_t *bytes,
+ unsigned int *mapping_flags,
+ const void *codeptr_ra
+);
+
typedef void (*ompt_callback_target_map_t) (
ompt_id_t target_id,
unsigned int nitems,
@@ -981,6 +1077,13 @@ typedef struct ompt_record_target_map_t {
const void *codeptr_ra;
} ompt_record_target_map_t;
+typedef void (*ompt_callback_target_submit_emi_t) (
+ ompt_scope_endpoint_t endpoint,
+ ompt_data_t *target_data,
+ ompt_id_t *host_op_id,
+ unsigned int requested_num_teams
+);
+
typedef void (*ompt_callback_target_submit_t) (
ompt_scope_endpoint_t endpoint,
ompt_id_t target_id,
@@ -1008,6 +1111,19 @@ typedef struct ompt_record_control_tool_t {
const void *codeptr_ra;
} ompt_record_control_tool_t;
+typedef void (*ompt_callback_error_t) (
+ ompt_severity_t severity,
+ const char *message, size_t length,
+ const void *codeptr_ra
+);
+
+typedef struct ompt_record_error_t {
+ ompt_severity_t severity;
+ const char *message;
+ size_t length;
+ const void *codeptr_ra;
+} ompt_record_error_t;
+
typedef struct ompd_address_t {
ompd_seg_t segment;
ompd_addr_t address;
@@ -1035,6 +1151,198 @@ typedef struct ompd_device_type_sizes_t {
uint8_t sizeof_pointer;
} ompd_device_type_sizes_t;
+void ompd_dll_locations_valid(void);
+
+typedef ompd_rc_t (*ompd_callback_memory_alloc_fn_t)(ompd_size_t nbytes,
+ void **ptr);
+
+typedef ompd_rc_t (*ompd_callback_memory_free_fn_t)(void *ptr);
+
+typedef ompd_rc_t (*ompd_callback_get_thread_context_for_thread_id_fn_t)(
+ ompd_address_space_context_t *address_space_context, ompd_thread_id_t kind,
+ ompd_size_t sizeof_thread_id, const void *thread_id,
+ ompd_thread_context_t **thread_context);
+
+typedef ompd_rc_t (*ompd_callback_sizeof_fn_t)(
+ ompd_address_space_context_t *address_space_context,
+ ompd_device_type_sizes_t *sizes);
+
+typedef ompd_rc_t (*ompd_callback_symbol_addr_fn_t)(
+ ompd_address_space_context_t *address_space_context,
+ ompd_thread_context_t *thread_context, const char *symbol_name,
+ ompd_address_t *symbol_addr, const char *file_name);
+
+typedef ompd_rc_t (*ompd_callback_memory_read_fn_t)(
+ ompd_address_space_context_t *address_space_context,
+ ompd_thread_context_t *thread_context, const ompd_address_t *addr,
+ ompd_size_t nbytes, void *buffer);
+
+typedef ompd_rc_t (*ompd_callback_memory_write_fn_t)(
+ ompd_address_space_context_t *address_space_context,
+ ompd_thread_context_t *thread_context, const ompd_address_t *addr,
+ ompd_size_t nbytes, const void *buffer);
+
+typedef ompd_rc_t (*ompd_callback_device_host_fn_t)(
+ ompd_address_space_context_t *address_space_context, const void *input,
+ ompd_size_t unit_size, ompd_size_t count, void *output);
+
+typedef ompd_rc_t (*ompd_callback_print_string_fn_t)(const char *string,
+ int category);
+
+typedef struct ompd_callbacks_t {
+ ompd_callback_memory_alloc_fn_t alloc_memory;
+ ompd_callback_memory_free_fn_t free_memory;
+ ompd_callback_print_string_fn_t print_string;
+ ompd_callback_sizeof_fn_t sizeof_type;
+ ompd_callback_symbol_addr_fn_t symbol_addr_lookup;
+ ompd_callback_memory_read_fn_t read_memory;
+ ompd_callback_memory_write_fn_t write_memory;
+ ompd_callback_memory_read_fn_t read_string;
+ ompd_callback_device_host_fn_t device_to_host;
+ ompd_callback_device_host_fn_t host_to_device;
+ ompd_callback_get_thread_context_for_thread_id_fn_t
+ get_thread_context_for_thread_id;
+} ompd_callbacks_t;
+
+void ompd_bp_parallel_begin(void);
+
+void ompd_bp_parallel_end(void);
+
+void ompd_bp_task_begin(void);
+
+void ompd_bp_task_end(void);
+
+void ompd_bp_thread_begin(void);
+
+void ompd_bp_thread_end(void);
+
+void ompd_bp_device_begin(void);
+
+void ompd_bp_device_end(void);
+
+ompd_rc_t ompd_initialize(ompd_word_t api_version,
+ const ompd_callbacks_t *callbacks);
+
+ompd_rc_t ompd_get_api_version(ompd_word_t *version);
+
+ompd_rc_t ompd_get_version_string(const char **string);
+
+ompd_rc_t ompd_finalize(void);
+
+ompd_rc_t ompd_process_initialize(ompd_address_space_context_t *context,
+ ompd_address_space_handle_t **handle);
+
+ompd_rc_t ompd_device_initialize(ompd_address_space_handle_t *process_handle,
+ ompd_address_space_context_t *device_context,
+ ompd_device_t kind, ompd_size_t sizeof_id,
+ void *id,
+ ompd_address_space_handle_t **device_handle);
+
+ompd_rc_t ompd_rel_address_space_handle(ompd_address_space_handle_t *handle);
+
+ompd_rc_t ompd_get_omp_version(ompd_address_space_handle_t *address_space,
+ ompd_word_t *omp_version);
+
+ompd_rc_t
+ompd_get_omp_version_string(ompd_address_space_handle_t *address_space,
+ const char **string);
+
+ompd_rc_t ompd_get_thread_in_parallel(ompd_parallel_handle_t *parallel_handle,
+ int thread_num,
+ ompd_thread_handle_t **thread_handle);
+
+ompd_rc_t ompd_get_thread_handle(ompd_address_space_handle_t *handle,
+ ompd_thread_id_t kind,
+ ompd_size_t sizeof_thread_id,
+ const void *thread_id,
+ ompd_thread_handle_t **thread_handle);
+
+ompd_rc_t ompd_rel_thread_handle(ompd_thread_handle_t *thread_handle);
+
+ompd_rc_t ompd_thread_handle_compare(ompd_thread_handle_t *thread_handle_1,
+ ompd_thread_handle_t *thread_handle_2,
+ int *cmp_value);
+
+ompd_rc_t ompd_get_thread_id(ompd_thread_handle_t *thread_handle,
+ ompd_thread_id_t kind,
+ ompd_size_t sizeof_thread_id, void *thread_id);
+
+ompd_rc_t
+ompd_get_curr_parallel_handle(ompd_thread_handle_t *thread_handle,
+ ompd_parallel_handle_t **parallel_handle);
+
+ompd_rc_t ompd_get_enclosing_parallel_handle(
+ ompd_parallel_handle_t *parallel_handle,
+ ompd_parallel_handle_t **enclosing_parallel_handle);
+
+ompd_rc_t
+ompd_get_task_parallel_handle(ompd_task_handle_t *task_handle,
+ ompd_parallel_handle_t **task_parallel_handle);
+
+ompd_rc_t ompd_rel_parallel_handle(ompd_parallel_handle_t *parallel_handle);
+
+ompd_rc_t
+ompd_parallel_handle_compare(ompd_parallel_handle_t *parallel_handle_1,
+ ompd_parallel_handle_t *parallel_handle_2,
+ int *cmp_value);
+
+ompd_rc_t ompd_get_curr_task_handle(ompd_thread_handle_t *thread_handle,
+ ompd_task_handle_t **task_handle);
+
+ompd_rc_t
+ompd_get_generating_task_handle(ompd_task_handle_t *task_handle,
+ ompd_task_handle_t **generating_task_handle);
+
+ompd_rc_t
+ompd_get_scheduling_task_handle(ompd_task_handle_t *task_handle,
+ ompd_task_handle_t **scheduling_task_handle);
+
+ompd_rc_t ompd_get_task_in_parallel(ompd_parallel_handle_t *parallel_handle,
+ int thread_num,
+ ompd_task_handle_t **task_handle);
+
+ompd_rc_t ompd_rel_task_handle(ompd_task_handle_t *task_handle);
+
+ompd_rc_t ompd_task_handle_compare(ompd_task_handle_t *task_handle_1,
+ ompd_task_handle_t *task_handle_2,
+ int *cmp_value);
+
+ompd_rc_t ompd_get_task_function(ompd_task_handle_t *task_handle,
+ ompd_address_t *entry_point);
+
+ompd_rc_t ompd_get_task_frame(ompd_task_handle_t *task_handle,
+ ompd_frame_info_t *exit_frame,
+ ompd_frame_info_t *enter_frame);
+
+ompd_rc_t
+ompd_enumerate_states(ompd_address_space_handle_t *address_space_handle,
+ ompd_word_t current_state, ompd_word_t *next_state,
+ const char **next_state_name, ompd_word_t *more_enums);
+
+ompd_rc_t ompd_get_state(ompd_thread_handle_t *thread_handle,
+ ompd_word_t *state, ompt_wait_id_t *wait_id);
+
+ompd_rc_t
+ompd_get_display_control_vars(ompd_address_space_handle_t *address_space_handle,
+ const char *const **control_vars);
+
+ompd_rc_t ompd_rel_display_control_vars(const char *const **control_vars);
+
+ompd_rc_t ompd_enumerate_icvs(ompd_address_space_handle_t *handle,
+ ompd_icv_id_t current, ompd_icv_id_t *next_id,
+ const char **next_icv_name,
+ ompd_scope_t *next_scope, int *more);
+
+ompd_rc_t ompd_get_icv_from_scope(void *handle, ompd_scope_t scope,
+ ompd_icv_id_t icv_id, ompd_word_t *icv_value);
+
+ompd_rc_t ompd_get_icv_string_from_scope(void *handle, ompd_scope_t scope,
+ ompd_icv_id_t icv_id,
+ const char **icv_string);
+
+ompd_rc_t ompd_get_tool_data(void *handle, ompd_scope_t scope,
+ ompd_word_t *value, ompd_address_t *ptr);
+
typedef struct ompt_record_ompt_t {
ompt_callbacks_t type;
ompt_device_time_t time;
@@ -1051,7 +1359,7 @@ typedef struct ompt_record_ompt_t {
ompt_record_task_dependence_t task_dependence;
ompt_record_task_schedule_t task_schedule;
ompt_record_implicit_task_t implicit_task;
- ompt_record_master_t master;
+ ompt_record_masked_t masked;
ompt_record_sync_region_t sync_region;
ompt_record_mutex_acquire_t mutex_acquire;
ompt_record_mutex_t mutex;
diff --git a/src/tool/hpcrun/ompt/ompt-device.c b/src/tool/hpcrun/ompt/ompt-device.c
index ff134e5eaa..b27b968f71 100644
--- a/src/tool/hpcrun/ompt/ompt-device.c
+++ b/src/tool/hpcrun/ompt/ompt-device.c
@@ -2,7 +2,7 @@
// * BeginRiceCopyright *****************************************************
//
-// $HeadURL$
+// $HeadURL$
// $Id$
//
// --------------------------------------------------------------------------
@@ -45,9 +45,6 @@
// ******************************************************* EndRiceCopyright *
-#include "ompt-device.h"
-
-#if HAVE_CUPTI_H
/******************************************************************************
* global include files
@@ -71,14 +68,19 @@
#include "ompt-interface.h"
#include "ompt-device-map.h"
+#include "ompt-device.h"
#include "gpu/gpu-op-placeholders.h"
+#include "gpu/gpu-application-thread-api.h"
#include "gpu/gpu-correlation-channel.h"
#include "gpu/gpu-correlation-channel-set.h"
+#include "gpu/gpu-correlation-id.h"
+#include "gpu/gpu-metrics.h"
#include "gpu/gpu-monitoring.h"
+#include "gpu/gpu-monitoring-thread-api.h"
+#include "gpu/gpu-trace.h"
-#include "gpu/nvidia/cupti-api.h"
-#include "sample-sources/nvidia.h"
+#include "gpu/ompt/ompt-gpu-api.h"
@@ -86,6 +88,15 @@
// macros
//*****************************************************************************
+#define FOREACH_OMPT_DATA_OP(macro) \
+ macro(op, ompt_target_data_alloc, ompt_tgt_alloc) \
+ macro(op, ompt_target_data_delete, ompt_tgt_delete) \
+ macro(op, ompt_target_data_transfer_to_device, ompt_tgt_copyin) \
+ macro(op, ompt_target_data_transfer_from_device, ompt_tgt_copyout)
+
+// with OMPT support turned on, callpath pruning should not be necessary
+#define PRUNE_CALLPATH 0
+
#define OMPT_ACTIVITY_DEBUG 0
#if OMPT_ACTIVITY_DEBUG
@@ -100,50 +111,79 @@
typedef return_type (*OMPT_API_FNTYPE(fn)) args
#define OMPT_TARGET_API_FUNCTION(return_type, fn, args) \
- OMPT_API_FUNCTION(return_type, fn, args)
+ OMPT_API_FUNCTION(return_type, fn, args)
#define FOREACH_OMPT_TARGET_FN(macro) \
macro(ompt_get_device_time) \
macro(ompt_translate_time) \
- macro(ompt_set_trace_native) \
+ macro(ompt_set_trace_ompt) \
macro(ompt_start_trace) \
macro(ompt_pause_trace) \
macro(ompt_stop_trace) \
+ macro(ompt_flush_trace) \
macro(ompt_get_record_type) \
- macro(ompt_get_record_native) \
+ macro(ompt_get_record_ompt) \
macro(ompt_get_record_abstract) \
- macro(ompt_advance_buffer_cursor) \
- macro(ompt_set_pc_sampling) \
- macro(ompt_set_external_subscriber)
+ macro(ompt_advance_buffer_cursor)
//*****************************************************************************
-// types
+// type declarations
//*****************************************************************************
-OMPT_TARGET_API_FUNCTION(void, ompt_set_external_subscriber,
-(
- int enable
-));
+typedef struct ompt_device_entry_t {
+ int device_id;
+ ompt_device_t *device;
+ struct ompt_device_entry_t *next;
+} ompt_device_entry_t;
-OMPT_TARGET_API_FUNCTION(void, ompt_set_pc_sampling,
-(
- ompt_device_t *device,
- int enable,
- int pc_sampling_frequency
-));
+
+//*****************************************************************************
+// forward declarations
+//*****************************************************************************
+
+static void ompt_dump(ompt_record_ompt_t *r) __attribute__((unused));
+
//*****************************************************************************
// static variables
//*****************************************************************************
-static bool ompt_pc_sampling_enabled = false;
+static device_finalizer_fn_entry_t device_finalizer_flush;
+static device_finalizer_fn_entry_t device_finalizer_trace;
+static device_finalizer_fn_entry_t device_finalizer_shutdown;
-static device_finalizer_fn_entry_t device_finalizer;
+static int ompt_shutdown_complete = 0;
+static ompt_device_entry_t *device_list = 0;
+
+static __thread bool ompt_need_flush = false;
+
+
+
+//*****************************************************************************
+// private operations
+//*****************************************************************************
+
+static void
+device_list_insert
+(
+ int device_id,
+ ompt_device_t *device
+)
+{
+ // FIXME: replace with splay-uint64
+ ompt_device_entry_t *e = (ompt_device_entry_t *)
+ malloc(sizeof(ompt_device_entry_t));
+ e->device_id = device_id;
+ e->device = device;
+ e->next = device_list;
+ device_list = e;
+ PRINT("device_list_insert id=%d device=%p\n", device_id, device);
+}
//------------------------------------------------
// declare function pointers for target functions
@@ -180,7 +220,12 @@ hpcrun_ompt_op_id_notify(ompt_scope_endpoint_t endpoint,
// Enter a ompt runtime api
PRINT("enter ompt runtime op %lu\n", host_op_id);
ompt_runtime_api_flag = true;
- cupti_correlation_id_push(host_op_id);
+
+ gpu_application_thread_process_activities();
+
+#if 0
+ ompt_correlation_id_push(host_op_id);
+#endif
gpu_op_ccts_t gpu_op_ccts;
memset(&gpu_op_ccts, 0, sizeof(gpu_op_ccts_t));
@@ -200,13 +245,16 @@ hpcrun_ompt_op_id_notify(ompt_scope_endpoint_t endpoint,
// Inform the worker about the placeholders
uint64_t cpu_submit_time = hpcrun_nanotime();
+ PRINT("producing correlation %lu\n", host_op_id);
gpu_correlation_channel_produce(host_op_id, &gpu_op_ccts, cpu_submit_time);
} else {
PRINT("exit ompt runtime op %lu\n", host_op_id);
// Enter a runtime api
ompt_runtime_api_flag = false;
+#if 0
// Pop the id and make a notification
- cupti_correlation_id_pop();
+ ompt_correlation_id_pop();
+#endif
// Clear kernel status
trace_node = NULL;
}
@@ -215,11 +263,12 @@ hpcrun_ompt_op_id_notify(ompt_scope_endpoint_t endpoint,
}
-void
+void
ompt_bind_names(ompt_function_lookup_t lookup)
{
#define ompt_bind_name(fn) \
- fn = (fn ## _t ) lookup(#fn);
+ fn = (fn ## _t ) lookup(#fn); \
+ PRINT("look up function %s, got %p\n", #fn, fn);
FOREACH_OMPT_TARGET_FN(ompt_bind_name)
@@ -229,8 +278,8 @@ ompt_bind_names(ompt_function_lookup_t lookup)
#define BUFFER_SIZE (1024 * 1024 * 8)
-void
-ompt_callback_buffer_request
+static void
+ompt_buffer_request
(
int device_id,
ompt_buffer_t **buffer,
@@ -243,100 +292,228 @@ ompt_callback_buffer_request
}
-void
-ompt_callback_buffer_complete
+static void
+ompt_buffer_release
(
- int device_id,
- ompt_buffer_t *buffer,
- size_t bytes,
- ompt_buffer_cursor_t begin,
- int buffer_owned
+ ompt_buffer_t *buffer
)
{
- // handle notifications
- gpu_correlation_channel_set_consume();
-
- // signal advance to return pointer to first record
- ompt_buffer_cursor_t next = begin;
- int status = 0;
- do {
- // TODO(keren): replace cupti_activity_handle with device_activity handle
- CUpti_Activity *activity = (CUpti_Activity *)next;
- cupti_activity_process(activity);
- status = cupti_buffer_cursor_advance(buffer, bytes, (CUpti_Activity **)&next);
- } while(status);
+ free(buffer);
}
-void
-ompt_pc_sampling_enable()
+static void
+ompt_dump
+(
+ ompt_record_ompt_t *r
+)
{
- ompt_pc_sampling_enabled = true;
+ if (r) {
+ printf("r=%p type=%d time=%lu thread_id=%lu target_id=0x%lx\n",
+ r, r->type, r->time, r->thread_id, r->target_id);
+
+ switch (r->type) {
+ case ompt_callback_target:
+ // case ompt_callback_target_emi:
+ {
+ ompt_record_target_t target_rec = r->record.target;
+ printf("\tTarget task: kind=%d endpoint=%d device=%d task_id=%lu target_id=0x%lx codeptr=%p\n",
+ target_rec.kind, target_rec.endpoint, target_rec.device_num,
+ target_rec.task_id, target_rec.target_id, target_rec.codeptr_ra);
+ break;
+ }
+ case ompt_callback_target_data_op:
+ // case ompt_callback_target_data_op_emi:
+ {
+ ompt_record_target_data_op_t target_data_op_rec =
+ r->record.target_data_op;
+ printf("\tTarget data op: host_op_id=%lu optype=%d src_addr=%p "
+ "src_device=%d dest_addr=%p dest_device=%d bytes=%lu "
+ "end_time=%lu duration=%luus codeptr=%p\n",
+ target_data_op_rec.host_op_id, target_data_op_rec.optype,
+ target_data_op_rec.src_addr, target_data_op_rec.src_device_num,
+ target_data_op_rec.dest_addr, target_data_op_rec.dest_device_num,
+ target_data_op_rec.bytes, target_data_op_rec.end_time,
+ target_data_op_rec.end_time - r->time,
+ target_data_op_rec.codeptr_ra);
+ break;
+ }
+ case ompt_callback_target_submit:
+ // case ompt_callback_target_submit_emi:
+ {
+ ompt_record_target_kernel_t target_kernel_rec = r->record.target_kernel;
+ printf("\tTarget kernel: host_op_id=%lu requested_num_teams=%u "
+ "granted_num_teams=%u end_time=%lu duration=%luus\n",
+ target_kernel_rec.host_op_id,
+ target_kernel_rec.requested_num_teams,
+ target_kernel_rec.granted_num_teams, target_kernel_rec.end_time,
+ target_kernel_rec.end_time - r->time);
+ break;
+ }
+ default:
+ assert(0);
+ break;
+ }
+ }
}
-void
-ompt_pc_sampling_disable()
+static ompt_device_t *
+ompt_get_device
+(
+ int device_id
+)
{
- ompt_pc_sampling_enabled = false;
+ ompt_device_entry_t *e = device_list;
+ while (e) {
+ if (e->device_id == device_id) return e->device;
+ e = e->next;
+ }
+ return 0;
}
-void
-ompt_trace_configure(ompt_device_t *device)
+static void
+ompt_finalize_flush
+(
+ void *arg,
+ int how
+)
{
- int flags = 0;
+ PRINT("ompt_finalize_flush enter\n");
+
+ ompt_device_entry_t *e = device_list;
+ while (e) {
+ PRINT("ompt_finalize_flush flush id=%d device=%p\n",
+ e->device_id, e->device);
+ if (ompt_need_flush) ompt_flush_trace(e->device);
+ e = e->next;
+ }
- // specify desired monitoring
- flags |= ompt_native_driver;
+ gpu_application_thread_process_activities();
- flags |= ompt_native_runtime;
+ PRINT("ompt_finalize_flush exit\n");
+}
- flags |= ompt_native_kernel_invocation;
- flags |= ompt_native_kernel_execution;
+static void
+ompt_finalize_shutdown
+(
+ void *arg,
+ int how
+)
+{
+ PRINT("ompt_finalize_shutdown enter\n");
+
+ ompt_device_entry_t *e = device_list;
+ while (e) {
+ PRINT("ompt_finalize_flush flush id=%d device=%p\n",
+ e->device_id, e->device);
+ ompt_stop_trace(e->device);
+ e = e->next;
+ }
+ ompt_shutdown_complete = 1;
+ gpu_application_thread_process_activities();
+ PRINT("ompt_finalize_shutdown exit\n");
+}
- flags |= ompt_native_data_motion_explicit;
- // indicate desired monitoring
- ompt_set_trace_native(device, 1, flags);
-
- // set pc sampling after other traces
- if (ompt_pc_sampling_enabled) {
- int freq_bits = gpu_monitoring_instruction_sample_frequency_get();
- ompt_set_pc_sampling(device, true, freq_bits);
+static void
+ompt_finalize_trace
+(
+ void *arg,
+ int how
+)
+{
+ PRINT("ompt_finalize_trace enter\n");
+ gpu_trace_fini(arg, how);
+ PRINT("ompt_finalize_trace exit\n");
+}
+
+
+
+static void
+ompt_buffer_complete
+(
+ int device_id,
+ ompt_buffer_t *buffer,
+ size_t bytes,
+ ompt_buffer_cursor_t begin,
+ int buffer_owned
+)
+{
+ PRINT("ompt_callback_buffer_complete enter device=%d\n", device_id);
+ if (ompt_shutdown_complete == 0) {
+
+ gpu_monitoring_thread_activities_ready();
+
+ ompt_device_t *device = ompt_get_device(device_id);
+
+ // signal advance to return pointer to first record
+ ompt_buffer_cursor_t current = begin;
+ int status = 1;
+ while (status) {
+ // extract the next record from the buffer
+ ompt_record_ompt_t *record = ompt_get_record_ompt(buffer, current);
+
+ // a buffer may be empty, so the first record may be NULL
+ if (record == NULL) break;
+
+ // process the record
+ ompt_activity_process(record);
+
+ // advance the cursor to the next record
+ // status will be 0 if there is no next record
+ status = ompt_advance_buffer_cursor(device, buffer, bytes, current,
+ ¤t);
+ }
}
+ if (buffer_owned) ompt_buffer_release(buffer);
+
+ PRINT("ompt_callback_buffer_complete exit device=%d\n", device_id);
+}
+
+
+void
+ompt_trace_configure(ompt_device_t *device)
+{
+ // indicate desired monitoring
+ ompt_set_trace_ompt(device, 1, 0);
+
// turn on monitoring previously indicated
- ompt_start_trace(device, ompt_callback_buffer_request, ompt_callback_buffer_complete);
+ ompt_start_trace(device, ompt_buffer_request,
+ ompt_buffer_complete);
}
void
-ompt_device_initialize(uint64_t device_num,
+ompt_device_initialize(int device_num,
const char *type,
ompt_device_t *device,
ompt_function_lookup_t lookup,
const char *documentation)
{
- PRINT("ompt_device_initialize->%s, %" PRIu64 "\n", type, device_num);
+ PRINT("ompt_device_initialize->%s, %d\n", type, device_num);
ompt_bind_names(lookup);
- //ompt_trace_configure(device);
+ ompt_trace_configure(device);
+ device_list_insert(device_num, device);
ompt_device_map_insert(device_num, device, type);
}
-void
-ompt_device_finalize(uint64_t device_num)
+void
+ompt_device_finalize(int device_num)
{
+ PRINT("ompt_device_finalize id=%d\n", device_num);
}
-void
-ompt_device_load(uint64_t device_num,
+void
+ompt_device_load(int device_num,
const char *filename,
int64_t file_offset,
const void *file_addr,
@@ -345,49 +522,58 @@ ompt_device_load(uint64_t device_num,
const void *device_addr,
uint64_t module_id)
{
- PRINT("ompt_device_load->%s, %" PRIu64 "\n", filename, device_num);
+ PRINT("ompt_device_load->%s, %d\n", filename, device_num);
+
+#if 0 // FIXME
cupti_load_callback_cuda(module_id, host_addr, bytes);
+#endif
}
-void
-ompt_device_unload(uint64_t device_num,
+void
+ompt_device_unload(int device_num,
uint64_t module_id)
{
//cubin_id_map_delete(module_id);
}
-static int
+#if PRUNE_CALLPATH
+static int
get_load_module
(
cct_node_t *node
)
{
- cct_addr_t *addr = hpcrun_cct_addr(target_node);
+ cct_addr_t *addr = hpcrun_cct_addr(target_node);
ip_normalized_t ip = addr->ip_norm;
return ip.lm_id;
}
+#endif
-void
-ompt_target_callback
+void
+ompt_target_callback_emi
(
ompt_target_t kind,
ompt_scope_endpoint_t endpoint,
- uint64_t device_num,
+ int device_num,
ompt_data_t *task_data,
- ompt_id_t target_id,
+ ompt_data_t *target_task_data,
+ ompt_data_t *target_data,
const void *codeptr_ra
)
{
- PRINT("ompt_target_callback->target_id %" PRIu64 "\n", target_id);
-
if (endpoint == ompt_scope_end) {
target_node = NULL;
return;
}
+ ompt_need_flush = true;
+
+ target_data->value = gpu_correlation_id();
+ PRINT("ompt_target_callback->target_id 0x%lx\n", target_data->value);
+
// XXX(Keren): Do not use openmp callbacks to consume and produce records
// HPCToolkit always subscribes its own cupti callback
//
@@ -406,54 +592,58 @@ ompt_target_callback
td->overhead++;
// NOTE(keren): hpcrun_safe_enter prevent self interruption
hpcrun_safe_enter();
-
+
int skip_this_frame = 1; // omit this procedure frame on the call path
- target_node =
- hpcrun_sample_callpath(&uc, zero_metric_id, zero_metric_incr,
- skip_this_frame, 1, NULL).sample_node;
+ target_node =
+ hpcrun_sample_callpath(&uc, zero_metric_id, zero_metric_incr,
+ skip_this_frame, 1, NULL).sample_node;
+#if PRUNE_CALLPATH
// the load module for the runtime library that supports offloading
- int lm = get_load_module(target_node);
+ int lm = get_load_module(target_node);
- // drop nodes on the call chain until we find one that is not in the load
+ // drop nodes on the call chain until we find one that is not in the load
// module for runtime library that supports offloading
- for (;;) {
+ for (;;) {
target_node = hpcrun_cct_parent(target_node);
if (get_load_module(target_node) != lm) break;
}
+#endif
hpcrun_safe_exit();
td->overhead--;
}
-#define FOREACH_OMPT_DATA_OP(macro) \
- macro(ph, ompt_target_data_alloc, ompt_tgt_alloc) \
- macro(ph, ompt_target_data_delete, ompt_tgt_delete) \
- macro(ph, ompt_target_data_transfer_to_device, ompt_tgt_copyin) \
- macro(ph, ompt_target_data_transfer_from_device, ompt_tgt_copyout)
-
void
-ompt_data_op_callback
+ompt_data_op_callback_emi
(
- ompt_scope_endpoint_t endpoint,
- ompt_id_t target_id,
- ompt_id_t host_op_id,
- ompt_target_data_op_t optype,
- void *src_addr,
- int src_device_num,
- void *dest_addr,
- int dest_device_num,
- size_t bytes,
- const void *codeptr_ra
+ ompt_scope_endpoint_t endpoint,
+ ompt_data_t *target_task_data,
+ ompt_data_t *target_data,
+ ompt_id_t *host_op_id,
+ ompt_target_data_op_t optype,
+ void *src_addr,
+ int src_device_num,
+ void *dest_addr,
+ int dest_device_num,
+ size_t bytes,
+ const void *codeptr_ra
)
-{
- uint64_t ph = hpcrun_placeholder_ompt_tgt_none;
- switch (optype) {
+{
+ if (endpoint == ompt_scope_end) return;
+
+ ompt_need_flush = true;
+
+ uint64_t op_id = *host_op_id = gpu_correlation_id();
+
+ PRINT("ompt_data_op enter->target_id 0x%lx\n", target_data->value);
+ enum hpcrun_placeholder op = hpcrun_placeholder_ompt_tgt_none;
+ switch (optype) {
#define ompt_op_macro(op, ompt_op_type, ompt_op_class) \
case ompt_op_type: \
op = hpcrun_placeholder_##ompt_op_class; \
break;
-
+
FOREACH_OMPT_DATA_OP(ompt_op_macro);
#undef ompt_op_macro
@@ -461,22 +651,31 @@ ompt_data_op_callback
break;
}
- hpcrun_ompt_op_id_notify(endpoint, host_op_id, get_placeholder_norm(ph));
+ hpcrun_ompt_op_id_notify(endpoint, op_id, get_placeholder_norm(op));
+ PRINT("ompt_data_op exit->target_id 0x%lx\n", target_data->value);
}
void
-ompt_submit_callback
+ompt_submit_callback_emi
(
ompt_scope_endpoint_t endpoint,
- ompt_id_t target_id,
- ompt_id_t host_op_id,
+ ompt_data_t *target_data,
+ ompt_id_t *host_op_id,
unsigned int requested_num_teams
)
{
- PRINT("ompt_submit_callback enter->target_id %" PRIu64 "\n", target_id);
- hpcrun_ompt_op_id_notify(endpoint, host_op_id, get_placeholder_norm(hpcrun_placeholder_ompt_tgt_kernel));
- PRINT("ompt_submit_callback exit->target_id %" PRIu64 "\n", target_id);
+ PRINT("ompt_submit_callback enter->target_id 0x%lx\n", target_data->value);
+
+ if (endpoint == ompt_scope_begin) {
+ *host_op_id = gpu_correlation_id();
+ hpcrun_ompt_op_id_notify(endpoint, *host_op_id,
+ get_placeholder_norm(hpcrun_placeholder_ompt_tgt_kernel));
+
+ ompt_need_flush = true;
+ }
+
+ PRINT("ompt_submit_callback exit->target_id 0x%lx\n", target_data->value);
}
@@ -488,6 +687,7 @@ ompt_map_callback(ompt_id_t target_id,
size_t *bytes,
unsigned int *mapping_flags)
{
+ ompt_need_flush = true;
}
@@ -510,25 +710,42 @@ ompt_trace_node_get
return trace_node;
}
-
void
-prepare_device()
+prepare_device
+(
+ void
+)
{
PRINT("ompt_initialize->prepare_device enter\n");
- device_finalizer.fn = cupti_device_flush;
- device_finalizer_register(device_finalizer_type_flush, &device_finalizer);
-
- ompt_set_callback(ompt_callback_device_initialize, ompt_device_initialize);
- ompt_set_callback(ompt_callback_device_finalize, ompt_device_finalize);
- ompt_set_callback(ompt_callback_device_load, ompt_device_load);
- ompt_set_callback(ompt_callback_device_unload, ompt_device_unload);
- ompt_set_callback(ompt_callback_target, ompt_target_callback);
- ompt_set_callback(ompt_callback_target_data_op, ompt_data_op_callback);
- ompt_set_callback(ompt_callback_target_submit, ompt_submit_callback);
- ompt_set_callback(ompt_callback_target_map, ompt_map_callback);
+ device_finalizer_flush.fn = ompt_finalize_flush;
+ device_finalizer_register(device_finalizer_type_flush,
+ &device_finalizer_flush);
+
+ device_finalizer_shutdown.fn = ompt_finalize_shutdown;
+ device_finalizer_register(device_finalizer_type_shutdown,
+ &device_finalizer_shutdown);
+
+ device_finalizer_trace.fn = ompt_finalize_trace;
+ device_finalizer_register(device_finalizer_type_shutdown,
+ &device_finalizer_trace);
+
+ ompt_set_callback
+ (ompt_callback_device_initialize, ompt_device_initialize);
+ ompt_set_callback
+ (ompt_callback_device_finalize, ompt_device_finalize);
+ ompt_set_callback
+ (ompt_callback_device_load, ompt_device_load);
+ ompt_set_callback
+ (ompt_callback_device_unload, ompt_device_unload);
+ ompt_set_callback
+ (ompt_callback_target_emi, ompt_target_callback_emi);
+ ompt_set_callback
+ (ompt_callback_target_data_op_emi, ompt_data_op_callback_emi);
+ ompt_set_callback
+ (ompt_callback_target_submit_emi, ompt_submit_callback_emi);
+ ompt_set_callback
+ (ompt_callback_target_map, ompt_map_callback);
PRINT("ompt_initialize->prepare_device exit\n");
}
-
-#endif
diff --git a/src/tool/hpcrun/ompt/ompt-device.h b/src/tool/hpcrun/ompt/ompt-device.h
index 17bae2f257..75785c3b08 100644
--- a/src/tool/hpcrun/ompt/ompt-device.h
+++ b/src/tool/hpcrun/ompt/ompt-device.h
@@ -51,8 +51,6 @@
#include
#include
-#if HAVE_CUPTI_H
-
void
prepare_device
(
@@ -112,12 +110,4 @@ ompt_external_subscriber_disable
void
);
-#else
-
-// no op without a CUDA device
-#define prepare_device()
-
-#endif
-
-
#endif // _OMPT_INTERFACE_H_
diff --git a/src/tool/hpcrun/ompt/ompt-interface.c b/src/tool/hpcrun/ompt/ompt-interface.c
index 24291289af..c69831ba5d 100644
--- a/src/tool/hpcrun/ompt/ompt-interface.c
+++ b/src/tool/hpcrun/ompt/ompt-interface.c
@@ -70,6 +70,8 @@
#include
#include
+#include
+
#include "ompt-callstack.h"
#include "ompt-defer.h"
#include "ompt-interface.h"
@@ -476,11 +478,9 @@ init_threads
void
)
{
- ompt_set_callback_fn
- (ompt_callback_thread_begin, (ompt_callback_t)ompt_thread_begin);
+ ompt_set_callback(ompt_callback_thread_begin, ompt_thread_begin);
- ompt_set_callback_fn
- (ompt_callback_thread_end, (ompt_callback_t) ompt_thread_end);
+ ompt_set_callback(ompt_callback_thread_end, ompt_thread_end);
}
@@ -490,7 +490,7 @@ init_parallel_regions
void
)
{
- ompt_parallel_region_register_callbacks(ompt_set_callback_fn);
+ ompt_parallel_region_register_callbacks(ompt_set_callback_internal);
ompt_regions_init();
}
@@ -501,7 +501,7 @@ init_tasks
void
)
{
- ompt_task_register_callbacks(ompt_set_callback_fn);
+ ompt_task_register_callbacks(ompt_set_callback_internal);
}
@@ -519,8 +519,8 @@ init_mutex_blame_shift
if (!ompt_mutex_blame_requested) return;
- retval = ompt_set_callback_fn(ompt_callback_mutex_released,
- (ompt_callback_t) ompt_mutex_blame_accept);
+ retval = ompt_set_callback(ompt_callback_mutex_released,
+ ompt_mutex_blame_accept);
mutex_blame_shift_avail |= ompt_event_may_occur(retval);
@@ -556,13 +556,11 @@ init_idle_blame_shift
#if 0
ompt_idle_blame_shift_request();
- retval = ompt_set_callback_fn(ompt_callback_idle,
- (ompt_callback_t)ompt_idle);
+ retval = ompt_set_callback(ompt_callback_idle, ompt_idle);
idle_blame_shift_avail |= ompt_event_may_occur(retval);
#endif
- retval = ompt_set_callback_fn(ompt_callback_sync_region_wait,
- (ompt_callback_t)ompt_sync);
+ retval = ompt_set_callback(ompt_callback_sync_region_wait, ompt_sync);
idle_blame_shift_avail |= ompt_event_may_occur(retval);
@@ -670,10 +668,13 @@ ompt_start_tool
const char *runtime_version
)
{
+ // force hpctoolkit initialization
+ monitor_initialize();
+ // post-condition: hpctoolkit is initialized
- if (getenv("OMPT_DEBUG_WAIT")) {
+ if (getenv("OMPT_DEBUG_WAIT")) {
while (ompt_debug_wait);
- }
+ }
#if OMPT_DEBUG_STARTUP
printf("Starting tool...\n");
@@ -1027,18 +1028,19 @@ hpcrun_ompt_get_parent_region_data
int
hpcrun_ompt_get_thread_num(int level)
{
- if (ompt_initialized) {
- int task_type_flags;
- ompt_data_t *task_data = NULL;
- ompt_data_t *parallel_data = NULL;
- ompt_frame_t *task_frame = NULL;
- int thread_num = 0;
-
- ompt_get_task_info_fn(level, &task_type_flags, &task_data, &task_frame, ¶llel_data, &thread_num);
- //printf("Task frame pointer = %p\n", task_frame);
- return thread_num;
- }
- return -1;
+ if (ompt_initialized) {
+ int task_type_flags;
+ ompt_data_t *task_data = NULL;
+ ompt_data_t *parallel_data = NULL;
+ ompt_frame_t *task_frame = NULL;
+ int thread_num = 0;
+
+ ompt_get_task_info_fn(level, &task_type_flags, &task_data,
+ &task_frame, ¶llel_data, &thread_num);
+ //printf("Task frame pointer = %p\n", task_frame);
+ return thread_num;
+ }
+ return -1;
}
diff --git a/src/tool/hpcrun/sample-sources/amd-rocprofiler.c b/src/tool/hpcrun/sample-sources/amd-rocprofiler.c
new file mode 100644
index 0000000000..32cdeaf27e
--- /dev/null
+++ b/src/tool/hpcrun/sample-sources/amd-rocprofiler.c
@@ -0,0 +1,211 @@
+//******************************************************************************
+// system includes
+//******************************************************************************
+
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+
+#include
+
+#ifndef HPCRUN_STATIC_LINK
+#include
+#endif
+
+
+
+//******************************************************************************
+// libmonitor
+//******************************************************************************
+
+#include
+
+
+
+//******************************************************************************
+// local includes
+//******************************************************************************
+
+#include "amd.h"
+
+#include "libdl.h"
+
+#include "simple_oo.h"
+#include "sample_source_obj.h"
+#include "common.h"
+
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+
+#include
+#include
+#include
+#include
+
+#include
+
+
+
+//******************************************************************************
+// macros
+//******************************************************************************
+
+#define AMD_ROCPROFILER_PREFIX "rocprof"
+
+static device_finalizer_fn_entry_t device_finalizer_rocprofiler_shutdown;
+
+//******************************************************************************
+// interface operations
+//******************************************************************************
+
+static void
+METHOD_FN(init)
+{
+ self->state = INIT;
+}
+
+
+static void
+METHOD_FN(thread_init)
+{
+ TMSG(CUDA, "thread_init");
+}
+
+
+static void
+METHOD_FN(thread_init_action)
+{
+ TMSG(CUDA, "thread_init_action");
+}
+
+
+static void
+METHOD_FN(start)
+{
+ TMSG(CUDA, "start");
+ TD_GET(ss_state)[self->sel_idx] = START;
+}
+
+
+static void
+METHOD_FN(thread_fini_action)
+{
+ TMSG(CUDA, "thread_fini_action");
+}
+
+
+static void
+METHOD_FN(stop)
+{
+ hpcrun_get_thread_data();
+ TD_GET(ss_state)[self->sel_idx] = STOP;
+}
+
+
+static void
+METHOD_FN(shutdown)
+{
+ self->state = UNINIT;
+}
+
+
+static bool
+METHOD_FN(supports_event, const char *ev_str)
+{
+#ifndef HPCRUN_STATIC_LINK
+ if (hpcrun_ev_is(ev_str, AMD_ROCPROFILER_PREFIX)) {
+ rocprofiler_init();
+ const char* roc_str = ev_str + sizeof(AMD_ROCPROFILER_PREFIX);
+ while (*roc_str == ':') roc_str++;
+ if (*roc_str == 0) return false;
+ return rocprofiler_match_event(roc_str) != 0;
+ }
+ return false;
+#else
+ return false;
+#endif
+
+
+}
+
+static void
+METHOD_FN(process_event_list, int lush_metrics)
+{
+ int nevents = (self->evl).nevents;
+ TMSG(CUDA,"nevents = %d", nevents);
+}
+
+static void
+METHOD_FN(finalize_event_list)
+{
+ // After going through all command line arguments,
+ // we call this function to generate a list of counters
+ // in rocprofiler's format and initialize corresponding
+ // hpcrun metrics
+ rocprofiler_finalize_event_list();
+
+ device_finalizer_rocprofiler_shutdown.fn = rocprofiler_fini;
+ device_finalizer_register(device_finalizer_type_shutdown, &device_finalizer_rocprofiler_shutdown);
+
+ // Inform roctracer component that we will collect hardware counters,
+ // which will serialize kernel launches
+ roctracer_enable_counter_collection();
+}
+
+
+static void
+METHOD_FN(gen_event_set,int lush_metrics)
+{
+
+}
+
+
+static void
+METHOD_FN(display_events)
+{
+ // We need to query rocprofiler to get a list of supported rocprofiler counters
+ rocprofiler_init();
+
+ int total_counters = rocprofiler_total_counters();
+ printf("===========================================================================\n");
+ printf("Available AMD GPU hardware counter events\n");
+ printf("===========================================================================\n");
+ printf("Name\t\tDescription\n");
+ printf("---------------------------------------------------------------------------\n");
+ for (int i = 0; i < total_counters; ++i) {
+ printf("%s::%s\t\t%s\n", AMD_ROCPROFILER_PREFIX, rocprofiler_counter_name(i), rocprofiler_counter_description(i));
+ }
+ printf("\n");
+}
+
+
+
+//**************************************************************************
+// object
+//**************************************************************************
+
+#define ss_name amd_rocprof
+#define ss_cls SS_HARDWARE
+
+#include "ss_obj.h"
diff --git a/src/tool/hpcrun/sample-sources/amd.c b/src/tool/hpcrun/sample-sources/amd.c
index 806606c727..d15d62bf96 100644
--- a/src/tool/hpcrun/sample-sources/amd.c
+++ b/src/tool/hpcrun/sample-sources/amd.c
@@ -43,6 +43,8 @@
#include
#include
#include
+#include
+#include
#include
#include
#include
@@ -152,6 +154,14 @@ METHOD_FN(process_event_list, int lush_metrics)
gpu_metrics_default_enable();
hpcrun_set_trace_metric(HPCRUN_GPU_TRACE_FLAG);
TMSG(CUDA,"nevents = %d", nevents);
+
+
+#ifndef HPCRUN_STATIC_LINK
+ if (hip_bind()) {
+ EEMSG("hpcrun: unable to bind to HIP AMD library %s\n", dlerror());
+ monitor_real_exit(-1);
+ }
+#endif
}
static void
@@ -213,7 +223,6 @@ METHOD_FN(display_events)
}
-
//**************************************************************************
// object
//**************************************************************************
diff --git a/src/tool/hpcrun/sample-sources/blame-shift/blame-sample-source.h b/src/tool/hpcrun/sample-sources/blame-shift/blame-sample-source.h
index 71a1afc816..bec3fd7f5d 100644
--- a/src/tool/hpcrun/sample-sources/blame-shift/blame-sample-source.h
+++ b/src/tool/hpcrun/sample-sources/blame-shift/blame-sample-source.h
@@ -192,7 +192,6 @@ METHOD_FN(display_events)
}
-
/*--------------------------------------------------------------------------
| sample source object
--------------------------------------------------------------------------*/
diff --git a/src/tool/hpcrun/sample-sources/cuda.c b/src/tool/hpcrun/sample-sources/cuda.c
index e4dae1b650..f79a4ba2e4 100644
--- a/src/tool/hpcrun/sample-sources/cuda.c
+++ b/src/tool/hpcrun/sample-sources/cuda.c
@@ -423,6 +423,7 @@ METHOD_FN(display_events)
printf("\n");
}
+
/***************************************************************************
* object
***************************************************************************/
diff --git a/src/tool/hpcrun/sample-sources/generic.c b/src/tool/hpcrun/sample-sources/generic.c
index ddb375ef42..d1a2754c62 100644
--- a/src/tool/hpcrun/sample-sources/generic.c
+++ b/src/tool/hpcrun/sample-sources/generic.c
@@ -446,6 +446,7 @@ METHOD_FN(display_events)
printf("\n");
}
+
//***************************************************************************
// object
//***************************************************************************
diff --git a/src/tool/hpcrun/sample-sources/gpu_blame.c b/src/tool/hpcrun/sample-sources/gpu_blame.c
index 0d02ec2d54..5d90169d5d 100644
--- a/src/tool/hpcrun/sample-sources/gpu_blame.c
+++ b/src/tool/hpcrun/sample-sources/gpu_blame.c
@@ -257,6 +257,7 @@ static void METHOD_FN(display_events)
printf("\n");
}
+
/***************************************************************************
* object
***************************************************************************/
diff --git a/src/tool/hpcrun/sample-sources/idle.c b/src/tool/hpcrun/sample-sources/idle.c
index 532884fda6..0080956e53 100644
--- a/src/tool/hpcrun/sample-sources/idle.c
+++ b/src/tool/hpcrun/sample-sources/idle.c
@@ -231,7 +231,6 @@ METHOD_FN(display_events)
}
-
/***************************************************************************
* object
***************************************************************************/
diff --git a/src/tool/hpcrun/sample-sources/memleak.c b/src/tool/hpcrun/sample-sources/memleak.c
index 57f00c6ad1..c5e59387e8 100644
--- a/src/tool/hpcrun/sample-sources/memleak.c
+++ b/src/tool/hpcrun/sample-sources/memleak.c
@@ -204,6 +204,7 @@ METHOD_FN(display_events)
printf("\n");
}
+
/***************************************************************************
* object
***************************************************************************/
diff --git a/src/tool/hpcrun/sample-sources/mpi.c b/src/tool/hpcrun/sample-sources/mpi.c
index 56595575f4..8f24a96822 100644
--- a/src/tool/hpcrun/sample-sources/mpi.c
+++ b/src/tool/hpcrun/sample-sources/mpi.c
@@ -186,6 +186,7 @@ METHOD_FN(display_events)
printf("\n");
}
+
/***************************************************************************
* object
***************************************************************************/
diff --git a/src/tool/hpcrun/sample-sources/none.c b/src/tool/hpcrun/sample-sources/none.c
index 9f9648894e..b287b0c744 100644
--- a/src/tool/hpcrun/sample-sources/none.c
+++ b/src/tool/hpcrun/sample-sources/none.c
@@ -187,6 +187,7 @@ METHOD_FN(display_events)
{
}
+
/***************************************************************************
* object
***************************************************************************/
diff --git a/src/tool/hpcrun/sample-sources/nvidia.c b/src/tool/hpcrun/sample-sources/nvidia.c
index 1a8b0af39b..bc345064d0 100644
--- a/src/tool/hpcrun/sample-sources/nvidia.c
+++ b/src/tool/hpcrun/sample-sources/nvidia.c
@@ -101,7 +101,7 @@
#define NVIDIA_CUDA "gpu=nvidia"
#define NVIDIA_CUDA_PC_SAMPLING "gpu=nvidia,pc"
-
+#define NVIDIA_CUDA_NV_LINK "nvlink"
/******************************************************************************
@@ -320,7 +320,8 @@ static bool
METHOD_FN(supports_event, const char *ev_str)
{
#ifndef HPCRUN_STATIC_LINK
- return hpcrun_ev_is(ev_str, NVIDIA_CUDA) || hpcrun_ev_is(ev_str, NVIDIA_CUDA_PC_SAMPLING);
+ return hpcrun_ev_is(ev_str, NVIDIA_CUDA) || hpcrun_ev_is(ev_str, NVIDIA_CUDA_PC_SAMPLING)
+ || hpcrun_ev_is(ev_str, NVIDIA_CUDA_NV_LINK);
#else
return false;
#endif
@@ -354,27 +355,32 @@ METHOD_FN(process_event_list, int lush_metrics)
char* event = start_tok(evlist);
long int frequency = 0;
int frequency_default = -1;
+
hpcrun_extract_ev_thresh(event, sizeof(nvidia_name), nvidia_name,
&frequency, frequency_default);
- if (hpcrun_ev_is(nvidia_name, NVIDIA_CUDA)) {
- trace_frequency =
- (frequency == frequency_default) ? trace_frequency_default : frequency;
- gpu_monitoring_trace_sample_frequency_set(trace_frequency);
- } else if (hpcrun_ev_is(nvidia_name, NVIDIA_CUDA_PC_SAMPLING)) {
- pc_sampling_frequency = (frequency == frequency_default) ?
- pc_sampling_frequency_default : frequency;
+ for (; event != NULL; event = next_tok()) {
+ if (hpcrun_ev_is(event, NVIDIA_CUDA)) {
+ trace_frequency =
+ (frequency == frequency_default) ? trace_frequency_default : frequency;
+ gpu_monitoring_trace_sample_frequency_set(trace_frequency);
+ } else if (hpcrun_ev_is(event, NVIDIA_CUDA_PC_SAMPLING)) {
+ pc_sampling_frequency = (frequency == frequency_default) ?
+ pc_sampling_frequency_default : frequency;
- gpu_monitoring_instruction_sample_frequency_set(pc_sampling_frequency);
+ gpu_monitoring_instruction_sample_frequency_set(pc_sampling_frequency);
- gpu_metrics_GPU_INST_enable(); // instruction counts
+ gpu_metrics_GPU_INST_enable(); // instruction counts
- gpu_metrics_GPU_INST_STALL_enable(); // stall metrics
+ gpu_metrics_GPU_INST_STALL_enable(); // stall metrics
gpu_metrics_GSAMP_enable(); // GPU utilization from sampling
// pc sampling cannot be on with concurrent kernels
kernel_invocation_activities[0] = CUPTI_ACTIVITY_KIND_KERNEL;
+ } else if (hpcrun_ev_is(event, NVIDIA_CUDA_NV_LINK)) {
+ gpu_metrics_GXFER_enable();
+ }
}
gpu_metrics_default_enable();
@@ -484,7 +490,6 @@ METHOD_FN(display_events)
}
-
//******************************************************************************
// object
//******************************************************************************
diff --git a/src/tool/hpcrun/sample-sources/openmp-target.c b/src/tool/hpcrun/sample-sources/openmp-target.c
new file mode 100644
index 0000000000..7aa46462e6
--- /dev/null
+++ b/src/tool/hpcrun/sample-sources/openmp-target.c
@@ -0,0 +1,194 @@
+//******************************************************************************
+// system includes
+//******************************************************************************
+
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+
+#include
+
+#ifndef HPCRUN_STATIC_LINK
+#include
+#endif
+
+
+
+//******************************************************************************
+// libmonitor
+//******************************************************************************
+
+#include
+
+
+
+//******************************************************************************
+// local includes
+//******************************************************************************
+
+#include "amd.h"
+
+#include "libdl.h"
+
+#include "simple_oo.h"
+#include "sample_source_obj.h"
+#include "common.h"
+
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+
+#include
+#include
+#include
+#include
+
+
+
+
+//******************************************************************************
+// macros
+//******************************************************************************
+
+#define OPENMP_TARGET "gpu=openmp"
+
+static device_finalizer_fn_entry_t device_finalizer_shutdown;
+static device_finalizer_fn_entry_t device_trace_finalizer_shutdown;
+
+
+//******************************************************************************
+// interface operations
+//******************************************************************************
+
+static void
+METHOD_FN(init)
+{
+ self->state = INIT;
+}
+
+
+static void
+METHOD_FN(thread_init)
+{
+ TMSG(CUDA, "thread_init");
+}
+
+
+static void
+METHOD_FN(thread_init_action)
+{
+ TMSG(CUDA, "thread_init_action");
+}
+
+
+static void
+METHOD_FN(start)
+{
+ TMSG(CUDA, "start");
+ TD_GET(ss_state)[self->sel_idx] = START;
+}
+
+
+static void
+METHOD_FN(thread_fini_action)
+{
+ TMSG(CUDA, "thread_fini_action");
+}
+
+
+static void
+METHOD_FN(stop)
+{
+ hpcrun_get_thread_data();
+
+ TD_GET(ss_state)[self->sel_idx] = STOP;
+}
+
+
+static void
+METHOD_FN(shutdown)
+{
+ self->state = UNINIT;
+}
+
+
+static bool
+METHOD_FN(supports_event, const char *ev_str)
+{
+#ifndef HPCRUN_STATIC_LINK
+ return hpcrun_ev_is(ev_str, OPENMP_TARGET);
+#else
+ return false;
+#endif
+
+
+}
+
+static void
+METHOD_FN(process_event_list, int lush_metrics)
+{
+ int nevents = (self->evl).nevents;
+ gpu_metrics_default_enable();
+ hpcrun_set_trace_metric(HPCRUN_GPU_TRACE_FLAG);
+ TMSG(CUDA,"nevents = %d", nevents);
+}
+
+static void
+METHOD_FN(finalize_event_list)
+{
+ gpu_metrics_default_enable();
+ gpu_trace_init();
+}
+
+
+static void
+METHOD_FN(gen_event_set,int lush_metrics)
+{
+
+}
+
+
+static void
+METHOD_FN(display_events)
+{
+ printf("===========================================================================\n");
+ printf("Available AMD GPU events\n");
+ printf("===========================================================================\n");
+ printf("Name\t\tDescription\n");
+ printf("---------------------------------------------------------------------------\n");
+ printf("%s\t\tOperation-level monitoring of OpenMP offloading.\n"
+ "\t\tCollect timing information on GPU kernel invocations,\n"
+ "\t\tmemory copies, etc.\n",
+ OPENMP_TARGET);
+ printf("\n");
+}
+
+
+
+//**************************************************************************
+// object
+//**************************************************************************
+
+#define ss_name openmp_gpu
+#define ss_cls SS_HARDWARE
+
+#include "ss_obj.h"
diff --git a/src/tool/hpcrun/sample-sources/papi-c-cupti.c b/src/tool/hpcrun/sample-sources/papi-c-cupti.c
index 4e6a372e14..9c9050e619 100644
--- a/src/tool/hpcrun/sample-sources/papi-c-cupti.c
+++ b/src/tool/hpcrun/sample-sources/papi-c-cupti.c
@@ -1,378 +1,193 @@
-// ******************* System Includes ********************
-#include
-#include
+// -*-Mode: C++;-*- // technically C99
-#include
-#include
-#include
-// *********************************************************
+// * BeginRiceCopyright *****************************************************
+//
+// --------------------------------------------------------------------------
+// Part of HPCToolkit (hpctoolkit.org)
+//
+// Information about sources of support for research and development of
+// HPCToolkit is at 'hpctoolkit.org' and in 'README.Acknowledgments'.
+// --------------------------------------------------------------------------
+//
+// Copyright ((c)) 2002-2020, Rice University
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// * Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// * Neither the name of Rice University (RICE) nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// This software is provided by RICE and contributors "as is" and any
+// express or implied warranties, including, but not limited to, the
+// implied warranties of merchantability and fitness for a particular
+// purpose are disclaimed. In no event shall RICE or contributors be
+// liable for any direct, indirect, incidental, special, exemplary, or
+// consequential damages (including, but not limited to, procurement of
+// substitute goods or services; loss of use, data, or profits; or
+// business interruption) however caused and on any theory of liability,
+// whether in contract, strict liability, or tort (including negligence
+// or otherwise) arising in any way out of the use of this software, even
+// if advised of the possibility of such damage.
+//
+// ******************************************************* EndRiceCopyright *
+//***************************************************************************
+//
+// File:
+// cupti-api.c
+//
+// Purpose:
+// implementation of wrapper around NVIDIA's CUPTI performance tools API
+//
+//***************************************************************************
-// ******************** PAPI *******************************
-#include
-// *********************************************************
+//***************************************************************************
+// system includes
+//***************************************************************************
-// ******************** MONITOR *******************************
+#include
#include
-// *********************************************************
-// ******************** GPU includes ***********************
-#include
-#include
-// *********************************************************
-// ******* HPCToolkit Includes *********************************
-#include
-#include
+//***************************************************************************
+// local includes
+//***************************************************************************
+
#include
-#include
-#include
-#include
-#include
#include
-// *********************************************************
-
-// ******** local includes ***********
#include "papi-c.h"
#include "papi-c-extended-info.h"
-// ***********************************
-
-// ****************** Convenience macros *******************
-
-#define CUPTI_LAUNCH_CALLBACK_DEPTH 7
-
-#define Cupti_call(fn, ...) \
-{ \
- int ret = fn(__VA_ARGS__); \
- if (ret != CUPTI_SUCCESS) { \
- const char* errstr; \
- dcuptiGetResultString(ret, &errstr); \
- hpcrun_abort("error: CUDA/CUPTI API " \
- #fn " failed w error code %d ==> '%s'\n", \
- ret, errstr); \
- } \
-}
-
-#define Cupti_call_silent(fn, ...) \
-{ \
- (void) fn(__VA_ARGS__); \
-}
-#define Chk_dlopen(v, lib, flags) \
- void* v = monitor_real_dlopen(lib, flags); \
- if (! v) { \
- fprintf(stderr, "gpu dlopen %s failed\n", lib); \
- return; \
- } \
-
-#define Chk_dlsym(h, fn) { \
- dlerror(); \
- d ## fn = dlsym(h, #fn); \
- char* e = dlerror(); \
- if (e) { \
- fprintf(stderr, "dlsym(%s) fails w '%s'\n", #fn, e); \
- return; \
- } \
-}
-// ***********************************************************
-
-typedef struct {
- int nevents;
- int event_set;
- sample_source_t* self;
-} papi_cuda_data_t;
-
-static bool event_set_created = false;
-static bool event_set_finalized = false;
-static papi_cuda_data_t local = {};
-
-static spinlock_t cupti_lock = SPINLOCK_UNLOCKED;
-static spinlock_t setup_lock = SPINLOCK_UNLOCKED;
-
-// ******************** cuda/cupti functions ***********************
-// Some cuda/cupti functions must not be wrapped! So, we fetch them via dlopen.
-// NOTE: naming convention is to prepend the letter "d" to the actual function
-// The indirect functions are below.
-//
-cudaError_t (*dcudaThreadSynchronize)(void);
-CUptiResult (*dcuptiGetResultString)(CUptiResult result, const char** str);
+//******************************************************************************
+// static data
+//******************************************************************************
-CUptiResult (*dcuptiSubscribe)(CUpti_SubscriberHandle* subscriber,
- CUpti_CallbackFunc callback,
- void* userdata);
+static __thread bool event_set_created = false;
+static __thread bool event_set_finalized = false;
+static __thread int my_event_set = PAPI_NULL;
-CUptiResult (*dcuptiEnableCallback)(uint32_t enable,
- CUpti_SubscriberHandle subscriber,
- CUpti_CallbackDomain domain,
- CUpti_CallbackId cbid);
-CUptiResult (*dcuptiUnsubscribe)(CUpti_SubscriberHandle subscriber);
+//******************************************************************************
+// private operations
+//******************************************************************************
-// *****************************************************************
-typedef struct cuda_callback_t {
- sample_source_t* ss;
- int event_set;
-} cuda_callback_t;
-
-//
-// populate the cuda/cupti functions via dlopen
-//
-
-static void
-dlgpu(void)
-{
- // only use dlfunctions in NON static case
-#ifndef HPCRUN_STATIC_LINK
- Chk_dlopen(cudart, "libcudart.so", RTLD_NOW | RTLD_GLOBAL);
- Chk_dlsym(cudart, cudaThreadSynchronize);
-
- Chk_dlopen(cupti, "libcupti.so", RTLD_NOW | RTLD_GLOBAL);
- Chk_dlsym(cupti, cuptiGetResultString);
- Chk_dlsym(cupti, cuptiSubscribe);
- Chk_dlsym(cupti, cuptiEnableCallback);
- Chk_dlsym(cupti, cuptiUnsubscribe);
-#endif // ! HPCRUN_STATIC_LINK
-}
-
-//
-// noop routine
-//
static void
papi_c_no_action(void)
{
;
}
-//
-// Predicate to determine if this component is being referenced
-//
+
static bool
is_papi_c_cuda(const char* name)
{
return strstr(name, "cuda") == name;
}
-static void CUPTIAPI
-hpcrun_cuda_kernel_callback(void* userdata,
- CUpti_CallbackDomain domain,
- CUpti_CallbackId cbid,
- const CUpti_CallbackData* cbInfo)
-{
- TMSG(CUDA, "Got Kernel Callback");
-
- papi_cuda_data_t* cuda_data = userdata;
- int nevents = cuda_data->nevents;
- int cudaEventSet = cuda_data->event_set;
- sample_source_t* self = cuda_data->self;
-
-
- TMSG(CUDA, "nevents = %d, cuda event set = %x", nevents, cudaEventSet);
-
- // This callback is enabled only for kernel launch; anything else is an error.
- if (cbid != CUPTI_RUNTIME_TRACE_CBID_cudaLaunch_v3020) {
- hpcrun_abort("CUDA CUPTI callback seen for unexpected "
- "interface operation: callback id %d\n", cbid);
- }
-
- if (cbInfo->callbackSite == CUPTI_API_ENTER) {
- TMSG(CUDA, "Cupti API -ENTER- portion");
- // MC recommends FIXME: Unnecessary, but use cudaDeviceSynchronize
- // exclusive access to launcher
- spinlock_lock(&cupti_lock);
- TMSG(CUPTI, "-ACQ-lock");
- dcudaThreadSynchronize();
-
- TMSG(CUPTI,"-- PRE launch callback");
- TMSG(CUDA, "Start monitoring with event set %d", cudaEventSet);
- int ret = PAPI_start(cudaEventSet);
- if (ret != PAPI_OK){
- EMSG("CUDA monitoring failed to start. PAPI_start failed with %s (%d)",
- PAPI_strerror(ret), ret);
- }
- }
- TMSG(CUDA, "Past (or done with) CUDA -ENTER- portion");
-
-
- if (cbInfo->callbackSite == CUPTI_API_EXIT) {
- TMSG(CUDA, "Cupti API -EXIT- portion");
- // MC recommends Use cudaDeviceSynchronize
- dcudaThreadSynchronize();
- TMSG(CUPTI, "-- POST launch callback");
- long_long eventValues[nevents+2];
-
- TMSG(CUDA,"stopping CUDA monitoring w event set %d",cudaEventSet);
- int ret = PAPI_stop(cudaEventSet, eventValues);
- if (ret != PAPI_OK){
- EMSG("CUDA monitoring failed to -stop-. PAPI_stop failed with %s (%d)",
- PAPI_strerror(ret), ret);
- }
- TMSG(CUDA,"stopped CUDA monitoring w event set %d",cudaEventSet);
-
- ucontext_t uc;
- TMSG(CUDA,"getting context in CUDA event handler");
- getcontext(&uc);
- TMSG(CUDA,"got context in CUDA event handler");
- bool safe = hpcrun_safe_enter();
- TMSG(CUDA,"blocked async event in CUDA event handler");
- {
- int i;
- for (i = 0; i < nevents; i++)
- {
- int metric_id = hpcrun_event2metric(self, i);
-
- TMSG(CUDA, "sampling call path for metric_id = %d", metric_id);
- hpcrun_sample_callpath(&uc, metric_id, eventValues[i]/*metricIncr*/,
- CUPTI_LAUNCH_CALLBACK_DEPTH/*skipInner*/,
- 0/*isSync*/, NULL);
- TMSG(CUDA, "sampled call path for metric_id = %d", metric_id);
- }
- }
- TMSG(CUDA,"unblocking async event in CUDA event handler");
- if (safe) hpcrun_safe_exit();
- TMSG(CUDA,"unblocked async event in CUDA event handler");
- spinlock_unlock(&cupti_lock);
- TMSG(CUPTI,"-REL-lock\n");
- }
- TMSG(CUDA, "At end (past -EXIT-)");
-}
-
-static CUpti_SubscriberHandle subscriber;
-
-//
-// sync setup for cuda/cupti
-//
+// Get or create a cupti event set
static void
-papi_c_cupti_setup(void)
-{
- // FIXME: Remove local definition
- // CUpti_SubscriberHandle subscriber;
-
- static bool one_time = false;
-
- spinlock_lock(&setup_lock);
- TMSG(CUDA, "CUPTI setup acquire lock");
- if (one_time) {
- spinlock_unlock(&setup_lock);
- TMSG(CUDA, "CUPTI setup release lock (setup already called)");
- return;
- }
-
- TMSG(CUDA,"sync setup called");
-
- thread_data_t* td = hpcrun_get_thread_data();
- local.self = hpcrun_fetch_source_by_name("papi");
-
- local.nevents = local.self->evl.nevents;
-
- // get cuda event set
-
- int cuda_component_idx;
- int n_components = PAPI_num_components();
-
- for (int i = 0; i < n_components; i++) {
- if (is_papi_c_cuda(PAPI_get_component_info(i)->name)) {
- cuda_component_idx = i;
- break;
- }
- }
-
- papi_source_info_t* psi = td->ss_info[local.self->sel_idx].ptr;
- local.event_set = get_component_event_set(psi, cuda_component_idx);
-
- Cupti_call(dcuptiSubscribe, &subscriber,
- (CUpti_CallbackFunc)hpcrun_cuda_kernel_callback,
- &local);
-
- Cupti_call(dcuptiEnableCallback, 1, subscriber,
- CUPTI_CB_DOMAIN_RUNTIME_API,
- CUPTI_RUNTIME_TRACE_CBID_cudaLaunch_v3020);
-
- one_time = true;
- spinlock_unlock(&setup_lock);
- TMSG(CUDA, "CUPTI setup release lock");
-}
-
-//
-// Get or create a cupti event set --- but only ONCE per process
-//
-void
-papi_c_cupti_get_event_set(int* ev_s)
+papi_c_cupti_get_event_set(int* event_set)
{
TMSG(CUDA, "Get event set");
- spinlock_lock(&setup_lock);
- TMSG(CUDA, "Cupti lock acquired");
if (! event_set_created) {
TMSG(CUDA, "No event set created, so create one");
- int ret = PAPI_create_eventset(ev_s);
+ int ret = PAPI_create_eventset(&my_event_set);
if (ret != PAPI_OK) {
- hpcrun_abort("Failure: PAPI_create_eventset.Return code = %d ==> %s",
+ hpcrun_abort("Failure: PAPI_create_eventset.Return code = %d ==> %s",
ret, PAPI_strerror(ret));
}
- local.event_set = *ev_s;
+ *event_set = my_event_set;
event_set_created = true;
- TMSG(CUDA, "Event set %d created", local.event_set);
+ TMSG(CUDA, "Event set %d created", my_event_set);
}
- spinlock_unlock(&setup_lock);
- TMSG(CUDA, "Cupti lock released");
}
-int
-papi_c_cupti_add_event(int ev_s, int ev)
+
+// Add event to my_event_set
+void
+papi_c_cupti_add_event(int event_set, int evcode)
{
+ assert(event_set == my_event_set);
+
int rv = PAPI_OK;
- TMSG(CUDA, "Adding event to cupti event set");
- spinlock_lock(&setup_lock);
- TMSG(CUDA, "Cupti lock acquired");
if (! event_set_finalized) {
- TMSG(CUDA, "Really add event %x to cupti event set", ev);
- rv = PAPI_add_event(local.event_set, ev);
- TMSG(CUDA, "Check event set passed in = %d, cuda event set = %d", ev_s, local.event_set);
+ TMSG(CUDA, "Adding event %x to cupti event set", evcode);
+ rv = PAPI_add_event(my_event_set, evcode);
+ if (rv != PAPI_OK) {
+ hpcrun_abort("failure in PAPI gen_event_set(): PAPI_add_event() returned: %s (%d)",
+ PAPI_strerror(rv), rv);
+ }
+ TMSG(CUDA, "Added event %d, to cuda event set %d", evcode, my_event_set);
}
- spinlock_unlock(&setup_lock);
- TMSG(CUDA, "Cupti lock released");
- return rv;
}
+// No adding new events after this point
void
papi_c_cupti_finalize_event_set(void)
{
- spinlock_lock(&setup_lock);
event_set_finalized = true;
- spinlock_unlock(&setup_lock);
}
-//
-// sync teardown for cuda/cupti
-//
-static void
-papi_c_cupti_teardown(void)
+void
+papi_c_cupti_start()
+{
+ int ret = PAPI_start(my_event_set);
+ if (ret != PAPI_OK) {
+ hpcrun_abort("PAPI_start of event set %d failed with %s (%d)",
+ my_event_set, PAPI_strerror(ret), ret);
+ }
+}
+
+
+void
+papi_c_cupti_read(long long *values)
+{
+ int ret = PAPI_read(my_event_set, values);
+ if (ret != PAPI_OK) {
+ hpcrun_abort("PAPI_read of event set %d failed with %s (%d)",
+ my_event_set, PAPI_strerror(ret), ret);
+ }
+}
+
+
+void
+papi_c_cupti_stop(long long *values)
{
- static bool one_time = false;
- spinlock_lock(&setup_lock);
- if (one_time) return;
-
- TMSG(CUDA,"sync teardown called (=unsubscribe)");
-
- Cupti_call(dcuptiUnsubscribe, subscriber);
- one_time = true;
- spinlock_unlock(&setup_lock);
+ int ret = PAPI_stop(my_event_set, values);
+ if (ret != PAPI_OK) {
+ hpcrun_abort("PAPI_stop of event set %d failed with %s (%d)",
+ my_event_set, PAPI_strerror(ret), ret);
+ }
}
+
static sync_info_list_t cuda_component = {
.pred = is_papi_c_cuda,
.get_event_set = papi_c_cupti_get_event_set,
.add_event = papi_c_cupti_add_event,
.finalize_event_set = papi_c_cupti_finalize_event_set,
- .sync_setup = papi_c_cupti_setup,
- .sync_teardown = papi_c_cupti_teardown,
- .sync_start = papi_c_no_action,
- .sync_stop = papi_c_no_action,
+ .is_gpu_sync = true,
+ .setup = papi_c_no_action,
+ .teardown = papi_c_no_action,
+ .start = papi_c_cupti_start,
+ .read = papi_c_cupti_read,
+ .stop = papi_c_cupti_stop,
.process_only = true,
.next = NULL,
};
@@ -381,7 +196,5 @@ static sync_info_list_t cuda_component = {
void
SS_OBJ_CONSTRUCTOR(papi_c_cupti)(void)
{
- // fetch actual cuda/cupti functions
- dlgpu();
papi_c_sync_register(&cuda_component);
-}
+}
\ No newline at end of file
diff --git a/src/tool/hpcrun/sample-sources/papi-c-extended-info.c b/src/tool/hpcrun/sample-sources/papi-c-extended-info.c
index 9d49d892bd..f113705199 100644
--- a/src/tool/hpcrun/sample-sources/papi-c-extended-info.c
+++ b/src/tool/hpcrun/sample-sources/papi-c-extended-info.c
@@ -14,26 +14,16 @@ papi_c_sync_register(sync_info_list_t* info)
registered_sync_components = info;
}
-void
-no_action(void)
-{
-}
void
-std_get_event_set(int* ev_s)
+no_action(void)
{
- int ret = PAPI_create_eventset(ev_s);
- TMSG(PAPI,"PAPI_create_eventset = %d, eventSet = %d", ret, *ev_s);
- if (ret != PAPI_OK) {
- hpcrun_abort("Failure: PAPI_create_eventset.Return code = %d ==> %s",
- ret, PAPI_strerror(ret));
- }
}
-int
-std_add_event(int ev_s, int ev)
+const char *
+component_get_name(int cidx)
{
- return PAPI_add_event(ev_s, ev);
+ return PAPI_get_component_info(cidx)->name;
}
get_event_set_proc_t
@@ -45,9 +35,11 @@ component_get_event_set(int cidx)
for(sync_info_list_t* item=registered_sync_components; item; item = item->next) {
if (item->pred(name)) return item->get_event_set;
}
- return std_get_event_set;
+// hpcrun_abort("Failure: PAPI_create_eventset to not registered component");
+ return NULL;
}
+
add_event_proc_t
component_add_event_proc(int cidx)
{
@@ -57,7 +49,8 @@ component_add_event_proc(int cidx)
for(sync_info_list_t* item=registered_sync_components; item; item = item->next) {
if (item->pred(name)) return item->add_event;
}
- return std_add_event;
+// hpcrun_abort("Failure: PAPI_add_event to not registered component");
+ return NULL;
}
finalize_event_set_proc_t
@@ -76,13 +69,10 @@ bool
component_uses_sync_samples(int cidx)
{
const char* name = PAPI_get_component_info(cidx)->name;
-
+
TMSG(PAPI, "checking component idx %d (name %s) to see if it is synchronous", cidx, name);
for(sync_info_list_t* item=registered_sync_components; item; item = item->next) {
- if (item->pred(name)) {
- TMSG(PAPI, "Component %s IS a synchronous component", name);
- return true;
- }
+ if (item->pred(name)) return item->is_gpu_sync;
}
return false;
}
@@ -94,9 +84,9 @@ sync_setup_for_component(int cidx)
TMSG(PAPI, "looking for sync setup for component idx=%d(%s)", cidx, name);
for(sync_info_list_t* item=registered_sync_components; item; item = item->next) {
- if (item->pred(name)) return item->sync_setup;
+ if (item->pred(name)) return item->setup;
}
- return no_action;
+ return NULL;
}
teardown_proc_t
@@ -106,9 +96,9 @@ sync_teardown_for_component(int cidx)
TMSG(PAPI, "looking for sync teardown for component idx=%d(%s)", cidx, name);
for(sync_info_list_t* item=registered_sync_components; item; item = item->next) {
- if (item->pred(name)) return item->sync_teardown;
+ if (item->pred(name)) return item->teardown;
}
- return no_action;
+ return NULL;
}
start_proc_t
@@ -118,11 +108,25 @@ sync_start_for_component(int cidx)
TMSG(PAPI, "looking for sync start for component idx=%d(%s)", cidx, name);
for(sync_info_list_t* item=registered_sync_components; item; item = item->next) {
- if (item->pred(name)) return item->sync_start;
+ if (item->pred(name)) return item->start;
}
- return no_action;
+ return NULL;
}
+
+read_proc_t
+sync_read_for_component(int cidx)
+{
+ const char* name = PAPI_get_component_info(cidx)->name;
+
+ TMSG(PAPI, "looking for sync start for component idx=%d(%s)", cidx, name);
+ for(sync_info_list_t* item=registered_sync_components; item; item = item->next) {
+ if (item->pred(name)) return item->read;
+ }
+ return NULL;
+}
+
+
stop_proc_t
sync_stop_for_component(int cidx)
{
@@ -130,7 +134,7 @@ sync_stop_for_component(int cidx)
TMSG(PAPI, "looking for sync stop for component idx=%d(%s)", cidx, name);
for(sync_info_list_t* item=registered_sync_components; item; item = item->next) {
- if (item->pred(name)) return item->sync_stop;
+ if (item->pred(name)) return item->stop;
}
- return no_action;
+ return NULL;
}
diff --git a/src/tool/hpcrun/sample-sources/papi-c-extended-info.h b/src/tool/hpcrun/sample-sources/papi-c-extended-info.h
index 1636a3f631..eb83b101dc 100644
--- a/src/tool/hpcrun/sample-sources/papi-c-extended-info.h
+++ b/src/tool/hpcrun/sample-sources/papi-c-extended-info.h
@@ -1,13 +1,14 @@
#ifndef PAPI_C_EXTENDED_INFO_H
#define PAPI_C_EXTENDED_INFO_H
-typedef void (*get_event_set_proc_t)(int* ev_s);
-typedef int (*add_event_proc_t)(int ev_s, int evcode);
+typedef void (*get_event_set_proc_t)(int* event_set);
+typedef void (*add_event_proc_t)(int event_set, int evcode);
typedef void (*finalize_event_set_proc_t)(void);
typedef void (*setup_proc_t)(void);
typedef void (*teardown_proc_t)(void);
typedef void (*start_proc_t)(void);
-typedef void (*stop_proc_t)(void);
+typedef void (*read_proc_t)(long long *values);
+typedef void (*stop_proc_t)(long long *values);
typedef bool (*pred_proc_t)(const char* name);
typedef struct sync_info_list_t {
@@ -15,14 +16,17 @@ typedef struct sync_info_list_t {
const get_event_set_proc_t get_event_set;
const add_event_proc_t add_event;
const finalize_event_set_proc_t finalize_event_set;
- const setup_proc_t sync_setup;
- const teardown_proc_t sync_teardown;
- const start_proc_t sync_start;
- const stop_proc_t sync_stop;
+ const bool is_gpu_sync;
+ const setup_proc_t setup;
+ const teardown_proc_t teardown;
+ const start_proc_t start;
+ const read_proc_t read;
+ const stop_proc_t stop;
const bool process_only;
struct sync_info_list_t* next;
} sync_info_list_t;
+extern const char* component_get_name(int cidx);
extern bool component_uses_sync_samples(int cidx);
extern get_event_set_proc_t component_get_event_set(int cidx);
extern add_event_proc_t component_add_event_proc(int cidx);
@@ -30,6 +34,7 @@ extern finalize_event_set_proc_t component_finalize_event_set(int cidx);
extern setup_proc_t sync_setup_for_component(int cidx);
extern teardown_proc_t sync_teardown_for_component(int cidx);
extern start_proc_t sync_start_for_component(int cidx);
+extern read_proc_t sync_read_for_component(int cidx);
extern stop_proc_t sync_stop_for_component(int cidx);
extern void papi_c_sync_register(sync_info_list_t* info);
diff --git a/src/tool/hpcrun/sample-sources/papi-c-rocm.c b/src/tool/hpcrun/sample-sources/papi-c-rocm.c
new file mode 100644
index 0000000000..0aca13a1e1
--- /dev/null
+++ b/src/tool/hpcrun/sample-sources/papi-c-rocm.c
@@ -0,0 +1,201 @@
+// -*-Mode: C++;-*- // technically C99
+
+// * BeginRiceCopyright *****************************************************
+//
+// --------------------------------------------------------------------------
+// Part of HPCToolkit (hpctoolkit.org)
+//
+// Information about sources of support for research and development of
+// HPCToolkit is at 'hpctoolkit.org' and in 'README.Acknowledgments'.
+// --------------------------------------------------------------------------
+//
+// Copyright ((c)) 2002-2020, Rice University
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// * Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// * Neither the name of Rice University (RICE) nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// This software is provided by RICE and contributors "as is" and any
+// express or implied warranties, including, but not limited to, the
+// implied warranties of merchantability and fitness for a particular
+// purpose are disclaimed. In no event shall RICE or contributors be
+// liable for any direct, indirect, incidental, special, exemplary, or
+// consequential damages (including, but not limited to, procurement of
+// substitute goods or services; loss of use, data, or profits; or
+// business interruption) however caused and on any theory of liability,
+// whether in contract, strict liability, or tort (including negligence
+// or otherwise) arising in any way out of the use of this software, even
+// if advised of the possibility of such damage.
+//
+// ******************************************************* EndRiceCopyright *
+
+//***************************************************************************
+//
+// File:
+// rocm-api.c
+//
+// Purpose:
+// implementation of wrapper around NVIDIA's ROCM performance tools API
+//
+//***************************************************************************
+
+//***************************************************************************
+// system includes
+//***************************************************************************
+
+#include
+#include
+
+
+
+//***************************************************************************
+// local includes
+//***************************************************************************
+
+#include
+#include
+#include "papi-c.h"
+#include "papi-c-extended-info.h"
+#include
+
+
+//******************************************************************************
+// static data
+//******************************************************************************
+
+static __thread bool event_set_created = false;
+static __thread bool event_set_finalized = false;
+static __thread int my_event_set = PAPI_NULL;
+
+
+
+//******************************************************************************
+// private operations
+//******************************************************************************
+
+static void
+papi_c_no_action(void)
+{
+ ;
+}
+
+
+static bool
+is_papi_c_rocm(const char* name)
+{
+ return strstr(name, "rocm") == name;
+}
+
+
+// Get or create a rocm event set
+static void
+papi_c_rocm_get_event_set(int* event_set)
+{
+ TMSG(ROCM, "Get event set");
+ if (! event_set_created) {
+ TMSG(ROCM, "No event set created, so create one");
+ int ret = PAPI_create_eventset(&my_event_set);
+ if (ret != PAPI_OK) {
+ hpcrun_abort("Failure: PAPI_create_eventset.Return code = %d ==> %s",
+ ret, PAPI_strerror(ret));
+ }
+ *event_set = my_event_set;
+ event_set_created = true;
+ TMSG(ROCM, "Event set %d created", my_event_set);
+ }
+}
+
+
+// Add event to my_event_set
+void
+papi_c_rocm_add_event(int event_set, int evcode)
+{
+ assert(event_set == my_event_set);
+
+ int rv = PAPI_OK;
+ if (! event_set_finalized) {
+ TMSG(ROCM, "Adding event %x to rocm event set", evcode);
+ rv = PAPI_add_event(my_event_set, evcode);
+ if (rv != PAPI_OK) {
+ hpcrun_abort("failure in PAPI gen_event_set(): PAPI_add_event() returned: %s (%d)",
+ PAPI_strerror(rv), rv);
+ }
+ TMSG(ROCM, "Added event %d, to rocm event set %d", evcode, my_event_set);
+ }
+}
+
+// No adding new events after this point
+void
+papi_c_rocm_finalize_event_set(void)
+{
+ event_set_finalized = true;
+}
+
+
+void
+papi_c_rocm_start()
+{
+ int ret = PAPI_start(my_event_set);
+ if (ret != PAPI_OK) {
+ hpcrun_abort("PAPI_start of event set %d failed with %s (%d)",
+ my_event_set, PAPI_strerror(ret), ret);
+ }
+}
+
+
+void
+papi_c_rocm_read(long long *values)
+{
+ hip_dev_sync(); // TODO:Dejan check this out
+ int ret = PAPI_read(my_event_set, values);
+ if (ret != PAPI_OK) {
+ hpcrun_abort("PAPI_read of event set %d failed with %s (%d)",
+ my_event_set, PAPI_strerror(ret), ret);
+ }
+}
+
+
+void
+papi_c_rocm_stop(long long *values)
+{
+ int ret = PAPI_stop(my_event_set, values);
+ if (ret != PAPI_OK) {
+ hpcrun_abort("PAPI_stop of event set %d failed with %s (%d)",
+ my_event_set, PAPI_strerror(ret), ret);
+ }
+}
+
+
+static sync_info_list_t rocm_component = {
+ .pred = is_papi_c_rocm,
+ .get_event_set = papi_c_rocm_get_event_set,
+ .add_event = papi_c_rocm_add_event,
+ .finalize_event_set = papi_c_rocm_finalize_event_set,
+ .is_gpu_sync = true,
+ .setup = papi_c_no_action,
+ .teardown = papi_c_no_action,
+ .start = papi_c_rocm_start,
+ .read = papi_c_rocm_read,
+ .stop = papi_c_rocm_stop,
+ .process_only = true,
+ .next = NULL,
+};
+
+
+void
+SS_OBJ_CONSTRUCTOR(papi_c_rocm)(void)
+{
+ papi_c_sync_register(&rocm_component);
+}
\ No newline at end of file
diff --git a/src/tool/hpcrun/sample-sources/papi-c.c b/src/tool/hpcrun/sample-sources/papi-c.c
index 99cf9c5fbb..dc522f3f7c 100644
--- a/src/tool/hpcrun/sample-sources/papi-c.c
+++ b/src/tool/hpcrun/sample-sources/papi-c.c
@@ -87,6 +87,7 @@
#include
#include
#include
+#include
#include
#include
#include
@@ -100,22 +101,31 @@
#include
#include
+#include "papi-c.h"
+#include "tool_state.h"
/******************************************************************************
* macros
*****************************************************************************/
+#define DEBUG 0
+
+#include
+#include
+
#define OVERFLOW_MODE 0
#define WEIGHT_METRIC 0
#define DEFAULT_THRESHOLD 2000000L
-#include "papi-c.h"
/******************************************************************************
* forward declarations
*****************************************************************************/
static void papi_event_handler(int event_set, void *pc, long long ovec, void *context);
+static void papi_monitor_enter(papi_component_info_t *ci, cct_node_t *cct_node);
+static void papi_monitor_exit(papi_component_info_t *ci);
+
static int event_is_derived(int ev_code);
static void event_fatal_error(int ev_code, int papi_ret);
@@ -123,6 +133,10 @@ static void event_fatal_error(int ev_code, int papi_ret);
* local variables
*****************************************************************************/
+// Support for derived events (proxy sampling).
+static int derived[MAX_EVENTS];
+static int some_overflow;
+
// Special case to make PAPI_library_init() a soft failure.
// Make sure that we call no other PAPI functions.
@@ -145,6 +159,7 @@ static uint64_t hpcrun_cycles_cmd_period = 0;
* private operations
*****************************************************************************/
+
static int
get_event_index(sample_source_t *self, int event_code)
{
@@ -157,18 +172,24 @@ get_event_index(sample_source_t *self, int event_code)
assert(0);
}
-//
-// fetch a given component's event set. Create one if need be
-//
-int
-get_component_event_set(papi_source_info_t* psi, int cidx)
+
+static int
+evcode_to_component_id(papi_source_info_t* psi, int evcode)
{
- if (cidx < 0 || cidx >= psi->num_components) {
+ int cidx = PAPI_get_event_component(evcode);
+ if (cidx < 0 || cidx >= psi->num_components) {
hpcrun_abort("PAPI component index out of range [0,%d]: %d", psi->num_components, cidx);
- }
+ }
+ return cidx;
+}
- papi_component_info_t* ci = &(psi->component_info[cidx]);
+//
+// fetch a given component's event set. Create one if need be
+//
+int
+get_component_event_set(papi_component_info_t* ci)
+{
if (!ci->inUse) {
ci->get_event_set(&(ci->eventSet));
ci->inUse = true;
@@ -176,17 +197,110 @@ get_component_event_set(papi_source_info_t* psi, int cidx)
return ci->eventSet;
}
+
//
// add an event to a component's event set
//
-int
-component_add_event(papi_source_info_t* psi, int cidx, int evcode)
+void
+component_add_event(papi_source_info_t* psi, int evcode)
{
- int event_set = get_component_event_set(psi, cidx);
+ int cidx = evcode_to_component_id(psi, evcode);
papi_component_info_t* ci = &(psi->component_info[cidx]);
- return ci->add_event(event_set, evcode);
+ int event_set = get_component_event_set(ci);
+
+ ci->add_event(event_set, evcode);
+ ci->some_derived |= event_is_derived(evcode);
+
+ TMSG(PAPI, "Added event code %x to component %d", evcode, cidx);
+ {
+ char buffer[PAPI_MAX_STR_LEN];
+ PAPI_event_code_to_name(evcode, buffer);
+ TMSG(PAPI,
+ "PAPI_add_event(eventSet=%%d, event_code=%x (event name %s)) component=%d",
+ /* eventSet, */ evcode, buffer, cidx);
+ }
+}
+
+
+static void
+papi_register_events(papi_source_info_t *psi, evlist_t evl)
+{
+ int i;
+ int nevents = evl.nevents;
+
+ // add events to new event_sets
+ for (i = 0; i < nevents; i++) {
+ int evcode = evl.events[i].event;
+ component_add_event(psi, evcode);
+
+ }
+
+ // finalize component event sets
+ for (i = 0; i < psi->num_components; i++) {
+ papi_component_info_t *ci = &(psi->component_info[i]);
+ ci->finalize_event_set();
+ }
+}
+
+
+static void
+papi_register_sync_callback(papi_component_info_t *ci)
+{
+ gpu_monitor_node_t node;
+ node.ci = ci;
+ node.enter_fn = papi_monitor_enter;
+ node.exit_fn = papi_monitor_exit;
+ gpu_monitor_register(node);
}
+
+static void
+papi_register_overflow_callback(int eventSet, int evcode, long thresh)
+{
+ TMSG(PAPI, "PAPI_overflow(eventSet=%d, evcode=%x, thresh=%d) register",
+ eventSet, evcode, thresh);
+
+ int ret = PAPI_overflow(eventSet, evcode, thresh, OVERFLOW_MODE, papi_event_handler);
+ if (ret != PAPI_OK) {
+ EMSG("failure in PAPI gen_event_set(): PAPI_overflow() returned: %s (%d)",
+ PAPI_strerror(ret), ret);
+ event_fatal_error(evcode, ret);
+ }
+}
+
+
+static void
+papi_register_callbacks(papi_source_info_t *psi, evlist_t evl)
+{
+ int i;
+ // set up overflow handling for asynchronous event sets for active components
+ // set up synchronous handling for synchronous event sets for active compoents
+ for (i = 0; i < evl.nevents; i++) {
+
+ int evcode = evl.events[i].event;
+ long thresh = evl.events[i].thresh;
+ int cidx = evcode_to_component_id(psi, evcode);
+ papi_component_info_t *ci = &(psi->component_info[cidx]);
+ int eventSet = get_component_event_set(ci);
+
+ // **** No overflow for synchronous events ****
+ if (ci->is_gpu_sync) {
+ TMSG(PAPI, "event code %d (component %d) is synchronous, so do NOT set overflow", evcode, cidx);
+ TMSG(PAPI, "Set up papi_monitor_apply instead");
+ TMSG(PAPI, "synchronous sample component index = %d", cidx);
+
+ papi_register_sync_callback(ci);
+ }
+ else{
+ if (! derived[i]) { // ***** Only set overflow if NOT derived event *****
+ papi_register_overflow_callback(eventSet, evcode, thresh);
+ }
+ }
+ }
+
+}
+
+
static bool
thread_count_scaling_for_component(int cidx)
{
@@ -196,14 +310,6 @@ thread_count_scaling_for_component(int cidx)
}
-/******************************************************************************
- * sample source registration
- *****************************************************************************/
-
-// Support for derived events (proxy sampling).
-static int derived[MAX_EVENTS];
-static int some_overflow;
-
/******************************************************************************
* method functions
*****************************************************************************/
@@ -225,15 +331,18 @@ strip_papi_prefix(const char *str)
return str;
}
+
static void
METHOD_FN(init)
{
+ tool_enter();
// PAPI_set_debug(0x3ff);
// **NOTE: some papi components may start threads, so
// hpcrun must ignore these threads to ensure that PAPI_library_init
// succeeds
//
+
monitor_disable_new_threads();
if (disable_papi_cuda) {
TMSG(PAPI_C, "Will disable PAPI cuda component (if component is active)");
@@ -241,10 +350,10 @@ METHOD_FN(init)
if (cidx) {
int res = PAPI_disable_component(cidx);
if (res == PAPI_OK) {
- TMSG(PAPI, "PAPI cuda component disabled");
+ TMSG(PAPI, "PAPI cuda component disabled");
}
else {
- EMSG("*** PAPI cuda component could not be disabled!!!");
+ EMSG("*** PAPI cuda component could not be disabled!!!");
}
}
}
@@ -278,13 +387,15 @@ METHOD_FN(init)
}
self->state = INIT;
+ tool_exit();
}
static void
METHOD_FN(thread_init)
{
+ tool_enter();
TMSG(PAPI, "thread init");
- if (papi_unavail) { return; }
+ if (papi_unavail) { goto finish; }
int retval = PAPI_thread_init(pthread_self);
if (retval != PAPI_OK) {
@@ -292,13 +403,17 @@ METHOD_FN(thread_init)
monitor_real_abort();
}
TMSG(PAPI, "thread init OK");
+
+finish:
+ tool_exit();
}
static void
METHOD_FN(thread_init_action)
{
+ tool_enter();
TMSG(PAPI, "register thread");
- if (papi_unavail) { return; }
+ if (papi_unavail) { goto finish; }
int retval = PAPI_register_thread();
if (retval != PAPI_OK) {
@@ -306,16 +421,20 @@ METHOD_FN(thread_init_action)
monitor_real_abort();
}
TMSG(PAPI, "register thread ok");
+
+finish:
+ tool_exit();
}
static void
METHOD_FN(start)
{
+ tool_enter();
int cidx;
TMSG(PAPI, "start");
- if (papi_unavail) {
- return;
+ if (papi_unavail) {
+ goto finish;
}
thread_data_t* td = hpcrun_get_thread_data();
@@ -327,7 +446,7 @@ METHOD_FN(start)
if (my_state == START) {
TMSG(PAPI,"*NOTE* PAPI start called when already in state START");
- return;
+ goto finish;
}
// for each active component, start its event set
@@ -336,54 +455,64 @@ METHOD_FN(start)
papi_component_info_t* ci = &(psi->component_info[cidx]);
if (ci->inUse) {
if (component_uses_sync_samples(cidx)) {
- TMSG(PAPI, "component %d is synchronous, use synchronous start", cidx);
- ci->sync_start();
+ TMSG(PAPI, "component %d is synchronous, use synchronous start", cidx);
+ ci->start();
}
- else { //use async start
- TMSG(PAPI,"starting PAPI event set %d for component %d", ci->eventSet, cidx);
- int ret = PAPI_start(ci->eventSet);
- if (ret == PAPI_EISRUN) {
- // this case should not happen, but maybe it's not fatal
- EMSG("PAPI returned EISRUN for event set %d component %d", ci->eventSet, cidx);
- }
- else if (ret != PAPI_OK) {
- EMSG("PAPI_start failed with %s (%d) for event set %d component %d ",
- PAPI_strerror(ret), ret, ci->eventSet, cidx);
- hpcrun_ssfail_start("PAPI");
- }
-
- if (ci->some_derived) {
- ret = PAPI_read(ci->eventSet, ci->prev_values);
- if (ret != PAPI_OK) {
- EMSG("PAPI_read of event set %d for component %d failed with %s (%d)",
- ci->eventSet, cidx, PAPI_strerror(ret), ret);
- }
- }
+ else {
+ TMSG(PAPI,"starting PAPI event set %d for component %d", ci->eventSet, cidx);
+ int ret = PAPI_start(ci->eventSet);
+ if (ret == PAPI_EISRUN) {
+ // this case should not happen, but maybe it's not fatal
+ EMSG("PAPI returned EISRUN for event set %d component %d", ci->eventSet, cidx);
+ }
+ else if (ret != PAPI_OK) {
+ EMSG("PAPI_start failed with %s (%d) for event set %d component %d ",
+ PAPI_strerror(ret), ret, ci->eventSet, cidx);
+ hpcrun_ssfail_start("PAPI");
+ }
+
+ if (ci->some_derived) {
+ ret = PAPI_read(ci->eventSet, ci->prev_values);
+ if (ret != PAPI_OK) {
+ EMSG("PAPI_read of event set %d for component %d failed with %s (%d)",
+ ci->eventSet, cidx, PAPI_strerror(ret), ret);
+ }
+ }
+
}
}
}
td->ss_state[self->sel_idx] = START;
+
+finish:
+ tool_exit();
}
static void
METHOD_FN(thread_fini_action)
{
+ tool_enter();
TMSG(PAPI, "unregister thread");
- if (papi_unavail) { return; }
+ if (papi_unavail) { goto finish; }
int retval = PAPI_unregister_thread();
char msg[] = "!!NOT PAPI_OK!! (code = -9999999)\n";
snprintf(msg, sizeof(msg)-1, "!!NOT PAPI_OK!! (code = %d)", retval);
TMSG(PAPI, "unregister thread returns %s", retval == PAPI_OK? "PAPI_OK" : msg);
+finish:
+ tool_exit();
}
+
static void
METHOD_FN(stop)
{
+ tool_enter();
+
int cidx;
TMSG(PAPI, "stop");
- if (papi_unavail) { return; }
+ if (papi_unavail) { goto finish; }
thread_data_t *td = hpcrun_get_thread_data();
int nevents = self->evl.nevents;
@@ -391,12 +520,12 @@ METHOD_FN(stop)
if (my_state == STOP) {
TMSG(PAPI,"*NOTE* PAPI stop called when already in state STOP");
- return;
+ goto finish;
}
if (my_state != START) {
TMSG(PAPI,"*WARNING* PAPI stop called when not in state START");
- return;
+ goto finish;
}
papi_source_info_t *psi = td->ss_info[self->sel_idx].ptr;
@@ -404,36 +533,46 @@ METHOD_FN(stop)
papi_component_info_t *ci = &(psi->component_info[cidx]);
if (ci->inUse) {
if (component_uses_sync_samples(cidx)) {
- TMSG(PAPI, "component %d is synchronous, stop is trivial", cidx);
+ TMSG(PAPI, "component %d is synchronous, stop is trivial", cidx);
}
else {
- TMSG(PAPI,"stop w event set = %d", ci->eventSet);
- long_long values[nevents+2];
- // long_long *values = (long_long *) alloca(sizeof(long_long) * (nevents+2));
- int ret = PAPI_stop(ci->eventSet, values);
- if (ret != PAPI_OK){
- EMSG("Failed to stop PAPI for eventset %d. Return code = %d ==> %s",
- ci->eventSet, ret, PAPI_strerror(ret));
- }
+ TMSG(PAPI,"stop w event set = %d", ci->eventSet);
+ long_long values[nevents+2];
+ // long_long *values = (long_long *) alloca(sizeof(long_long) * (nevents+2));
+
+ int ret = PAPI_stop(ci->eventSet, values);
+ if (ret != PAPI_OK) {
+ EMSG("Failed to stop PAPI for eventset %d. Return code = %d ==> %s",
+ ci->eventSet, ret, PAPI_strerror(ret));
+ }
+
}
}
}
TD_GET(ss_state)[self->sel_idx] = STOP;
+finish:
+ tool_exit();
}
+
static void
METHOD_FN(shutdown)
{
+ tool_enter();
TMSG(PAPI, "shutdown");
- if (papi_unavail) { return; }
+ if (papi_unavail) { goto finish; }
- METHOD_CALL(self, stop); // make sure stop has been called
+ do{
+ METHOD_CALL(self, stop); // make sure stop has been called
+ }while(0);
// FIXME: add component shutdown code here
PAPI_shutdown();
self->state = UNINIT;
+finish:
+ tool_exit();
}
// Return true if PAPI recognizes the name, whether supported or not.
@@ -441,15 +580,17 @@ METHOD_FN(shutdown)
static bool
METHOD_FN(supports_event, const char *ev_str)
{
+ tool_enter();
+ bool ret;
ev_str = strip_papi_prefix(ev_str);
-
+
TMSG(PAPI, "supports event");
- if (papi_unavail) { return false; }
+ if (papi_unavail) { ret = false; goto finish;}
if (self->state == UNINIT){
METHOD_CALL(self, init);
}
-
+
char evtmp[1024];
int ec;
long th;
@@ -460,15 +601,20 @@ METHOD_FN(supports_event, const char *ev_str)
if (is_event_to_exclude(evtmp)) {
return false;
}
+
+ ret = (PAPI_event_name_to_code(evtmp, &ec) == PAPI_OK);
- return PAPI_event_name_to_code(evtmp, &ec) == PAPI_OK;
+finish:
+ tool_exit();
+ return ret;
}
-
+
static void
METHOD_FN(process_event_list, int lush_metrics)
{
+ tool_enter();
TMSG(PAPI, "process event list");
- if (papi_unavail) { return; }
+ if (papi_unavail) { goto finish; }
char *event;
int i, ret;
@@ -490,7 +636,7 @@ METHOD_FN(process_event_list, int lush_metrics)
int period_type = hpcrun_extract_ev_thresh(event, sizeof(name), name, &thresh, DEFAULT_THRESHOLD);
if (!period_type) {
AMSG("WARNING: %s using default threshold %ld, "
- "better to use an explicit threshold.", name, DEFAULT_THRESHOLD);
+ "better to use an explicit threshold.", name, DEFAULT_THRESHOLD);
}
#else
int period_type = hpcrun_extract_ev_thresh(event, sizeof(name), name, &thresh, DEFAULT_THRESHOLD);
@@ -498,8 +644,8 @@ METHOD_FN(process_event_list, int lush_metrics)
ret = PAPI_event_name_to_code(name, &evcode);
if (ret != PAPI_OK) {
EMSG("unexpected failure in PAPI process_event_list(): "
- "PAPI_event_name_to_code() returned %s (%d)",
- PAPI_strerror(ret), ret);
+ "PAPI_event_name_to_code() returned %s (%d)",
+ PAPI_strerror(ret), ret);
hpcrun_ssfail_unsupported("PAPI", name);
}
if (PAPI_query_event(evcode) != PAPI_OK) {
@@ -549,7 +695,7 @@ METHOD_FN(process_event_list, int lush_metrics)
// supports hardware overflow. use threshold = 0 to force proxy
// sampling (for testing).
if (event_is_derived(self->evl.events[i].event)
- || self->evl.events[i].thresh == 0) {
+ || self->evl.events[i].thresh == 0) {
TMSG(PAPI, "using proxy sampling for event %s", buffer);
strcat(buffer, " (proxy)");
self->evl.events[i].thresh = 1;
@@ -571,10 +717,11 @@ METHOD_FN(process_event_list, int lush_metrics)
if (component_uses_sync_samples(cidx))
TMSG(PAPI, "Event %s from synchronous component", buffer);
+
int metric_id = /* weight */
hpcrun_set_new_metric_info_and_period(papi_kind, strdup(buffer),
- MetricFlags_ValFmt_Int,
- threshold, prop);
+ MetricFlags_ValFmt_Int,
+ threshold, prop);
METHOD_CALL(self, store_metric_id, i, metric_id);
if (isCycles) {
hpcrun_cycles_metric_id = metric_id;
@@ -585,9 +732,9 @@ METHOD_FN(process_event_list, int lush_metrics)
if (num_lush_metrics > 0 && strcmp(buffer, "PAPI_TOT_CYC") == 0) {
// there should be one lush metric; its source is the last event
int mid_idleness =
- hpcrun_set_new_metric_info_and_period(papi_kind, "idleness",
- MetricFlags_ValFmt_Real,
- self->evl.events[i].thresh, prop);
+ hpcrun_set_new_metric_info_and_period(papi_kind, "idleness",
+ MetricFlags_ValFmt_Real,
+ self->evl.events[i].thresh, prop);
assert(num_lush_metrics == 1 && (i == (nevents - 1)));
lush_agents->metric_time = metric_id;
lush_agents->metric_idleness = mid_idleness;
@@ -599,6 +746,9 @@ METHOD_FN(process_event_list, int lush_metrics)
if (! some_overflow) {
hpcrun_ssfail_all_derived("PAPI");
}
+
+finish:
+ tool_exit();
}
static void
@@ -609,16 +759,16 @@ METHOD_FN(finalize_event_list)
static void
METHOD_FN(gen_event_set, int lush_metrics)
{
+ tool_enter();
thread_data_t *td = hpcrun_get_thread_data();
int i;
- int ret;
TMSG(PAPI, "generating all event sets for all components");
- if (papi_unavail) { return; }
+ if (papi_unavail) { goto finish; }
int num_components = PAPI_num_components();
- int ss_info_size = sizeof(papi_source_info_t) +
- num_components * sizeof(papi_component_info_t);
+ int ss_info_size = sizeof(papi_source_info_t) +
+ num_components * sizeof(papi_component_info_t);
TMSG(PAPI, "Num components = %d", num_components);
papi_source_info_t* psi = hpcrun_malloc(ss_info_size);
@@ -630,7 +780,8 @@ METHOD_FN(gen_event_set, int lush_metrics)
psi->num_components = num_components;
for (i = 0; i < num_components; i++) {
papi_component_info_t *ci = &(psi->component_info[i]);
- ci->inUse = false;
+ ci->name = component_get_name(i);
+ ci->inUse = false;
ci->eventSet = PAPI_NULL;
ci->state = INIT;
ci->some_derived = 0;
@@ -638,98 +789,48 @@ METHOD_FN(gen_event_set, int lush_metrics)
ci->add_event = component_add_event_proc(i);
ci->finalize_event_set = component_finalize_event_set(i);
ci->scale_by_thread_count = thread_count_scaling_for_component(i);
- ci->is_sync = component_uses_sync_samples(i);
- ci->sync_setup = sync_setup_for_component(i);
- ci->sync_teardown = sync_teardown_for_component(i);
- ci->sync_start = sync_start_for_component(i);
- ci->sync_stop = sync_stop_for_component(i);
+ ci->is_gpu_sync = component_uses_sync_samples(i);
+ ci->setup = sync_setup_for_component(i);
+ ci->teardown = sync_teardown_for_component(i);
+ ci->start = sync_start_for_component(i);
+ ci->read = sync_read_for_component(i);
+ ci->stop = sync_stop_for_component(i);
memset(ci->prev_values, 0, sizeof(ci->prev_values));
}
// record the component state in thread state
td->ss_info[self->sel_idx].ptr = psi;
- int nevents = (self->evl).nevents;
- for (i = 0; i < nevents; i++) {
- int evcode = self->evl.events[i].event;
- int cidx = PAPI_get_event_component(evcode);
-
- ret = component_add_event(psi, cidx, evcode);
- psi->component_info[cidx].some_derived |= event_is_derived(evcode);
- TMSG(PAPI, "Added event code %x to component %d", evcode, cidx);
- {
- char buffer[PAPI_MAX_STR_LEN];
- PAPI_event_code_to_name(evcode, buffer);
- TMSG(PAPI,
- "PAPI_add_event(eventSet=%%d, event_code=%x (event name %s)) component=%d",
- /* eventSet, */ evcode, buffer, cidx);
- }
- if (ret != PAPI_OK) {
- EMSG("failure in PAPI gen_event_set(): PAPI_add_event() returned: %s (%d)",
- PAPI_strerror(ret), ret);
- event_fatal_error(evcode, ret);
- }
- }
+ papi_register_events(psi, self->evl);
- // finalize component event sets
- for (i = 0; i < num_components; i++) {
- papi_component_info_t *ci = &(psi->component_info[i]);
- ci->finalize_event_set();
- }
+ papi_register_callbacks(psi, self->evl);
- // set up overflow handling for asynchronous event sets for active components
- // set up synchronous handling for synchronous event sets for active compoents
- for (i = 0; i < nevents; i++) {
- int evcode = self->evl.events[i].event;
- long thresh = self->evl.events[i].thresh;
- int cidx = PAPI_get_event_component(evcode);
- int eventSet = get_component_event_set(psi, cidx);
-
- // **** No overflow for synchronous events ****
- // **** Use component-specific setup for synchronous events ****
- if (component_uses_sync_samples(cidx)) {
- TMSG(PAPI, "event code %d (component %d) is synchronous, so do NOT set overflow", evcode, cidx);
- TMSG(PAPI, "Set up sync handler instead");
- TMSG(PAPI, "synchronous sample component index = %d", cidx);
- sync_setup_for_component(cidx)();
- continue;
- }
- // ***** Only set overflow if NOT derived event *****
- if (! derived[i]) {
- ret = PAPI_overflow(eventSet, evcode, thresh, OVERFLOW_MODE,
- papi_event_handler);
- TMSG(PAPI, "PAPI_overflow(eventSet=%d, evcode=%x, thresh=%d) = %d",
- eventSet, evcode, thresh, ret);
- if (ret != PAPI_OK) {
- EMSG("failure in PAPI gen_event_set(): PAPI_overflow() returned: %s (%d)",
- PAPI_strerror(ret), ret);
- event_fatal_error(evcode, ret);
- }
- }
- }
+finish:
+ tool_exit();
}
static void
METHOD_FN(display_events)
{
+ tool_enter();
PAPI_event_info_t info;
int ev, ret, num_total, num_prof;
int num_components, cidx;
if (papi_unavail) {
- printf("PAPI is not available. Probably, the kernel doesn't support PAPI,\n"
- "or else maybe HPCToolkit is out of sync with PAPI.\n\n");
- return;
+ PRINT("PAPI is not available. Probably, the kernel doesn't support PAPI,\n"
+ "or else maybe HPCToolkit is out of sync with PAPI.\n\n");
+ goto finish;
}
cidx = 0; // CPU component
{
const PAPI_component_info_t *component = PAPI_get_component_info(cidx);
- printf("===========================================================================\n");
- printf("Available PAPI preset events in component %s\n", component->name);
- printf("\n");
- printf("Name\t Profilable\tDescription\n");
- printf("===========================================================================\n");
+ PRINT("===========================================================================\n");
+ PRINT("Available PAPI preset events in component %s\n", component->name);
+ PRINT("\n");
+ PRINT("Name\t Profilable\tDescription\n");
+ PRINT("===========================================================================\n");
num_total = 0;
num_prof = 0;
@@ -739,53 +840,55 @@ METHOD_FN(display_events)
char *prof;
memset(&info, 0, sizeof(info));
if (PAPI_get_event_info(ev, &info) == PAPI_OK && info.count != 0) {
- if (event_is_derived(ev)) {
- prof = "No";
- } else {
- prof = "Yes";
- num_prof++;
- }
- num_total++;
- printf("%-10s\t%s\t%s\n", info.symbol, prof, info.long_descr);
+ if (event_is_derived(ev)) {
+ prof = "No";
+ } else {
+ prof = "Yes";
+ num_prof++;
+ }
+ num_total++;
+ PRINT("%-10s\t%s\t%s\n", info.symbol, prof, info.long_descr);
}
ret = PAPI_enum_cmp_event(&ev, PAPI_ENUM_EVENTS, cidx);
}
- printf("---------------------------------------------------------------------------\n");
- printf("Total PAPI events: %d, able to profile: %d\n", num_total, num_prof);
- printf("\n\n");
+ PRINT("---------------------------------------------------------------------------\n");
+ PRINT("Total PAPI events: %d, able to profile: %d\n", num_total, num_prof);
+ PRINT("\n\n");
}
- num_components = PAPI_num_components();
+ num_components = PAPI_num_components();
for(cidx = 0; cidx < num_components; cidx++) {
const PAPI_component_info_t* component = PAPI_get_component_info(cidx);
int cmp_event_count = 0;
if (component->disabled) continue;
- printf("===========================================================================\n");
- printf("Native events in component %s\n", component->name);
- printf("\n");
- printf("Name Description\n");
- printf("===========================================================================\n");
-
+ PRINT("===========================================================================\n");
+ PRINT("Native events in component %s\n", component->name);
+ PRINT("\n");
+ PRINT("Name Description\n");
+ PRINT("===========================================================================\n");
+
ev = 0 | PAPI_NATIVE_MASK;
ret = PAPI_enum_cmp_event(&ev, PAPI_ENUM_FIRST, cidx);
while (ret == PAPI_OK) {
memset(&info, 0, sizeof(info));
if (PAPI_get_event_info(ev, &info) == PAPI_OK) {
- cmp_event_count++;
+ cmp_event_count++;
display_event_info(stdout, info.symbol, info.long_descr);
- printf("---------------------------------------------------------------------------\n");
+ PRINT("---------------------------------------------------------------------------\n");
}
ret = PAPI_enum_cmp_event(&ev, PAPI_ENUM_EVENTS, cidx);
}
- printf("Total native events for component %s: %d\n", component->name, cmp_event_count);
- printf("\n\n");
+ PRINT("Total native events for component %s: %d\n", component->name, cmp_event_count);
+ PRINT("\n\n");
num_total += cmp_event_count;
}
- printf( "Total events reported: %d\n", num_total);
- printf("\n\n");
+ PRINT( "Total events reported: %d\n", num_total);
+ PRINT("\n\n");
+finish:
+ tool_exit();
}
@@ -800,16 +903,18 @@ METHOD_FN(display_events)
#include "ss_obj.h"
// **************************************************************************
-// * public operations
+// * public operations
// **************************************************************************
void
hpcrun_disable_papi_cuda(void)
{
+ tool_enter();
disable_papi_cuda = true;
+ tool_exit();
}
/******************************************************************************
- * private operations
+ * private operations
*****************************************************************************/
// Returns: 1 if the event code is a derived event.
@@ -817,26 +922,35 @@ hpcrun_disable_papi_cuda(void)
static int
event_is_derived(int ev_code)
{
+ tool_enter();
+ int ret;
PAPI_event_info_t info;
// "Is derived" is kind of a bad thing, so if any unexpected failure
// occurs, we'll return the "bad" answer.
if (PAPI_get_event_info(ev_code, &info) != PAPI_OK
|| info.derived == NULL) {
- return 1;
+ ret = 1;
+ goto finish;
}
if (info.count == 1
|| strlen(info.derived) == 0
|| strcmp(info.derived, "NOT_DERIVED") == 0
|| strcmp(info.derived, "DERIVED_CMPD") == 0) {
- return 0;
+ ret = 0;
+ goto finish;
}
- return 1;
+ ret = 1;
+
+finish:
+ tool_exit();
+ return ret;
}
static void
event_fatal_error(int ev_code, int papi_ret)
{
+ tool_enter();
char name[1024];
PAPI_event_code_to_name(ev_code, name);
@@ -850,36 +964,39 @@ event_fatal_error(int ev_code, int papi_ret)
hpcrun_ssfail_conflict("PAPI", name);
}
hpcrun_ssfail_unsupported("PAPI", name);
+
+ tool_exit();
}
static void
papi_event_handler(int event_set, void *pc, long long ovec,
void *context)
{
+ tool_enter();
sample_source_t *self = &obj_name();
long long values[MAX_EVENTS];
int my_events[MAX_EVENTS];
- int my_event_count = MAX_EVENTS;
+ int my_events_number = MAX_EVENTS;
int nevents = self->evl.nevents;
int i, ret;
- int my_event_codes[MAX_EVENTS];
- int my_event_codes_count = MAX_EVENTS;
+ int my_events_code[MAX_EVENTS];
+ int my_events_code_count = MAX_EVENTS;
// if sampling disabled explicitly for this thread, skip all processing
- if (hpcrun_suppress_sample() || sample_filters_apply()) return;
+ if (hpcrun_suppress_sample() || sample_filters_apply()) goto finish;
if (!ovec) {
TMSG(PAPI_SAMPLE, "papi overflow event: event set %d ovec = %ld",
- event_set, ovec);
- return;
+ event_set, ovec);
+ goto finish;
}
// If the interrupt came from inside our code, then drop the sample
// and return and avoid any MSG.
if (! hpcrun_safe_enter_async(pc)) {
hpcrun_stats_num_samples_blocked_async_inc();
- return;
+ goto finish;
}
int cidx = PAPI_get_eventset_component(event_set);
@@ -894,42 +1011,42 @@ papi_event_handler(int event_set, void *pc, long long ovec,
}
}
- ret = PAPI_get_overflow_event_index(event_set, ovec, my_events,
- &my_event_count);
+ ret = PAPI_get_overflow_event_index(event_set, ovec, my_events,
+ &my_events_number);
if (ret != PAPI_OK) {
TMSG(PAPI_SAMPLE, "papi_event_handler: event set %d ovec %ld "
- "get_overflow_event_index return code = %d ==> %s",
- event_set, ovec, ret, PAPI_strerror(ret));
+ "get_overflow_event_index return code = %d ==> %s",
+ event_set, ovec, ret, PAPI_strerror(ret));
#ifdef DEBUG_PAPI_OVERFLOW
- ret = PAPI_list_events(event_set, my_event_codes, &my_event_codes_count);
+ ret = PAPI_list_events(event_set, my_events_code, &my_events_code_count);
if (ret != PAPI_OK) {
TMSG(PAPI_SAMPLE, "PAPI_list_events failed inside papi_event_handler."
- "Return code = %d ==> %s", ret, PAPI_strerror(ret));
+ "Return code = %d ==> %s", ret, PAPI_strerror(ret));
} else {
- for (i = 0; i < my_event_codes_count; i++) {
- TMSG(PAPI_SAMPLE, "event set %d event code %d = %x\n",
- event_set, i, my_event_codes[i]);
+ for (i = 0; i < my_events_code_count; i++) {
+ TMSG(PAPI_SAMPLE, "event set %d event code %d = %x\n",
+ event_set, i, my_events_code[i]);
}
}
TMSG(PAPI_SAMPLE, "get_overflow_event_index failure in papi_event_handler");
#endif
}
- ret = PAPI_list_events(event_set, my_event_codes, &my_event_codes_count);
+ ret = PAPI_list_events(event_set, my_events_code, &my_events_code_count);
if (ret != PAPI_OK) {
hpcrun_abort("PAPI_list_events failed inside papi_event_handler."
- "Return code = %d ==> %s", ret, PAPI_strerror(ret));
+ "Return code = %d ==> %s", ret, PAPI_strerror(ret));
}
- for (i = 0; i < my_event_count; i++) {
+ for (i = 0; i < my_events_number; i++) {
// FIXME: SUBTLE ERROR: metric_id may not be same from hpcrun_new_metric()!
// This means lush's 'time' metric should be *last*
TMSG(PAPI_SAMPLE,"handling papi overflow event: "
- "event set %d event index = %d event code = 0x%x",
- event_set, my_events[i], my_event_codes[my_events[i]]);
+ "event set %d event index = %d event code = 0x%x",
+ event_set, my_events[i], my_events_code[my_events[i]]);
- int event_index = get_event_index(self, my_event_codes[my_events[i]]);
+ int event_index = get_event_index(self, my_events_code[my_events[i]]);
int metric_id = hpcrun_event2metric(self, event_index);
@@ -967,9 +1084,9 @@ papi_event_handler(int event_set, void *pc, long long ovec,
if (ci->some_derived) {
for (i = 0; i < nevents; i++) {
if (derived[i]) {
- hpcrun_sample_callpath(context, hpcrun_event2metric(self, i),
- (hpcrun_metricVal_t) {.i=values[i] - ci->prev_values[i]},
- 0, 0, NULL);
+ hpcrun_sample_callpath(context, hpcrun_event2metric(self, i),
+ (hpcrun_metricVal_t) {.i=values[i] - ci->prev_values[i]},
+ 0, 0, NULL);
}
}
@@ -979,5 +1096,99 @@ papi_event_handler(int event_set, void *pc, long long ovec,
}
}
+finish:
+ tool_exit();
hpcrun_safe_exit();
}
+
+
+static void
+attribute_metric_to_cct
+(
+ int metric_id,
+ cct_node_t *cct_node,
+ long long value
+)
+{
+ metric_data_list_t* metrics = hpcrun_reify_metric_set(cct_node, metric_id);
+
+ hpcrun_metric_std_inc(metric_id,
+ metrics,
+ (cct_metric_data_t) {.i = value});
+}
+
+
+static void
+attribute_counters(papi_component_info_t *ci, long long *collected_values, cct_node_t *cct_node)
+{
+ sample_source_t *self = &obj_name();
+ int events_codes[MAX_EVENTS];
+ int my_events_number = MAX_EVENTS;
+ int ret;
+
+ // Attribute collected metric to cct nodes
+ ret = PAPI_list_events(ci->eventSet, events_codes, &my_events_number);
+ if (ret != PAPI_OK) {
+ hpcrun_abort("PAPI_list_events failed inside papi_event_handler."
+ "Return code = %d ==> %s", ret, PAPI_strerror(ret));
+ }
+
+ for (int eid = 0; eid < my_events_number; ++eid) {
+ int event_index = get_event_index(self, events_codes[eid]);
+ int metric_id = hpcrun_event2metric(self, event_index);
+ long long int final_counts = collected_values[eid] - ci->prev_values[eid];
+
+
+ blame_shift_apply(metric_id, cct_node, final_counts/*metricIncr*/);
+ attribute_metric_to_cct(metric_id, cct_node, final_counts);
+
+ PRINT("PAPI_EXIT:: %d Event = %x, event_index = %d, metric_id = %d || value = %lld - %lld == %lld\n",
+ eid, events_codes[eid], event_index, metric_id,
+ collected_values[eid], ci->prev_values[eid],
+ final_counts);
+ }
+}
+
+
+static void
+papi_monitor_enter(papi_component_info_t *ci, cct_node_t *cct_node)
+{
+ tool_enter();
+// PRINT("|------->PAPI_MONITOR_ENTER | cct = %p\n", cct_node);
+
+ // if sampling disabled explicitly for this thread, skip all processing
+ if (hpcrun_suppress_sample() || sample_filters_apply()) goto finish;
+
+ ci->cct_node = cct_node;
+
+ // Save counts on the end so we could substract that from next call (we don't want to measure ourselves)
+
+ if (ci->inUse) {
+ ci->read(ci->prev_values);
+
+ PRINT("PAPI_ENTER:: Component %s Event = %d, value = %lld | %p\n", ci->name, ci->eventSet, ci->prev_values[0], cct_node);
+ }
+
+finish:
+ tool_exit();
+}
+
+
+static void
+papi_monitor_exit(papi_component_info_t *ci)
+{
+ tool_enter();
+ long long collected_values[MAX_EVENTS];
+
+ // if sampling disabled explicitly for this thread, skip all processing
+ if (hpcrun_suppress_sample() || sample_filters_apply()) goto finish;
+
+ if (ci->inUse){
+ ci->read(collected_values);
+ attribute_counters(ci, collected_values, ci->cct_node);
+ }
+
+
+finish:
+ tool_exit();
+}
diff --git a/src/tool/hpcrun/sample-sources/papi-c.h b/src/tool/hpcrun/sample-sources/papi-c.h
index 426a778117..2c125ef129 100644
--- a/src/tool/hpcrun/sample-sources/papi-c.h
+++ b/src/tool/hpcrun/sample-sources/papi-c.h
@@ -53,6 +53,8 @@
#include "papi-c-extended-info.h"
+#include "sample_source_obj.h"
+#include "cct.h"
/******************************************************************************
@@ -60,21 +62,24 @@
*****************************************************************************/
typedef struct {
+ const char *name;
bool inUse;
int eventSet;
source_state_t state;
int some_derived;
bool scale_by_thread_count;
long long prev_values[MAX_EVENTS];
- bool is_sync;
+ cct_node_t *cct_node;
+ bool is_gpu_sync;
bool setup_process_only;
get_event_set_proc_t get_event_set;
add_event_proc_t add_event;
finalize_event_set_proc_t finalize_event_set;
- start_proc_t sync_start;
- stop_proc_t sync_stop;
- setup_proc_t sync_setup;
- teardown_proc_t sync_teardown;
+ start_proc_t start;
+ read_proc_t read;
+ stop_proc_t stop;
+ setup_proc_t setup;
+ teardown_proc_t teardown;
} papi_component_info_t;
@@ -84,11 +89,10 @@ typedef struct {
} papi_source_info_t;
-
/******************************************************************************
* external declarations
*****************************************************************************/
-extern int get_component_event_set(papi_source_info_t *psi, int cidx);
+extern int get_component_event_set(papi_component_info_t* ci);
#endif // PAPI_C_H
diff --git a/src/tool/hpcrun/sample-sources/papi.c b/src/tool/hpcrun/sample-sources/papi.c
index 8eed9ddc65..e4514533c6 100644
--- a/src/tool/hpcrun/sample-sources/papi.c
+++ b/src/tool/hpcrun/sample-sources/papi.c
@@ -96,6 +96,8 @@
#include
#include
+#include "tool_state.h"
+
/******************************************************************************
* macros
@@ -165,6 +167,7 @@ strip_papi_prefix(const char *str)
static void
METHOD_FN(init)
{
+ tool_enter();
PAPI_set_debug(0x3ff);
// **NOTE: some papi components may start threads, so
@@ -201,13 +204,15 @@ METHOD_FN(init)
}
self->state = INIT;
+ tool_exit();
}
static void
METHOD_FN(thread_init)
{
+ tool_enter();
TMSG(PAPI, "thread init");
- if (papi_unavail) { return; }
+ if (papi_unavail) { goto finish; }
int retval = PAPI_thread_init(pthread_self);
if (retval != PAPI_OK) {
@@ -215,13 +220,16 @@ METHOD_FN(thread_init)
monitor_real_abort();
}
TMSG(PAPI, "thread init OK");
+finish:
+ tool_exit();
}
static void
METHOD_FN(thread_init_action)
{
+ tool_enter();
TMSG(PAPI, "register thread");
- if (papi_unavail) { return; }
+ if (papi_unavail) { goto finish; }
int retval = PAPI_register_thread();
if (retval != PAPI_OK) {
@@ -229,13 +237,16 @@ METHOD_FN(thread_init_action)
monitor_real_abort();
}
TMSG(PAPI, "register thread ok");
+finish:
+ tool_exit();
}
static void
METHOD_FN(start)
{
+ tool_enter();
TMSG(PAPI, "start");
- if (papi_unavail) { return; }
+ if (papi_unavail) { goto finish; }
thread_data_t *td = hpcrun_get_thread_data();
papi_source_info_t *psi = td->ss_info[self->sel_idx].ptr;
@@ -247,7 +258,7 @@ METHOD_FN(start)
// state PAPI is in.
if (my_state == START) {
- return;
+ goto finish;
}
TMSG(PAPI,"starting PAPI w event set %d",eventSet);
@@ -269,25 +280,33 @@ METHOD_FN(start)
}
TD_GET(ss_state)[self->sel_idx] = START;
+
+finish:
+ tool_exit();
}
static void
METHOD_FN(thread_fini_action)
{
- TMSG(PAPI, "unregister thread");
- if (papi_unavail) { return; }
+ tool_enter();
+ TMSG(PAPI, "unregister thread");
+ if (papi_unavail) { goto finish; }
int retval = PAPI_unregister_thread();
char msg[] = "!!NOT PAPI_OK!! (code = -9999999)\n";
snprintf(msg, sizeof(msg)-1, "!!NOT PAPI_OK!! (code = %d)", retval);
TMSG(PAPI, "unregister thread returns %s", retval == PAPI_OK? "PAPI_OK" : msg);
+finish:
+ tool_exit();
}
static void
METHOD_FN(stop)
{
- TMSG(PAPI, "stop");
- if (papi_unavail) { return; }
+ tool_enter();
+
+ TMSG(PAPI, "stop");
+ if (papi_unavail) { goto finish; }
thread_data_t *td = hpcrun_get_thread_data();
papi_source_info_t *psi = td->ss_info[self->sel_idx].ptr;
@@ -297,12 +316,12 @@ METHOD_FN(stop)
if (my_state == STOP) {
TMSG(PAPI,"--stop called on an already stopped event set %d",eventSet);
- return;
+ goto finish;
}
if (my_state != START) {
TMSG(PAPI,"*WARNING* Stop called on event set that has not been started");
- return;
+ goto finish;
}
TMSG(PAPI,"stop w event set = %d",eventSet);
@@ -314,18 +333,23 @@ METHOD_FN(stop)
}
TD_GET(ss_state)[self->sel_idx] = STOP;
+finish:
+ tool_exit();
}
static void
METHOD_FN(shutdown)
{
- TMSG(PAPI, "shutdown");
- if (papi_unavail) { return; }
+ tool_enter();
+ TMSG(PAPI, "shutdown");
+ if (papi_unavail) { goto finish; }
METHOD_CALL(self, stop); // make sure stop has been called
PAPI_shutdown();
self->state = UNINIT;
+finish:
+ tool_exit();
}
// Return true if PAPI recognizes the name, whether supported or not.
@@ -333,10 +357,12 @@ METHOD_FN(shutdown)
static bool
METHOD_FN(supports_event, const char *ev_str)
{
+ tool_enter();
+ bool ret;
ev_str = strip_papi_prefix(ev_str);
TMSG(PAPI, "supports event");
- if (papi_unavail) { return false; }
+ if (papi_unavail) { ret = false; goto finish; }
if (self->state == UNINIT){
METHOD_CALL(self, init);
@@ -347,14 +373,19 @@ METHOD_FN(supports_event, const char *ev_str)
long th;
hpcrun_extract_ev_thresh(ev_str, sizeof(evtmp), evtmp, &th, DEFAULT_THRESHOLD);
- return PAPI_event_name_to_code(evtmp, &ec) == PAPI_OK;
+ ret = (PAPI_event_name_to_code(evtmp, &ec) == PAPI_OK);
+
+finish:
+ tool_exit();
+ return ret;
}
static void
METHOD_FN(process_event_list, int lush_metrics)
{
+ tool_enter();
TMSG(PAPI, "process event list");
- if (papi_unavail) { return; }
+ if (papi_unavail) { goto finish; }
char *event;
int i, ret;
@@ -456,17 +487,20 @@ METHOD_FN(process_event_list, int lush_metrics)
if (! some_overflow) {
hpcrun_ssfail_all_derived("PAPI");
}
+finish:
+ tool_exit();
}
static void
METHOD_FN(gen_event_set,int lush_metrics)
{
- int i;
+ tool_enter();
+ int i;
int ret;
int eventSet;
TMSG(PAPI, "gen event set");
- if (papi_unavail) { return; }
+ if (papi_unavail) { goto finish; }
int ss_info_size = sizeof(papi_source_info_t);
papi_source_info_t *psi = hpcrun_malloc(ss_info_size);
@@ -520,11 +554,14 @@ METHOD_FN(gen_event_set,int lush_metrics)
}
}
psi->eventSet= eventSet;
+finish:
+ tool_exit();
}
static void
METHOD_FN(display_events)
{
+ tool_enter();
PAPI_event_info_t info;
char name[200], *prof;
int ev, ret, num_total, num_prof;
@@ -538,7 +575,7 @@ METHOD_FN(display_events)
if (papi_unavail) {
printf("PAPI is not available. Probably, the kernel doesn't support PAPI,\n"
"or else maybe HPCToolkit is out of sync with PAPI.\n\n");
- return;
+ goto finish;
}
num_total = 0;
@@ -592,8 +629,11 @@ METHOD_FN(display_events)
}
printf("Total native events: %d\n", num_total);
printf("\n");
+finish:
+ tool_exit();
}
+
/***************************************************************************
* object
***************************************************************************/
@@ -626,26 +666,35 @@ hpcrun_disable_papi_cuda(void)
static int
event_is_derived(int ev_code)
{
- PAPI_event_info_t info;
+ tool_enter();
+ int ret;
+ PAPI_event_info_t info;
// "Is derived" is kind of a bad thing, so if any unexpected failure
// occurs, we'll return the "bad" answer.
if (PAPI_get_event_info(ev_code, &info) != PAPI_OK
|| info.derived == NULL) {
- return 1;
+ ret = 1;
+ goto finish;
}
if (info.count == 1
|| strlen(info.derived) == 0
|| strcmp(info.derived, "NOT_DERIVED") == 0
|| strcmp(info.derived, "DERIVED_CMPD") == 0) {
- return 0;
+ ret = 0;
+ goto finish;
}
- return 1;
+ ret = 1;
+
+finish:
+ tool_exit();
+ return ret;
}
static void
event_fatal_error(int ev_code, int papi_ret)
{
+ tool_enter();
char name[1024];
PAPI_event_code_to_name(ev_code, name);
@@ -659,12 +708,15 @@ event_fatal_error(int ev_code, int papi_ret)
hpcrun_ssfail_conflict("PAPI", name);
}
hpcrun_ssfail_unsupported("PAPI", name);
+
+ tool_exit();
}
static void
papi_event_handler(int event_set, void *pc, long long ovec,
void *context)
{
+ tool_enter();
sample_source_t *self = &_papi_obj;
long long values[MAX_EVENTS];
int my_events[MAX_EVENTS];
@@ -673,14 +725,14 @@ papi_event_handler(int event_set, void *pc, long long ovec,
int i, ret;
// if sampling disabled explicitly for this thread, skip all processing
- if (hpcrun_suppress_sample()) return;
+ if (hpcrun_suppress_sample()) goto finish;
// If the interrupt came from inside our code, then drop the sample
// and return and avoid any MSG.
if (! hpcrun_safe_enter_async(pc)) {
hpcrun_stats_num_samples_blocked_async_inc();
- return;
+ goto finish;
}
TMSG(PAPI_SAMPLE,"papi event happened, ovec = %ld",ovec);
@@ -734,5 +786,7 @@ papi_event_handler(int event_set, void *pc, long long ovec,
}
}
- hpcrun_safe_exit();
+finish:
+ tool_exit();
+ hpcrun_safe_exit();
}
diff --git a/src/tool/hpcrun/sample-sources/retcnt.c b/src/tool/hpcrun/sample-sources/retcnt.c
index dfe405920a..0a44f44b5b 100644
--- a/src/tool/hpcrun/sample-sources/retcnt.c
+++ b/src/tool/hpcrun/sample-sources/retcnt.c
@@ -208,6 +208,7 @@ METHOD_FN(display_events)
printf("\n");
}
+
#define ss_name retcnt
#define ss_cls SS_SOFTWARE
#define ss_sort_order 100
diff --git a/src/tool/hpcrun/sample-sources/ss-list.h b/src/tool/hpcrun/sample-sources/ss-list.h
index f9674cb2f1..abd1f112b0 100644
--- a/src/tool/hpcrun/sample-sources/ss-list.h
+++ b/src/tool/hpcrun/sample-sources/ss-list.h
@@ -88,6 +88,10 @@ SAMPLE_SOURCE_DECL_MACRO(retcnt)
SAMPLE_SOURCE_DECL_MACRO(papi_c_cupti)
#endif
+#ifdef HPCRUN_SS_PAPI_C_ROCM
+SAMPLE_SOURCE_DECL_MACRO(papi_c_rocm)
+#endif
+
#ifdef HPCRUN_SS_NVIDIA
SAMPLE_SOURCE_DECL_MACRO(nvidia_gpu)
#endif
@@ -98,6 +102,14 @@ SAMPLE_SOURCE_DECL_MACRO(amd_gpu)
#endif
#endif
+SAMPLE_SOURCE_DECL_MACRO(openmp_gpu)
+
+#ifdef HPCRUN_SS_AMD
+#ifndef HPCRUN_STATIC_LINK
+SAMPLE_SOURCE_DECL_MACRO(amd_rocprof)
+#endif
+#endif
+
#ifdef HPCRUN_SS_LEVEL0
SAMPLE_SOURCE_DECL_MACRO(level0)
#endif
diff --git a/src/tool/hpcrun/sample-sources/sync.c b/src/tool/hpcrun/sample-sources/sync.c
index b9608f915f..f0c68a2b12 100644
--- a/src/tool/hpcrun/sample-sources/sync.c
+++ b/src/tool/hpcrun/sample-sources/sync.c
@@ -199,6 +199,7 @@ METHOD_FN(display_events)
printf("\n");
}
+
/***************************************************************************
* object
***************************************************************************/
diff --git a/src/tool/hpcrun/sample-sources/tst.c b/src/tool/hpcrun/sample-sources/tst.c
index 6b58d21446..54baf04327 100644
--- a/src/tool/hpcrun/sample-sources/tst.c
+++ b/src/tool/hpcrun/sample-sources/tst.c
@@ -366,6 +366,7 @@ METHOD_FN(display_events)
#endif
}
+
/***************************************************************************
* object
***************************************************************************/
diff --git a/src/tool/hpcrun/sample-sources/upc.c b/src/tool/hpcrun/sample-sources/upc.c
index 7bd4cf4b4b..ef12ac7419 100644
--- a/src/tool/hpcrun/sample-sources/upc.c
+++ b/src/tool/hpcrun/sample-sources/upc.c
@@ -447,6 +447,7 @@ METHOD_FN(display_events)
printf("\n");
}
+
#define ss_name upc
#define ss_cls SS_HARDWARE
diff --git a/src/tool/hpcrun/sample_event.c b/src/tool/hpcrun/sample_event.c
index f5c7b46f30..56fd3732a5 100644
--- a/src/tool/hpcrun/sample_event.c
+++ b/src/tool/hpcrun/sample_event.c
@@ -244,7 +244,7 @@ hpcrun_sample_callpath(void* context, int metricId,
}
}
}
- else {
+ else { // Partial unwind case
cct_bundle_t* cct = &(td->core_profile_trace_data.epoch->csdata);
node = record_partial_unwind(cct, td->btbuf_beg, td->btbuf_cur - 1,
metricId, metricIncr, skipInner, NULL);
@@ -305,7 +305,7 @@ hpcrun_sample_callpath(void* context, int metricId,
}
hpcrun_clear_handling_sample(td);
- if (TD_GET(mem_low) || ENABLED(FLUSH_EVERY_SAMPLE)) {
+ if (get_mem_low() || ENABLED(FLUSH_EVERY_SAMPLE)) {
hpcrun_flush_epochs(&(TD_GET(core_profile_trace_data)));
hpcrun_reclaim_freeable_mem();
}
@@ -384,7 +384,7 @@ hpcrun_gen_thread_ctxt(void* context)
}
#endif
hpcrun_clear_handling_sample(td);
- if (TD_GET(mem_low) || ENABLED(FLUSH_EVERY_SAMPLE)) {
+ if (get_mem_low() || ENABLED(FLUSH_EVERY_SAMPLE)) {
hpcrun_flush_epochs(&(TD_GET(core_profile_trace_data)));
hpcrun_reclaim_freeable_mem();
}
diff --git a/src/tool/hpcrun/sample_sources_registered.c b/src/tool/hpcrun/sample_sources_registered.c
index aa623c5dca..658b40c104 100644
--- a/src/tool/hpcrun/sample_sources_registered.c
+++ b/src/tool/hpcrun/sample_sources_registered.c
@@ -75,7 +75,6 @@
static sample_source_t* registered_sample_sources = NULL;
-
//------------------------------------------------------------------------------
// interface operations
//------------------------------------------------------------------------------
@@ -92,8 +91,6 @@ hpcrun_sample_sources_register(void)
}
-
-
//------------------------------------------------------------------------------
// interface operations
//------------------------------------------------------------------------------
@@ -146,7 +143,8 @@ hpcrun_registered_sources_init(void)
METHOD_CALL(ss, init);
TMSG(SS_COMMON, "sample source \"%s\": init", ss->name);
}
-
+
+ // set user-defined control_knobs for the sample sources
control_knob_init();
}
diff --git a/src/tool/hpcrun/scripts/hpcrun.in b/src/tool/hpcrun/scripts/hpcrun.in
index 491e2b20ab..9b97b4aff3 100644
--- a/src/tool/hpcrun/scripts/hpcrun.in
+++ b/src/tool/hpcrun/scripts/hpcrun.in
@@ -375,7 +375,10 @@ do
CPU_GPU_IDLE* ) preload_list="${preload_list:+${preload_list}:}${hpcrun_dir}/libhpcrun_gpu.so" ;;
MPI* ) preload_list="${preload_list:+${preload_list}:}${hpcrun_dir}/libhpcrun_mpi.so" ;;
gpu=amd) roctracer_libdir="${roctracer_lib_path}"
- export HIP_ENABLE_DEFERRED_LOADING=0;;
+ export HSA_TOOLS_LIB=librocprofiler64.so.1
+ export ROCP_TOOL_LIB=libhpcrun.so
+ export ROCP_METRICS=@ROCM_PROFILER_LD_DIR@/metrics.xml
+ export ROCP_HSA_INTERCEPT=1;;
gpu=opencl) preload_list="${preload_list:+${preload_list}:}${hpcrun_dir}/libhpcrun_opencl.so" ;;
gpu=opencl,inst) gtpin_libdir="${gtpin_lib_path}"
@@ -395,6 +398,9 @@ do
;;
-L | -l | --list-events )
+ export HSA_TOOLS_LIB=librocprofiler64.so.1
+ export ROCP_TOOL_LIB=libhpcrun.so
+ export ROCP_METRICS=@ROCM_PROFILER_LD_DIR@/metrics.xml
export HPCRUN_EVENT_LIST=LIST
export HPCRUN_LIST_EVENT=1
;;
diff --git a/src/tool/hpcrun/thread_data.c b/src/tool/hpcrun/thread_data.c
index bdea4fa984..c1adbe5fbc 100644
--- a/src/tool/hpcrun/thread_data.c
+++ b/src/tool/hpcrun/thread_data.c
@@ -129,7 +129,6 @@ static pthread_key_t _hpcrun_key;
static int use_getspecific = 0;
static __thread bool mem_pool_initialized = false;
-
void
hpcrun_init_pthread_key
(
@@ -291,8 +290,6 @@ hpcrun_thread_init_mem_pool_once
}
}
-
-
//***************************************************************************
//
//***************************************************************************
@@ -385,7 +382,6 @@ hpcrun_thread_data_init
size_t n_sources
)
{
- hpcrun_meminfo_t memstore;
thread_data_t* td = hpcrun_get_thread_data();
// ----------------------------------------
@@ -396,12 +392,8 @@ hpcrun_thread_data_init
// memstore so we can reuse it in the child after fork. This must
// come first.
td->inside_hpcrun = 1;
- memstore = td->memstore;
memset(td, 0xfe, sizeof(thread_data_t));
td->inside_hpcrun = 1;
- td->memstore = memstore;
- hpcrun_make_memstore(&td->memstore, is_child);
- td->mem_low = 0;
// ----------------------------------------
// normalized thread id (monitor-generated)
diff --git a/src/tool/hpcrun/thread_data.h b/src/tool/hpcrun/thread_data.h
index 2874b39400..baa37f6c95 100644
--- a/src/tool/hpcrun/thread_data.h
+++ b/src/tool/hpcrun/thread_data.h
@@ -172,12 +172,6 @@ typedef struct thread_data_t {
int omp_thread;
uint64_t last_bar_time_us;
- // ----------------------------------------
- // hpcrun_malloc() memory data structures
- // ----------------------------------------
- hpcrun_meminfo_t memstore;
- int mem_low;
-
// ----------------------------------------
// sample sources
// ----------------------------------------
diff --git a/src/tool/hpcrun/tool_state.c b/src/tool/hpcrun/tool_state.c
new file mode 100644
index 0000000000..1a978c6e90
--- /dev/null
+++ b/src/tool/hpcrun/tool_state.c
@@ -0,0 +1,23 @@
+//
+// Created by dejan on 1.7.20..
+//
+
+#include "tool_state.h"
+
+static __thread int tool_active = false;
+
+
+
+void tool_enter(){
+ tool_active++;
+}
+
+
+void tool_exit(){
+ tool_active--;
+}
+
+
+bool is_tool_active(){
+ return tool_active;
+}
\ No newline at end of file
diff --git a/src/tool/hpcrun/tool_state.h b/src/tool/hpcrun/tool_state.h
new file mode 100644
index 0000000000..95bc91f67f
--- /dev/null
+++ b/src/tool/hpcrun/tool_state.h
@@ -0,0 +1,15 @@
+//
+// Created by dejan on 1.7.20..
+//
+
+#ifndef HPCTOOLKIT_TOOL_STATE_H
+#define HPCTOOLKIT_TOOL_STATE_H
+
+#include
+
+
+void tool_enter();
+void tool_exit();
+bool is_tool_active();
+
+#endif //HPCTOOLKIT_TOOL_STATE_H
diff --git a/src/tool/hpcrun/utilities/bgq-cnk/Makefile.in b/src/tool/hpcrun/utilities/bgq-cnk/Makefile.in
index 7b714659b0..9852d55315 100644
--- a/src/tool/hpcrun/utilities/bgq-cnk/Makefile.in
+++ b/src/tool/hpcrun/utilities/bgq-cnk/Makefile.in
@@ -312,6 +312,7 @@ OPT_CUPTI_IFLAGS = @OPT_CUPTI_IFLAGS@
OPT_CUPTI_LDFLAGS = @OPT_CUPTI_LDFLAGS@
OPT_GTPIN = @OPT_GTPIN@
OPT_GTPIN_IFLAGS = @OPT_GTPIN_IFLAGS@
+OPT_GTPIN_LDFLAGS = @OPT_GTPIN_LDFLAGS@
OPT_GTPIN_LIBDIR = @OPT_GTPIN_LIBDIR@
OPT_IGC = @OPT_IGC@
OPT_IGC_IFLAGS = @OPT_IGC_IFLAGS@
@@ -349,6 +350,7 @@ PERFMON_LDFLAGS_STAT = @PERFMON_LDFLAGS_STAT@
PERFMON_LIB = @PERFMON_LIB@
PERF_EVENT_PARANOID = @PERF_EVENT_PARANOID@
RANLIB = @RANLIB@
+ROCM_PROFILER_LD_DIR = @ROCM_PROFILER_LD_DIR@
SED = @SED@
SET_MAKE = @SET_MAKE@
SHELL = @SHELL@
diff --git a/src/tool/hpcserver/Makefile.in b/src/tool/hpcserver/Makefile.in
index ea164abf7f..7caa9659b1 100644
--- a/src/tool/hpcserver/Makefile.in
+++ b/src/tool/hpcserver/Makefile.in
@@ -366,6 +366,7 @@ OPT_CUPTI_IFLAGS = @OPT_CUPTI_IFLAGS@
OPT_CUPTI_LDFLAGS = @OPT_CUPTI_LDFLAGS@
OPT_GTPIN = @OPT_GTPIN@
OPT_GTPIN_IFLAGS = @OPT_GTPIN_IFLAGS@
+OPT_GTPIN_LDFLAGS = @OPT_GTPIN_LDFLAGS@
OPT_GTPIN_LIBDIR = @OPT_GTPIN_LIBDIR@
OPT_IGC = @OPT_IGC@
OPT_IGC_IFLAGS = @OPT_IGC_IFLAGS@
@@ -403,6 +404,7 @@ PERFMON_LDFLAGS_STAT = @PERFMON_LDFLAGS_STAT@
PERFMON_LIB = @PERFMON_LIB@
PERF_EVENT_PARANOID = @PERF_EVENT_PARANOID@
RANLIB = @RANLIB@
+ROCM_PROFILER_LD_DIR = @ROCM_PROFILER_LD_DIR@
SED = @SED@
SET_MAKE = @SET_MAKE@
SHELL = @SHELL@
diff --git a/src/tool/hpcserver/mpi/Makefile.in b/src/tool/hpcserver/mpi/Makefile.in
index d4fc024c72..a58be57c8d 100644
--- a/src/tool/hpcserver/mpi/Makefile.in
+++ b/src/tool/hpcserver/mpi/Makefile.in
@@ -374,6 +374,7 @@ OPT_CUPTI_IFLAGS = @OPT_CUPTI_IFLAGS@
OPT_CUPTI_LDFLAGS = @OPT_CUPTI_LDFLAGS@
OPT_GTPIN = @OPT_GTPIN@
OPT_GTPIN_IFLAGS = @OPT_GTPIN_IFLAGS@
+OPT_GTPIN_LDFLAGS = @OPT_GTPIN_LDFLAGS@
OPT_GTPIN_LIBDIR = @OPT_GTPIN_LIBDIR@
OPT_IGC = @OPT_IGC@
OPT_IGC_IFLAGS = @OPT_IGC_IFLAGS@
@@ -411,6 +412,7 @@ PERFMON_LDFLAGS_STAT = @PERFMON_LDFLAGS_STAT@
PERFMON_LIB = @PERFMON_LIB@
PERF_EVENT_PARANOID = @PERF_EVENT_PARANOID@
RANLIB = @RANLIB@
+ROCM_PROFILER_LD_DIR = @ROCM_PROFILER_LD_DIR@
SED = @SED@
SET_MAKE = @SET_MAKE@
SHELL = @SHELL@
diff --git a/src/tool/hpcstruct/Makefile.in b/src/tool/hpcstruct/Makefile.in
index 49ad90c9ad..22162293bb 100644
--- a/src/tool/hpcstruct/Makefile.in
+++ b/src/tool/hpcstruct/Makefile.in
@@ -402,6 +402,7 @@ OPT_CUPTI_IFLAGS = @OPT_CUPTI_IFLAGS@
OPT_CUPTI_LDFLAGS = @OPT_CUPTI_LDFLAGS@
OPT_GTPIN = @OPT_GTPIN@
OPT_GTPIN_IFLAGS = @OPT_GTPIN_IFLAGS@
+OPT_GTPIN_LDFLAGS = @OPT_GTPIN_LDFLAGS@
OPT_GTPIN_LIBDIR = @OPT_GTPIN_LIBDIR@
OPT_IGC = @OPT_IGC@
OPT_IGC_IFLAGS = @OPT_IGC_IFLAGS@
@@ -439,6 +440,7 @@ PERFMON_LDFLAGS_STAT = @PERFMON_LDFLAGS_STAT@
PERFMON_LIB = @PERFMON_LIB@
PERF_EVENT_PARANOID = @PERF_EVENT_PARANOID@
RANLIB = @RANLIB@
+ROCM_PROFILER_LD_DIR = @ROCM_PROFILER_LD_DIR@
SED = @SED@
SET_MAKE = @SET_MAKE@
SHELL = @SHELL@
diff --git a/src/tool/hpctracedump/Makefile.in b/src/tool/hpctracedump/Makefile.in
index 46cf02cc36..f511af86f5 100644
--- a/src/tool/hpctracedump/Makefile.in
+++ b/src/tool/hpctracedump/Makefile.in
@@ -352,6 +352,7 @@ OPT_CUPTI_IFLAGS = @OPT_CUPTI_IFLAGS@
OPT_CUPTI_LDFLAGS = @OPT_CUPTI_LDFLAGS@
OPT_GTPIN = @OPT_GTPIN@
OPT_GTPIN_IFLAGS = @OPT_GTPIN_IFLAGS@
+OPT_GTPIN_LDFLAGS = @OPT_GTPIN_LDFLAGS@
OPT_GTPIN_LIBDIR = @OPT_GTPIN_LIBDIR@
OPT_IGC = @OPT_IGC@
OPT_IGC_IFLAGS = @OPT_IGC_IFLAGS@
@@ -389,6 +390,7 @@ PERFMON_LDFLAGS_STAT = @PERFMON_LDFLAGS_STAT@
PERFMON_LIB = @PERFMON_LIB@
PERF_EVENT_PARANOID = @PERF_EVENT_PARANOID@
RANLIB = @RANLIB@
+ROCM_PROFILER_LD_DIR = @ROCM_PROFILER_LD_DIR@
SED = @SED@
SET_MAKE = @SET_MAKE@
SHELL = @SHELL@
diff --git a/src/tool/misc/Makefile.in b/src/tool/misc/Makefile.in
index acb75a41da..c28239b629 100644
--- a/src/tool/misc/Makefile.in
+++ b/src/tool/misc/Makefile.in
@@ -307,6 +307,7 @@ OPT_CUPTI_IFLAGS = @OPT_CUPTI_IFLAGS@
OPT_CUPTI_LDFLAGS = @OPT_CUPTI_LDFLAGS@
OPT_GTPIN = @OPT_GTPIN@
OPT_GTPIN_IFLAGS = @OPT_GTPIN_IFLAGS@
+OPT_GTPIN_LDFLAGS = @OPT_GTPIN_LDFLAGS@
OPT_GTPIN_LIBDIR = @OPT_GTPIN_LIBDIR@
OPT_IGC = @OPT_IGC@
OPT_IGC_IFLAGS = @OPT_IGC_IFLAGS@
@@ -344,6 +345,7 @@ PERFMON_LDFLAGS_STAT = @PERFMON_LDFLAGS_STAT@
PERFMON_LIB = @PERFMON_LIB@
PERF_EVENT_PARANOID = @PERF_EVENT_PARANOID@
RANLIB = @RANLIB@
+ROCM_PROFILER_LD_DIR = @ROCM_PROFILER_LD_DIR@
SED = @SED@
SET_MAKE = @SET_MAKE@
SHELL = @SHELL@
diff --git a/src/tool/xprof/Makefile.in b/src/tool/xprof/Makefile.in
index 6e11068ad1..410d824678 100644
--- a/src/tool/xprof/Makefile.in
+++ b/src/tool/xprof/Makefile.in
@@ -373,6 +373,7 @@ OPT_CUPTI_IFLAGS = @OPT_CUPTI_IFLAGS@
OPT_CUPTI_LDFLAGS = @OPT_CUPTI_LDFLAGS@
OPT_GTPIN = @OPT_GTPIN@
OPT_GTPIN_IFLAGS = @OPT_GTPIN_IFLAGS@
+OPT_GTPIN_LDFLAGS = @OPT_GTPIN_LDFLAGS@
OPT_GTPIN_LIBDIR = @OPT_GTPIN_LIBDIR@
OPT_IGC = @OPT_IGC@
OPT_IGC_IFLAGS = @OPT_IGC_IFLAGS@
@@ -410,6 +411,7 @@ PERFMON_LDFLAGS_STAT = @PERFMON_LDFLAGS_STAT@
PERFMON_LIB = @PERFMON_LIB@
PERF_EVENT_PARANOID = @PERF_EVENT_PARANOID@
RANLIB = @RANLIB@
+ROCM_PROFILER_LD_DIR = @ROCM_PROFILER_LD_DIR@
SED = @SED@
SET_MAKE = @SET_MAKE@
SHELL = @SHELL@
diff --git a/tests/Makefile.in b/tests/Makefile.in
index dc7d338c90..f42b0a49ec 100644
--- a/tests/Makefile.in
+++ b/tests/Makefile.in
@@ -539,6 +539,7 @@ OPT_CUPTI_IFLAGS = @OPT_CUPTI_IFLAGS@
OPT_CUPTI_LDFLAGS = @OPT_CUPTI_LDFLAGS@
OPT_GTPIN = @OPT_GTPIN@
OPT_GTPIN_IFLAGS = @OPT_GTPIN_IFLAGS@
+OPT_GTPIN_LDFLAGS = @OPT_GTPIN_LDFLAGS@
OPT_GTPIN_LIBDIR = @OPT_GTPIN_LIBDIR@
OPT_IGC = @OPT_IGC@
OPT_IGC_IFLAGS = @OPT_IGC_IFLAGS@
@@ -576,6 +577,7 @@ PERFMON_LDFLAGS_STAT = @PERFMON_LDFLAGS_STAT@
PERFMON_LIB = @PERFMON_LIB@
PERF_EVENT_PARANOID = @PERF_EVENT_PARANOID@
RANLIB = @RANLIB@
+ROCM_PROFILER_LD_DIR = @ROCM_PROFILER_LD_DIR@
SED = @SED@
SET_MAKE = @SET_MAKE@
SHELL = @SHELL@