diff --git a/Makefile.in b/Makefile.in index 9166db4530..fb24849491 100644 --- a/Makefile.in +++ b/Makefile.in @@ -342,6 +342,7 @@ OPT_CUPTI_IFLAGS = @OPT_CUPTI_IFLAGS@ OPT_CUPTI_LDFLAGS = @OPT_CUPTI_LDFLAGS@ OPT_GTPIN = @OPT_GTPIN@ OPT_GTPIN_IFLAGS = @OPT_GTPIN_IFLAGS@ +OPT_GTPIN_LDFLAGS = @OPT_GTPIN_LDFLAGS@ OPT_GTPIN_LIBDIR = @OPT_GTPIN_LIBDIR@ OPT_IGC = @OPT_IGC@ OPT_IGC_IFLAGS = @OPT_IGC_IFLAGS@ @@ -379,6 +380,7 @@ PERFMON_LDFLAGS_STAT = @PERFMON_LDFLAGS_STAT@ PERFMON_LIB = @PERFMON_LIB@ PERF_EVENT_PARANOID = @PERF_EVENT_PARANOID@ RANLIB = @RANLIB@ +ROCM_PROFILER_LD_DIR = @ROCM_PROFILER_LD_DIR@ SED = @SED@ SET_MAKE = @SET_MAKE@ SHELL = @SHELL@ diff --git a/configure b/configure index 4fa77fc0d7..76f0a04114 100755 --- a/configure +++ b/configure @@ -651,10 +651,12 @@ OPT_LEVEL0_IFLAGS OPT_LEVEL0 OPT_ENABLE_LEVEL0_FALSE OPT_ENABLE_LEVEL0_TRUE +ROCM_PROFILER_LD_DIR OPT_ROCM_LD_LIB_PATH OPT_ROCM_IFLAGS OPT_ENABLE_ROCM_FALSE OPT_ENABLE_ROCM_TRUE +OPT_GTPIN_LDFLAGS OPT_GTPIN_LIBDIR OPT_GTPIN_IFLAGS OPT_GTPIN @@ -747,6 +749,8 @@ TBB_LIB_DIR TBB_PROXY_LIB TBB_LFLAGS TBB_IFLAGS +OPT_PAPI_ROCM_FALSE +OPT_PAPI_ROCM_TRUE OPT_PAPI_CUPTI_FALSE OPT_PAPI_CUPTI_TRUE OPT_PAPI_COMPONENT_FALSE @@ -1053,6 +1057,7 @@ with_papi enable_force_papi enable_papi_c enable_papi_c_cupti +enable_papi_c_rocm with_perfmon enable_perf_events enable_kernel_blocking @@ -1081,6 +1086,8 @@ with_rocm with_rocm_hip with_rocm_dbgapi with_rocm_tracer +with_rocm_profiler +with_rocm_hsa with_level0 enable_data_centric_tracing enable_devtools @@ -1760,6 +1767,8 @@ Optional Features: --enable-papi-c use component papi, if available (default yes) --enable-papi-c-cupti use papi CUPTI support, if available (default no), requires papi cuda component + --enable-papi-c-rocm use papi ROCM support, if available (default no), + requires papi rocm component --enable-perf-events force enable or disable perf events in hpcrun (normally 2.6.32 or later), only needed if fails to auto-detect correctly @@ -1839,6 +1848,9 @@ Optional Packages: --with-rocm-hip=PATH path to hip install directory --with-rocm-dbgapi=PATH path to rocm-dbgapi install directory --with-rocm-tracer=PATH path to roctracer-dev install directory + --with-rocm-profiler=PATH + path to rocprofiler-dev install directory + --with-rocm-hsa=PATH path to hsa-dev install directory --with-level0=PATH use given Level Zero installation (absolute path) with hpcrun (default is NO) --with-valgrind=PATH path to Valgrind install directory @@ -21990,10 +22002,8 @@ ac_compiler_gnu=$ac_cv_c_compiler_gnu #include "papi.h" -extern void CUDA_init_component(void); int main() { - CUDA_init_component(); } _ACEOF @@ -22019,6 +22029,67 @@ $as_echo "$use_papi_c_cupti" >&6; } fi +#------------------------------------------------- +# Option: --enable-papi-c-rocm +#------------------------------------------------- + +use_papi_c_rocm=no + +# Check whether --enable-papi-c-rocm was given. +if test "${enable_papi_c_rocm+set}" = set; then : + enableval=$enable_papi_c_rocm; use_papi_c_rocm="$enableval" +fi + + +if test "$use_papi_c" = no || test "$use_papi_c_rocm" != yes ; then + use_papi_c_rocm=no +else + { $as_echo "$as_me:${as_lineno-$LINENO}: checking for papi rocm component" >&5 +$as_echo_n "checking for papi rocm component... " >&6; } + + ORIG_CFLAGS="$CFLAGS" + ORIG_LIBS="$LIBS" + CFLAGS="$CFLAGS $OPT_PAPI_IFLAGS" + LIBS="$OPT_PAPI_LDFLAGS $papi_extra_libs" + ac_ext=c +ac_cpp='$CPP $CPPFLAGS' +ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5' +ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5' +ac_compiler_gnu=$ac_cv_c_compiler_gnu + + + cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ + + +#include "papi.h" +int main() +{ +} + +_ACEOF +if ac_fn_c_try_link "$LINENO"; then : + use_papi_c_rocm=yes +else + use_papi_c_rocm=no +fi +rm -f core conftest.err conftest.$ac_objext \ + conftest$ac_exeext conftest.$ac_ext + + ac_ext=cpp +ac_cpp='$CXXCPP $CPPFLAGS' +ac_compile='$CXX -c $CXXFLAGS $CPPFLAGS conftest.$ac_ext >&5' +ac_link='$CXX -o conftest$ac_exeext $CXXFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5' +ac_compiler_gnu=$ac_cv_cxx_compiler_gnu + + CFLAGS="$ORIG_CFLAGS" + LIBS="$ORIG_LIBS" + + { $as_echo "$as_me:${as_lineno-$LINENO}: result: $use_papi_c_rocm" >&5 +$as_echo "$use_papi_c_rocm" >&6; } +fi + + #------------------------------------------------- # Option: --with-perfmon=PATH #------------------------------------------------- @@ -22348,6 +22419,7 @@ $as_echo "$as_me: WARNING: disable papi due to possible conflict with perfmon" > OPT_PAPI_LIBPATH= use_papi_c=no use_papi_c_cupti=no + use_papi_c_rocm=no fi fi @@ -22385,6 +22457,14 @@ else OPT_PAPI_CUPTI_FALSE= fi + if test "$use_papi_c_rocm" = yes; then + OPT_PAPI_ROCM_TRUE= + OPT_PAPI_ROCM_FALSE='#' +else + OPT_PAPI_ROCM_TRUE='#' + OPT_PAPI_ROCM_FALSE= +fi + #------------------------------------------------- @@ -24368,6 +24448,201 @@ $as_echo "$GTPIN" >&6; } +#------------------------------------------------- +# Option: --with-igc=PATH +#------------------------------------------------- + +IGC=no +OPT_HAVE_IGC=no +OPT_IGC_IFLAGS= +OPT_IGC_LDFLAGS= + + +# Check whether --with-igc was given. +if test "${with_igc+set}" = set; then : + withval=$with_igc; IGC="$withval" +fi + + +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for igc" >&5 +$as_echo_n "checking for igc... " >&6; } + +case "$IGC" in + /* ) + if test ! -f "${IGC}/include/igc/igc.opencl.h" ; then + as_fn_error $? "unable to find igc.opencl.h in: $IGC" "$LINENO" 5 + fi + OPT_IGC_IFLAGS="-I${IGC}/include" + + IGC_LDFLAGS = + IGA_LDFLAGS = + + for lib in $multilib_path ; do + if test -f "${IGC}/${lib}/libigc.so" ; then + IGC_LDFLAGS="-L${IGC}/$lib -Wl,-rpath=${IGC}/$lib -ligc" + break + fi + done + if test "x$IGC_LDFLAGS" = x ; then + as_fn_error $? "unable to find libigc.so in: $IGC" "$LINENO" 5 + fi + + for lib in $multilib_path ; do + if test -f "${IGC}/${lib}/libiga64.so" ; then + IGA_LDFLAGS="-L${IGC}/$lib -Wl,-rpath=${IGC}/$lib -liga64" + break + fi + done + if test "x$IGA_LDFLAGS" = x ; then + as_fn_error $? "unable to find libiga.so in: $IGC" "$LINENO" 5 + fi + + OPT_IGC_LDFLAGS="${IGC_LDFLAGS} ${IGA_LDFLAGS}" + + OPT_HAVE_IGC=yes + ;; + no ) + ;; + * ) + as_fn_error $? "igc directory must be absolute path: $IGC" "$LINENO" 5 + ;; +esac + + if test "$OPT_HAVE_IGC" = yes; then + OPT_ENABLE_IGC_TRUE= + OPT_ENABLE_IGC_FALSE='#' +else + OPT_ENABLE_IGC_TRUE='#' + OPT_ENABLE_IGC_FALSE= +fi + + +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $IGC" >&5 +$as_echo "$IGC" >&6; } + + + + + +#------------------------------------------------- +# Option: --with-metrics-discovery=PATH +#------------------------------------------------- + +METRICS_DISCOVERY=no +OPT_HAVE_METRICS_DISCOVERY=no +OPT_METRICS_DISCOVERY_IFLAGS= +OPT_METRICS_DISCOVERY_LDFLAGS= + + +# Check whether --with-metrics-discovery was given. +if test "${with_metrics_discovery+set}" = set; then : + withval=$with_metrics_discovery; METRICS_DISCOVERY="$withval" +fi + + +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for metrics-discovery" >&5 +$as_echo_n "checking for metrics-discovery... " >&6; } + +case "$METRICS_DISCOVERY" in + /* ) + if test ! -f "${METRICS_DISCOVERY}/include/metrics_discovery_api.h" ; then + as_fn_error $? "unable to find metrics_discovery_api.h in: $METRICS_DISCOVERY" "$LINENO" 5 + fi + OPT_METRICS_DISCOVERY_IFLAGS="-I${METRICS_DISCOVERY}/include" + + for lib in $multilib_path ; do + if test -f "${METRICS_DISCOVERY}/${lib}/libmd.so" ; then + OPT_METRICS_DISCOVERY_LDFLAGS="-L${METRICS_DISCOVERY}/$lib -Wl,-rpath=${METRICS_DISCOVERY}/$lib -lmd" + break + fi + done + if test "x$OPT_METRICS_DISCOVERY_LDFLAGS" = x ; then + as_fn_error $? "unable to find libmd.so in: $METRICS_DISCOVERY" "$LINENO" 5 + fi + OPT_HAVE_METRICS_DISCOVERY=yes + ;; + no ) + ;; + * ) + as_fn_error $? "metrics-discovery directory must be absolute path: $METRICS_DISCOVERY" "$LINENO" 5 + ;; +esac + + if test "$OPT_HAVE_METRICS_DISCOVERY" = yes; then + OPT_ENABLE_METRICS_DISCOVERY_TRUE= + OPT_ENABLE_METRICS_DISCOVERY_FALSE='#' +else + OPT_ENABLE_METRICS_DISCOVERY_TRUE='#' + OPT_ENABLE_METRICS_DISCOVERY_FALSE= +fi + + +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $METRICS_DISCOVERY" >&5 +$as_echo "$METRICS_DISCOVERY" >&6; } + + + + + +#------------------------------------------------- +# Option: --with-gtpin=PATH +#------------------------------------------------- + +GTPIN=no +OPT_HAVE_GTPIN=no +OPT_GTPIN_IFLAGS= +OPT_GTPIN_LDFLAGS= + + +# Check whether --with-gtpin was given. +if test "${with_gtpin+set}" = set; then : + withval=$with_gtpin; GTPIN="$withval" +fi + + +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for gtpin" >&5 +$as_echo_n "checking for gtpin... " >&6; } + +case "$GTPIN" in + /* ) + if test ! -f "${GTPIN}/Profilers/Include/gtpin.h" ; then + as_fn_error $? "unable to find gtpin.h in: $GTPIN" "$LINENO" 5 + fi + OPT_GTPIN_IFLAGS="-I${GTPIN}/Profilers/Include -I${GTPIN}/Profilers/Include/ged/intel64/" + + if test -f "${GTPIN}/Profilers/Lib/intel64/libgtpin.so" ; then + OPT_GTPIN_LDFLAGS="-L${GTPIN}/Profilers/Lib/intel64/ -Wl,-rpath=${GTPIN}/Profilers/Lib/intel64/ -lgtpin" + fi + + if test "x$OPT_GTPIN_LDFLAGS" = x ; then + as_fn_error $? "unable to find libgtpin.so in: $GTPIN" "$LINENO" 5 + fi + OPT_HAVE_GTPIN=yes + ;; + no ) + ;; + * ) + as_fn_error $? "gtpin directory must be absolute path: $GTPIN" "$LINENO" 5 + ;; +esac + + if test "$OPT_HAVE_GTPIN" = yes; then + OPT_ENABLE_GTPIN_TRUE= + OPT_ENABLE_GTPIN_FALSE='#' +else + OPT_ENABLE_GTPIN_TRUE='#' + OPT_ENABLE_GTPIN_FALSE= +fi + + +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $GTPIN" >&5 +$as_echo "$GTPIN" >&6; } + + + + + + #------------------------------------------------- # Option: --with-rocm=PATH #------------------------------------------------- @@ -24383,6 +24658,8 @@ ROCM= ROCM_HIP= ROCM_DBGAPI= ROCM_TRACER= +ROCM_PROFILER= +ROCM_HSA= # Check whether --with-rocm was given. @@ -24412,17 +24689,39 @@ if test "${with_rocm_tracer+set}" = set; then : fi + +# Check whether --with-rocm-profiler was given. +if test "${with_rocm_profiler+set}" = set; then : + withval=$with_rocm_profiler; ROCM_PROFILER="$withval" +fi + + + +# Check whether --with-rocm-hsa was given. +if test "${with_rocm_hsa+set}" = set; then : + withval=$with_rocm_hsa; ROCM_HSA="$withval" +fi + + + + ROCM_HIP_IFLAGS= ROCM_DBGAPI_IFLAGS= ROCM_TRACER_IFLAGS= +ROCM_PROFILER_IFLAGS= +ROCM_HSA_IFLAGS= ROCM_HIP_LD_DIR= ROCM_DBGAPI_LD_DIR= ROCM_TRACER_LD_DIR= +ROCM_PROFILER_LD_DIR= +ROCM_HSA_LD_DIR= ROCM_HIP_MESG= ROCM_DBGAPI_MESG= ROCM_TRACER_MESG= +ROCM_PROFILER_MESG= +ROCM_HSA_MESG= require_rocm=no @@ -24485,6 +24784,38 @@ $as_echo "$as_me: found $ROCM/roctracer/lib/libroctracer64.so" >&6;} found=yes fi + # ROCPROFILER + if test -f "$ROCM/rocprofiler/include/rocprofiler.h" ; then + { $as_echo "$as_me:${as_lineno-$LINENO}: found $ROCM/rocprofiler/include/rocprofiler.h" >&5 +$as_echo "$as_me: found $ROCM/rocprofiler/include/rocprofiler.h" >&6;} + ROCM_PROFILER_IFLAGS="-I$ROCM/rocprofiler/include" + ROCM_PROFILER_MESG="$ROCM/rocprofiler" + found=yes + fi + if test -f "$ROCM/rocprofiler/lib/librocprofiler64.so" ; then + { $as_echo "$as_me:${as_lineno-$LINENO}: found $ROCM/rocprofiler/lib/librocprofiler64.so" >&5 +$as_echo "$as_me: found $ROCM/rocprofiler/lib/librocprofiler64.so" >&6;} + ROCM_PROFILER_LD_DIR="$ROCM/rocprofiler/lib" + ROCM_PROFILER_MESG="$ROCM/rocprofiler" + found=yes + fi + + # HSA + if test -f "$ROCM/hsa/include/hsa/hsa.h" ; then + { $as_echo "$as_me:${as_lineno-$LINENO}: found $ROCM/hsa/include/hsa/hsa.h" >&5 +$as_echo "$as_me: found $ROCM/hsa/include/hsa/hsa.h" >&6;} + ROCM_HSA_IFLAGS="-I$ROCM/hsa/include/hsa" + ROCM_HSA_MESG="$ROCM/hsa" + found=yes + fi + if test -f "$ROCM/hsa/lib/libhsa-runtime64.so" ; then + { $as_echo "$as_me:${as_lineno-$LINENO}: found $ROCM/hsa/lib/libhsa-runtime64.so" >&5 +$as_echo "$as_me: found $ROCM/hsa/lib/libhsa-runtime64.so" >&6;} + ROCM_HSA_LD_DIR="$ROCM/hsa/lib" + ROCM_HSA_MESG="$ROCM/hsa" + found=yes + fi + # warn if given dir has nothing useful if test "$found" = no ; then { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: found nothing useful in $ROCM" >&5 @@ -24586,6 +24917,64 @@ $as_echo "$as_me: WARNING: found nothing useful in $ROCM_TRACER" >&2;} ;; esac +case "$ROCM_PROFILER" in + /* ) + require_rocm=yes + found=no + + if test -f "$ROCM_PROFILER/rocprofiler/include/rocprofiler.h" ; then + { $as_echo "$as_me:${as_lineno-$LINENO}: found $ROCM_PROFILER/rocprofiler/include/rocprofiler.h" >&5 +$as_echo "$as_me: found $ROCM_PROFILER/rocprofiler/include/rocprofiler.h" >&6;} + ROCM_PROFILER_IFLAGS="-I$ROCM_PROFILER/rocprofiler/include" + ROCM_PROFILER_MESG="$ROCM_PROFILER/rocprofiler" + found=yes + fi + if test -f "$ROCM_PROFILER/rocprofiler/lib/librocprofiler64.so" ; then + { $as_echo "$as_me:${as_lineno-$LINENO}: found $ROCM_PROFILER/rocprofiler/lib/librocprofiler64.so" >&5 +$as_echo "$as_me: found $ROCM_PROFILER/rocprofiler/lib/librocprofiler64.so" >&6;} + ROCM_PROFILER_LD_DIR="$ROCM_PROFILER/rocprofiler/lib" + ROCM_PROFILER_MESG="$ROCM_PROFILER/rocprofiler" + found=yes + fi + if test "$found" = no ; then + { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: found nothing useful in $ROCM_PROFILER" >&5 +$as_echo "$as_me: WARNING: found nothing useful in $ROCM_PROFILER" >&2;} + fi + ;; + * ) + ROCM_PROFILER=no + ;; +esac + +case "$ROCM_HSA" in + /* ) + require_rocm=yes + found=no + + if test -f "$ROCM_HSA/include/hsa/hsa.h" ; then + { $as_echo "$as_me:${as_lineno-$LINENO}: found $ROCM_HSA/include/hsa/hsa.h" >&5 +$as_echo "$as_me: found $ROCM_HSA/include/hsa/hsa.h" >&6;} + ROCM_HSA_IFLAGS="-I$ROCM_HSA/include/hsa" + ROCM_HSA_MESG="$ROCM_HSA" + found=yes + fi + if test -f "$ROCM_HSA/lib/libhsa-runtime64.so" ; then + { $as_echo "$as_me:${as_lineno-$LINENO}: found $ROCM_HSA/lib/libhsa-runtime64.so" >&5 +$as_echo "$as_me: found $ROCM_HSA/lib/libhsa-runtime64.so" >&6;} + ROCM_HSA_LD_DIR="$ROCM_HSA/lib" + ROCM_HSA_MESG="$ROCM_HSA" + found=yes + fi + if test "$found" = no ; then + { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: found nothing useful in $ROCM_HSA" >&5 +$as_echo "$as_me: WARNING: found nothing useful in $ROCM_HSA" >&2;} + fi + ;; + * ) + ROCM_HSA=no + ;; +esac + # # Check that we found all the pieces. # @@ -24602,6 +24991,12 @@ then if test "x$ROCM_TRACER_IFLAGS" = x ; then as_fn_error $? "unable to find roctracer_hip.h" "$LINENO" 5 fi + if test "x$ROCM_PROFILER_IFLAGS" = x ; then + as_fn_error $? "unable to find rocprofiler.h" "$LINENO" 5 + fi + if test "x$ROCM_HSA_IFLAGS" = x ; then + as_fn_error $? "unable to find hsa.h" "$LINENO" 5 + fi if test "x$ROCM_HIP_LD_DIR" = x ; then as_fn_error $? "unable to find libamdhip64.so" "$LINENO" 5 @@ -24612,10 +25007,16 @@ then if test "x$ROCM_TRACER_LD_DIR" = x ; then as_fn_error $? "unable to find libroctracer64.so" "$LINENO" 5 fi + if test "x$ROCM_PROFILER_LD_DIR" = x ; then + as_fn_error $? "unable to find librocprofiler64.so" "$LINENO" 5 + fi + if test "x$ROCM_HSA_LD_DIR" = x ; then + as_fn_error $? "unable to find libhsa-runtime64.so" "$LINENO" 5 + fi OPT_HAVE_ROCM=yes - OPT_ROCM_IFLAGS="$ROCM_HIP_IFLAGS $ROCM_DBGAPI_IFLAGS $ROCM_TRACER_IFLAGS" - OPT_ROCM_LD_LIB_PATH="${ROCM_HIP_LD_DIR}:${ROCM_DBGAPI_LD_DIR}:${ROCM_TRACER_LD_DIR}" + OPT_ROCM_IFLAGS="$ROCM_HIP_IFLAGS $ROCM_DBGAPI_IFLAGS $ROCM_TRACER_IFLAGS $ROCM_PROFILER_IFLAGS $ROCM_HSA_IFLAGS" + OPT_ROCM_LD_LIB_PATH="${ROCM_HIP_LD_DIR}:${ROCM_DBGAPI_LD_DIR}:${ROCM_TRACER_LD_DIR}:${ROCM_PROFILER_LD_DIR}:${ROCM_HSA_LD_DIR}" fi # @@ -24669,6 +25070,7 @@ fi + #------------------------------------------------- # Option: --with-level0=PATH #------------------------------------------------- @@ -25225,6 +25627,10 @@ if test -z "${OPT_PAPI_CUPTI_TRUE}" && test -z "${OPT_PAPI_CUPTI_FALSE}"; then as_fn_error $? "conditional \"OPT_PAPI_CUPTI\" was never defined. Usually this means the macro was only invoked conditionally." "$LINENO" 5 fi +if test -z "${OPT_PAPI_ROCM_TRUE}" && test -z "${OPT_PAPI_ROCM_FALSE}"; then + as_fn_error $? "conditional \"OPT_PAPI_ROCM\" was never defined. +Usually this means the macro was only invoked conditionally." "$LINENO" 5 +fi if test -z "${OPT_USE_ZLIB_TRUE}" && test -z "${OPT_USE_ZLIB_FALSE}"; then as_fn_error $? "conditional \"OPT_USE_ZLIB\" was never defined. Usually this means the macro was only invoked conditionally." "$LINENO" 5 @@ -25317,6 +25723,18 @@ if test -z "${OPT_ENABLE_GTPIN_TRUE}" && test -z "${OPT_ENABLE_GTPIN_FALSE}"; th as_fn_error $? "conditional \"OPT_ENABLE_GTPIN\" was never defined. Usually this means the macro was only invoked conditionally." "$LINENO" 5 fi +if test -z "${OPT_ENABLE_IGC_TRUE}" && test -z "${OPT_ENABLE_IGC_FALSE}"; then + as_fn_error $? "conditional \"OPT_ENABLE_IGC\" was never defined. +Usually this means the macro was only invoked conditionally." "$LINENO" 5 +fi +if test -z "${OPT_ENABLE_METRICS_DISCOVERY_TRUE}" && test -z "${OPT_ENABLE_METRICS_DISCOVERY_FALSE}"; then + as_fn_error $? "conditional \"OPT_ENABLE_METRICS_DISCOVERY\" was never defined. +Usually this means the macro was only invoked conditionally." "$LINENO" 5 +fi +if test -z "${OPT_ENABLE_GTPIN_TRUE}" && test -z "${OPT_ENABLE_GTPIN_FALSE}"; then + as_fn_error $? "conditional \"OPT_ENABLE_GTPIN\" was never defined. +Usually this means the macro was only invoked conditionally." "$LINENO" 5 +fi if test -z "${OPT_ENABLE_ROCM_TRUE}" && test -z "${OPT_ENABLE_ROCM_FALSE}"; then as_fn_error $? "conditional \"OPT_ENABLE_ROCM\" was never defined. Usually this means the macro was only invoked conditionally." "$LINENO" 5 @@ -27999,6 +28417,8 @@ $as_echo "$as_me: gtpin: ${GTPIN}" >&6;} $as_echo "$as_me: metrics-discovery: ${METRICS_DISCOVERY}" >&6;} { $as_echo "$as_me:${as_lineno-$LINENO}: papi-c-cupti: ${use_papi_c_cupti}" >&5 $as_echo "$as_me: papi-c-cupti: ${use_papi_c_cupti}" >&6;} +{ $as_echo "$as_me:${as_lineno-$LINENO}: papi-c-rocm: ${use_papi_c_rocm}" >&5 +$as_echo "$as_me: papi-c-rocm: ${use_papi_c_rocm}" >&6;} { $as_echo "$as_me:${as_lineno-$LINENO}: rocm: ${rocm_mesg}" >&5 $as_echo "$as_me: rocm: ${rocm_mesg}" >&6;} if test "$OPT_HAVE_ROCM" = yes ; then @@ -28008,6 +28428,10 @@ $as_echo "$as_me: rocm hip: $ROCM_HIP_MESG" >&6;} $as_echo "$as_me: rocm dbgapi: $ROCM_DBGAPI_MESG" >&6;} { $as_echo "$as_me:${as_lineno-$LINENO}: rocm tracer: $ROCM_TRACER_MESG" >&5 $as_echo "$as_me: rocm tracer: $ROCM_TRACER_MESG" >&6;} + { $as_echo "$as_me:${as_lineno-$LINENO}: rocm profiler:$ROCM_PROFILER_MESG" >&5 +$as_echo "$as_me: rocm profiler:$ROCM_PROFILER_MESG" >&6;} + { $as_echo "$as_me:${as_lineno-$LINENO}: rocm hsa: $ROCM_HSA_MESG" >&5 +$as_echo "$as_me: rocm hsa: $ROCM_HSA_MESG" >&6;} fi { $as_echo "$as_me:${as_lineno-$LINENO}: valgrind: ${VALGRIND}" >&5 $as_echo "$as_me: valgrind: ${VALGRIND}" >&6;} diff --git a/configure.ac b/configure.ac index 94496fec79..e8f61ffe14 100644 --- a/configure.ac +++ b/configure.ac @@ -3314,10 +3314,8 @@ else AC_LINK_IFELSE([ AC_LANG_SOURCE([[ #include "papi.h" -extern void CUDA_init_component(void); int main() { - CUDA_init_component(); } ]])], [use_papi_c_cupti=yes], [use_papi_c_cupti=no]) @@ -3329,6 +3327,46 @@ int main() fi +#------------------------------------------------- +# Option: --enable-papi-c-rocm +#------------------------------------------------- + +use_papi_c_rocm=no + +AC_ARG_ENABLE([papi-c-rocm], + [AS_HELP_STRING([--enable-papi-c-rocm], + [use papi ROCM support, if available (default no), requires + papi rocm component])], + [use_papi_c_rocm="$enableval"], + []) + +if test "$use_papi_c" = no || test "$use_papi_c_rocm" != yes ; then + use_papi_c_rocm=no +else + AC_MSG_CHECKING([for papi rocm component]) + + ORIG_CFLAGS="$CFLAGS" + ORIG_LIBS="$LIBS" + CFLAGS="$CFLAGS $OPT_PAPI_IFLAGS" + LIBS="$OPT_PAPI_LDFLAGS $papi_extra_libs" + AC_LANG_PUSH([C]) + + AC_LINK_IFELSE([ + AC_LANG_SOURCE([[ +#include "papi.h" +int main() +{ +} +]])], [use_papi_c_rocm=yes], [use_papi_c_rocm=no]) + + AC_LANG_POP + CFLAGS="$ORIG_CFLAGS" + LIBS="$ORIG_LIBS" + + AC_MSG_RESULT([$use_papi_c_rocm]) +fi + + #------------------------------------------------- # Option: --with-perfmon=PATH #------------------------------------------------- @@ -3602,6 +3640,7 @@ then OPT_PAPI_LIBPATH= use_papi_c=no use_papi_c_cupti=no + use_papi_c_rocm=no fi fi @@ -3611,6 +3650,7 @@ AM_CONDITIONAL(OPT_PAPI_DYNAMIC, [test "$OPT_PAPI_DYNAMIC" = yes]) AM_CONDITIONAL(OPT_PAPI_STATIC, [test "$OPT_PAPI_STATIC" = yes]) AM_CONDITIONAL(OPT_PAPI_COMPONENT, [test "$use_papi_c" = yes]) AM_CONDITIONAL(OPT_PAPI_CUPTI, [test "$use_papi_c_cupti" = yes]) +AM_CONDITIONAL(OPT_PAPI_ROCM, [test "$use_papi_c_rocm" = yes]) #------------------------------------------------- @@ -5133,6 +5173,171 @@ AC_SUBST([OPT_GTPIN_IFLAGS]) AC_SUBST([OPT_GTPIN_LIBDIR]) +#------------------------------------------------- +# Option: --with-igc=PATH +#------------------------------------------------- + +IGC=no +OPT_HAVE_IGC=no +OPT_IGC_IFLAGS= +OPT_IGC_LDFLAGS= + +AC_ARG_WITH([igc], + [AS_HELP_STRING([--with-igc=PATH], + [path to igc install directory])], + [IGC="$withval"], + []) + +AC_MSG_CHECKING([for igc]) + +case "$IGC" in + /* ) + if test ! -f "${IGC}/include/igc/igc.opencl.h" ; then + AC_MSG_ERROR([unable to find igc.opencl.h in: $IGC]) + fi + OPT_IGC_IFLAGS="-I${IGC}/include" + + IGC_LDFLAGS = + IGA_LDFLAGS = + + for lib in $multilib_path ; do + if test -f "${IGC}/${lib}/libigc.so" ; then + IGC_LDFLAGS="-L${IGC}/$lib -Wl,-rpath=${IGC}/$lib -ligc" + break + fi + done + if test "x$IGC_LDFLAGS" = x ; then + AC_MSG_ERROR([unable to find libigc.so in: $IGC]) + fi + + for lib in $multilib_path ; do + if test -f "${IGC}/${lib}/libiga64.so" ; then + IGA_LDFLAGS="-L${IGC}/$lib -Wl,-rpath=${IGC}/$lib -liga64" + break + fi + done + if test "x$IGA_LDFLAGS" = x ; then + AC_MSG_ERROR([unable to find libiga.so in: $IGC]) + fi + + OPT_IGC_LDFLAGS="${IGC_LDFLAGS} ${IGA_LDFLAGS}" + + OPT_HAVE_IGC=yes + ;; + no ) + ;; + * ) + AC_MSG_ERROR([igc directory must be absolute path: $IGC]) + ;; +esac + +AM_CONDITIONAL([OPT_ENABLE_IGC], [test "$OPT_HAVE_IGC" = yes]) + +AC_MSG_RESULT([$IGC]) + +AC_SUBST([OPT_IGC]) +AC_SUBST([OPT_IGC_IFLAGS]) +AC_SUBST([OPT_IGC_LDFLAGS]) + +#------------------------------------------------- +# Option: --with-metrics-discovery=PATH +#------------------------------------------------- + +METRICS_DISCOVERY=no +OPT_HAVE_METRICS_DISCOVERY=no +OPT_METRICS_DISCOVERY_IFLAGS= +OPT_METRICS_DISCOVERY_LDFLAGS= + +AC_ARG_WITH([metrics-discovery], + [AS_HELP_STRING([--with-metrics-discovery=PATH], + [path to metrics-discovery install directory])], + [METRICS_DISCOVERY="$withval"], + []) + +AC_MSG_CHECKING([for metrics-discovery]) + +case "$METRICS_DISCOVERY" in + /* ) + if test ! -f "${METRICS_DISCOVERY}/include/metrics_discovery_api.h" ; then + AC_MSG_ERROR([unable to find metrics_discovery_api.h in: $METRICS_DISCOVERY]) + fi + OPT_METRICS_DISCOVERY_IFLAGS="-I${METRICS_DISCOVERY}/include" + + for lib in $multilib_path ; do + if test -f "${METRICS_DISCOVERY}/${lib}/libmd.so" ; then + OPT_METRICS_DISCOVERY_LDFLAGS="-L${METRICS_DISCOVERY}/$lib -Wl,-rpath=${METRICS_DISCOVERY}/$lib -lmd" + break + fi + done + if test "x$OPT_METRICS_DISCOVERY_LDFLAGS" = x ; then + AC_MSG_ERROR([unable to find libmd.so in: $METRICS_DISCOVERY]) + fi + OPT_HAVE_METRICS_DISCOVERY=yes + ;; + no ) + ;; + * ) + AC_MSG_ERROR([metrics-discovery directory must be absolute path: $METRICS_DISCOVERY]) + ;; +esac + +AM_CONDITIONAL([OPT_ENABLE_METRICS_DISCOVERY], [test "$OPT_HAVE_METRICS_DISCOVERY" = yes]) + +AC_MSG_RESULT([$METRICS_DISCOVERY]) + +AC_SUBST([OPT_METRICS_DISCOVERY]) +AC_SUBST([OPT_METRICS_DISCOVERY_IFLAGS]) +AC_SUBST([OPT_METRICS_DISCOVERY_LDFLAGS]) + +#------------------------------------------------- +# Option: --with-gtpin=PATH +#------------------------------------------------- + +GTPIN=no +OPT_HAVE_GTPIN=no +OPT_GTPIN_IFLAGS= +OPT_GTPIN_LDFLAGS= + +AC_ARG_WITH([gtpin], + [AS_HELP_STRING([--with-gtpin=PATH], + [path to gtpin install directory])], + [GTPIN="$withval"], + []) + +AC_MSG_CHECKING([for gtpin]) + +case "$GTPIN" in + /* ) + if test ! -f "${GTPIN}/Profilers/Include/gtpin.h" ; then + AC_MSG_ERROR([unable to find gtpin.h in: $GTPIN]) + fi + OPT_GTPIN_IFLAGS="-I${GTPIN}/Profilers/Include -I${GTPIN}/Profilers/Include/ged/intel64/" + + if test -f "${GTPIN}/Profilers/Lib/intel64/libgtpin.so" ; then + OPT_GTPIN_LDFLAGS="-L${GTPIN}/Profilers/Lib/intel64/ -Wl,-rpath=${GTPIN}/Profilers/Lib/intel64/ -lgtpin" + fi + + if test "x$OPT_GTPIN_LDFLAGS" = x ; then + AC_MSG_ERROR([unable to find libgtpin.so in: $GTPIN]) + fi + OPT_HAVE_GTPIN=yes + ;; + no ) + ;; + * ) + AC_MSG_ERROR([gtpin directory must be absolute path: $GTPIN]) + ;; +esac + +AM_CONDITIONAL([OPT_ENABLE_GTPIN], [test "$OPT_HAVE_GTPIN" = yes]) + +AC_MSG_RESULT([$GTPIN]) + +AC_SUBST([OPT_GTPIN]) +AC_SUBST([OPT_GTPIN_IFLAGS]) +AC_SUBST([OPT_GTPIN_LDFLAGS]) + + #------------------------------------------------- # Option: --with-rocm=PATH #------------------------------------------------- @@ -5148,6 +5353,8 @@ ROCM= ROCM_HIP= ROCM_DBGAPI= ROCM_TRACER= +ROCM_PROFILER= +ROCM_HSA= AC_ARG_WITH([rocm], AS_HELP_STRING([--with-rocm=PATH], @@ -5169,17 +5376,35 @@ AC_ARG_WITH([rocm-tracer], [path to roctracer-dev install directory]), [ROCM_TRACER="$withval"], []) +AC_ARG_WITH([rocm-profiler], + AS_HELP_STRING([--with-rocm-profiler=PATH], + [path to rocprofiler-dev install directory]), + [ROCM_PROFILER="$withval"], []) + +AC_ARG_WITH([rocm-hsa], + AS_HELP_STRING([--with-rocm-hsa=PATH], + [path to hsa-dev install directory]), + [ROCM_HSA="$withval"], []) + + + ROCM_HIP_IFLAGS= ROCM_DBGAPI_IFLAGS= ROCM_TRACER_IFLAGS= +ROCM_PROFILER_IFLAGS= +ROCM_HSA_IFLAGS= ROCM_HIP_LD_DIR= ROCM_DBGAPI_LD_DIR= ROCM_TRACER_LD_DIR= +ROCM_PROFILER_LD_DIR= +ROCM_HSA_LD_DIR= ROCM_HIP_MESG= ROCM_DBGAPI_MESG= ROCM_TRACER_MESG= +ROCM_PROFILER_MESG= +ROCM_HSA_MESG= require_rocm=no @@ -5235,6 +5460,34 @@ case "$ROCM" in found=yes fi + # ROCPROFILER + if test -f "$ROCM/rocprofiler/include/rocprofiler.h" ; then + AC_MSG_NOTICE([found $ROCM/rocprofiler/include/rocprofiler.h]) + ROCM_PROFILER_IFLAGS="-I$ROCM/rocprofiler/include" + ROCM_PROFILER_MESG="$ROCM/rocprofiler" + found=yes + fi + if test -f "$ROCM/rocprofiler/lib/librocprofiler64.so" ; then + AC_MSG_NOTICE([found $ROCM/rocprofiler/lib/librocprofiler64.so]) + ROCM_PROFILER_LD_DIR="$ROCM/rocprofiler/lib" + ROCM_PROFILER_MESG="$ROCM/rocprofiler" + found=yes + fi + + # HSA + if test -f "$ROCM/hsa/include/hsa/hsa.h" ; then + AC_MSG_NOTICE([found $ROCM/hsa/include/hsa/hsa.h]) + ROCM_HSA_IFLAGS="-I$ROCM/hsa/include/hsa" + ROCM_HSA_MESG="$ROCM/hsa" + found=yes + fi + if test -f "$ROCM/hsa/lib/libhsa-runtime64.so" ; then + AC_MSG_NOTICE([found $ROCM/hsa/lib/libhsa-runtime64.so]) + ROCM_HSA_LD_DIR="$ROCM/hsa/lib" + ROCM_HSA_MESG="$ROCM/hsa" + found=yes + fi + # warn if given dir has nothing useful if test "$found" = no ; then AC_MSG_WARN([found nothing useful in $ROCM]) @@ -5326,6 +5579,58 @@ case "$ROCM_TRACER" in ;; esac +case "$ROCM_PROFILER" in + /* ) + require_rocm=yes + found=no + + if test -f "$ROCM_PROFILER/rocprofiler/include/rocprofiler.h" ; then + AC_MSG_NOTICE([found $ROCM_PROFILER/rocprofiler/include/rocprofiler.h]) + ROCM_PROFILER_IFLAGS="-I$ROCM_PROFILER/rocprofiler/include" + ROCM_PROFILER_MESG="$ROCM_PROFILER/rocprofiler" + found=yes + fi + if test -f "$ROCM_PROFILER/rocprofiler/lib/librocprofiler64.so" ; then + AC_MSG_NOTICE([found $ROCM_PROFILER/rocprofiler/lib/librocprofiler64.so]) + ROCM_PROFILER_LD_DIR="$ROCM_PROFILER/rocprofiler/lib" + ROCM_PROFILER_MESG="$ROCM_PROFILER/rocprofiler" + found=yes + fi + if test "$found" = no ; then + AC_MSG_WARN([found nothing useful in $ROCM_PROFILER]) + fi + ;; + * ) + ROCM_PROFILER=no + ;; +esac + +case "$ROCM_HSA" in + /* ) + require_rocm=yes + found=no + + if test -f "$ROCM_HSA/include/hsa/hsa.h" ; then + AC_MSG_NOTICE([found $ROCM_HSA/include/hsa/hsa.h]) + ROCM_HSA_IFLAGS="-I$ROCM_HSA/include/hsa" + ROCM_HSA_MESG="$ROCM_HSA" + found=yes + fi + if test -f "$ROCM_HSA/lib/libhsa-runtime64.so" ; then + AC_MSG_NOTICE([found $ROCM_HSA/lib/libhsa-runtime64.so]) + ROCM_HSA_LD_DIR="$ROCM_HSA/lib" + ROCM_HSA_MESG="$ROCM_HSA" + found=yes + fi + if test "$found" = no ; then + AC_MSG_WARN([found nothing useful in $ROCM_HSA]) + fi + ;; + * ) + ROCM_HSA=no + ;; +esac + # # Check that we found all the pieces. # @@ -5342,6 +5647,12 @@ then if test "x$ROCM_TRACER_IFLAGS" = x ; then AC_MSG_ERROR([unable to find roctracer_hip.h]) fi + if test "x$ROCM_PROFILER_IFLAGS" = x ; then + AC_MSG_ERROR([unable to find rocprofiler.h]) + fi + if test "x$ROCM_HSA_IFLAGS" = x ; then + AC_MSG_ERROR([unable to find hsa.h]) + fi if test "x$ROCM_HIP_LD_DIR" = x ; then AC_MSG_ERROR([unable to find libamdhip64.so]) @@ -5352,10 +5663,16 @@ then if test "x$ROCM_TRACER_LD_DIR" = x ; then AC_MSG_ERROR([unable to find libroctracer64.so]) fi + if test "x$ROCM_PROFILER_LD_DIR" = x ; then + AC_MSG_ERROR([unable to find librocprofiler64.so]) + fi + if test "x$ROCM_HSA_LD_DIR" = x ; then + AC_MSG_ERROR([unable to find libhsa-runtime64.so]) + fi OPT_HAVE_ROCM=yes - OPT_ROCM_IFLAGS="$ROCM_HIP_IFLAGS $ROCM_DBGAPI_IFLAGS $ROCM_TRACER_IFLAGS" - OPT_ROCM_LD_LIB_PATH="${ROCM_HIP_LD_DIR}:${ROCM_DBGAPI_LD_DIR}:${ROCM_TRACER_LD_DIR}" + OPT_ROCM_IFLAGS="$ROCM_HIP_IFLAGS $ROCM_DBGAPI_IFLAGS $ROCM_TRACER_IFLAGS $ROCM_PROFILER_IFLAGS $ROCM_HSA_IFLAGS" + OPT_ROCM_LD_LIB_PATH="${ROCM_HIP_LD_DIR}:${ROCM_DBGAPI_LD_DIR}:${ROCM_TRACER_LD_DIR}:${ROCM_PROFILER_LD_DIR}:${ROCM_HSA_LD_DIR}" fi # @@ -5396,6 +5713,7 @@ AM_CONDITIONAL([OPT_ENABLE_ROCM], [test "$OPT_HAVE_ROCM" = yes]) AC_SUBST([OPT_ROCM_IFLAGS]) AC_SUBST([OPT_ROCM_LD_LIB_PATH]) +AC_SUBST([ROCM_PROFILER_LD_DIR]) #------------------------------------------------- @@ -5795,11 +6113,14 @@ AC_MSG_NOTICE([ igc: ${IGC}]) AC_MSG_NOTICE([ gtpin: ${GTPIN}]) AC_MSG_NOTICE([ metrics-discovery: ${METRICS_DISCOVERY}]) AC_MSG_NOTICE([ papi-c-cupti: ${use_papi_c_cupti}]) +AC_MSG_NOTICE([ papi-c-rocm: ${use_papi_c_rocm}]) AC_MSG_NOTICE([ rocm: ${rocm_mesg}]) if test "$OPT_HAVE_ROCM" = yes ; then AC_MSG_NOTICE([ rocm hip: $ROCM_HIP_MESG]) AC_MSG_NOTICE([ rocm dbgapi: $ROCM_DBGAPI_MESG]) AC_MSG_NOTICE([ rocm tracer: $ROCM_TRACER_MESG]) + AC_MSG_NOTICE([ rocm profiler:$ROCM_PROFILER_MESG]) + AC_MSG_NOTICE([ rocm hsa: $ROCM_HSA_MESG]) fi AC_MSG_NOTICE([ valgrind: ${VALGRIND}]) AC_MSG_NOTICE([ valgrind: annotated: ${OPT_ENABLE_VG_ANNOTATIONS}]) diff --git a/doc/Makefile.in b/doc/Makefile.in index c8196892ef..a14a5bdf49 100644 --- a/doc/Makefile.in +++ b/doc/Makefile.in @@ -354,6 +354,7 @@ OPT_CUPTI_IFLAGS = @OPT_CUPTI_IFLAGS@ OPT_CUPTI_LDFLAGS = @OPT_CUPTI_LDFLAGS@ OPT_GTPIN = @OPT_GTPIN@ OPT_GTPIN_IFLAGS = @OPT_GTPIN_IFLAGS@ +OPT_GTPIN_LDFLAGS = @OPT_GTPIN_LDFLAGS@ OPT_GTPIN_LIBDIR = @OPT_GTPIN_LIBDIR@ OPT_IGC = @OPT_IGC@ OPT_IGC_IFLAGS = @OPT_IGC_IFLAGS@ @@ -391,6 +392,7 @@ PERFMON_LDFLAGS_STAT = @PERFMON_LDFLAGS_STAT@ PERFMON_LIB = @PERFMON_LIB@ PERF_EVENT_PARANOID = @PERF_EVENT_PARANOID@ RANLIB = @RANLIB@ +ROCM_PROFILER_LD_DIR = @ROCM_PROFILER_LD_DIR@ SED = @SED@ SET_MAKE = @SET_MAKE@ SHELL = @SHELL@ diff --git a/doc/man/Makefile.in b/doc/man/Makefile.in index 67cb362dc1..33c0aea877 100644 --- a/doc/man/Makefile.in +++ b/doc/man/Makefile.in @@ -297,6 +297,7 @@ OPT_CUPTI_IFLAGS = @OPT_CUPTI_IFLAGS@ OPT_CUPTI_LDFLAGS = @OPT_CUPTI_LDFLAGS@ OPT_GTPIN = @OPT_GTPIN@ OPT_GTPIN_IFLAGS = @OPT_GTPIN_IFLAGS@ +OPT_GTPIN_LDFLAGS = @OPT_GTPIN_LDFLAGS@ OPT_GTPIN_LIBDIR = @OPT_GTPIN_LIBDIR@ OPT_IGC = @OPT_IGC@ OPT_IGC_IFLAGS = @OPT_IGC_IFLAGS@ @@ -334,6 +335,7 @@ PERFMON_LDFLAGS_STAT = @PERFMON_LDFLAGS_STAT@ PERFMON_LIB = @PERFMON_LIB@ PERF_EVENT_PARANOID = @PERF_EVENT_PARANOID@ RANLIB = @RANLIB@ +ROCM_PROFILER_LD_DIR = @ROCM_PROFILER_LD_DIR@ SED = @SED@ SET_MAKE = @SET_MAKE@ SHELL = @SHELL@ diff --git a/doc/manual/Makefile.in b/doc/manual/Makefile.in index a804f017b1..4ed1616ca5 100644 --- a/doc/manual/Makefile.in +++ b/doc/manual/Makefile.in @@ -294,6 +294,7 @@ OPT_CUPTI_IFLAGS = @OPT_CUPTI_IFLAGS@ OPT_CUPTI_LDFLAGS = @OPT_CUPTI_LDFLAGS@ OPT_GTPIN = @OPT_GTPIN@ OPT_GTPIN_IFLAGS = @OPT_GTPIN_IFLAGS@ +OPT_GTPIN_LDFLAGS = @OPT_GTPIN_LDFLAGS@ OPT_GTPIN_LIBDIR = @OPT_GTPIN_LIBDIR@ OPT_IGC = @OPT_IGC@ OPT_IGC_IFLAGS = @OPT_IGC_IFLAGS@ @@ -331,6 +332,7 @@ PERFMON_LDFLAGS_STAT = @PERFMON_LDFLAGS_STAT@ PERFMON_LIB = @PERFMON_LIB@ PERF_EVENT_PARANOID = @PERF_EVENT_PARANOID@ RANLIB = @RANLIB@ +ROCM_PROFILER_LD_DIR = @ROCM_PROFILER_LD_DIR@ SED = @SED@ SET_MAKE = @SET_MAKE@ SHELL = @SHELL@ diff --git a/doc/www/Makefile.in b/doc/www/Makefile.in index 6a3e8c3784..d8f839f9f3 100644 --- a/doc/www/Makefile.in +++ b/doc/www/Makefile.in @@ -294,6 +294,7 @@ OPT_CUPTI_IFLAGS = @OPT_CUPTI_IFLAGS@ OPT_CUPTI_LDFLAGS = @OPT_CUPTI_LDFLAGS@ OPT_GTPIN = @OPT_GTPIN@ OPT_GTPIN_IFLAGS = @OPT_GTPIN_IFLAGS@ +OPT_GTPIN_LDFLAGS = @OPT_GTPIN_LDFLAGS@ OPT_GTPIN_LIBDIR = @OPT_GTPIN_LIBDIR@ OPT_IGC = @OPT_IGC@ OPT_IGC_IFLAGS = @OPT_IGC_IFLAGS@ @@ -331,6 +332,7 @@ PERFMON_LDFLAGS_STAT = @PERFMON_LDFLAGS_STAT@ PERFMON_LIB = @PERFMON_LIB@ PERF_EVENT_PARANOID = @PERF_EVENT_PARANOID@ RANLIB = @RANLIB@ +ROCM_PROFILER_LD_DIR = @ROCM_PROFILER_LD_DIR@ SED = @SED@ SET_MAKE = @SET_MAKE@ SHELL = @SHELL@ diff --git a/doc/www/download.html b/doc/www/download.html index 3a562bcc4b..e4cd235dcd 100644 --- a/doc/www/download.html +++ b/doc/www/download.html @@ -69,28 +69,29 @@

Installing Java 11

  • Set JAVA_HOME environment variable to the installed directory
  • - - + +Additional Mac OS directions if other Java versions are installed: + + + -

    Latest Release

    +

    Latest release

    - + - - - - - + + + + + +
    OS Platform Download link
    OS Processor Download link
    Linux x86-64 hpcviewer-linux.gtk.x86_64.tgz
    Linux Power Little endian 64 hpcviewer-linux.gtk.ppc64le.tgz
    Linux ARM 64 (experimental) hpcviewer-linux.gtk.aarch64.tgz
    MacOS x86-64 hpcviewer-macosx.cocoa.x86_64.dmg
    Windows x86-64 hpcviewer-win32.win32.x86_64.zip
    Linux x86_64 hpcviewer-linux.gtk.x86_64.tgz
    Linux ppcle hpcviewer-linux.gtk.ppc64le.tgz
    Linux aarch64 hpcviewer-linux.gtk.aarch64.tgz
    MacOS x86_64 hpcviewer-macosx.cocoa.x86_64.dmg
    MacOS aarch64 hpcviewer-macosx.cocoa.aarch64.dmg
    Windows x86_64 hpcviewer-win32.win32.x86_64.zip
    diff --git a/lib/Makefile.in b/lib/Makefile.in index 92e797fb7c..8e67265146 100644 --- a/lib/Makefile.in +++ b/lib/Makefile.in @@ -293,6 +293,7 @@ OPT_CUPTI_IFLAGS = @OPT_CUPTI_IFLAGS@ OPT_CUPTI_LDFLAGS = @OPT_CUPTI_LDFLAGS@ OPT_GTPIN = @OPT_GTPIN@ OPT_GTPIN_IFLAGS = @OPT_GTPIN_IFLAGS@ +OPT_GTPIN_LDFLAGS = @OPT_GTPIN_LDFLAGS@ OPT_GTPIN_LIBDIR = @OPT_GTPIN_LIBDIR@ OPT_IGC = @OPT_IGC@ OPT_IGC_IFLAGS = @OPT_IGC_IFLAGS@ @@ -330,6 +331,7 @@ PERFMON_LDFLAGS_STAT = @PERFMON_LDFLAGS_STAT@ PERFMON_LIB = @PERFMON_LIB@ PERF_EVENT_PARANOID = @PERF_EVENT_PARANOID@ RANLIB = @RANLIB@ +ROCM_PROFILER_LD_DIR = @ROCM_PROFILER_LD_DIR@ SED = @SED@ SET_MAKE = @SET_MAKE@ SHELL = @SHELL@ diff --git a/src/Makefile.in b/src/Makefile.in index 168fc19cc0..d353bb7012 100644 --- a/src/Makefile.in +++ b/src/Makefile.in @@ -323,6 +323,7 @@ OPT_CUPTI_IFLAGS = @OPT_CUPTI_IFLAGS@ OPT_CUPTI_LDFLAGS = @OPT_CUPTI_LDFLAGS@ OPT_GTPIN = @OPT_GTPIN@ OPT_GTPIN_IFLAGS = @OPT_GTPIN_IFLAGS@ +OPT_GTPIN_LDFLAGS = @OPT_GTPIN_LDFLAGS@ OPT_GTPIN_LIBDIR = @OPT_GTPIN_LIBDIR@ OPT_IGC = @OPT_IGC@ OPT_IGC_IFLAGS = @OPT_IGC_IFLAGS@ @@ -360,6 +361,7 @@ PERFMON_LDFLAGS_STAT = @PERFMON_LDFLAGS_STAT@ PERFMON_LIB = @PERFMON_LIB@ PERF_EVENT_PARANOID = @PERF_EVENT_PARANOID@ RANLIB = @RANLIB@ +ROCM_PROFILER_LD_DIR = @ROCM_PROFILER_LD_DIR@ SED = @SED@ SET_MAKE = @SET_MAKE@ SHELL = @SHELL@ diff --git a/src/extern/Makefile.in b/src/extern/Makefile.in index 6487c6533c..2950b9ba80 100644 --- a/src/extern/Makefile.in +++ b/src/extern/Makefile.in @@ -332,6 +332,7 @@ OPT_CUPTI_IFLAGS = @OPT_CUPTI_IFLAGS@ OPT_CUPTI_LDFLAGS = @OPT_CUPTI_LDFLAGS@ OPT_GTPIN = @OPT_GTPIN@ OPT_GTPIN_IFLAGS = @OPT_GTPIN_IFLAGS@ +OPT_GTPIN_LDFLAGS = @OPT_GTPIN_LDFLAGS@ OPT_GTPIN_LIBDIR = @OPT_GTPIN_LIBDIR@ OPT_IGC = @OPT_IGC@ OPT_IGC_IFLAGS = @OPT_IGC_IFLAGS@ @@ -369,6 +370,7 @@ PERFMON_LDFLAGS_STAT = @PERFMON_LDFLAGS_STAT@ PERFMON_LIB = @PERFMON_LIB@ PERF_EVENT_PARANOID = @PERF_EVENT_PARANOID@ RANLIB = @RANLIB@ +ROCM_PROFILER_LD_DIR = @ROCM_PROFILER_LD_DIR@ SED = @SED@ SET_MAKE = @SET_MAKE@ SHELL = @SHELL@ diff --git a/src/extern/libunwind/Makefile.in b/src/extern/libunwind/Makefile.in index 07e5213148..b4b2172587 100644 --- a/src/extern/libunwind/Makefile.in +++ b/src/extern/libunwind/Makefile.in @@ -267,6 +267,7 @@ OPT_CUPTI_IFLAGS = @OPT_CUPTI_IFLAGS@ OPT_CUPTI_LDFLAGS = @OPT_CUPTI_LDFLAGS@ OPT_GTPIN = @OPT_GTPIN@ OPT_GTPIN_IFLAGS = @OPT_GTPIN_IFLAGS@ +OPT_GTPIN_LDFLAGS = @OPT_GTPIN_LDFLAGS@ OPT_GTPIN_LIBDIR = @OPT_GTPIN_LIBDIR@ OPT_IGC = @OPT_IGC@ OPT_IGC_IFLAGS = @OPT_IGC_IFLAGS@ @@ -304,6 +305,7 @@ PERFMON_LDFLAGS_STAT = @PERFMON_LDFLAGS_STAT@ PERFMON_LIB = @PERFMON_LIB@ PERF_EVENT_PARANOID = @PERF_EVENT_PARANOID@ RANLIB = @RANLIB@ +ROCM_PROFILER_LD_DIR = @ROCM_PROFILER_LD_DIR@ SED = @SED@ SET_MAKE = @SET_MAKE@ SHELL = @SHELL@ diff --git a/src/extern/lzma/Makefile.in b/src/extern/lzma/Makefile.in index c8649688a4..aa24d2ab9b 100644 --- a/src/extern/lzma/Makefile.in +++ b/src/extern/lzma/Makefile.in @@ -267,6 +267,7 @@ OPT_CUPTI_IFLAGS = @OPT_CUPTI_IFLAGS@ OPT_CUPTI_LDFLAGS = @OPT_CUPTI_LDFLAGS@ OPT_GTPIN = @OPT_GTPIN@ OPT_GTPIN_IFLAGS = @OPT_GTPIN_IFLAGS@ +OPT_GTPIN_LDFLAGS = @OPT_GTPIN_LDFLAGS@ OPT_GTPIN_LIBDIR = @OPT_GTPIN_LIBDIR@ OPT_IGC = @OPT_IGC@ OPT_IGC_IFLAGS = @OPT_IGC_IFLAGS@ @@ -304,6 +305,7 @@ PERFMON_LDFLAGS_STAT = @PERFMON_LDFLAGS_STAT@ PERFMON_LIB = @PERFMON_LIB@ PERF_EVENT_PARANOID = @PERF_EVENT_PARANOID@ RANLIB = @RANLIB@ +ROCM_PROFILER_LD_DIR = @ROCM_PROFILER_LD_DIR@ SED = @SED@ SET_MAKE = @SET_MAKE@ SHELL = @SHELL@ diff --git a/src/lib/Makefile.in b/src/lib/Makefile.in index e3627010c1..4889bb6d06 100644 --- a/src/lib/Makefile.in +++ b/src/lib/Makefile.in @@ -336,6 +336,7 @@ OPT_CUPTI_IFLAGS = @OPT_CUPTI_IFLAGS@ OPT_CUPTI_LDFLAGS = @OPT_CUPTI_LDFLAGS@ OPT_GTPIN = @OPT_GTPIN@ OPT_GTPIN_IFLAGS = @OPT_GTPIN_IFLAGS@ +OPT_GTPIN_LDFLAGS = @OPT_GTPIN_LDFLAGS@ OPT_GTPIN_LIBDIR = @OPT_GTPIN_LIBDIR@ OPT_IGC = @OPT_IGC@ OPT_IGC_IFLAGS = @OPT_IGC_IFLAGS@ @@ -373,6 +374,7 @@ PERFMON_LDFLAGS_STAT = @PERFMON_LDFLAGS_STAT@ PERFMON_LIB = @PERFMON_LIB@ PERF_EVENT_PARANOID = @PERF_EVENT_PARANOID@ RANLIB = @RANLIB@ +ROCM_PROFILER_LD_DIR = @ROCM_PROFILER_LD_DIR@ SED = @SED@ SET_MAKE = @SET_MAKE@ SHELL = @SHELL@ diff --git a/src/lib/analysis/Makefile.in b/src/lib/analysis/Makefile.in index 56c6d6f61f..68d1dcf0df 100644 --- a/src/lib/analysis/Makefile.in +++ b/src/lib/analysis/Makefile.in @@ -371,6 +371,7 @@ OPT_CUPTI_IFLAGS = @OPT_CUPTI_IFLAGS@ OPT_CUPTI_LDFLAGS = @OPT_CUPTI_LDFLAGS@ OPT_GTPIN = @OPT_GTPIN@ OPT_GTPIN_IFLAGS = @OPT_GTPIN_IFLAGS@ +OPT_GTPIN_LDFLAGS = @OPT_GTPIN_LDFLAGS@ OPT_GTPIN_LIBDIR = @OPT_GTPIN_LIBDIR@ OPT_IGC = @OPT_IGC@ OPT_IGC_IFLAGS = @OPT_IGC_IFLAGS@ @@ -408,6 +409,7 @@ PERFMON_LDFLAGS_STAT = @PERFMON_LDFLAGS_STAT@ PERFMON_LIB = @PERFMON_LIB@ PERF_EVENT_PARANOID = @PERF_EVENT_PARANOID@ RANLIB = @RANLIB@ +ROCM_PROFILER_LD_DIR = @ROCM_PROFILER_LD_DIR@ SED = @SED@ SET_MAKE = @SET_MAKE@ SHELL = @SHELL@ diff --git a/src/lib/banal/Makefile.in b/src/lib/banal/Makefile.in index 6398fc3e67..caa556f5c0 100644 --- a/src/lib/banal/Makefile.in +++ b/src/lib/banal/Makefile.in @@ -367,6 +367,7 @@ OPT_CUPTI_IFLAGS = @OPT_CUPTI_IFLAGS@ OPT_CUPTI_LDFLAGS = @OPT_CUPTI_LDFLAGS@ OPT_GTPIN = @OPT_GTPIN@ OPT_GTPIN_IFLAGS = @OPT_GTPIN_IFLAGS@ +OPT_GTPIN_LDFLAGS = @OPT_GTPIN_LDFLAGS@ OPT_GTPIN_LIBDIR = @OPT_GTPIN_LIBDIR@ OPT_IGC = @OPT_IGC@ OPT_IGC_IFLAGS = @OPT_IGC_IFLAGS@ @@ -404,6 +405,7 @@ PERFMON_LDFLAGS_STAT = @PERFMON_LDFLAGS_STAT@ PERFMON_LIB = @PERFMON_LIB@ PERF_EVENT_PARANOID = @PERF_EVENT_PARANOID@ RANLIB = @RANLIB@ +ROCM_PROFILER_LD_DIR = @ROCM_PROFILER_LD_DIR@ SED = @SED@ SET_MAKE = @SET_MAKE@ SHELL = @SHELL@ diff --git a/src/lib/banal/Struct.cpp b/src/lib/banal/Struct.cpp index 9c3f32f49a..1be8bd3d28 100644 --- a/src/lib/banal/Struct.cpp +++ b/src/lib/banal/Struct.cpp @@ -117,6 +117,7 @@ #include "gpu/ReadCudaCFG.hpp" + #ifdef ENABLE_IGC #include "gpu/ReadIntelCFG.hpp" #endif // ENABLE_IGC diff --git a/src/lib/banal/gpu/ReadIntelCFG.cpp b/src/lib/banal/gpu/ReadIntelCFG.cpp index 0df0d6232b..71c5868a6e 100644 --- a/src/lib/banal/gpu/ReadIntelCFG.cpp +++ b/src/lib/banal/gpu/ReadIntelCFG.cpp @@ -45,6 +45,7 @@ //*************************************************************************** + #ifdef ENABLE_IGC //****************************************************************************** @@ -300,4 +301,5 @@ readIntelCFG return false; } + #endif // ENABLE_IGC diff --git a/src/lib/binutils/Makefile.in b/src/lib/binutils/Makefile.in index ea27efd04c..dbfa265f59 100644 --- a/src/lib/binutils/Makefile.in +++ b/src/lib/binutils/Makefile.in @@ -387,6 +387,7 @@ OPT_CUPTI_IFLAGS = @OPT_CUPTI_IFLAGS@ OPT_CUPTI_LDFLAGS = @OPT_CUPTI_LDFLAGS@ OPT_GTPIN = @OPT_GTPIN@ OPT_GTPIN_IFLAGS = @OPT_GTPIN_IFLAGS@ +OPT_GTPIN_LDFLAGS = @OPT_GTPIN_LDFLAGS@ OPT_GTPIN_LIBDIR = @OPT_GTPIN_LIBDIR@ OPT_IGC = @OPT_IGC@ OPT_IGC_IFLAGS = @OPT_IGC_IFLAGS@ @@ -424,6 +425,7 @@ PERFMON_LDFLAGS_STAT = @PERFMON_LDFLAGS_STAT@ PERFMON_LIB = @PERFMON_LIB@ PERF_EVENT_PARANOID = @PERF_EVENT_PARANOID@ RANLIB = @RANLIB@ +ROCM_PROFILER_LD_DIR = @ROCM_PROFILER_LD_DIR@ SED = @SED@ SET_MAKE = @SET_MAKE@ SHELL = @SHELL@ diff --git a/src/lib/binutils/intel/gen_binary_decoder.h b/src/lib/binutils/intel/gen_binary_decoder.h index 1e6a85dd08..2f2b2f3fde 100644 --- a/src/lib/binutils/intel/gen_binary_decoder.h +++ b/src/lib/binutils/intel/gen_binary_decoder.h @@ -29,6 +29,7 @@ #include + #ifdef ENABLE_IGC #include @@ -74,6 +75,7 @@ class GenBinaryDecoder { private: KernelView kernel_view_; }; + #endif // ENABLE_IGC #endif // PTI_SAMPLES_UTILS_GEN_BINARY_DECODER_H_ diff --git a/src/lib/isa/Makefile.in b/src/lib/isa/Makefile.in index 3faa223f42..9f0ff691f8 100644 --- a/src/lib/isa/Makefile.in +++ b/src/lib/isa/Makefile.in @@ -364,6 +364,7 @@ OPT_CUPTI_IFLAGS = @OPT_CUPTI_IFLAGS@ OPT_CUPTI_LDFLAGS = @OPT_CUPTI_LDFLAGS@ OPT_GTPIN = @OPT_GTPIN@ OPT_GTPIN_IFLAGS = @OPT_GTPIN_IFLAGS@ +OPT_GTPIN_LDFLAGS = @OPT_GTPIN_LDFLAGS@ OPT_GTPIN_LIBDIR = @OPT_GTPIN_LIBDIR@ OPT_IGC = @OPT_IGC@ OPT_IGC_IFLAGS = @OPT_IGC_IFLAGS@ @@ -401,6 +402,7 @@ PERFMON_LDFLAGS_STAT = @PERFMON_LDFLAGS_STAT@ PERFMON_LIB = @PERFMON_LIB@ PERF_EVENT_PARANOID = @PERF_EVENT_PARANOID@ RANLIB = @RANLIB@ +ROCM_PROFILER_LD_DIR = @ROCM_PROFILER_LD_DIR@ SED = @SED@ SET_MAKE = @SET_MAKE@ SHELL = @SHELL@ diff --git a/src/lib/prof-lean/Makefile.in b/src/lib/prof-lean/Makefile.in index 72deb89952..dc73f044c1 100644 --- a/src/lib/prof-lean/Makefile.in +++ b/src/lib/prof-lean/Makefile.in @@ -369,6 +369,7 @@ OPT_CUPTI_IFLAGS = @OPT_CUPTI_IFLAGS@ OPT_CUPTI_LDFLAGS = @OPT_CUPTI_LDFLAGS@ OPT_GTPIN = @OPT_GTPIN@ OPT_GTPIN_IFLAGS = @OPT_GTPIN_IFLAGS@ +OPT_GTPIN_LDFLAGS = @OPT_GTPIN_LDFLAGS@ OPT_GTPIN_LIBDIR = @OPT_GTPIN_LIBDIR@ OPT_IGC = @OPT_IGC@ OPT_IGC_IFLAGS = @OPT_IGC_IFLAGS@ @@ -406,6 +407,7 @@ PERFMON_LDFLAGS_STAT = @PERFMON_LDFLAGS_STAT@ PERFMON_LIB = @PERFMON_LIB@ PERF_EVENT_PARANOID = @PERF_EVENT_PARANOID@ RANLIB = @RANLIB@ +ROCM_PROFILER_LD_DIR = @ROCM_PROFILER_LD_DIR@ SED = @SED@ SET_MAKE = @SET_MAKE@ SHELL = @SHELL@ diff --git a/src/lib/prof-lean/crypto-hash.h b/src/lib/prof-lean/crypto-hash.h index 7a18f59b82..d212583670 100644 --- a/src/lib/prof-lean/crypto-hash.h +++ b/src/lib/prof-lean/crypto-hash.h @@ -159,6 +159,10 @@ crypto_hash_self_test int verbose ); +#if defined(__cplusplus) +} +#endif + #endif #if defined(__cplusplus) diff --git a/src/lib/prof/Makefile.in b/src/lib/prof/Makefile.in index 16af4067ad..33a9fdb0d9 100644 --- a/src/lib/prof/Makefile.in +++ b/src/lib/prof/Makefile.in @@ -375,6 +375,7 @@ OPT_CUPTI_IFLAGS = @OPT_CUPTI_IFLAGS@ OPT_CUPTI_LDFLAGS = @OPT_CUPTI_LDFLAGS@ OPT_GTPIN = @OPT_GTPIN@ OPT_GTPIN_IFLAGS = @OPT_GTPIN_IFLAGS@ +OPT_GTPIN_LDFLAGS = @OPT_GTPIN_LDFLAGS@ OPT_GTPIN_LIBDIR = @OPT_GTPIN_LIBDIR@ OPT_IGC = @OPT_IGC@ OPT_IGC_IFLAGS = @OPT_IGC_IFLAGS@ @@ -412,6 +413,7 @@ PERFMON_LDFLAGS_STAT = @PERFMON_LDFLAGS_STAT@ PERFMON_LIB = @PERFMON_LIB@ PERF_EVENT_PARANOID = @PERF_EVENT_PARANOID@ RANLIB = @RANLIB@ +ROCM_PROFILER_LD_DIR = @ROCM_PROFILER_LD_DIR@ SED = @SED@ SET_MAKE = @SET_MAKE@ SHELL = @SHELL@ diff --git a/src/lib/profile/Makefile.in b/src/lib/profile/Makefile.in index a1f68e6195..9c8ac842a6 100644 --- a/src/lib/profile/Makefile.in +++ b/src/lib/profile/Makefile.in @@ -406,6 +406,7 @@ OPT_CUPTI_IFLAGS = @OPT_CUPTI_IFLAGS@ OPT_CUPTI_LDFLAGS = @OPT_CUPTI_LDFLAGS@ OPT_GTPIN = @OPT_GTPIN@ OPT_GTPIN_IFLAGS = @OPT_GTPIN_IFLAGS@ +OPT_GTPIN_LDFLAGS = @OPT_GTPIN_LDFLAGS@ OPT_GTPIN_LIBDIR = @OPT_GTPIN_LIBDIR@ OPT_IGC = @OPT_IGC@ OPT_IGC_IFLAGS = @OPT_IGC_IFLAGS@ @@ -443,6 +444,7 @@ PERFMON_LDFLAGS_STAT = @PERFMON_LDFLAGS_STAT@ PERFMON_LIB = @PERFMON_LIB@ PERF_EVENT_PARANOID = @PERF_EVENT_PARANOID@ RANLIB = @RANLIB@ +ROCM_PROFILER_LD_DIR = @ROCM_PROFILER_LD_DIR@ SED = @SED@ SET_MAKE = @SET_MAKE@ SHELL = @SHELL@ diff --git a/src/lib/profxml/Makefile.in b/src/lib/profxml/Makefile.in index d7285ecb6c..baef3c5eeb 100644 --- a/src/lib/profxml/Makefile.in +++ b/src/lib/profxml/Makefile.in @@ -369,6 +369,7 @@ OPT_CUPTI_IFLAGS = @OPT_CUPTI_IFLAGS@ OPT_CUPTI_LDFLAGS = @OPT_CUPTI_LDFLAGS@ OPT_GTPIN = @OPT_GTPIN@ OPT_GTPIN_IFLAGS = @OPT_GTPIN_IFLAGS@ +OPT_GTPIN_LDFLAGS = @OPT_GTPIN_LDFLAGS@ OPT_GTPIN_LIBDIR = @OPT_GTPIN_LIBDIR@ OPT_IGC = @OPT_IGC@ OPT_IGC_IFLAGS = @OPT_IGC_IFLAGS@ @@ -406,6 +407,7 @@ PERFMON_LDFLAGS_STAT = @PERFMON_LDFLAGS_STAT@ PERFMON_LIB = @PERFMON_LIB@ PERF_EVENT_PARANOID = @PERF_EVENT_PARANOID@ RANLIB = @RANLIB@ +ROCM_PROFILER_LD_DIR = @ROCM_PROFILER_LD_DIR@ SED = @SED@ SET_MAKE = @SET_MAKE@ SHELL = @SHELL@ diff --git a/src/lib/stubs-gcc_s/Makefile.in b/src/lib/stubs-gcc_s/Makefile.in index 9c780b3dca..98f067d869 100644 --- a/src/lib/stubs-gcc_s/Makefile.in +++ b/src/lib/stubs-gcc_s/Makefile.in @@ -347,6 +347,7 @@ OPT_CUPTI_IFLAGS = @OPT_CUPTI_IFLAGS@ OPT_CUPTI_LDFLAGS = @OPT_CUPTI_LDFLAGS@ OPT_GTPIN = @OPT_GTPIN@ OPT_GTPIN_IFLAGS = @OPT_GTPIN_IFLAGS@ +OPT_GTPIN_LDFLAGS = @OPT_GTPIN_LDFLAGS@ OPT_GTPIN_LIBDIR = @OPT_GTPIN_LIBDIR@ OPT_IGC = @OPT_IGC@ OPT_IGC_IFLAGS = @OPT_IGC_IFLAGS@ @@ -384,6 +385,7 @@ PERFMON_LDFLAGS_STAT = @PERFMON_LDFLAGS_STAT@ PERFMON_LIB = @PERFMON_LIB@ PERF_EVENT_PARANOID = @PERF_EVENT_PARANOID@ RANLIB = @RANLIB@ +ROCM_PROFILER_LD_DIR = @ROCM_PROFILER_LD_DIR@ SED = @SED@ SET_MAKE = @SET_MAKE@ SHELL = @SHELL@ diff --git a/src/lib/support-lean/Makefile.in b/src/lib/support-lean/Makefile.in index 35ffa8c0a6..2636565045 100644 --- a/src/lib/support-lean/Makefile.in +++ b/src/lib/support-lean/Makefile.in @@ -353,6 +353,7 @@ OPT_CUPTI_IFLAGS = @OPT_CUPTI_IFLAGS@ OPT_CUPTI_LDFLAGS = @OPT_CUPTI_LDFLAGS@ OPT_GTPIN = @OPT_GTPIN@ OPT_GTPIN_IFLAGS = @OPT_GTPIN_IFLAGS@ +OPT_GTPIN_LDFLAGS = @OPT_GTPIN_LDFLAGS@ OPT_GTPIN_LIBDIR = @OPT_GTPIN_LIBDIR@ OPT_IGC = @OPT_IGC@ OPT_IGC_IFLAGS = @OPT_IGC_IFLAGS@ @@ -390,6 +391,7 @@ PERFMON_LDFLAGS_STAT = @PERFMON_LDFLAGS_STAT@ PERFMON_LIB = @PERFMON_LIB@ PERF_EVENT_PARANOID = @PERF_EVENT_PARANOID@ RANLIB = @RANLIB@ +ROCM_PROFILER_LD_DIR = @ROCM_PROFILER_LD_DIR@ SED = @SED@ SET_MAKE = @SET_MAKE@ SHELL = @SHELL@ diff --git a/src/lib/support/Makefile.in b/src/lib/support/Makefile.in index d4f1edf599..10b2a09865 100644 --- a/src/lib/support/Makefile.in +++ b/src/lib/support/Makefile.in @@ -381,6 +381,7 @@ OPT_CUPTI_IFLAGS = @OPT_CUPTI_IFLAGS@ OPT_CUPTI_LDFLAGS = @OPT_CUPTI_LDFLAGS@ OPT_GTPIN = @OPT_GTPIN@ OPT_GTPIN_IFLAGS = @OPT_GTPIN_IFLAGS@ +OPT_GTPIN_LDFLAGS = @OPT_GTPIN_LDFLAGS@ OPT_GTPIN_LIBDIR = @OPT_GTPIN_LIBDIR@ OPT_IGC = @OPT_IGC@ OPT_IGC_IFLAGS = @OPT_IGC_IFLAGS@ @@ -418,6 +419,7 @@ PERFMON_LDFLAGS_STAT = @PERFMON_LDFLAGS_STAT@ PERFMON_LIB = @PERFMON_LIB@ PERF_EVENT_PARANOID = @PERF_EVENT_PARANOID@ RANLIB = @RANLIB@ +ROCM_PROFILER_LD_DIR = @ROCM_PROFILER_LD_DIR@ SED = @SED@ SET_MAKE = @SET_MAKE@ SHELL = @SHELL@ diff --git a/src/lib/xml/Makefile.in b/src/lib/xml/Makefile.in index 50c80235a4..c5cd173841 100644 --- a/src/lib/xml/Makefile.in +++ b/src/lib/xml/Makefile.in @@ -366,6 +366,7 @@ OPT_CUPTI_IFLAGS = @OPT_CUPTI_IFLAGS@ OPT_CUPTI_LDFLAGS = @OPT_CUPTI_LDFLAGS@ OPT_GTPIN = @OPT_GTPIN@ OPT_GTPIN_IFLAGS = @OPT_GTPIN_IFLAGS@ +OPT_GTPIN_LDFLAGS = @OPT_GTPIN_LDFLAGS@ OPT_GTPIN_LIBDIR = @OPT_GTPIN_LIBDIR@ OPT_IGC = @OPT_IGC@ OPT_IGC_IFLAGS = @OPT_IGC_IFLAGS@ @@ -403,6 +404,7 @@ PERFMON_LDFLAGS_STAT = @PERFMON_LDFLAGS_STAT@ PERFMON_LIB = @PERFMON_LIB@ PERF_EVENT_PARANOID = @PERF_EVENT_PARANOID@ RANLIB = @RANLIB@ +ROCM_PROFILER_LD_DIR = @ROCM_PROFILER_LD_DIR@ SED = @SED@ SET_MAKE = @SET_MAKE@ SHELL = @SHELL@ diff --git a/src/tool/Makefile.in b/src/tool/Makefile.in index ae0baf8570..465bddb363 100644 --- a/src/tool/Makefile.in +++ b/src/tool/Makefile.in @@ -341,6 +341,7 @@ OPT_CUPTI_IFLAGS = @OPT_CUPTI_IFLAGS@ OPT_CUPTI_LDFLAGS = @OPT_CUPTI_LDFLAGS@ OPT_GTPIN = @OPT_GTPIN@ OPT_GTPIN_IFLAGS = @OPT_GTPIN_IFLAGS@ +OPT_GTPIN_LDFLAGS = @OPT_GTPIN_LDFLAGS@ OPT_GTPIN_LIBDIR = @OPT_GTPIN_LIBDIR@ OPT_IGC = @OPT_IGC@ OPT_IGC_IFLAGS = @OPT_IGC_IFLAGS@ @@ -378,6 +379,7 @@ PERFMON_LDFLAGS_STAT = @PERFMON_LDFLAGS_STAT@ PERFMON_LIB = @PERFMON_LIB@ PERF_EVENT_PARANOID = @PERF_EVENT_PARANOID@ RANLIB = @RANLIB@ +ROCM_PROFILER_LD_DIR = @ROCM_PROFILER_LD_DIR@ SED = @SED@ SET_MAKE = @SET_MAKE@ SHELL = @SHELL@ diff --git a/src/tool/hpcfnbounds/Makefile.in b/src/tool/hpcfnbounds/Makefile.in index 04543be278..caccfb848d 100644 --- a/src/tool/hpcfnbounds/Makefile.in +++ b/src/tool/hpcfnbounds/Makefile.in @@ -450,6 +450,7 @@ OPT_CUPTI_IFLAGS = @OPT_CUPTI_IFLAGS@ OPT_CUPTI_LDFLAGS = @OPT_CUPTI_LDFLAGS@ OPT_GTPIN = @OPT_GTPIN@ OPT_GTPIN_IFLAGS = @OPT_GTPIN_IFLAGS@ +OPT_GTPIN_LDFLAGS = @OPT_GTPIN_LDFLAGS@ OPT_GTPIN_LIBDIR = @OPT_GTPIN_LIBDIR@ OPT_IGC = @OPT_IGC@ OPT_IGC_IFLAGS = @OPT_IGC_IFLAGS@ @@ -487,6 +488,7 @@ PERFMON_LDFLAGS_STAT = @PERFMON_LDFLAGS_STAT@ PERFMON_LIB = @PERFMON_LIB@ PERF_EVENT_PARANOID = @PERF_EVENT_PARANOID@ RANLIB = @RANLIB@ +ROCM_PROFILER_LD_DIR = @ROCM_PROFILER_LD_DIR@ SED = @SED@ SET_MAKE = @SET_MAKE@ SHELL = @SHELL@ diff --git a/src/tool/hpcfnbounds2/Makefile.in b/src/tool/hpcfnbounds2/Makefile.in index 3b727fb413..9828167843 100644 --- a/src/tool/hpcfnbounds2/Makefile.in +++ b/src/tool/hpcfnbounds2/Makefile.in @@ -348,6 +348,7 @@ OPT_CUPTI_IFLAGS = @OPT_CUPTI_IFLAGS@ OPT_CUPTI_LDFLAGS = @OPT_CUPTI_LDFLAGS@ OPT_GTPIN = @OPT_GTPIN@ OPT_GTPIN_IFLAGS = @OPT_GTPIN_IFLAGS@ +OPT_GTPIN_LDFLAGS = @OPT_GTPIN_LDFLAGS@ OPT_GTPIN_LIBDIR = @OPT_GTPIN_LIBDIR@ OPT_IGC = @OPT_IGC@ OPT_IGC_IFLAGS = @OPT_IGC_IFLAGS@ @@ -385,6 +386,7 @@ PERFMON_LDFLAGS_STAT = @PERFMON_LDFLAGS_STAT@ PERFMON_LIB = @PERFMON_LIB@ PERF_EVENT_PARANOID = @PERF_EVENT_PARANOID@ RANLIB = @RANLIB@ +ROCM_PROFILER_LD_DIR = @ROCM_PROFILER_LD_DIR@ SED = @SED@ SET_MAKE = @SET_MAKE@ SHELL = @SHELL@ diff --git a/src/tool/hpclump/Makefile.in b/src/tool/hpclump/Makefile.in index 8c2f7b688b..fff69c91ee 100644 --- a/src/tool/hpclump/Makefile.in +++ b/src/tool/hpclump/Makefile.in @@ -382,6 +382,7 @@ OPT_CUPTI_IFLAGS = @OPT_CUPTI_IFLAGS@ OPT_CUPTI_LDFLAGS = @OPT_CUPTI_LDFLAGS@ OPT_GTPIN = @OPT_GTPIN@ OPT_GTPIN_IFLAGS = @OPT_GTPIN_IFLAGS@ +OPT_GTPIN_LDFLAGS = @OPT_GTPIN_LDFLAGS@ OPT_GTPIN_LIBDIR = @OPT_GTPIN_LIBDIR@ OPT_IGC = @OPT_IGC@ OPT_IGC_IFLAGS = @OPT_IGC_IFLAGS@ @@ -419,6 +420,7 @@ PERFMON_LDFLAGS_STAT = @PERFMON_LDFLAGS_STAT@ PERFMON_LIB = @PERFMON_LIB@ PERF_EVENT_PARANOID = @PERF_EVENT_PARANOID@ RANLIB = @RANLIB@ +ROCM_PROFILER_LD_DIR = @ROCM_PROFILER_LD_DIR@ SED = @SED@ SET_MAKE = @SET_MAKE@ SHELL = @SHELL@ diff --git a/src/tool/hpcprof-flat/Makefile.in b/src/tool/hpcprof-flat/Makefile.in index fbc6ce1017..8fdc2ad6c3 100644 --- a/src/tool/hpcprof-flat/Makefile.in +++ b/src/tool/hpcprof-flat/Makefile.in @@ -416,6 +416,7 @@ OPT_CUPTI_IFLAGS = @OPT_CUPTI_IFLAGS@ OPT_CUPTI_LDFLAGS = @OPT_CUPTI_LDFLAGS@ OPT_GTPIN = @OPT_GTPIN@ OPT_GTPIN_IFLAGS = @OPT_GTPIN_IFLAGS@ +OPT_GTPIN_LDFLAGS = @OPT_GTPIN_LDFLAGS@ OPT_GTPIN_LIBDIR = @OPT_GTPIN_LIBDIR@ OPT_IGC = @OPT_IGC@ OPT_IGC_IFLAGS = @OPT_IGC_IFLAGS@ @@ -453,6 +454,7 @@ PERFMON_LDFLAGS_STAT = @PERFMON_LDFLAGS_STAT@ PERFMON_LIB = @PERFMON_LIB@ PERF_EVENT_PARANOID = @PERF_EVENT_PARANOID@ RANLIB = @RANLIB@ +ROCM_PROFILER_LD_DIR = @ROCM_PROFILER_LD_DIR@ SED = @SED@ SET_MAKE = @SET_MAKE@ SHELL = @SHELL@ diff --git a/src/tool/hpcprof-mpi/Makefile.in b/src/tool/hpcprof-mpi/Makefile.in index a9d6ecaa97..5a263b3717 100644 --- a/src/tool/hpcprof-mpi/Makefile.in +++ b/src/tool/hpcprof-mpi/Makefile.in @@ -416,6 +416,7 @@ OPT_CUPTI_IFLAGS = @OPT_CUPTI_IFLAGS@ OPT_CUPTI_LDFLAGS = @OPT_CUPTI_LDFLAGS@ OPT_GTPIN = @OPT_GTPIN@ OPT_GTPIN_IFLAGS = @OPT_GTPIN_IFLAGS@ +OPT_GTPIN_LDFLAGS = @OPT_GTPIN_LDFLAGS@ OPT_GTPIN_LIBDIR = @OPT_GTPIN_LIBDIR@ OPT_IGC = @OPT_IGC@ OPT_IGC_IFLAGS = @OPT_IGC_IFLAGS@ @@ -453,6 +454,7 @@ PERFMON_LDFLAGS_STAT = @PERFMON_LDFLAGS_STAT@ PERFMON_LIB = @PERFMON_LIB@ PERF_EVENT_PARANOID = @PERF_EVENT_PARANOID@ RANLIB = @RANLIB@ +ROCM_PROFILER_LD_DIR = @ROCM_PROFILER_LD_DIR@ SED = @SED@ SET_MAKE = @SET_MAKE@ SHELL = @SHELL@ diff --git a/src/tool/hpcprof/Makefile.in b/src/tool/hpcprof/Makefile.in index f675a9e44f..3a16a2826d 100644 --- a/src/tool/hpcprof/Makefile.in +++ b/src/tool/hpcprof/Makefile.in @@ -414,6 +414,7 @@ OPT_CUPTI_IFLAGS = @OPT_CUPTI_IFLAGS@ OPT_CUPTI_LDFLAGS = @OPT_CUPTI_LDFLAGS@ OPT_GTPIN = @OPT_GTPIN@ OPT_GTPIN_IFLAGS = @OPT_GTPIN_IFLAGS@ +OPT_GTPIN_LDFLAGS = @OPT_GTPIN_LDFLAGS@ OPT_GTPIN_LIBDIR = @OPT_GTPIN_LIBDIR@ OPT_IGC = @OPT_IGC@ OPT_IGC_IFLAGS = @OPT_IGC_IFLAGS@ @@ -451,6 +452,7 @@ PERFMON_LDFLAGS_STAT = @PERFMON_LDFLAGS_STAT@ PERFMON_LIB = @PERFMON_LIB@ PERF_EVENT_PARANOID = @PERF_EVENT_PARANOID@ RANLIB = @RANLIB@ +ROCM_PROFILER_LD_DIR = @ROCM_PROFILER_LD_DIR@ SED = @SED@ SET_MAKE = @SET_MAKE@ SHELL = @SHELL@ diff --git a/src/tool/hpcprof2-mpi/Makefile.in b/src/tool/hpcprof2-mpi/Makefile.in index 96836cd011..5d7363e3f3 100644 --- a/src/tool/hpcprof2-mpi/Makefile.in +++ b/src/tool/hpcprof2-mpi/Makefile.in @@ -359,6 +359,7 @@ OPT_CUPTI_IFLAGS = @OPT_CUPTI_IFLAGS@ OPT_CUPTI_LDFLAGS = @OPT_CUPTI_LDFLAGS@ OPT_GTPIN = @OPT_GTPIN@ OPT_GTPIN_IFLAGS = @OPT_GTPIN_IFLAGS@ +OPT_GTPIN_LDFLAGS = @OPT_GTPIN_LDFLAGS@ OPT_GTPIN_LIBDIR = @OPT_GTPIN_LIBDIR@ OPT_IGC = @OPT_IGC@ OPT_IGC_IFLAGS = @OPT_IGC_IFLAGS@ @@ -396,6 +397,7 @@ PERFMON_LDFLAGS_STAT = @PERFMON_LDFLAGS_STAT@ PERFMON_LIB = @PERFMON_LIB@ PERF_EVENT_PARANOID = @PERF_EVENT_PARANOID@ RANLIB = @RANLIB@ +ROCM_PROFILER_LD_DIR = @ROCM_PROFILER_LD_DIR@ SED = @SED@ SET_MAKE = @SET_MAKE@ SHELL = @SHELL@ diff --git a/src/tool/hpcprof2/Makefile.in b/src/tool/hpcprof2/Makefile.in index b40dfeef89..40dd29a49a 100644 --- a/src/tool/hpcprof2/Makefile.in +++ b/src/tool/hpcprof2/Makefile.in @@ -351,6 +351,7 @@ OPT_CUPTI_IFLAGS = @OPT_CUPTI_IFLAGS@ OPT_CUPTI_LDFLAGS = @OPT_CUPTI_LDFLAGS@ OPT_GTPIN = @OPT_GTPIN@ OPT_GTPIN_IFLAGS = @OPT_GTPIN_IFLAGS@ +OPT_GTPIN_LDFLAGS = @OPT_GTPIN_LDFLAGS@ OPT_GTPIN_LIBDIR = @OPT_GTPIN_LIBDIR@ OPT_IGC = @OPT_IGC@ OPT_IGC_IFLAGS = @OPT_IGC_IFLAGS@ @@ -388,6 +389,7 @@ PERFMON_LDFLAGS_STAT = @PERFMON_LDFLAGS_STAT@ PERFMON_LIB = @PERFMON_LIB@ PERF_EVENT_PARANOID = @PERF_EVENT_PARANOID@ RANLIB = @RANLIB@ +ROCM_PROFILER_LD_DIR = @ROCM_PROFILER_LD_DIR@ SED = @SED@ SET_MAKE = @SET_MAKE@ SHELL = @SHELL@ diff --git a/src/tool/hpcproftt/Makefile.in b/src/tool/hpcproftt/Makefile.in index 4c305011e7..ca04ad8c53 100644 --- a/src/tool/hpcproftt/Makefile.in +++ b/src/tool/hpcproftt/Makefile.in @@ -417,6 +417,7 @@ OPT_CUPTI_IFLAGS = @OPT_CUPTI_IFLAGS@ OPT_CUPTI_LDFLAGS = @OPT_CUPTI_LDFLAGS@ OPT_GTPIN = @OPT_GTPIN@ OPT_GTPIN_IFLAGS = @OPT_GTPIN_IFLAGS@ +OPT_GTPIN_LDFLAGS = @OPT_GTPIN_LDFLAGS@ OPT_GTPIN_LIBDIR = @OPT_GTPIN_LIBDIR@ OPT_IGC = @OPT_IGC@ OPT_IGC_IFLAGS = @OPT_IGC_IFLAGS@ @@ -454,6 +455,7 @@ PERFMON_LDFLAGS_STAT = @PERFMON_LDFLAGS_STAT@ PERFMON_LIB = @PERFMON_LIB@ PERF_EVENT_PARANOID = @PERF_EVENT_PARANOID@ RANLIB = @RANLIB@ +ROCM_PROFILER_LD_DIR = @ROCM_PROFILER_LD_DIR@ SED = @SED@ SET_MAKE = @SET_MAKE@ SHELL = @SHELL@ diff --git a/src/tool/hpcrun-flat/Makefile.in b/src/tool/hpcrun-flat/Makefile.in index 10f0c788b8..cc063d5a5e 100644 --- a/src/tool/hpcrun-flat/Makefile.in +++ b/src/tool/hpcrun-flat/Makefile.in @@ -411,6 +411,7 @@ OPT_CUPTI_IFLAGS = @OPT_CUPTI_IFLAGS@ OPT_CUPTI_LDFLAGS = @OPT_CUPTI_LDFLAGS@ OPT_GTPIN = @OPT_GTPIN@ OPT_GTPIN_IFLAGS = @OPT_GTPIN_IFLAGS@ +OPT_GTPIN_LDFLAGS = @OPT_GTPIN_LDFLAGS@ OPT_GTPIN_LIBDIR = @OPT_GTPIN_LIBDIR@ OPT_IGC = @OPT_IGC@ OPT_IGC_IFLAGS = @OPT_IGC_IFLAGS@ @@ -448,6 +449,7 @@ PERFMON_LDFLAGS_STAT = @PERFMON_LDFLAGS_STAT@ PERFMON_LIB = @PERFMON_LIB@ PERF_EVENT_PARANOID = @PERF_EVENT_PARANOID@ RANLIB = @RANLIB@ +ROCM_PROFILER_LD_DIR = @ROCM_PROFILER_LD_DIR@ SED = @SED@ SET_MAKE = @SET_MAKE@ SHELL = @SHELL@ diff --git a/src/tool/hpcrun/Makefile.am b/src/tool/hpcrun/Makefile.am index a30590f819..7c2be6110d 100644 --- a/src/tool/hpcrun/Makefile.am +++ b/src/tool/hpcrun/Makefile.am @@ -163,6 +163,9 @@ XED2_INC = @XED2_INC@ XED2_HPCRUN_LIBS = @XED2_HPCRUN_LIBS@ XED2_HPCLINK_LIBS = @XED2_HPCLINK_LIBS@ CUPTI_INC_FLGS = @OPT_CUPTI_IFLAGS@ + +ROCM_INC_FLGS = @OPT_ROCM_IFLAGS@ + OPENCL_IFLAGS = @OPT_OPENCL_IFLAGS@ CUPTI_LD_FLGS = @OPT_CUPTI_LDFLAGS@ CUPTI_BASE = @OPT_CUPTI@ @@ -296,6 +299,7 @@ MY_BASE_FILES = \ sample_event.c \ sample_prob.c \ sample_sources_all.c \ + tool_state.c \ sample-sources/blame-shift/blame-shift.c \ sample-sources/blame-shift/blame-map.c \ sample-sources/blame-shift/directed.c \ @@ -325,6 +329,7 @@ MY_BASE_FILES = \ control-knob.c \ control-knob.h \ device-finalizers.c \ + gpu-monitors.c \ device-initializers.c \ module-ignore-map.c \ threadmgr.c \ @@ -387,6 +392,10 @@ MY_BASE_FILES = \ gpu/gpu-trace-channel-set.c \ gpu/gpu-trace-demultiplexer.c \ \ + gpu/ompt/ompt-gpu-api.c \ + gpu/ompt/ompt-activity-translate.c \ + sample-sources/openmp-target.c \ + \ ompt/ompt-callstack.c \ ompt/ompt-defer.c \ ompt/ompt-device.c \ @@ -503,6 +512,11 @@ if OPT_PAPI_CUPTI MY_PAPI_FILES += sample-sources/papi-c-cupti.c endif +if OPT_PAPI_ROCM + MY_PAPI_FILES += sample-sources/papi-c-rocm.c +endif + + if OPT_ENABLE_CUPTI MY_CUPTI_FILES = sample-sources/nvidia.c \ gpu/nvidia/cubin-hash-map.c \ @@ -542,9 +556,11 @@ endif if OPT_ENABLE_ROCM MY_ROCM_FILES =\ sample-sources/amd.c \ + sample-sources/amd-rocprofiler.c \ + gpu/amd/hip-api.c \ gpu/amd/roctracer-activity-translate.c \ gpu/amd/roctracer-api.c \ - gpu/amd/rocm-debug-api.c \ + gpu/amd/rocprofiler-api.c \ gpu/amd/rocm-binary-processing.c endif @@ -590,6 +606,7 @@ MY_INCLUDE_DIRS = \ -I$(HPCFNBOUNDS_INC) \ $(OPT_CUDA_IFLAGS) \ $(OPT_CUPTI_IFLAGS) \ + $(ROCM_INC_FLGS) \ -I$(LIBELF_INC) \ -I$(LIBMONITOR_INC) \ $(GOTCHA_IFLAGS) \ @@ -991,11 +1008,16 @@ libhpcrun_la_CPPFLAGS += $(CUPTI_INC_FLGS) MY_CPP_DEFINES += -DHPCRUN_SS_PAPI_C_CUPTI endif +if OPT_PAPI_ROCM +libhpcrun_la_CPPFLAGS += $(ROCM_INC_FLGS) +MY_CPP_DEFINES += -DHPCRUN_SS_PAPI_C_ROCM +endif + if OPT_PAPI_STATIC libhpcrun_o_SOURCES += $(MY_PAPI_FILES) libhpcrun_o_CPPFLAGS += $(PAPI_INC_FLGS) libhpcrun_o_LDADD += $(OPT_PAPI_LIBS_STAT) - + MY_CPP_DEFINES += -DHPCRUN_SS_PAPI endif @@ -1038,11 +1060,10 @@ if OPT_ENABLE_CUDA libhpcrun_o_SOURCES += $(MY_CUDA_FILES) endif - if OPT_ENABLE_ROCM libhpcrun_la_SOURCES += $(MY_ROCM_FILES) libhpcrun_la_CPPFLAGS += -DENABLE_ROCM - libhpcrun_la_CFLAGS += $(OPT_ROCM_IFLAGS) + libhpcrun_la_CFLAGS += $(ROCM_INC_FLGS) MY_CPP_DEFINES += -DHPCRUN_SS_AMD endif @@ -1177,7 +1198,7 @@ endif # Don't use LDFLAGS for static case. MONITOR_NAMES = -G 'monitor_*' -HPCRUN_NAMES = -G 'hpcrun_*' -G 'hpctoolkit_*' +HPCRUN_NAMES = -G 'hpcrun_*' -G 'hpctoolkit_*' -G 'OnLoadToolProp' -G 'OnLoad' -G 'OnUnloadTool' MISC_NAMES = -G 'debug_flag_*' -G 'messages_*' -G ompt_start_tool OPENCL_NAMES = -G 'clBuildProgram' -G 'clCreate*' -G 'clEnqueue*' -G 'clSetKernelArg' -G 'cl*Event*' diff --git a/src/tool/hpcrun/Makefile.in b/src/tool/hpcrun/Makefile.in index 13aaf4d908..afb6c56216 100644 --- a/src/tool/hpcrun/Makefile.in +++ b/src/tool/hpcrun/Makefile.in @@ -168,25 +168,25 @@ host_triplet = @host@ @OPT_ENABLE_KERNEL_4_3_TRUE@@OPT_ENABLE_PERF_EVENT_TRUE@am__append_14 = sample-sources/perf/kernel_blocking.c @OPT_ENABLE_KERNEL_4_3_FALSE@@OPT_ENABLE_PERF_EVENT_TRUE@am__append_15 = sample-sources/perf/kernel_blocking_stub.c @OPT_PAPI_CUPTI_TRUE@am__append_16 = sample-sources/papi-c-cupti.c -@OPT_ENABLE_OPENCL_TRUE@am__append_17 = libhpcrun_opencl.la -@OPT_ENABLE_LEVEL0_TRUE@am__append_18 = libhpcrun_level0.la +@OPT_PAPI_ROCM_TRUE@am__append_17 = sample-sources/papi-c-rocm.c +@OPT_ENABLE_OPENCL_TRUE@am__append_18 = libhpcrun_opencl.la +@OPT_ENABLE_LEVEL0_TRUE@am__append_19 = libhpcrun_level0.la # # BG/Q backend requires special treatment to avoid deadlocks # -@OPT_BGQ_BACKEND_TRUE@am__append_19 = -DUSE_HW_THREAD_ID -DNONZERO_THRESHOLD -@OPT_BGQ_BACKEND_TRUE@am__append_20 = -I$(srcdir)/utilities/bgq-cnk +@OPT_BGQ_BACKEND_TRUE@am__append_20 = -DUSE_HW_THREAD_ID -DNONZERO_THRESHOLD @OPT_BGQ_BACKEND_TRUE@am__append_21 = -I$(srcdir)/utilities/bgq-cnk -@OPT_ENABLE_MPI_WRAP_TRUE@am__append_22 = mpi-overrides.c +@OPT_BGQ_BACKEND_TRUE@am__append_22 = -I$(srcdir)/utilities/bgq-cnk @OPT_ENABLE_MPI_WRAP_TRUE@am__append_23 = mpi-overrides.c +@OPT_ENABLE_MPI_WRAP_TRUE@am__append_24 = mpi-overrides.c #----------------------------------------------------------- # whirled peas #----------------------------------------------------------- -@HOST_OS_LINUX_TRUE@am__append_24 = $(MY_LINUX_DYNAMIC_FILES) -@HOST_CPU_MIPS_TRUE@am__append_25 = $(MY_MIPS_FILES) +@HOST_OS_LINUX_TRUE@am__append_25 = $(MY_LINUX_DYNAMIC_FILES) @HOST_CPU_MIPS_TRUE@am__append_26 = $(MY_MIPS_FILES) -@HOST_CPU_MIPS_TRUE@am__append_27 = $(MY_MIPS_INCLUDE_DIRS) +@HOST_CPU_MIPS_TRUE@am__append_27 = $(MY_MIPS_FILES) @HOST_CPU_MIPS_TRUE@am__append_28 = $(MY_MIPS_INCLUDE_DIRS) @HOST_CPU_MIPS_TRUE@am__append_29 = $(MY_MIPS_INCLUDE_DIRS) @HOST_CPU_MIPS_TRUE@am__append_30 = $(MY_MIPS_INCLUDE_DIRS) @@ -197,15 +197,15 @@ host_triplet = @host@ @HOST_CPU_MIPS_TRUE@am__append_35 = $(MY_MIPS_INCLUDE_DIRS) @HOST_CPU_MIPS_TRUE@am__append_36 = $(MY_MIPS_INCLUDE_DIRS) @HOST_CPU_MIPS_TRUE@am__append_37 = $(MY_MIPS_INCLUDE_DIRS) +@HOST_CPU_MIPS_TRUE@am__append_38 = $(MY_MIPS_INCLUDE_DIRS) # Note: setting CCASFLAGS here is a no-op hack with the side effect of # prefixing the tramp.s file names so they will be compiled separately # for .o and .so targets. CFLAGS does this for the .c files, but # CFLAGS doesn't apply to .s files. See the automake docs section # 8.3.9.2, Objects created with both libtool and without. -@HOST_CPU_PPC_TRUE@am__append_38 = $(MY_PPC_FILES) @HOST_CPU_PPC_TRUE@am__append_39 = $(MY_PPC_FILES) -@HOST_CPU_PPC_TRUE@am__append_40 = $(MY_PPC_INCLUDE_DIRS) +@HOST_CPU_PPC_TRUE@am__append_40 = $(MY_PPC_FILES) @HOST_CPU_PPC_TRUE@am__append_41 = $(MY_PPC_INCLUDE_DIRS) @HOST_CPU_PPC_TRUE@am__append_42 = $(MY_PPC_INCLUDE_DIRS) @HOST_CPU_PPC_TRUE@am__append_43 = $(MY_PPC_INCLUDE_DIRS) @@ -218,13 +218,13 @@ host_triplet = @host@ @HOST_CPU_PPC_TRUE@am__append_50 = $(MY_PPC_INCLUDE_DIRS) @HOST_CPU_PPC_TRUE@am__append_51 = $(MY_PPC_INCLUDE_DIRS) @HOST_CPU_PPC_TRUE@am__append_52 = $(MY_PPC_INCLUDE_DIRS) -@HOST_CPU_X86_FAMILY_TRUE@am__append_53 = $(MY_X86_FILES) +@HOST_CPU_PPC_TRUE@am__append_53 = $(MY_PPC_INCLUDE_DIRS) @HOST_CPU_X86_FAMILY_TRUE@am__append_54 = $(MY_X86_FILES) -@HOST_CPU_X86_FAMILY_TRUE@am__append_55 = $(MY_X86_INCLUDE_DIRS) +@HOST_CPU_X86_FAMILY_TRUE@am__append_55 = $(MY_X86_FILES) @HOST_CPU_X86_FAMILY_TRUE@am__append_56 = $(MY_X86_INCLUDE_DIRS) -@HOST_CPU_X86_FAMILY_TRUE@am__append_57 = $(XED2_HPCRUN_LIBS) -@HOST_CPU_X86_FAMILY_TRUE@am__append_58 = $(XED2_HPCLINK_LIBS) -@HOST_CPU_X86_FAMILY_TRUE@am__append_59 = $(MY_X86_INCLUDE_DIRS) +@HOST_CPU_X86_FAMILY_TRUE@am__append_57 = $(MY_X86_INCLUDE_DIRS) +@HOST_CPU_X86_FAMILY_TRUE@am__append_58 = $(XED2_HPCRUN_LIBS) +@HOST_CPU_X86_FAMILY_TRUE@am__append_59 = $(XED2_HPCLINK_LIBS) @HOST_CPU_X86_FAMILY_TRUE@am__append_60 = $(MY_X86_INCLUDE_DIRS) @HOST_CPU_X86_FAMILY_TRUE@am__append_61 = $(MY_X86_INCLUDE_DIRS) @HOST_CPU_X86_FAMILY_TRUE@am__append_62 = $(MY_X86_INCLUDE_DIRS) @@ -236,9 +236,9 @@ host_triplet = @host@ @HOST_CPU_X86_FAMILY_TRUE@am__append_68 = $(MY_X86_INCLUDE_DIRS) @HOST_CPU_X86_FAMILY_TRUE@am__append_69 = $(MY_X86_INCLUDE_DIRS) @HOST_CPU_X86_FAMILY_TRUE@am__append_70 = $(MY_X86_INCLUDE_DIRS) -@HOST_CPU_IA64_TRUE@am__append_71 = $(MY_IA64_FILES) +@HOST_CPU_X86_FAMILY_TRUE@am__append_71 = $(MY_X86_INCLUDE_DIRS) @HOST_CPU_IA64_TRUE@am__append_72 = $(MY_IA64_FILES) -@HOST_CPU_IA64_TRUE@am__append_73 = $(MY_IA64_INCLUDE_DIRS) +@HOST_CPU_IA64_TRUE@am__append_73 = $(MY_IA64_FILES) @HOST_CPU_IA64_TRUE@am__append_74 = $(MY_IA64_INCLUDE_DIRS) @HOST_CPU_IA64_TRUE@am__append_75 = $(MY_IA64_INCLUDE_DIRS) @HOST_CPU_IA64_TRUE@am__append_76 = $(MY_IA64_INCLUDE_DIRS) @@ -249,9 +249,9 @@ host_triplet = @host@ @HOST_CPU_IA64_TRUE@am__append_81 = $(MY_IA64_INCLUDE_DIRS) @HOST_CPU_IA64_TRUE@am__append_82 = $(MY_IA64_INCLUDE_DIRS) @HOST_CPU_IA64_TRUE@am__append_83 = $(MY_IA64_INCLUDE_DIRS) -@HOST_CPU_AARCH64_TRUE@am__append_84 = $(MY_AARCH64_FILES) +@HOST_CPU_IA64_TRUE@am__append_84 = $(MY_IA64_INCLUDE_DIRS) @HOST_CPU_AARCH64_TRUE@am__append_85 = $(MY_AARCH64_FILES) -@HOST_CPU_AARCH64_TRUE@am__append_86 = $(MY_AARCH64_INCLUDE_DIRS) +@HOST_CPU_AARCH64_TRUE@am__append_86 = $(MY_AARCH64_FILES) @HOST_CPU_AARCH64_TRUE@am__append_87 = $(MY_AARCH64_INCLUDE_DIRS) @HOST_CPU_AARCH64_TRUE@am__append_88 = $(MY_AARCH64_INCLUDE_DIRS) @HOST_CPU_AARCH64_TRUE@am__append_89 = $(MY_AARCH64_INCLUDE_DIRS) @@ -264,49 +264,52 @@ host_triplet = @host@ @HOST_CPU_AARCH64_TRUE@am__append_96 = $(MY_AARCH64_INCLUDE_DIRS) @HOST_CPU_AARCH64_TRUE@am__append_97 = $(MY_AARCH64_INCLUDE_DIRS) @HOST_CPU_AARCH64_TRUE@am__append_98 = $(MY_AARCH64_INCLUDE_DIRS) -@OPT_PAPI_DYNAMIC_TRUE@am__append_99 = $(MY_PAPI_FILES) -@OPT_PAPI_DYNAMIC_TRUE@am__append_100 = $(PAPI_INC_FLGS) -@OPT_PAPI_DYNAMIC_TRUE@am__append_101 = $(PAPI_LD_FLGS) -@OPT_PAPI_DYNAMIC_TRUE@am__append_102 = -DHPCRUN_SS_PAPI -@OPT_ENABLE_CUPTI_TRUE@am__append_103 = $(MY_CUPTI_FILES) +@HOST_CPU_AARCH64_TRUE@am__append_99 = $(MY_AARCH64_INCLUDE_DIRS) +@OPT_PAPI_DYNAMIC_TRUE@am__append_100 = $(MY_PAPI_FILES) +@OPT_PAPI_DYNAMIC_TRUE@am__append_101 = $(PAPI_INC_FLGS) +@OPT_PAPI_DYNAMIC_TRUE@am__append_102 = $(PAPI_LD_FLGS) +@OPT_PAPI_DYNAMIC_TRUE@am__append_103 = -DHPCRUN_SS_PAPI @OPT_ENABLE_CUPTI_TRUE@am__append_104 = $(MY_CUPTI_FILES) -@OPT_ENABLE_CUPTI_TRUE@am__append_105 = $(CUPTI_INC_FLGS) -@OPT_ENABLE_CUPTI_TRUE@am__append_106 = -DHPCRUN_SS_NVIDIA -@OPT_PAPI_CUPTI_TRUE@am__append_107 = $(CUPTI_INC_FLGS) -@OPT_PAPI_CUPTI_TRUE@am__append_108 = -DHPCRUN_SS_PAPI_C_CUPTI -@OPT_PAPI_STATIC_TRUE@am__append_109 = $(MY_PAPI_FILES) -@OPT_PAPI_STATIC_TRUE@am__append_110 = $(PAPI_INC_FLGS) -@OPT_PAPI_STATIC_TRUE@am__append_111 = $(OPT_PAPI_LIBS_STAT) -@OPT_PAPI_STATIC_TRUE@am__append_112 = -DHPCRUN_SS_PAPI -@OPT_ENABLE_UPC_TRUE@am__append_113 = $(MY_UPC_FILES) -@OPT_ENABLE_UPC_TRUE@am__append_114 = $(MY_UPC_FILES) -@OPT_ENABLE_UPC_TRUE@am__append_115 = $(OPT_UPC_IFLAGS) -@OPT_ENABLE_UPC_TRUE@am__append_116 = $(OPT_UPC_IFLAGS) -@OPT_ENABLE_UPC_TRUE@am__append_117 = $(OPT_UPC_LDFLAGS) -@OPT_ENABLE_LUSH_PTHREADS_TRUE@am__append_118 = -DLUSH_PTHREADS -@OPT_ENABLE_LUSH_PTHREADS_TRUE@am__append_119 = -DLUSH_PTHREADS -@OPT_ENABLE_CUDA_TRUE@am__append_120 = $(MY_CUDA_FILES) -@OPT_ENABLE_CUDA_TRUE@am__append_121 = -DENABLE_CUDA -@OPT_ENABLE_CUDA_TRUE@am__append_122 = $(OPT_CUDA_IFLAGS) +@OPT_ENABLE_CUPTI_TRUE@am__append_105 = $(MY_CUPTI_FILES) +@OPT_ENABLE_CUPTI_TRUE@am__append_106 = $(CUPTI_INC_FLGS) +@OPT_ENABLE_CUPTI_TRUE@am__append_107 = -DHPCRUN_SS_NVIDIA +@OPT_PAPI_CUPTI_TRUE@am__append_108 = $(CUPTI_INC_FLGS) +@OPT_PAPI_CUPTI_TRUE@am__append_109 = -DHPCRUN_SS_PAPI_C_CUPTI +@OPT_PAPI_ROCM_TRUE@am__append_110 = $(ROCM_INC_FLGS) +@OPT_PAPI_ROCM_TRUE@am__append_111 = -DHPCRUN_SS_PAPI_C_ROCM +@OPT_PAPI_STATIC_TRUE@am__append_112 = $(MY_PAPI_FILES) +@OPT_PAPI_STATIC_TRUE@am__append_113 = $(PAPI_INC_FLGS) +@OPT_PAPI_STATIC_TRUE@am__append_114 = $(OPT_PAPI_LIBS_STAT) +@OPT_PAPI_STATIC_TRUE@am__append_115 = -DHPCRUN_SS_PAPI +@OPT_ENABLE_UPC_TRUE@am__append_116 = $(MY_UPC_FILES) +@OPT_ENABLE_UPC_TRUE@am__append_117 = $(MY_UPC_FILES) +@OPT_ENABLE_UPC_TRUE@am__append_118 = $(OPT_UPC_IFLAGS) +@OPT_ENABLE_UPC_TRUE@am__append_119 = $(OPT_UPC_IFLAGS) +@OPT_ENABLE_UPC_TRUE@am__append_120 = $(OPT_UPC_LDFLAGS) +@OPT_ENABLE_LUSH_PTHREADS_TRUE@am__append_121 = -DLUSH_PTHREADS +@OPT_ENABLE_LUSH_PTHREADS_TRUE@am__append_122 = -DLUSH_PTHREADS @OPT_ENABLE_CUDA_TRUE@am__append_123 = $(MY_CUDA_FILES) -@OPT_ENABLE_ROCM_TRUE@am__append_124 = $(MY_ROCM_FILES) -@OPT_ENABLE_ROCM_TRUE@am__append_125 = -DENABLE_ROCM -@OPT_ENABLE_ROCM_TRUE@am__append_126 = $(OPT_ROCM_IFLAGS) -@OPT_ENABLE_ROCM_TRUE@am__append_127 = -DHPCRUN_SS_AMD -@OPT_ENABLE_LEVEL0_TRUE@am__append_128 = $(MY_LEVEL0_FILES) -@OPT_ENABLE_LEVEL0_TRUE@am__append_129 = -DENABLE_LEVEL0 -@OPT_ENABLE_LEVEL0_TRUE@am__append_130 = $(OPT_LEVEL0_IFLAGS) -@OPT_ENABLE_LEVEL0_TRUE@am__append_131 = -DHPCRUN_SS_LEVEL0 -@OPT_ENABLE_OPENCL_TRUE@am__append_132 = $(MY_OPENCL_FILES) -@OPT_ENABLE_OPENCL_TRUE@am__append_133 = -DENABLE_OPENCL -@OPT_ENABLE_OPENCL_TRUE@am__append_134 = $(OPT_OPENCL_IFLAGS) -@OPT_ENABLE_OPENCL_TRUE@am__append_135 = -DHPCRUN_SS_OPENCL -@OPT_ENABLE_GTPIN_TRUE@am__append_136 = $(MY_GTPIN_FILES) -@OPT_ENABLE_GTPIN_TRUE@am__append_137 = -DENABLE_GTPIN -DGTPIN_LIBDIR=$(OPT_GTPIN_LIBDIR) -@OPT_ENABLE_GTPIN_TRUE@am__append_138 = $(OPT_GTPIN_IFLAGS) -@OPT_ENABLE_GTPIN_TRUE@am__append_139 = -DHPCRUN_SS_GTPIN -@OPT_ENABLE_LUSH_TRUE@@OPT_WITH_CILK_TRUE@am__append_140 = libagent-cilk.la -@OPT_ENABLE_LUSH_TRUE@am__append_141 = libagent-pthread.la \ +@OPT_ENABLE_CUDA_TRUE@am__append_124 = -DENABLE_CUDA +@OPT_ENABLE_CUDA_TRUE@am__append_125 = $(OPT_CUDA_IFLAGS) +@OPT_ENABLE_CUDA_TRUE@am__append_126 = $(MY_CUDA_FILES) +@OPT_ENABLE_ROCM_TRUE@am__append_127 = $(MY_ROCM_FILES) +@OPT_ENABLE_ROCM_TRUE@am__append_128 = -DENABLE_ROCM +@OPT_ENABLE_ROCM_TRUE@am__append_129 = $(ROCM_INC_FLGS) +@OPT_ENABLE_ROCM_TRUE@am__append_130 = -DHPCRUN_SS_AMD +@OPT_ENABLE_LEVEL0_TRUE@am__append_131 = $(MY_LEVEL0_FILES) +@OPT_ENABLE_LEVEL0_TRUE@am__append_132 = -DENABLE_LEVEL0 +@OPT_ENABLE_LEVEL0_TRUE@am__append_133 = $(OPT_LEVEL0_IFLAGS) +@OPT_ENABLE_LEVEL0_TRUE@am__append_134 = -DHPCRUN_SS_LEVEL0 +@OPT_ENABLE_OPENCL_TRUE@am__append_135 = $(MY_OPENCL_FILES) +@OPT_ENABLE_OPENCL_TRUE@am__append_136 = -DENABLE_OPENCL +@OPT_ENABLE_OPENCL_TRUE@am__append_137 = $(OPT_OPENCL_IFLAGS) +@OPT_ENABLE_OPENCL_TRUE@am__append_138 = -DHPCRUN_SS_OPENCL +@OPT_ENABLE_GTPIN_TRUE@am__append_139 = $(MY_GTPIN_FILES) +@OPT_ENABLE_GTPIN_TRUE@am__append_140 = -DENABLE_GTPIN -DGTPIN_LIBDIR=$(OPT_GTPIN_LIBDIR) +@OPT_ENABLE_GTPIN_TRUE@am__append_141 = $(OPT_GTPIN_IFLAGS) +@OPT_ENABLE_GTPIN_TRUE@am__append_142 = -DHPCRUN_SS_GTPIN +@OPT_ENABLE_LUSH_TRUE@@OPT_WITH_CILK_TRUE@am__append_143 = libagent-cilk.la +@OPT_ENABLE_LUSH_TRUE@am__append_144 = libagent-pthread.la \ @OPT_ENABLE_LUSH_TRUE@ libagent-tbb.la subdir = src/tool/hpcrun ACLOCAL_M4 = $(top_srcdir)/aclocal.m4 @@ -455,7 +458,7 @@ am__libhpcrun_la_SOURCES_DIST = utilities/first_func.c main.h main.c \ cct_backtrace_finalize.c env.c epoch.c files.c \ handling_sample.c hpcrun-initializers.c hpcrun_options.c \ hpcrun_stats.c loadmap.c metrics.c name.c rank.c \ - sample_event.c sample_prob.c sample_sources_all.c \ + sample_event.c sample_prob.c sample_sources_all.c tool_state.c \ sample-sources/blame-shift/blame-shift.c \ sample-sources/blame-shift/blame-map.c \ sample-sources/blame-shift/directed.c \ @@ -470,18 +473,18 @@ am__libhpcrun_la_SOURCES_DIST = utilities/first_func.c main.h main.c \ sample_sources_registered.c sample-sources/sample-filters.c \ segv_handler.c start-stop.c term_handler.c thread_data.c \ thread_use.c thread_finalize.c control-knob.c control-knob.h \ - device-finalizers.c device-initializers.c module-ignore-map.c \ - threadmgr.c trace.c weak.c write_data.c cct/cct_bundle.c \ - cct/cct_ctxt.c cct/cct.c cct/cct-node-vector.c cct2metrics.c \ - lush/lush-backtrace.h lush/lush-backtrace.c lush/lush.h \ - lush/lush.c lush/lush-pthread.h lush/lush-pthread.i \ - lush/lush-pthread.c lush/lush-support-rt.h \ - lush/lush-support-rt.c lush/lushi.h lush/lushi-cb.h \ - lush/lushi-cb.c fnbounds/fnbounds_common.c memory/mem.c \ - memory/mmap.c messages/debug-flag.c messages/messages-sync.c \ - messages/messages-async.c messages/fmt.c gpu/gpu-activity.c \ - gpu/gpu-activity-channel.c gpu/gpu-activity-process.c \ - gpu/gpu-application-thread-api.c \ + device-finalizers.c gpu-monitors.c device-initializers.c \ + module-ignore-map.c threadmgr.c trace.c weak.c write_data.c \ + cct/cct_bundle.c cct/cct_ctxt.c cct/cct.c \ + cct/cct-node-vector.c cct2metrics.c lush/lush-backtrace.h \ + lush/lush-backtrace.c lush/lush.h lush/lush.c \ + lush/lush-pthread.h lush/lush-pthread.i lush/lush-pthread.c \ + lush/lush-support-rt.h lush/lush-support-rt.c lush/lushi.h \ + lush/lushi-cb.h lush/lushi-cb.c fnbounds/fnbounds_common.c \ + memory/mem.c memory/mmap.c messages/debug-flag.c \ + messages/messages-sync.c messages/messages-async.c \ + messages/fmt.c gpu/gpu-activity.c gpu/gpu-activity-channel.c \ + gpu/gpu-activity-process.c gpu/gpu-application-thread-api.c \ gpu/gpu-channel-item-allocator.c gpu/gpu-context-id-map.c \ gpu/gpu-correlation.c gpu/gpu-correlation-channel.c \ gpu/gpu-correlation-channel-set.c gpu/gpu-correlation-id.c \ @@ -495,12 +498,14 @@ am__libhpcrun_la_SOURCES_DIST = utilities/first_func.c main.h main.c \ gpu/gpu-stream-id-map.c gpu/gpu-trace.c \ gpu/gpu-trace-channel.c gpu/gpu-trace-item.c \ gpu/gpu-trace-channel-set.c gpu/gpu-trace-demultiplexer.c \ - ompt/ompt-callstack.c ompt/ompt-defer.c ompt/ompt-device.c \ - ompt/ompt-defer-write.c ompt/ompt-interface.c \ - ompt/ompt-queues.c ompt/ompt-region.c ompt/ompt-region-debug.c \ - ompt/ompt-device-map.c ompt/ompt-task.c ompt/ompt-thread.c \ - extern-real/dl-iterate.c extern-real/mmap.c syscalls/poll.c \ - syscalls/ppoll.c syscalls/select.c syscalls/sysv_signal.c \ + gpu/ompt/ompt-gpu-api.c gpu/ompt/ompt-activity-translate.c \ + sample-sources/openmp-target.c ompt/ompt-callstack.c \ + ompt/ompt-defer.c ompt/ompt-device.c ompt/ompt-defer-write.c \ + ompt/ompt-interface.c ompt/ompt-queues.c ompt/ompt-region.c \ + ompt/ompt-region-debug.c ompt/ompt-device-map.c \ + ompt/ompt-task.c ompt/ompt-thread.c extern-real/dl-iterate.c \ + extern-real/mmap.c syscalls/poll.c syscalls/ppoll.c \ + syscalls/select.c syscalls/sysv_signal.c \ utilities/executable-path.h utilities/executable-path.c \ utilities/hpcrun-nanotime.h utilities/hpcrun-nanotime.c \ utilities/ip-normalized.h utilities/ip-normalized.c \ @@ -531,15 +536,17 @@ am__libhpcrun_la_SOURCES_DIST = utilities/first_func.c main.h main.c \ trampoline/aarch64/aarch64-tramp.c \ utilities/arch/libunwind/libunwind-context-pc.c \ sample-sources/papi.c sample-sources/papi-c-cupti.c \ - sample-sources/papi-c.c sample-sources/papi-c-extended-info.c \ - sample-sources/nvidia.c gpu/nvidia/cubin-hash-map.c \ - gpu/nvidia/cubin-id-map.c gpu/nvidia/cubin-symbols.c \ - gpu/nvidia/cuda-api.c gpu/nvidia/cuda-device-map.c \ + sample-sources/papi-c-rocm.c sample-sources/papi-c.c \ + sample-sources/papi-c-extended-info.c sample-sources/nvidia.c \ + gpu/nvidia/cubin-hash-map.c gpu/nvidia/cubin-id-map.c \ + gpu/nvidia/cubin-symbols.c gpu/nvidia/cuda-api.c \ + gpu/nvidia/cuda-device-map.c \ gpu/nvidia/cupti-activity-translate.c \ gpu/nvidia/cupti-analysis.c gpu/nvidia/cupti-api.c \ gpu/nvidia/cupti-gpu-api.c sample-sources/upc.c \ - sample-sources/amd.c gpu/amd/roctracer-activity-translate.c \ - gpu/amd/roctracer-api.c gpu/amd/rocm-debug-api.c \ + sample-sources/amd.c sample-sources/amd-rocprofiler.c \ + gpu/amd/hip-api.c gpu/amd/roctracer-activity-translate.c \ + gpu/amd/roctracer-api.c gpu/amd/rocprofiler-api.c \ gpu/amd/rocm-binary-processing.c sample-sources/level0.c \ gpu/level0/level0-api.c \ gpu/level0/level0-command-list-context-map.c \ @@ -604,7 +611,7 @@ am__objects_14 = utilities/libhpcrun_la-first_func.lo \ libhpcrun_la-loadmap.lo libhpcrun_la-metrics.lo \ libhpcrun_la-name.lo libhpcrun_la-rank.lo \ libhpcrun_la-sample_event.lo libhpcrun_la-sample_prob.lo \ - libhpcrun_la-sample_sources_all.lo \ + libhpcrun_la-sample_sources_all.lo libhpcrun_la-tool_state.lo \ sample-sources/blame-shift/libhpcrun_la-blame-shift.lo \ sample-sources/blame-shift/libhpcrun_la-blame-map.lo \ sample-sources/blame-shift/libhpcrun_la-directed.lo \ @@ -629,6 +636,7 @@ am__objects_14 = utilities/libhpcrun_la-first_func.lo \ libhpcrun_la-term_handler.lo libhpcrun_la-thread_data.lo \ libhpcrun_la-thread_use.lo libhpcrun_la-thread_finalize.lo \ libhpcrun_la-control-knob.lo libhpcrun_la-device-finalizers.lo \ + libhpcrun_la-gpu-monitors.lo \ libhpcrun_la-device-initializers.lo \ libhpcrun_la-module-ignore-map.lo libhpcrun_la-threadmgr.lo \ libhpcrun_la-trace.lo libhpcrun_la-weak.lo \ @@ -675,6 +683,9 @@ am__objects_14 = utilities/libhpcrun_la-first_func.lo \ gpu/libhpcrun_la-gpu-trace-item.lo \ gpu/libhpcrun_la-gpu-trace-channel-set.lo \ gpu/libhpcrun_la-gpu-trace-demultiplexer.lo \ + gpu/ompt/libhpcrun_la-ompt-gpu-api.lo \ + gpu/ompt/libhpcrun_la-ompt-activity-translate.lo \ + sample-sources/libhpcrun_la-openmp-target.lo \ ompt/libhpcrun_la-ompt-callstack.lo \ ompt/libhpcrun_la-ompt-defer.lo \ ompt/libhpcrun_la-ompt-device.lo \ @@ -722,14 +733,15 @@ am__objects_26 = trampoline/aarch64/libhpcrun_la-aarch64-tramp.lo \ utilities/arch/libunwind/libhpcrun_la-libunwind-context-pc.lo @HOST_CPU_AARCH64_TRUE@am__objects_27 = $(am__objects_26) @OPT_PAPI_CUPTI_TRUE@am__objects_28 = sample-sources/libhpcrun_la-papi-c-cupti.lo -@OPT_PAPI_COMPONENT_FALSE@am__objects_29 = \ +@OPT_PAPI_ROCM_TRUE@am__objects_29 = sample-sources/libhpcrun_la-papi-c-rocm.lo +@OPT_PAPI_COMPONENT_FALSE@am__objects_30 = \ @OPT_PAPI_COMPONENT_FALSE@ sample-sources/libhpcrun_la-papi.lo \ -@OPT_PAPI_COMPONENT_FALSE@ $(am__objects_28) -@OPT_PAPI_COMPONENT_TRUE@am__objects_29 = sample-sources/libhpcrun_la-papi-c.lo \ +@OPT_PAPI_COMPONENT_FALSE@ $(am__objects_28) $(am__objects_29) +@OPT_PAPI_COMPONENT_TRUE@am__objects_30 = sample-sources/libhpcrun_la-papi-c.lo \ @OPT_PAPI_COMPONENT_TRUE@ sample-sources/libhpcrun_la-papi-c-extended-info.lo \ -@OPT_PAPI_COMPONENT_TRUE@ $(am__objects_28) -@OPT_PAPI_DYNAMIC_TRUE@am__objects_30 = $(am__objects_29) -@OPT_ENABLE_CUPTI_TRUE@am__objects_31 = \ +@OPT_PAPI_COMPONENT_TRUE@ $(am__objects_28) $(am__objects_29) +@OPT_PAPI_DYNAMIC_TRUE@am__objects_31 = $(am__objects_30) +@OPT_ENABLE_CUPTI_TRUE@am__objects_32 = \ @OPT_ENABLE_CUPTI_TRUE@ sample-sources/libhpcrun_la-nvidia.lo \ @OPT_ENABLE_CUPTI_TRUE@ gpu/nvidia/libhpcrun_la-cubin-hash-map.lo \ @OPT_ENABLE_CUPTI_TRUE@ gpu/nvidia/libhpcrun_la-cubin-id-map.lo \ @@ -740,18 +752,20 @@ am__objects_26 = trampoline/aarch64/libhpcrun_la-aarch64-tramp.lo \ @OPT_ENABLE_CUPTI_TRUE@ gpu/nvidia/libhpcrun_la-cupti-analysis.lo \ @OPT_ENABLE_CUPTI_TRUE@ gpu/nvidia/libhpcrun_la-cupti-api.lo \ @OPT_ENABLE_CUPTI_TRUE@ gpu/nvidia/libhpcrun_la-cupti-gpu-api.lo -@OPT_ENABLE_CUPTI_TRUE@am__objects_32 = $(am__objects_31) -am__objects_33 = sample-sources/libhpcrun_la-upc.lo -@OPT_ENABLE_UPC_TRUE@am__objects_34 = $(am__objects_33) -am__objects_35 = -@OPT_ENABLE_ROCM_TRUE@am__objects_36 = \ +@OPT_ENABLE_CUPTI_TRUE@am__objects_33 = $(am__objects_32) +am__objects_34 = sample-sources/libhpcrun_la-upc.lo +@OPT_ENABLE_UPC_TRUE@am__objects_35 = $(am__objects_34) +am__objects_36 = +@OPT_ENABLE_ROCM_TRUE@am__objects_37 = \ @OPT_ENABLE_ROCM_TRUE@ sample-sources/libhpcrun_la-amd.lo \ +@OPT_ENABLE_ROCM_TRUE@ sample-sources/libhpcrun_la-amd-rocprofiler.lo \ +@OPT_ENABLE_ROCM_TRUE@ gpu/amd/libhpcrun_la-hip-api.lo \ @OPT_ENABLE_ROCM_TRUE@ gpu/amd/libhpcrun_la-roctracer-activity-translate.lo \ @OPT_ENABLE_ROCM_TRUE@ gpu/amd/libhpcrun_la-roctracer-api.lo \ -@OPT_ENABLE_ROCM_TRUE@ gpu/amd/libhpcrun_la-rocm-debug-api.lo \ +@OPT_ENABLE_ROCM_TRUE@ gpu/amd/libhpcrun_la-rocprofiler-api.lo \ @OPT_ENABLE_ROCM_TRUE@ gpu/amd/libhpcrun_la-rocm-binary-processing.lo -@OPT_ENABLE_ROCM_TRUE@am__objects_37 = $(am__objects_36) -@OPT_ENABLE_LEVEL0_TRUE@am__objects_38 = \ +@OPT_ENABLE_ROCM_TRUE@am__objects_38 = $(am__objects_37) +@OPT_ENABLE_LEVEL0_TRUE@am__objects_39 = \ @OPT_ENABLE_LEVEL0_TRUE@ sample-sources/libhpcrun_la-level0.lo \ @OPT_ENABLE_LEVEL0_TRUE@ gpu/level0/libhpcrun_la-level0-api.lo \ @OPT_ENABLE_LEVEL0_TRUE@ gpu/level0/libhpcrun_la-level0-command-list-context-map.lo \ @@ -760,8 +774,8 @@ am__objects_35 = @OPT_ENABLE_LEVEL0_TRUE@ gpu/level0/libhpcrun_la-level0-data-node.lo \ @OPT_ENABLE_LEVEL0_TRUE@ gpu/level0/libhpcrun_la-level0-event-map.lo \ @OPT_ENABLE_LEVEL0_TRUE@ gpu/level0/libhpcrun_la-level0-handle-map.lo -@OPT_ENABLE_LEVEL0_TRUE@am__objects_39 = $(am__objects_38) -@OPT_ENABLE_OPENCL_TRUE@am__objects_40 = \ +@OPT_ENABLE_LEVEL0_TRUE@am__objects_40 = $(am__objects_39) +@OPT_ENABLE_OPENCL_TRUE@am__objects_41 = \ @OPT_ENABLE_OPENCL_TRUE@ sample-sources/libhpcrun_la-opencl.lo \ @OPT_ENABLE_OPENCL_TRUE@ gpu/opencl/libhpcrun_la-opencl-api.lo \ @OPT_ENABLE_OPENCL_TRUE@ gpu/opencl/libhpcrun_la-opencl-memory-manager.lo \ @@ -769,28 +783,28 @@ am__objects_35 = @OPT_ENABLE_OPENCL_TRUE@ gpu/opencl/libhpcrun_la-opencl-h2d-map.lo \ @OPT_ENABLE_OPENCL_TRUE@ gpu/opencl/libhpcrun_la-opencl-queue-map.lo \ @OPT_ENABLE_OPENCL_TRUE@ gpu/opencl/libhpcrun_la-opencl-context-map.lo -@OPT_ENABLE_OPENCL_TRUE@am__objects_41 = $(am__objects_40) -@OPT_ENABLE_GTPIN_TRUE@am__objects_42 = gpu/instrumentation/libhpcrun_la-kernel-data-map.lo \ +@OPT_ENABLE_OPENCL_TRUE@am__objects_42 = $(am__objects_41) +@OPT_ENABLE_GTPIN_TRUE@am__objects_43 = gpu/instrumentation/libhpcrun_la-kernel-data-map.lo \ @OPT_ENABLE_GTPIN_TRUE@ gpu/instrumentation/libhpcrun_la-gtpin-instrumentation.lo \ @OPT_ENABLE_GTPIN_TRUE@ gpu/instrumentation/libhpcrun_la-gtpin-correlation-id-map.lo -@OPT_ENABLE_GTPIN_TRUE@am__objects_43 = $(am__objects_42) -am__objects_44 = unwind/common/libhpcrun_la-backtrace.lo \ +@OPT_ENABLE_GTPIN_TRUE@am__objects_44 = $(am__objects_43) +am__objects_45 = unwind/common/libhpcrun_la-backtrace.lo \ unwind/common/libhpcrun_la-unw-throw.lo -am__objects_45 = $(am__objects_44) \ +am__objects_46 = $(am__objects_45) \ unwind/common/libhpcrun_la-binarytree_uwi.lo \ unwind/common/libhpcrun_la-interval_t.lo \ unwind/common/libhpcrun_la-libunw_intervals.lo \ unwind/common/libhpcrun_la-stack_troll.lo \ unwind/common/libhpcrun_la-uw_hash.lo \ unwind/common/libhpcrun_la-uw_recipe_map.lo -am__objects_46 = $(am__objects_45) \ +am__objects_47 = $(am__objects_46) \ unwind/generic-libunwind/libhpcrun_la-libunw-unwind.lo \ unwind/common/libhpcrun_la-default_validation_summary.lo -am__objects_47 = $(am__objects_45) \ +am__objects_48 = $(am__objects_46) \ unwind/ppc64/libhpcrun_la-ppc64-unwind.lo \ unwind/ppc64/libhpcrun_la-ppc64-unwind-interval.lo \ unwind/common/libhpcrun_la-default_validation_summary.lo -am__objects_48 = $(am__objects_45) \ +am__objects_49 = $(am__objects_46) \ unwind/x86-family/libhpcrun_la-x86-all.lo \ unwind/x86-family/libhpcrun_la-amd-xop.lo \ unwind/x86-family/libhpcrun_la-x86-cold-path.lo \ @@ -810,15 +824,15 @@ am__objects_48 = $(am__objects_45) \ unwind/x86-family/manual-intervals/libhpcrun_la-x86-32bit-icc-variant.lo \ unwind/x86-family/manual-intervals/libhpcrun_la-x86-fail-intervals.lo \ unwind/x86-family/manual-intervals/libhpcrun_la-x86-pgi-mp_pexit.lo -@UNW_LIBUNW_FALSE@@UNW_PPC64_FALSE@@UNW_X86_TRUE@am__objects_49 = $(am__objects_48) -@UNW_LIBUNW_FALSE@@UNW_PPC64_TRUE@am__objects_49 = $(am__objects_47) -@UNW_LIBUNW_TRUE@am__objects_49 = $(am__objects_46) +@UNW_LIBUNW_FALSE@@UNW_PPC64_FALSE@@UNW_X86_TRUE@am__objects_50 = $(am__objects_49) +@UNW_LIBUNW_FALSE@@UNW_PPC64_TRUE@am__objects_50 = $(am__objects_48) +@UNW_LIBUNW_TRUE@am__objects_50 = $(am__objects_47) am_libhpcrun_la_OBJECTS = $(am__objects_14) $(am__objects_15) \ $(am__objects_17) $(am__objects_19) $(am__objects_21) \ $(am__objects_23) $(am__objects_25) $(am__objects_27) \ - $(am__objects_30) $(am__objects_32) $(am__objects_34) \ - $(am__objects_35) $(am__objects_37) $(am__objects_39) \ - $(am__objects_41) $(am__objects_43) $(am__objects_49) \ + $(am__objects_31) $(am__objects_33) $(am__objects_35) \ + $(am__objects_36) $(am__objects_38) $(am__objects_40) \ + $(am__objects_42) $(am__objects_44) $(am__objects_50) \ utilities/libhpcrun_la-last_func.lo libhpcrun_la_OBJECTS = $(am_libhpcrun_la_OBJECTS) libhpcrun_la_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) \ @@ -945,7 +959,7 @@ am__libhpcrun_o_SOURCES_DIST = utilities/first_func.c main.h main.c \ cct_backtrace_finalize.c env.c epoch.c files.c \ handling_sample.c hpcrun-initializers.c hpcrun_options.c \ hpcrun_stats.c loadmap.c metrics.c name.c rank.c \ - sample_event.c sample_prob.c sample_sources_all.c \ + sample_event.c sample_prob.c sample_sources_all.c tool_state.c \ sample-sources/blame-shift/blame-shift.c \ sample-sources/blame-shift/blame-map.c \ sample-sources/blame-shift/directed.c \ @@ -960,18 +974,18 @@ am__libhpcrun_o_SOURCES_DIST = utilities/first_func.c main.h main.c \ sample_sources_registered.c sample-sources/sample-filters.c \ segv_handler.c start-stop.c term_handler.c thread_data.c \ thread_use.c thread_finalize.c control-knob.c control-knob.h \ - device-finalizers.c device-initializers.c module-ignore-map.c \ - threadmgr.c trace.c weak.c write_data.c cct/cct_bundle.c \ - cct/cct_ctxt.c cct/cct.c cct/cct-node-vector.c cct2metrics.c \ - lush/lush-backtrace.h lush/lush-backtrace.c lush/lush.h \ - lush/lush.c lush/lush-pthread.h lush/lush-pthread.i \ - lush/lush-pthread.c lush/lush-support-rt.h \ - lush/lush-support-rt.c lush/lushi.h lush/lushi-cb.h \ - lush/lushi-cb.c fnbounds/fnbounds_common.c memory/mem.c \ - memory/mmap.c messages/debug-flag.c messages/messages-sync.c \ - messages/messages-async.c messages/fmt.c gpu/gpu-activity.c \ - gpu/gpu-activity-channel.c gpu/gpu-activity-process.c \ - gpu/gpu-application-thread-api.c \ + device-finalizers.c gpu-monitors.c device-initializers.c \ + module-ignore-map.c threadmgr.c trace.c weak.c write_data.c \ + cct/cct_bundle.c cct/cct_ctxt.c cct/cct.c \ + cct/cct-node-vector.c cct2metrics.c lush/lush-backtrace.h \ + lush/lush-backtrace.c lush/lush.h lush/lush.c \ + lush/lush-pthread.h lush/lush-pthread.i lush/lush-pthread.c \ + lush/lush-support-rt.h lush/lush-support-rt.c lush/lushi.h \ + lush/lushi-cb.h lush/lushi-cb.c fnbounds/fnbounds_common.c \ + memory/mem.c memory/mmap.c messages/debug-flag.c \ + messages/messages-sync.c messages/messages-async.c \ + messages/fmt.c gpu/gpu-activity.c gpu/gpu-activity-channel.c \ + gpu/gpu-activity-process.c gpu/gpu-application-thread-api.c \ gpu/gpu-channel-item-allocator.c gpu/gpu-context-id-map.c \ gpu/gpu-correlation.c gpu/gpu-correlation-channel.c \ gpu/gpu-correlation-channel-set.c gpu/gpu-correlation-id.c \ @@ -985,12 +999,14 @@ am__libhpcrun_o_SOURCES_DIST = utilities/first_func.c main.h main.c \ gpu/gpu-stream-id-map.c gpu/gpu-trace.c \ gpu/gpu-trace-channel.c gpu/gpu-trace-item.c \ gpu/gpu-trace-channel-set.c gpu/gpu-trace-demultiplexer.c \ - ompt/ompt-callstack.c ompt/ompt-defer.c ompt/ompt-device.c \ - ompt/ompt-defer-write.c ompt/ompt-interface.c \ - ompt/ompt-queues.c ompt/ompt-region.c ompt/ompt-region-debug.c \ - ompt/ompt-device-map.c ompt/ompt-task.c ompt/ompt-thread.c \ - extern-real/dl-iterate.c extern-real/mmap.c syscalls/poll.c \ - syscalls/ppoll.c syscalls/select.c syscalls/sysv_signal.c \ + gpu/ompt/ompt-gpu-api.c gpu/ompt/ompt-activity-translate.c \ + sample-sources/openmp-target.c ompt/ompt-callstack.c \ + ompt/ompt-defer.c ompt/ompt-device.c ompt/ompt-defer-write.c \ + ompt/ompt-interface.c ompt/ompt-queues.c ompt/ompt-region.c \ + ompt/ompt-region-debug.c ompt/ompt-device-map.c \ + ompt/ompt-task.c ompt/ompt-thread.c extern-real/dl-iterate.c \ + extern-real/mmap.c syscalls/poll.c syscalls/ppoll.c \ + syscalls/select.c syscalls/sysv_signal.c \ utilities/executable-path.h utilities/executable-path.c \ utilities/hpcrun-nanotime.h utilities/hpcrun-nanotime.c \ utilities/ip-normalized.h utilities/ip-normalized.c \ @@ -1025,12 +1041,13 @@ am__libhpcrun_o_SOURCES_DIST = utilities/first_func.c main.h main.c \ gpu/nvidia/cupti-activity-translate.c \ gpu/nvidia/cupti-analysis.c gpu/nvidia/cupti-api.c \ gpu/nvidia/cupti-gpu-api.c sample-sources/papi.c \ - sample-sources/papi-c-cupti.c sample-sources/papi-c.c \ - sample-sources/papi-c-extended-info.c sample-sources/upc.c \ - unwind/common/backtrace.c unwind/common/unw-throw.c \ - unwind/common/binarytree_uwi.c unwind/common/interval_t.c \ - unwind/common/libunw_intervals.c unwind/common/stack_troll.c \ - unwind/common/uw_hash.c unwind/common/uw_recipe_map.c \ + sample-sources/papi-c-cupti.c sample-sources/papi-c-rocm.c \ + sample-sources/papi-c.c sample-sources/papi-c-extended-info.c \ + sample-sources/upc.c unwind/common/backtrace.c \ + unwind/common/unw-throw.c unwind/common/binarytree_uwi.c \ + unwind/common/interval_t.c unwind/common/libunw_intervals.c \ + unwind/common/stack_troll.c unwind/common/uw_hash.c \ + unwind/common/uw_recipe_map.c \ unwind/generic-libunwind/libunw-unwind.c \ unwind/ppc64/ppc64-unwind.c \ unwind/ppc64/ppc64-unwind-interval.c \ @@ -1053,19 +1070,19 @@ am__libhpcrun_o_SOURCES_DIST = utilities/first_func.c main.h main.c \ unwind/x86-family/manual-intervals/x86-fail-intervals.c \ unwind/x86-family/manual-intervals/x86-pgi-mp_pexit.c \ utilities/last_func.c -@HOST_CPU_PPC_TRUE@am__objects_50 = trampoline/common/libhpcrun_o-trampoline_eager.$(OBJEXT) -@HOST_CPU_PPC_FALSE@am__objects_51 = trampoline/common/libhpcrun_o-trampoline_lazy.$(OBJEXT) -@OPT_ENABLE_PERF_EVENT_TRUE@am__objects_52 = sample-sources/perf/libhpcrun_o-event_custom.$(OBJEXT) \ +@HOST_CPU_PPC_TRUE@am__objects_51 = trampoline/common/libhpcrun_o-trampoline_eager.$(OBJEXT) +@HOST_CPU_PPC_FALSE@am__objects_52 = trampoline/common/libhpcrun_o-trampoline_lazy.$(OBJEXT) +@OPT_ENABLE_PERF_EVENT_TRUE@am__objects_53 = sample-sources/perf/libhpcrun_o-event_custom.$(OBJEXT) \ @OPT_ENABLE_PERF_EVENT_TRUE@ sample-sources/perf/libhpcrun_o-linux_perf.$(OBJEXT) \ @OPT_ENABLE_PERF_EVENT_TRUE@ sample-sources/perf/libhpcrun_o-perf_event_open.$(OBJEXT) \ @OPT_ENABLE_PERF_EVENT_TRUE@ sample-sources/perf/libhpcrun_o-perf-util.$(OBJEXT) \ @OPT_ENABLE_PERF_EVENT_TRUE@ sample-sources/perf/libhpcrun_o-perf_mmap.$(OBJEXT) \ @OPT_ENABLE_PERF_EVENT_TRUE@ sample-sources/perf/libhpcrun_o-perf_skid.$(OBJEXT) -@OPT_ENABLE_PERF_EVENT_TRUE@@OPT_PERFMON_TRUE@am__objects_53 = sample-sources/perf/libhpcrun_o-perfmon-util.$(OBJEXT) -@OPT_ENABLE_PERF_EVENT_TRUE@@OPT_PERFMON_FALSE@am__objects_54 = sample-sources/perf/libhpcrun_o-perfmon-util-dummy.$(OBJEXT) -@OPT_ENABLE_KERNEL_4_3_TRUE@@OPT_ENABLE_PERF_EVENT_TRUE@am__objects_55 = sample-sources/perf/libhpcrun_o-kernel_blocking.$(OBJEXT) -@OPT_ENABLE_KERNEL_4_3_FALSE@@OPT_ENABLE_PERF_EVENT_TRUE@am__objects_56 = sample-sources/perf/libhpcrun_o-kernel_blocking_stub.$(OBJEXT) -am__objects_57 = utilities/libhpcrun_o-first_func.$(OBJEXT) \ +@OPT_ENABLE_PERF_EVENT_TRUE@@OPT_PERFMON_TRUE@am__objects_54 = sample-sources/perf/libhpcrun_o-perfmon-util.$(OBJEXT) +@OPT_ENABLE_PERF_EVENT_TRUE@@OPT_PERFMON_FALSE@am__objects_55 = sample-sources/perf/libhpcrun_o-perfmon-util-dummy.$(OBJEXT) +@OPT_ENABLE_KERNEL_4_3_TRUE@@OPT_ENABLE_PERF_EVENT_TRUE@am__objects_56 = sample-sources/perf/libhpcrun_o-kernel_blocking.$(OBJEXT) +@OPT_ENABLE_KERNEL_4_3_FALSE@@OPT_ENABLE_PERF_EVENT_TRUE@am__objects_57 = sample-sources/perf/libhpcrun_o-kernel_blocking_stub.$(OBJEXT) +am__objects_58 = utilities/libhpcrun_o-first_func.$(OBJEXT) \ libhpcrun_o-main.$(OBJEXT) libhpcrun_o-disabled.$(OBJEXT) \ libhpcrun_o-closure-registry.$(OBJEXT) \ libhpcrun_o-cct_insert_backtrace.$(OBJEXT) \ @@ -1081,6 +1098,7 @@ am__objects_57 = utilities/libhpcrun_o-first_func.$(OBJEXT) \ libhpcrun_o-sample_event.$(OBJEXT) \ libhpcrun_o-sample_prob.$(OBJEXT) \ libhpcrun_o-sample_sources_all.$(OBJEXT) \ + libhpcrun_o-tool_state.$(OBJEXT) \ sample-sources/blame-shift/libhpcrun_o-blame-shift.$(OBJEXT) \ sample-sources/blame-shift/libhpcrun_o-blame-map.$(OBJEXT) \ sample-sources/blame-shift/libhpcrun_o-directed.$(OBJEXT) \ @@ -1109,6 +1127,7 @@ am__objects_57 = utilities/libhpcrun_o-first_func.$(OBJEXT) \ libhpcrun_o-thread_finalize.$(OBJEXT) \ libhpcrun_o-control-knob.$(OBJEXT) \ libhpcrun_o-device-finalizers.$(OBJEXT) \ + libhpcrun_o-gpu-monitors.$(OBJEXT) \ libhpcrun_o-device-initializers.$(OBJEXT) \ libhpcrun_o-module-ignore-map.$(OBJEXT) \ libhpcrun_o-threadmgr.$(OBJEXT) libhpcrun_o-trace.$(OBJEXT) \ @@ -1160,6 +1179,9 @@ am__objects_57 = utilities/libhpcrun_o-first_func.$(OBJEXT) \ gpu/libhpcrun_o-gpu-trace-item.$(OBJEXT) \ gpu/libhpcrun_o-gpu-trace-channel-set.$(OBJEXT) \ gpu/libhpcrun_o-gpu-trace-demultiplexer.$(OBJEXT) \ + gpu/ompt/libhpcrun_o-ompt-gpu-api.$(OBJEXT) \ + gpu/ompt/libhpcrun_o-ompt-activity-translate.$(OBJEXT) \ + sample-sources/libhpcrun_o-openmp-target.$(OBJEXT) \ ompt/libhpcrun_o-ompt-callstack.$(OBJEXT) \ ompt/libhpcrun_o-ompt-defer.$(OBJEXT) \ ompt/libhpcrun_o-ompt-device.$(OBJEXT) \ @@ -1184,28 +1206,28 @@ am__objects_57 = utilities/libhpcrun_o-first_func.$(OBJEXT) \ utilities/libhpcrun_o-linuxtimer.$(OBJEXT) \ utilities/libhpcrun_o-timer.$(OBJEXT) \ utilities/libhpcrun_o-tokenize.$(OBJEXT) \ - utilities/libhpcrun_o-unlink.$(OBJEXT) $(am__objects_50) \ - $(am__objects_51) $(am__objects_52) $(am__objects_53) \ - $(am__objects_54) $(am__objects_55) $(am__objects_56) -am__objects_58 = fnbounds/libhpcrun_o-fnbounds_static.$(OBJEXT) \ + utilities/libhpcrun_o-unlink.$(OBJEXT) $(am__objects_51) \ + $(am__objects_52) $(am__objects_53) $(am__objects_54) \ + $(am__objects_55) $(am__objects_56) $(am__objects_57) +am__objects_59 = fnbounds/libhpcrun_o-fnbounds_static.$(OBJEXT) \ libhpcrun_o-custom-init-static.$(OBJEXT) -am__objects_59 = unwind/common/libhpcrun_o-default_validation_summary.$(OBJEXT) -@HOST_CPU_MIPS_TRUE@am__objects_60 = $(am__objects_59) -am__objects_61 = trampoline/ppc64/libhpcrun_o-ppc64-tramp.$(OBJEXT) \ +am__objects_60 = unwind/common/libhpcrun_o-default_validation_summary.$(OBJEXT) +@HOST_CPU_MIPS_TRUE@am__objects_61 = $(am__objects_60) +am__objects_62 = trampoline/ppc64/libhpcrun_o-ppc64-tramp.$(OBJEXT) \ utilities/arch/ppc64/libhpcrun_o-ppc64-context-pc.$(OBJEXT) -@HOST_CPU_PPC_TRUE@am__objects_62 = $(am__objects_61) -am__objects_63 = \ +@HOST_CPU_PPC_TRUE@am__objects_63 = $(am__objects_62) +am__objects_64 = \ trampoline/x86-family/libhpcrun_o-x86-tramp.$(OBJEXT) \ utilities/arch/x86-family/libhpcrun_o-x86-context-pc.$(OBJEXT) -@HOST_CPU_X86_FAMILY_TRUE@am__objects_64 = $(am__objects_63) -am__objects_65 = trampoline/ia64/libhpcrun_o-ia64-tramp.$(OBJEXT) \ +@HOST_CPU_X86_FAMILY_TRUE@am__objects_65 = $(am__objects_64) +am__objects_66 = trampoline/ia64/libhpcrun_o-ia64-tramp.$(OBJEXT) \ utilities/arch/ia64/libhpcrun_o-ia64-context-pc.$(OBJEXT) -@HOST_CPU_IA64_TRUE@am__objects_66 = $(am__objects_65) -am__objects_67 = \ +@HOST_CPU_IA64_TRUE@am__objects_67 = $(am__objects_66) +am__objects_68 = \ trampoline/aarch64/libhpcrun_o-aarch64-tramp.$(OBJEXT) \ utilities/arch/libunwind/libhpcrun_o-libunwind-context-pc.$(OBJEXT) -@HOST_CPU_AARCH64_TRUE@am__objects_68 = $(am__objects_67) -@OPT_ENABLE_CUPTI_TRUE@am__objects_69 = sample-sources/libhpcrun_o-nvidia.$(OBJEXT) \ +@HOST_CPU_AARCH64_TRUE@am__objects_69 = $(am__objects_68) +@OPT_ENABLE_CUPTI_TRUE@am__objects_70 = sample-sources/libhpcrun_o-nvidia.$(OBJEXT) \ @OPT_ENABLE_CUPTI_TRUE@ gpu/nvidia/libhpcrun_o-cubin-hash-map.$(OBJEXT) \ @OPT_ENABLE_CUPTI_TRUE@ gpu/nvidia/libhpcrun_o-cubin-id-map.$(OBJEXT) \ @OPT_ENABLE_CUPTI_TRUE@ gpu/nvidia/libhpcrun_o-cubin-symbols.$(OBJEXT) \ @@ -1215,33 +1237,34 @@ am__objects_67 = \ @OPT_ENABLE_CUPTI_TRUE@ gpu/nvidia/libhpcrun_o-cupti-analysis.$(OBJEXT) \ @OPT_ENABLE_CUPTI_TRUE@ gpu/nvidia/libhpcrun_o-cupti-api.$(OBJEXT) \ @OPT_ENABLE_CUPTI_TRUE@ gpu/nvidia/libhpcrun_o-cupti-gpu-api.$(OBJEXT) -@OPT_ENABLE_CUPTI_TRUE@am__objects_70 = $(am__objects_69) -@OPT_PAPI_CUPTI_TRUE@am__objects_71 = sample-sources/libhpcrun_o-papi-c-cupti.$(OBJEXT) -@OPT_PAPI_COMPONENT_FALSE@am__objects_72 = sample-sources/libhpcrun_o-papi.$(OBJEXT) \ -@OPT_PAPI_COMPONENT_FALSE@ $(am__objects_71) -@OPT_PAPI_COMPONENT_TRUE@am__objects_72 = sample-sources/libhpcrun_o-papi-c.$(OBJEXT) \ +@OPT_ENABLE_CUPTI_TRUE@am__objects_71 = $(am__objects_70) +@OPT_PAPI_CUPTI_TRUE@am__objects_72 = sample-sources/libhpcrun_o-papi-c-cupti.$(OBJEXT) +@OPT_PAPI_ROCM_TRUE@am__objects_73 = sample-sources/libhpcrun_o-papi-c-rocm.$(OBJEXT) +@OPT_PAPI_COMPONENT_FALSE@am__objects_74 = sample-sources/libhpcrun_o-papi.$(OBJEXT) \ +@OPT_PAPI_COMPONENT_FALSE@ $(am__objects_72) $(am__objects_73) +@OPT_PAPI_COMPONENT_TRUE@am__objects_74 = sample-sources/libhpcrun_o-papi-c.$(OBJEXT) \ @OPT_PAPI_COMPONENT_TRUE@ sample-sources/libhpcrun_o-papi-c-extended-info.$(OBJEXT) \ -@OPT_PAPI_COMPONENT_TRUE@ $(am__objects_71) -@OPT_PAPI_STATIC_TRUE@am__objects_73 = $(am__objects_72) -am__objects_74 = sample-sources/libhpcrun_o-upc.$(OBJEXT) -@OPT_ENABLE_UPC_TRUE@am__objects_75 = $(am__objects_74) -am__objects_76 = unwind/common/libhpcrun_o-backtrace.$(OBJEXT) \ +@OPT_PAPI_COMPONENT_TRUE@ $(am__objects_72) $(am__objects_73) +@OPT_PAPI_STATIC_TRUE@am__objects_75 = $(am__objects_74) +am__objects_76 = sample-sources/libhpcrun_o-upc.$(OBJEXT) +@OPT_ENABLE_UPC_TRUE@am__objects_77 = $(am__objects_76) +am__objects_78 = unwind/common/libhpcrun_o-backtrace.$(OBJEXT) \ unwind/common/libhpcrun_o-unw-throw.$(OBJEXT) -am__objects_77 = $(am__objects_76) \ +am__objects_79 = $(am__objects_78) \ unwind/common/libhpcrun_o-binarytree_uwi.$(OBJEXT) \ unwind/common/libhpcrun_o-interval_t.$(OBJEXT) \ unwind/common/libhpcrun_o-libunw_intervals.$(OBJEXT) \ unwind/common/libhpcrun_o-stack_troll.$(OBJEXT) \ unwind/common/libhpcrun_o-uw_hash.$(OBJEXT) \ unwind/common/libhpcrun_o-uw_recipe_map.$(OBJEXT) -am__objects_78 = $(am__objects_77) \ +am__objects_80 = $(am__objects_79) \ unwind/generic-libunwind/libhpcrun_o-libunw-unwind.$(OBJEXT) \ unwind/common/libhpcrun_o-default_validation_summary.$(OBJEXT) -am__objects_79 = $(am__objects_77) \ +am__objects_81 = $(am__objects_79) \ unwind/ppc64/libhpcrun_o-ppc64-unwind.$(OBJEXT) \ unwind/ppc64/libhpcrun_o-ppc64-unwind-interval.$(OBJEXT) \ unwind/common/libhpcrun_o-default_validation_summary.$(OBJEXT) -am__objects_80 = $(am__objects_77) \ +am__objects_82 = $(am__objects_79) \ unwind/x86-family/libhpcrun_o-x86-all.$(OBJEXT) \ unwind/x86-family/libhpcrun_o-amd-xop.$(OBJEXT) \ unwind/x86-family/libhpcrun_o-x86-cold-path.$(OBJEXT) \ @@ -1261,14 +1284,14 @@ am__objects_80 = $(am__objects_77) \ unwind/x86-family/manual-intervals/libhpcrun_o-x86-32bit-icc-variant.$(OBJEXT) \ unwind/x86-family/manual-intervals/libhpcrun_o-x86-fail-intervals.$(OBJEXT) \ unwind/x86-family/manual-intervals/libhpcrun_o-x86-pgi-mp_pexit.$(OBJEXT) -@UNW_LIBUNW_FALSE@@UNW_PPC64_FALSE@@UNW_X86_TRUE@am__objects_81 = $(am__objects_80) -@UNW_LIBUNW_FALSE@@UNW_PPC64_TRUE@am__objects_81 = $(am__objects_79) -@UNW_LIBUNW_TRUE@am__objects_81 = $(am__objects_78) -am_libhpcrun_o_OBJECTS = $(am__objects_57) $(am__objects_58) \ - $(am__objects_60) $(am__objects_62) $(am__objects_64) \ - $(am__objects_66) $(am__objects_68) $(am__objects_70) \ - $(am__objects_73) $(am__objects_75) $(am__objects_35) \ - $(am__objects_81) utilities/libhpcrun_o-last_func.$(OBJEXT) +@UNW_LIBUNW_FALSE@@UNW_PPC64_FALSE@@UNW_X86_TRUE@am__objects_83 = $(am__objects_82) +@UNW_LIBUNW_FALSE@@UNW_PPC64_TRUE@am__objects_83 = $(am__objects_81) +@UNW_LIBUNW_TRUE@am__objects_83 = $(am__objects_80) +am_libhpcrun_o_OBJECTS = $(am__objects_58) $(am__objects_59) \ + $(am__objects_61) $(am__objects_63) $(am__objects_65) \ + $(am__objects_67) $(am__objects_69) $(am__objects_71) \ + $(am__objects_75) $(am__objects_77) $(am__objects_36) \ + $(am__objects_83) utilities/libhpcrun_o-last_func.$(OBJEXT) libhpcrun_o_OBJECTS = $(am_libhpcrun_o_OBJECTS) @HOST_CPU_X86_FAMILY_TRUE@am__DEPENDENCIES_2 = $(am__DEPENDENCIES_1) @OPT_PAPI_STATIC_TRUE@am__DEPENDENCIES_3 = $(am__DEPENDENCIES_1) @@ -1580,6 +1603,7 @@ OPT_CUPTI_IFLAGS = @OPT_CUPTI_IFLAGS@ OPT_CUPTI_LDFLAGS = @OPT_CUPTI_LDFLAGS@ OPT_GTPIN = @OPT_GTPIN@ OPT_GTPIN_IFLAGS = @OPT_GTPIN_IFLAGS@ +OPT_GTPIN_LDFLAGS = @OPT_GTPIN_LDFLAGS@ OPT_GTPIN_LIBDIR = @OPT_GTPIN_LIBDIR@ OPT_IGC = @OPT_IGC@ OPT_IGC_IFLAGS = @OPT_IGC_IFLAGS@ @@ -1617,6 +1641,7 @@ PERFMON_LDFLAGS_STAT = @PERFMON_LDFLAGS_STAT@ PERFMON_LIB = @PERFMON_LIB@ PERF_EVENT_PARANOID = @PERF_EVENT_PARANOID@ RANLIB = @RANLIB@ +ROCM_PROFILER_LD_DIR = @ROCM_PROFILER_LD_DIR@ SED = @SED@ SET_MAKE = @SET_MAKE@ SHELL = @SHELL@ @@ -1777,14 +1802,15 @@ bin_SCRIPTS = $(am__append_4) $(am__append_6) pkglibexec_SCRIPTS = $(am__append_1) include_HEADERS = $(am__append_2) pkglib_LIBRARIES = $(am__append_5) -pkglib_LTLIBRARIES = $(am__append_3) $(am__append_7) $(am__append_17) \ - $(am__append_18) $(am__append_140) $(am__append_141) -BUILT_SOURCES = $(am__append_22) -CLEANFILES = $(am__append_23) +pkglib_LTLIBRARIES = $(am__append_3) $(am__append_7) $(am__append_18) \ + $(am__append_19) $(am__append_143) $(am__append_144) +BUILT_SOURCES = $(am__append_23) +CLEANFILES = $(am__append_24) @OPT_ENABLE_HPCRUN_DYNAMIC_TRUE@noinst_LTLIBRARIES = libhpcrun.la PAPI_INC_FLGS = @OPT_PAPI_IFLAGS@ PAPI_LD_FLGS = @OPT_PAPI_LDFLAGS@ CUPTI_INC_FLGS = @OPT_CUPTI_IFLAGS@ +ROCM_INC_FLGS = @OPT_ROCM_IFLAGS@ OPENCL_IFLAGS = @OPT_OPENCL_IFLAGS@ CUPTI_LD_FLGS = @OPT_CUPTI_LDFLAGS@ CUPTI_BASE = @OPT_CUPTI@ @@ -1867,16 +1893,16 @@ UNW_MIPS_INCLUDE_DIRS = \ UNW_MIPS_LD_FLAGS = MY_CPP_DEFINES = -D_GNU_SOURCE -DINLINE_FN=1 -DLOCAL_BUILD=1 \ - -D__HIP_PLATFORM_HCC__=1 $(am__append_11) $(am__append_19) \ - $(am__append_102) $(am__append_106) $(am__append_108) \ - $(am__append_112) $(am__append_127) $(am__append_131) \ - $(am__append_135) $(am__append_139) + -D__HIP_PLATFORM_HCC__=1 $(am__append_11) $(am__append_20) \ + $(am__append_103) $(am__append_107) $(am__append_109) \ + $(am__append_111) $(am__append_115) $(am__append_130) \ + $(am__append_134) $(am__append_138) $(am__append_142) MY_BASE_FILES = utilities/first_func.c main.h main.c disabled.c \ closure-registry.c cct_insert_backtrace.c \ cct_backtrace_finalize.c env.c epoch.c files.c \ handling_sample.c hpcrun-initializers.c hpcrun_options.c \ hpcrun_stats.c loadmap.c metrics.c name.c rank.c \ - sample_event.c sample_prob.c sample_sources_all.c \ + sample_event.c sample_prob.c sample_sources_all.c tool_state.c \ sample-sources/blame-shift/blame-shift.c \ sample-sources/blame-shift/blame-map.c \ sample-sources/blame-shift/directed.c \ @@ -1891,18 +1917,18 @@ MY_BASE_FILES = utilities/first_func.c main.h main.c disabled.c \ sample_sources_registered.c sample-sources/sample-filters.c \ segv_handler.c start-stop.c term_handler.c thread_data.c \ thread_use.c thread_finalize.c control-knob.c control-knob.h \ - device-finalizers.c device-initializers.c module-ignore-map.c \ - threadmgr.c trace.c weak.c write_data.c cct/cct_bundle.c \ - cct/cct_ctxt.c cct/cct.c cct/cct-node-vector.c cct2metrics.c \ - lush/lush-backtrace.h lush/lush-backtrace.c lush/lush.h \ - lush/lush.c lush/lush-pthread.h lush/lush-pthread.i \ - lush/lush-pthread.c lush/lush-support-rt.h \ - lush/lush-support-rt.c lush/lushi.h lush/lushi-cb.h \ - lush/lushi-cb.c fnbounds/fnbounds_common.c memory/mem.c \ - memory/mmap.c messages/debug-flag.c messages/messages-sync.c \ - messages/messages-async.c messages/fmt.c gpu/gpu-activity.c \ - gpu/gpu-activity-channel.c gpu/gpu-activity-process.c \ - gpu/gpu-application-thread-api.c \ + device-finalizers.c gpu-monitors.c device-initializers.c \ + module-ignore-map.c threadmgr.c trace.c weak.c write_data.c \ + cct/cct_bundle.c cct/cct_ctxt.c cct/cct.c \ + cct/cct-node-vector.c cct2metrics.c lush/lush-backtrace.h \ + lush/lush-backtrace.c lush/lush.h lush/lush.c \ + lush/lush-pthread.h lush/lush-pthread.i lush/lush-pthread.c \ + lush/lush-support-rt.h lush/lush-support-rt.c lush/lushi.h \ + lush/lushi-cb.h lush/lushi-cb.c fnbounds/fnbounds_common.c \ + memory/mem.c memory/mmap.c messages/debug-flag.c \ + messages/messages-sync.c messages/messages-async.c \ + messages/fmt.c gpu/gpu-activity.c gpu/gpu-activity-channel.c \ + gpu/gpu-activity-process.c gpu/gpu-application-thread-api.c \ gpu/gpu-channel-item-allocator.c gpu/gpu-context-id-map.c \ gpu/gpu-correlation.c gpu/gpu-correlation-channel.c \ gpu/gpu-correlation-channel-set.c gpu/gpu-correlation-id.c \ @@ -1916,12 +1942,14 @@ MY_BASE_FILES = utilities/first_func.c main.h main.c disabled.c \ gpu/gpu-stream-id-map.c gpu/gpu-trace.c \ gpu/gpu-trace-channel.c gpu/gpu-trace-item.c \ gpu/gpu-trace-channel-set.c gpu/gpu-trace-demultiplexer.c \ - ompt/ompt-callstack.c ompt/ompt-defer.c ompt/ompt-device.c \ - ompt/ompt-defer-write.c ompt/ompt-interface.c \ - ompt/ompt-queues.c ompt/ompt-region.c ompt/ompt-region-debug.c \ - ompt/ompt-device-map.c ompt/ompt-task.c ompt/ompt-thread.c \ - extern-real/dl-iterate.c extern-real/mmap.c syscalls/poll.c \ - syscalls/ppoll.c syscalls/select.c syscalls/sysv_signal.c \ + gpu/ompt/ompt-gpu-api.c gpu/ompt/ompt-activity-translate.c \ + sample-sources/openmp-target.c ompt/ompt-callstack.c \ + ompt/ompt-defer.c ompt/ompt-device.c ompt/ompt-defer-write.c \ + ompt/ompt-interface.c ompt/ompt-queues.c ompt/ompt-region.c \ + ompt/ompt-region-debug.c ompt/ompt-device-map.c \ + ompt/ompt-task.c ompt/ompt-thread.c extern-real/dl-iterate.c \ + extern-real/mmap.c syscalls/poll.c syscalls/ppoll.c \ + syscalls/select.c syscalls/sysv_signal.c \ utilities/executable-path.h utilities/executable-path.c \ utilities/hpcrun-nanotime.h utilities/hpcrun-nanotime.c \ utilities/ip-normalized.h utilities/ip-normalized.c \ @@ -1965,10 +1993,10 @@ MY_AARCH64_FILES = \ utilities/arch/libunwind/libunwind-context-pc.c @OPT_PAPI_COMPONENT_FALSE@MY_PAPI_FILES = sample-sources/papi.c \ -@OPT_PAPI_COMPONENT_FALSE@ $(am__append_16) +@OPT_PAPI_COMPONENT_FALSE@ $(am__append_16) $(am__append_17) @OPT_PAPI_COMPONENT_TRUE@MY_PAPI_FILES = sample-sources/papi-c.c \ @OPT_PAPI_COMPONENT_TRUE@ sample-sources/papi-c-extended-info.c \ -@OPT_PAPI_COMPONENT_TRUE@ $(am__append_16) +@OPT_PAPI_COMPONENT_TRUE@ $(am__append_16) $(am__append_17) @OPT_ENABLE_CUPTI_TRUE@MY_CUPTI_FILES = sample-sources/nvidia.c \ @OPT_ENABLE_CUPTI_TRUE@ gpu/nvidia/cubin-hash-map.c \ @OPT_ENABLE_CUPTI_TRUE@ gpu/nvidia/cubin-id-map.c \ @@ -1999,9 +2027,11 @@ MY_AARCH64_FILES = \ @OPT_ENABLE_ROCM_TRUE@MY_ROCM_FILES = \ @OPT_ENABLE_ROCM_TRUE@ sample-sources/amd.c \ +@OPT_ENABLE_ROCM_TRUE@ sample-sources/amd-rocprofiler.c \ +@OPT_ENABLE_ROCM_TRUE@ gpu/amd/hip-api.c \ @OPT_ENABLE_ROCM_TRUE@ gpu/amd/roctracer-activity-translate.c \ @OPT_ENABLE_ROCM_TRUE@ gpu/amd/roctracer-api.c \ -@OPT_ENABLE_ROCM_TRUE@ gpu/amd/rocm-debug-api.c \ +@OPT_ENABLE_ROCM_TRUE@ gpu/amd/rocprofiler-api.c \ @OPT_ENABLE_ROCM_TRUE@ gpu/amd/rocm-binary-processing.c @OPT_ENABLE_LEVEL0_TRUE@MY_LEVEL0_FILES = \ @@ -2032,6 +2062,7 @@ MY_INCLUDE_DIRS = \ -I$(HPCFNBOUNDS_INC) \ $(OPT_CUDA_IFLAGS) \ $(OPT_CUPTI_IFLAGS) \ + $(ROCM_INC_FLGS) \ -I$(LIBELF_INC) \ -I$(LIBMONITOR_INC) \ $(GOTCHA_IFLAGS) \ @@ -2053,11 +2084,11 @@ MY_AARCH64_INCLUDE_DIRS = \ -I$(srcdir)/utilities/arch/aarch64 libhpcrun_la_SOURCES = $(MY_BASE_FILES) $(MY_DYNAMIC_FILES) \ - $(am__append_24) $(am__append_25) $(am__append_38) \ - $(am__append_53) $(am__append_71) $(am__append_84) \ - $(am__append_99) $(am__append_103) $(am__append_113) \ - $(am__append_120) $(am__append_124) $(am__append_128) \ - $(am__append_132) $(am__append_136) $(UNW_SOURCE_FILES) \ + $(am__append_25) $(am__append_26) $(am__append_39) \ + $(am__append_54) $(am__append_72) $(am__append_85) \ + $(am__append_100) $(am__append_104) $(am__append_116) \ + $(am__append_123) $(am__append_127) $(am__append_131) \ + $(am__append_135) $(am__append_139) $(UNW_SOURCE_FILES) \ utilities/last_func.c libhpcrun_fake_audit_la_SOURCES = \ audit/fake-auditor.c @@ -2066,9 +2097,9 @@ libhpcrun_audit_la_SOURCES = \ audit/auditor.c libhpcrun_o_SOURCES = $(MY_BASE_FILES) $(MY_STATIC_FILES) \ - $(am__append_26) $(am__append_39) $(am__append_54) \ - $(am__append_72) $(am__append_85) $(am__append_104) \ - $(am__append_109) $(am__append_114) $(am__append_123) \ + $(am__append_27) $(am__append_40) $(am__append_55) \ + $(am__append_73) $(am__append_86) $(am__append_105) \ + $(am__append_112) $(am__append_117) $(am__append_126) \ $(UNW_SOURCE_FILES) utilities/last_func.c libhpcrun_wrap_a_SOURCES = \ monitor-exts/openmp.c @@ -2113,12 +2144,12 @@ libhpctoolkit_a_SOURCES = \ # cppflags #----------------------------------------------------------- libhpcrun_la_CPPFLAGS = $(MY_CPP_DEFINES) $(MY_INCLUDE_DIRS) \ - $(am__append_20) $(am__append_27) $(am__append_40) \ - $(am__append_55) $(am__append_73) $(am__append_86) \ - $(am__append_100) $(am__append_105) $(am__append_107) \ - $(am__append_115) $(am__append_118) $(am__append_121) \ - $(am__append_125) $(am__append_129) $(am__append_133) \ - $(am__append_137) $(UNW_INCLUDE_DIRS) + $(am__append_21) $(am__append_28) $(am__append_41) \ + $(am__append_56) $(am__append_74) $(am__append_87) \ + $(am__append_101) $(am__append_106) $(am__append_108) \ + $(am__append_110) $(am__append_118) $(am__append_121) \ + $(am__append_124) $(am__append_128) $(am__append_132) \ + $(am__append_136) $(am__append_140) $(UNW_INCLUDE_DIRS) libhpcrun_fake_audit_la_CPPFLAGS = \ $(MY_CPP_DEFINES) \ $(MY_INCLUDE_DIRS) @@ -2128,51 +2159,51 @@ libhpcrun_audit_la_CPPFLAGS = \ $(MY_INCLUDE_DIRS) libhpcrun_o_CPPFLAGS = -DHPCRUN_STATIC_LINK $(MY_CPP_DEFINES) \ - $(MY_INCLUDE_DIRS) $(am__append_21) $(am__append_28) \ - $(am__append_41) $(am__append_56) $(am__append_74) \ - $(am__append_87) $(am__append_110) $(am__append_116) \ - $(am__append_119) $(UNW_INCLUDE_DIRS) + $(MY_INCLUDE_DIRS) $(am__append_22) $(am__append_29) \ + $(am__append_42) $(am__append_57) $(am__append_75) \ + $(am__append_88) $(am__append_113) $(am__append_119) \ + $(am__append_122) $(UNW_INCLUDE_DIRS) libhpcrun_wrap_a_CPPFLAGS = \ -DHPCRUN_STATIC_LINK \ $(MY_CPP_DEFINES) \ $(MY_INCLUDE_DIRS) libhpcrun_ga_la_CPPFLAGS = $(MY_CPP_DEFINES) $(MY_INCLUDE_DIRS) \ - $(am__append_29) $(am__append_42) $(am__append_59) \ - $(am__append_75) $(am__append_88) $(UNW_INCLUDE_DIRS) + $(am__append_30) $(am__append_43) $(am__append_60) \ + $(am__append_76) $(am__append_89) $(UNW_INCLUDE_DIRS) libhpcrun_ga_wrap_a_CPPFLAGS = -DHPCRUN_STATIC_LINK $(MY_CPP_DEFINES) \ - $(MY_INCLUDE_DIRS) $(am__append_30) $(am__append_43) \ - $(am__append_60) $(am__append_76) $(am__append_89) \ + $(MY_INCLUDE_DIRS) $(am__append_31) $(am__append_44) \ + $(am__append_61) $(am__append_77) $(am__append_90) \ $(UNW_INCLUDE_DIRS) libhpcrun_gprof_la_CPPFLAGS = $(MY_CPP_DEFINES) $(MY_INCLUDE_DIRS) \ - $(am__append_44) $(am__append_61) $(am__append_90) + $(am__append_45) $(am__append_62) $(am__append_91) libhpcrun_gprof_wrap_a_CPPFLAGS = -DHPCRUN_STATIC_LINK \ - $(MY_CPP_DEFINES) $(MY_INCLUDE_DIRS) $(am__append_45) \ - $(am__append_62) $(am__append_91) + $(MY_CPP_DEFINES) $(MY_INCLUDE_DIRS) $(am__append_46) \ + $(am__append_63) $(am__append_92) libhpcrun_io_la_CPPFLAGS = $(MY_CPP_DEFINES) $(MY_INCLUDE_DIRS) \ - $(am__append_31) $(am__append_46) $(am__append_63) \ - $(am__append_77) $(am__append_92) $(UNW_INCLUDE_DIRS) + $(am__append_32) $(am__append_47) $(am__append_64) \ + $(am__append_78) $(am__append_93) $(UNW_INCLUDE_DIRS) libhpcrun_io_wrap_a_CPPFLAGS = -DHPCRUN_STATIC_LINK $(MY_CPP_DEFINES) \ - $(MY_INCLUDE_DIRS) $(am__append_32) $(am__append_47) \ - $(am__append_64) $(am__append_78) $(am__append_93) \ + $(MY_INCLUDE_DIRS) $(am__append_33) $(am__append_48) \ + $(am__append_65) $(am__append_79) $(am__append_94) \ $(UNW_INCLUDE_DIRS) libhpcrun_memleak_la_CPPFLAGS = $(MY_CPP_DEFINES) $(MY_INCLUDE_DIRS) \ - $(am__append_33) $(am__append_48) $(am__append_65) \ - $(am__append_79) $(am__append_94) $(UNW_INCLUDE_DIRS) + $(am__append_34) $(am__append_49) $(am__append_66) \ + $(am__append_80) $(am__append_95) $(UNW_INCLUDE_DIRS) libhpcrun_memleak_wrap_a_CPPFLAGS = -DHPCRUN_STATIC_LINK \ - $(MY_CPP_DEFINES) $(MY_INCLUDE_DIRS) $(am__append_34) \ - $(am__append_49) $(am__append_66) $(am__append_80) \ - $(am__append_95) $(UNW_INCLUDE_DIRS) + $(MY_CPP_DEFINES) $(MY_INCLUDE_DIRS) $(am__append_35) \ + $(am__append_50) $(am__append_67) $(am__append_81) \ + $(am__append_96) $(UNW_INCLUDE_DIRS) libhpcrun_pthread_la_CPPFLAGS = $(MY_CPP_DEFINES) $(MY_INCLUDE_DIRS) \ - $(am__append_35) $(am__append_50) $(am__append_67) \ - $(am__append_81) $(am__append_96) $(UNW_INCLUDE_DIRS) + $(am__append_36) $(am__append_51) $(am__append_68) \ + $(am__append_82) $(am__append_97) $(UNW_INCLUDE_DIRS) libhpcrun_pthread_wrap_a_CPPFLAGS = -DHPCRUN_STATIC_LINK \ - $(MY_CPP_DEFINES) $(MY_INCLUDE_DIRS) $(am__append_36) \ - $(am__append_51) $(am__append_68) $(am__append_82) \ - $(am__append_97) $(UNW_INCLUDE_DIRS) + $(MY_CPP_DEFINES) $(MY_INCLUDE_DIRS) $(am__append_37) \ + $(am__append_52) $(am__append_69) $(am__append_83) \ + $(am__append_98) $(UNW_INCLUDE_DIRS) libhpcrun_mpi_la_CPPFLAGS = $(MY_CPP_DEFINES) -I$(MPI_INC) \ - $(MY_INCLUDE_DIRS) $(am__append_37) $(am__append_52) \ - $(am__append_69) $(am__append_83) $(am__append_98) \ + $(MY_INCLUDE_DIRS) $(am__append_38) $(am__append_53) \ + $(am__append_70) $(am__append_84) $(am__append_99) \ $(UNW_INCLUDE_DIRS) libhpctoolkit_la_CPPFLAGS = \ $(MY_CPP_DEFINES) \ @@ -2188,8 +2219,8 @@ libhpctoolkit_a_CPPFLAGS = \ # cflags #----------------------------------------------------------- libhpcrun_la_CFLAGS = $(CFLAGS) $(HOST_CFLAGS) $(PERFMON_CFLAGS) \ - $(am__append_122) $(am__append_126) $(am__append_130) \ - $(am__append_134) $(am__append_138) $(GOTCHA_IFLAGS) + $(am__append_125) $(am__append_129) $(am__append_133) \ + $(am__append_137) $(am__append_141) $(GOTCHA_IFLAGS) libhpcrun_o_CFLAGS = $(CFLAGS) $(HOST_CFLAGS) $(PERFMON_CFLAGS) libhpcrun_wrap_a_CFLAGS = $(CFLAGS) $(HOST_CFLAGS) libhpcrun_ga_la_CFLAGS = $(CFLAGS) $(HOST_CFLAGS) @@ -2221,8 +2252,8 @@ OUR_LIBUNWIND_A = $(top_builddir)/src/extern/libunwind/libunwind.a OUR_LZMA_A = $(top_builddir)/src/extern/lzma/liblzma.a libhpcrun_la_LDFLAGS = -Wl,-Bsymbolic -L$(LIBMONITOR_LIB) -lmonitor \ -lpthread -lrt -L$(LIBELF_LIB) -lelf $(PERFMON_LDFLAGS_DYN) \ - $(OPT_ROCM_LDFLAGS) $(am__append_57) $(am__append_101) \ - $(am__append_117) $(GOTCHA_LDFLAGS) $(UNW_DYNAMIC_LD_FLAGS) + $(OPT_ROCM_LDFLAGS) $(am__append_58) $(am__append_102) \ + $(am__append_120) $(GOTCHA_LDFLAGS) $(UNW_DYNAMIC_LD_FLAGS) libhpcrun_fake_audit_la_LDFLAGS = -Wl,-Bsymbolic -ldl libhpcrun_audit_la_LDFLAGS = -Wl,-Bsymbolic -ldl libhpcrun_ga_la_LDFLAGS = -Wl,-Bsymbolic @@ -2248,9 +2279,9 @@ libhpcrun_la_LIBADD = \ libhpcrun_o_LDADD = $(PROF_LEAN_A) $(SUPPORT_LEAN_A) \ $(PERFMON_LDFLAGS_STAT) $(MBEDTLS_LIBS) $(OUR_LIBUNWIND_A) \ - $(OUR_LZMA_A) $(am__append_58) $(am__append_111) \ + $(OUR_LZMA_A) $(am__append_59) $(am__append_114) \ $(UNW_STATIC_LD_FLAGS) -MY_AGENT_INCLUDE_DIRS = $(MY_INCLUDE_DIRS) $(am__append_70) \ +MY_AGENT_INCLUDE_DIRS = $(MY_INCLUDE_DIRS) $(am__append_71) \ $(UNW_INCLUDE_DIRS) @HOST_CPU_AARCH64_TRUE@libhpcrun_la_CCASFLAGS = $(AM_CCASFLAGS) @HOST_CPU_PPC_TRUE@libhpcrun_la_CCASFLAGS = $(AM_CCASFLAGS) @@ -2334,7 +2365,7 @@ MY_AGENT_TBB_CFLAGS = \ # and hidden into libhpcrun.o. Other dependencies go into hpclink. # Don't use LDFLAGS for static case. MONITOR_NAMES = -G 'monitor_*' -HPCRUN_NAMES = -G 'hpcrun_*' -G 'hpctoolkit_*' +HPCRUN_NAMES = -G 'hpcrun_*' -G 'hpctoolkit_*' -G 'OnLoadToolProp' -G 'OnLoad' -G 'OnUnloadTool' MISC_NAMES = -G 'debug_flag_*' -G 'messages_*' -G ompt_start_tool OPENCL_NAMES = -G 'clBuildProgram' -G 'clCreate*' -G 'clEnqueue*' -G 'clSetKernelArg' -G 'cl*Event*' DYN_SYSCALL_LIST = poll ppoll pselect select __sysv_signal @@ -2744,6 +2775,19 @@ gpu/libhpcrun_la-gpu-trace-channel-set.lo: gpu/$(am__dirstamp) \ gpu/$(DEPDIR)/$(am__dirstamp) gpu/libhpcrun_la-gpu-trace-demultiplexer.lo: gpu/$(am__dirstamp) \ gpu/$(DEPDIR)/$(am__dirstamp) +gpu/ompt/$(am__dirstamp): + @$(MKDIR_P) gpu/ompt + @: > gpu/ompt/$(am__dirstamp) +gpu/ompt/$(DEPDIR)/$(am__dirstamp): + @$(MKDIR_P) gpu/ompt/$(DEPDIR) + @: > gpu/ompt/$(DEPDIR)/$(am__dirstamp) +gpu/ompt/libhpcrun_la-ompt-gpu-api.lo: gpu/ompt/$(am__dirstamp) \ + gpu/ompt/$(DEPDIR)/$(am__dirstamp) +gpu/ompt/libhpcrun_la-ompt-activity-translate.lo: \ + gpu/ompt/$(am__dirstamp) gpu/ompt/$(DEPDIR)/$(am__dirstamp) +sample-sources/libhpcrun_la-openmp-target.lo: \ + sample-sources/$(am__dirstamp) \ + sample-sources/$(DEPDIR)/$(am__dirstamp) ompt/$(am__dirstamp): @$(MKDIR_P) ompt @: > ompt/$(am__dirstamp) @@ -2960,6 +3004,9 @@ sample-sources/libhpcrun_la-papi.lo: sample-sources/$(am__dirstamp) \ sample-sources/libhpcrun_la-papi-c-cupti.lo: \ sample-sources/$(am__dirstamp) \ sample-sources/$(DEPDIR)/$(am__dirstamp) +sample-sources/libhpcrun_la-papi-c-rocm.lo: \ + sample-sources/$(am__dirstamp) \ + sample-sources/$(DEPDIR)/$(am__dirstamp) sample-sources/libhpcrun_la-papi-c.lo: sample-sources/$(am__dirstamp) \ sample-sources/$(DEPDIR)/$(am__dirstamp) sample-sources/libhpcrun_la-papi-c-extended-info.lo: \ @@ -2997,17 +3044,22 @@ sample-sources/libhpcrun_la-upc.lo: sample-sources/$(am__dirstamp) \ sample-sources/$(DEPDIR)/$(am__dirstamp) sample-sources/libhpcrun_la-amd.lo: sample-sources/$(am__dirstamp) \ sample-sources/$(DEPDIR)/$(am__dirstamp) +sample-sources/libhpcrun_la-amd-rocprofiler.lo: \ + sample-sources/$(am__dirstamp) \ + sample-sources/$(DEPDIR)/$(am__dirstamp) gpu/amd/$(am__dirstamp): @$(MKDIR_P) gpu/amd @: > gpu/amd/$(am__dirstamp) gpu/amd/$(DEPDIR)/$(am__dirstamp): @$(MKDIR_P) gpu/amd/$(DEPDIR) @: > gpu/amd/$(DEPDIR)/$(am__dirstamp) +gpu/amd/libhpcrun_la-hip-api.lo: gpu/amd/$(am__dirstamp) \ + gpu/amd/$(DEPDIR)/$(am__dirstamp) gpu/amd/libhpcrun_la-roctracer-activity-translate.lo: \ gpu/amd/$(am__dirstamp) gpu/amd/$(DEPDIR)/$(am__dirstamp) gpu/amd/libhpcrun_la-roctracer-api.lo: gpu/amd/$(am__dirstamp) \ gpu/amd/$(DEPDIR)/$(am__dirstamp) -gpu/amd/libhpcrun_la-rocm-debug-api.lo: gpu/amd/$(am__dirstamp) \ +gpu/amd/libhpcrun_la-rocprofiler-api.lo: gpu/amd/$(am__dirstamp) \ gpu/amd/$(DEPDIR)/$(am__dirstamp) gpu/amd/libhpcrun_la-rocm-binary-processing.lo: \ gpu/amd/$(am__dirstamp) gpu/amd/$(DEPDIR)/$(am__dirstamp) @@ -3435,6 +3487,13 @@ gpu/libhpcrun_o-gpu-trace-channel-set.$(OBJEXT): gpu/$(am__dirstamp) \ gpu/$(DEPDIR)/$(am__dirstamp) gpu/libhpcrun_o-gpu-trace-demultiplexer.$(OBJEXT): \ gpu/$(am__dirstamp) gpu/$(DEPDIR)/$(am__dirstamp) +gpu/ompt/libhpcrun_o-ompt-gpu-api.$(OBJEXT): gpu/ompt/$(am__dirstamp) \ + gpu/ompt/$(DEPDIR)/$(am__dirstamp) +gpu/ompt/libhpcrun_o-ompt-activity-translate.$(OBJEXT): \ + gpu/ompt/$(am__dirstamp) gpu/ompt/$(DEPDIR)/$(am__dirstamp) +sample-sources/libhpcrun_o-openmp-target.$(OBJEXT): \ + sample-sources/$(am__dirstamp) \ + sample-sources/$(DEPDIR)/$(am__dirstamp) ompt/libhpcrun_o-ompt-callstack.$(OBJEXT): ompt/$(am__dirstamp) \ ompt/$(DEPDIR)/$(am__dirstamp) ompt/libhpcrun_o-ompt-defer.$(OBJEXT): ompt/$(am__dirstamp) \ @@ -3586,6 +3645,9 @@ sample-sources/libhpcrun_o-papi.$(OBJEXT): \ sample-sources/libhpcrun_o-papi-c-cupti.$(OBJEXT): \ sample-sources/$(am__dirstamp) \ sample-sources/$(DEPDIR)/$(am__dirstamp) +sample-sources/libhpcrun_o-papi-c-rocm.$(OBJEXT): \ + sample-sources/$(am__dirstamp) \ + sample-sources/$(DEPDIR)/$(am__dirstamp) sample-sources/libhpcrun_o-papi-c.$(OBJEXT): \ sample-sources/$(am__dirstamp) \ sample-sources/$(DEPDIR)/$(am__dirstamp) @@ -3786,6 +3848,8 @@ mostlyclean-compile: -rm -f gpu/level0/*.lo -rm -f gpu/nvidia/*.$(OBJEXT) -rm -f gpu/nvidia/*.lo + -rm -f gpu/ompt/*.$(OBJEXT) + -rm -f gpu/ompt/*.lo -rm -f gpu/opencl/*.$(OBJEXT) -rm -f gpu/opencl/*.lo -rm -f lush-agents/*.$(OBJEXT) @@ -3858,6 +3922,7 @@ distclean-compile: @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libhpcrun_la-env.Plo@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libhpcrun_la-epoch.Plo@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libhpcrun_la-files.Plo@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libhpcrun_la-gpu-monitors.Plo@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libhpcrun_la-handling_sample.Plo@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libhpcrun_la-hpcrun-initializers.Plo@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libhpcrun_la-hpcrun_options.Plo@am__quote@ @@ -3879,6 +3944,7 @@ distclean-compile: @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libhpcrun_la-thread_finalize.Plo@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libhpcrun_la-thread_use.Plo@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libhpcrun_la-threadmgr.Plo@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libhpcrun_la-tool_state.Plo@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libhpcrun_la-trace.Plo@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libhpcrun_la-weak.Plo@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libhpcrun_la-write_data.Plo@am__quote@ @@ -3895,6 +3961,7 @@ distclean-compile: @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libhpcrun_o-env.Po@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libhpcrun_o-epoch.Po@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libhpcrun_o-files.Po@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libhpcrun_o-gpu-monitors.Po@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libhpcrun_o-handling_sample.Po@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libhpcrun_o-hpcrun-initializers.Po@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libhpcrun_o-hpcrun_options.Po@am__quote@ @@ -3916,6 +3983,7 @@ distclean-compile: @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libhpcrun_o-thread_finalize.Po@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libhpcrun_o-thread_use.Po@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libhpcrun_o-threadmgr.Po@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libhpcrun_o-tool_state.Po@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libhpcrun_o-trace.Po@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libhpcrun_o-weak.Po@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libhpcrun_o-write_data.Po@am__quote@ @@ -4001,8 +4069,9 @@ distclean-compile: @AMDEP_TRUE@@am__include@ @am__quote@gpu/$(DEPDIR)/libhpcrun_o-gpu-trace-demultiplexer.Po@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@gpu/$(DEPDIR)/libhpcrun_o-gpu-trace-item.Po@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@gpu/$(DEPDIR)/libhpcrun_o-gpu-trace.Po@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@gpu/amd/$(DEPDIR)/libhpcrun_la-hip-api.Plo@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@gpu/amd/$(DEPDIR)/libhpcrun_la-rocm-binary-processing.Plo@am__quote@ -@AMDEP_TRUE@@am__include@ @am__quote@gpu/amd/$(DEPDIR)/libhpcrun_la-rocm-debug-api.Plo@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@gpu/amd/$(DEPDIR)/libhpcrun_la-rocprofiler-api.Plo@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@gpu/amd/$(DEPDIR)/libhpcrun_la-roctracer-activity-translate.Plo@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@gpu/amd/$(DEPDIR)/libhpcrun_la-roctracer-api.Plo@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@gpu/instrumentation/$(DEPDIR)/libhpcrun_la-gtpin-correlation-id-map.Plo@am__quote@ @@ -4034,6 +4103,10 @@ distclean-compile: @AMDEP_TRUE@@am__include@ @am__quote@gpu/nvidia/$(DEPDIR)/libhpcrun_o-cupti-analysis.Po@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@gpu/nvidia/$(DEPDIR)/libhpcrun_o-cupti-api.Po@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@gpu/nvidia/$(DEPDIR)/libhpcrun_o-cupti-gpu-api.Po@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@gpu/ompt/$(DEPDIR)/libhpcrun_la-ompt-activity-translate.Plo@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@gpu/ompt/$(DEPDIR)/libhpcrun_la-ompt-gpu-api.Plo@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@gpu/ompt/$(DEPDIR)/libhpcrun_o-ompt-activity-translate.Po@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@gpu/ompt/$(DEPDIR)/libhpcrun_o-ompt-gpu-api.Po@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@gpu/opencl/$(DEPDIR)/libhpcrun_la-opencl-activity-translate.Plo@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@gpu/opencl/$(DEPDIR)/libhpcrun_la-opencl-api.Plo@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@gpu/opencl/$(DEPDIR)/libhpcrun_la-opencl-context-map.Plo@am__quote@ @@ -4098,6 +4171,7 @@ distclean-compile: @AMDEP_TRUE@@am__include@ @am__quote@sample-sources/$(DEPDIR)/libhpcrun_ga_wrap_a-ga-overrides.Po@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@sample-sources/$(DEPDIR)/libhpcrun_io_la-io-over.Plo@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@sample-sources/$(DEPDIR)/libhpcrun_io_wrap_a-io-over.Po@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@sample-sources/$(DEPDIR)/libhpcrun_la-amd-rocprofiler.Plo@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@sample-sources/$(DEPDIR)/libhpcrun_la-amd.Plo@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@sample-sources/$(DEPDIR)/libhpcrun_la-common.Plo@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@sample-sources/$(DEPDIR)/libhpcrun_la-display.Plo@am__quote@ @@ -4113,8 +4187,10 @@ distclean-compile: @AMDEP_TRUE@@am__include@ @am__quote@sample-sources/$(DEPDIR)/libhpcrun_la-omp-idle.Plo@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@sample-sources/$(DEPDIR)/libhpcrun_la-omp-mutex.Plo@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@sample-sources/$(DEPDIR)/libhpcrun_la-opencl.Plo@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@sample-sources/$(DEPDIR)/libhpcrun_la-openmp-target.Plo@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@sample-sources/$(DEPDIR)/libhpcrun_la-papi-c-cupti.Plo@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@sample-sources/$(DEPDIR)/libhpcrun_la-papi-c-extended-info.Plo@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@sample-sources/$(DEPDIR)/libhpcrun_la-papi-c-rocm.Plo@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@sample-sources/$(DEPDIR)/libhpcrun_la-papi-c.Plo@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@sample-sources/$(DEPDIR)/libhpcrun_la-papi.Plo@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@sample-sources/$(DEPDIR)/libhpcrun_la-pthread-blame.Plo@am__quote@ @@ -4136,8 +4212,10 @@ distclean-compile: @AMDEP_TRUE@@am__include@ @am__quote@sample-sources/$(DEPDIR)/libhpcrun_o-nvidia.Po@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@sample-sources/$(DEPDIR)/libhpcrun_o-omp-idle.Po@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@sample-sources/$(DEPDIR)/libhpcrun_o-omp-mutex.Po@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@sample-sources/$(DEPDIR)/libhpcrun_o-openmp-target.Po@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@sample-sources/$(DEPDIR)/libhpcrun_o-papi-c-cupti.Po@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@sample-sources/$(DEPDIR)/libhpcrun_o-papi-c-extended-info.Po@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@sample-sources/$(DEPDIR)/libhpcrun_o-papi-c-rocm.Po@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@sample-sources/$(DEPDIR)/libhpcrun_o-papi-c.Po@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@sample-sources/$(DEPDIR)/libhpcrun_o-papi.Po@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@sample-sources/$(DEPDIR)/libhpcrun_o-pthread-blame.Po@am__quote@ @@ -4631,6 +4709,13 @@ libhpcrun_la-sample_sources_all.lo: sample_sources_all.c @AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@ @am__fastdepCC_FALSE@ $(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -c -o libhpcrun_la-sample_sources_all.lo `test -f 'sample_sources_all.c' || echo '$(srcdir)/'`sample_sources_all.c +libhpcrun_la-tool_state.lo: tool_state.c +@am__fastdepCC_TRUE@ $(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -MT libhpcrun_la-tool_state.lo -MD -MP -MF $(DEPDIR)/libhpcrun_la-tool_state.Tpo -c -o libhpcrun_la-tool_state.lo `test -f 'tool_state.c' || echo '$(srcdir)/'`tool_state.c +@am__fastdepCC_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/libhpcrun_la-tool_state.Tpo $(DEPDIR)/libhpcrun_la-tool_state.Plo +@AMDEP_TRUE@@am__fastdepCC_FALSE@ $(AM_V_CC)source='tool_state.c' object='libhpcrun_la-tool_state.lo' libtool=yes @AMDEPBACKSLASH@ +@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@ +@am__fastdepCC_FALSE@ $(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -c -o libhpcrun_la-tool_state.lo `test -f 'tool_state.c' || echo '$(srcdir)/'`tool_state.c + sample-sources/blame-shift/libhpcrun_la-blame-shift.lo: sample-sources/blame-shift/blame-shift.c @am__fastdepCC_TRUE@ $(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -MT sample-sources/blame-shift/libhpcrun_la-blame-shift.lo -MD -MP -MF sample-sources/blame-shift/$(DEPDIR)/libhpcrun_la-blame-shift.Tpo -c -o sample-sources/blame-shift/libhpcrun_la-blame-shift.lo `test -f 'sample-sources/blame-shift/blame-shift.c' || echo '$(srcdir)/'`sample-sources/blame-shift/blame-shift.c @am__fastdepCC_TRUE@ $(AM_V_at)$(am__mv) sample-sources/blame-shift/$(DEPDIR)/libhpcrun_la-blame-shift.Tpo sample-sources/blame-shift/$(DEPDIR)/libhpcrun_la-blame-shift.Plo @@ -4827,6 +4912,13 @@ libhpcrun_la-device-finalizers.lo: device-finalizers.c @AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@ @am__fastdepCC_FALSE@ $(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -c -o libhpcrun_la-device-finalizers.lo `test -f 'device-finalizers.c' || echo '$(srcdir)/'`device-finalizers.c +libhpcrun_la-gpu-monitors.lo: gpu-monitors.c +@am__fastdepCC_TRUE@ $(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -MT libhpcrun_la-gpu-monitors.lo -MD -MP -MF $(DEPDIR)/libhpcrun_la-gpu-monitors.Tpo -c -o libhpcrun_la-gpu-monitors.lo `test -f 'gpu-monitors.c' || echo '$(srcdir)/'`gpu-monitors.c +@am__fastdepCC_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/libhpcrun_la-gpu-monitors.Tpo $(DEPDIR)/libhpcrun_la-gpu-monitors.Plo +@AMDEP_TRUE@@am__fastdepCC_FALSE@ $(AM_V_CC)source='gpu-monitors.c' object='libhpcrun_la-gpu-monitors.lo' libtool=yes @AMDEPBACKSLASH@ +@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@ +@am__fastdepCC_FALSE@ $(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -c -o libhpcrun_la-gpu-monitors.lo `test -f 'gpu-monitors.c' || echo '$(srcdir)/'`gpu-monitors.c + libhpcrun_la-device-initializers.lo: device-initializers.c @am__fastdepCC_TRUE@ $(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -MT libhpcrun_la-device-initializers.lo -MD -MP -MF $(DEPDIR)/libhpcrun_la-device-initializers.Tpo -c -o libhpcrun_la-device-initializers.lo `test -f 'device-initializers.c' || echo '$(srcdir)/'`device-initializers.c @am__fastdepCC_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/libhpcrun_la-device-initializers.Tpo $(DEPDIR)/libhpcrun_la-device-initializers.Plo @@ -5198,6 +5290,27 @@ gpu/libhpcrun_la-gpu-trace-demultiplexer.lo: gpu/gpu-trace-demultiplexer.c @AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@ @am__fastdepCC_FALSE@ $(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -c -o gpu/libhpcrun_la-gpu-trace-demultiplexer.lo `test -f 'gpu/gpu-trace-demultiplexer.c' || echo '$(srcdir)/'`gpu/gpu-trace-demultiplexer.c +gpu/ompt/libhpcrun_la-ompt-gpu-api.lo: gpu/ompt/ompt-gpu-api.c +@am__fastdepCC_TRUE@ $(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -MT gpu/ompt/libhpcrun_la-ompt-gpu-api.lo -MD -MP -MF gpu/ompt/$(DEPDIR)/libhpcrun_la-ompt-gpu-api.Tpo -c -o gpu/ompt/libhpcrun_la-ompt-gpu-api.lo `test -f 'gpu/ompt/ompt-gpu-api.c' || echo '$(srcdir)/'`gpu/ompt/ompt-gpu-api.c +@am__fastdepCC_TRUE@ $(AM_V_at)$(am__mv) gpu/ompt/$(DEPDIR)/libhpcrun_la-ompt-gpu-api.Tpo gpu/ompt/$(DEPDIR)/libhpcrun_la-ompt-gpu-api.Plo +@AMDEP_TRUE@@am__fastdepCC_FALSE@ $(AM_V_CC)source='gpu/ompt/ompt-gpu-api.c' object='gpu/ompt/libhpcrun_la-ompt-gpu-api.lo' libtool=yes @AMDEPBACKSLASH@ +@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@ +@am__fastdepCC_FALSE@ $(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -c -o gpu/ompt/libhpcrun_la-ompt-gpu-api.lo `test -f 'gpu/ompt/ompt-gpu-api.c' || echo '$(srcdir)/'`gpu/ompt/ompt-gpu-api.c + +gpu/ompt/libhpcrun_la-ompt-activity-translate.lo: gpu/ompt/ompt-activity-translate.c +@am__fastdepCC_TRUE@ $(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -MT gpu/ompt/libhpcrun_la-ompt-activity-translate.lo -MD -MP -MF gpu/ompt/$(DEPDIR)/libhpcrun_la-ompt-activity-translate.Tpo -c -o gpu/ompt/libhpcrun_la-ompt-activity-translate.lo `test -f 'gpu/ompt/ompt-activity-translate.c' || echo '$(srcdir)/'`gpu/ompt/ompt-activity-translate.c +@am__fastdepCC_TRUE@ $(AM_V_at)$(am__mv) gpu/ompt/$(DEPDIR)/libhpcrun_la-ompt-activity-translate.Tpo gpu/ompt/$(DEPDIR)/libhpcrun_la-ompt-activity-translate.Plo +@AMDEP_TRUE@@am__fastdepCC_FALSE@ $(AM_V_CC)source='gpu/ompt/ompt-activity-translate.c' object='gpu/ompt/libhpcrun_la-ompt-activity-translate.lo' libtool=yes @AMDEPBACKSLASH@ +@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@ +@am__fastdepCC_FALSE@ $(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -c -o gpu/ompt/libhpcrun_la-ompt-activity-translate.lo `test -f 'gpu/ompt/ompt-activity-translate.c' || echo '$(srcdir)/'`gpu/ompt/ompt-activity-translate.c + +sample-sources/libhpcrun_la-openmp-target.lo: sample-sources/openmp-target.c +@am__fastdepCC_TRUE@ $(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -MT sample-sources/libhpcrun_la-openmp-target.lo -MD -MP -MF sample-sources/$(DEPDIR)/libhpcrun_la-openmp-target.Tpo -c -o sample-sources/libhpcrun_la-openmp-target.lo `test -f 'sample-sources/openmp-target.c' || echo '$(srcdir)/'`sample-sources/openmp-target.c +@am__fastdepCC_TRUE@ $(AM_V_at)$(am__mv) sample-sources/$(DEPDIR)/libhpcrun_la-openmp-target.Tpo sample-sources/$(DEPDIR)/libhpcrun_la-openmp-target.Plo +@AMDEP_TRUE@@am__fastdepCC_FALSE@ $(AM_V_CC)source='sample-sources/openmp-target.c' object='sample-sources/libhpcrun_la-openmp-target.lo' libtool=yes @AMDEPBACKSLASH@ +@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@ +@am__fastdepCC_FALSE@ $(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -c -o sample-sources/libhpcrun_la-openmp-target.lo `test -f 'sample-sources/openmp-target.c' || echo '$(srcdir)/'`sample-sources/openmp-target.c + ompt/libhpcrun_la-ompt-callstack.lo: ompt/ompt-callstack.c @am__fastdepCC_TRUE@ $(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -MT ompt/libhpcrun_la-ompt-callstack.lo -MD -MP -MF ompt/$(DEPDIR)/libhpcrun_la-ompt-callstack.Tpo -c -o ompt/libhpcrun_la-ompt-callstack.lo `test -f 'ompt/ompt-callstack.c' || echo '$(srcdir)/'`ompt/ompt-callstack.c @am__fastdepCC_TRUE@ $(AM_V_at)$(am__mv) ompt/$(DEPDIR)/libhpcrun_la-ompt-callstack.Tpo ompt/$(DEPDIR)/libhpcrun_la-ompt-callstack.Plo @@ -5548,6 +5661,13 @@ sample-sources/libhpcrun_la-papi-c-cupti.lo: sample-sources/papi-c-cupti.c @AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@ @am__fastdepCC_FALSE@ $(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -c -o sample-sources/libhpcrun_la-papi-c-cupti.lo `test -f 'sample-sources/papi-c-cupti.c' || echo '$(srcdir)/'`sample-sources/papi-c-cupti.c +sample-sources/libhpcrun_la-papi-c-rocm.lo: sample-sources/papi-c-rocm.c +@am__fastdepCC_TRUE@ $(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -MT sample-sources/libhpcrun_la-papi-c-rocm.lo -MD -MP -MF sample-sources/$(DEPDIR)/libhpcrun_la-papi-c-rocm.Tpo -c -o sample-sources/libhpcrun_la-papi-c-rocm.lo `test -f 'sample-sources/papi-c-rocm.c' || echo '$(srcdir)/'`sample-sources/papi-c-rocm.c +@am__fastdepCC_TRUE@ $(AM_V_at)$(am__mv) sample-sources/$(DEPDIR)/libhpcrun_la-papi-c-rocm.Tpo sample-sources/$(DEPDIR)/libhpcrun_la-papi-c-rocm.Plo +@AMDEP_TRUE@@am__fastdepCC_FALSE@ $(AM_V_CC)source='sample-sources/papi-c-rocm.c' object='sample-sources/libhpcrun_la-papi-c-rocm.lo' libtool=yes @AMDEPBACKSLASH@ +@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@ +@am__fastdepCC_FALSE@ $(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -c -o sample-sources/libhpcrun_la-papi-c-rocm.lo `test -f 'sample-sources/papi-c-rocm.c' || echo '$(srcdir)/'`sample-sources/papi-c-rocm.c + sample-sources/libhpcrun_la-papi-c.lo: sample-sources/papi-c.c @am__fastdepCC_TRUE@ $(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -MT sample-sources/libhpcrun_la-papi-c.lo -MD -MP -MF sample-sources/$(DEPDIR)/libhpcrun_la-papi-c.Tpo -c -o sample-sources/libhpcrun_la-papi-c.lo `test -f 'sample-sources/papi-c.c' || echo '$(srcdir)/'`sample-sources/papi-c.c @am__fastdepCC_TRUE@ $(AM_V_at)$(am__mv) sample-sources/$(DEPDIR)/libhpcrun_la-papi-c.Tpo sample-sources/$(DEPDIR)/libhpcrun_la-papi-c.Plo @@ -5646,6 +5766,20 @@ sample-sources/libhpcrun_la-amd.lo: sample-sources/amd.c @AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@ @am__fastdepCC_FALSE@ $(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -c -o sample-sources/libhpcrun_la-amd.lo `test -f 'sample-sources/amd.c' || echo '$(srcdir)/'`sample-sources/amd.c +sample-sources/libhpcrun_la-amd-rocprofiler.lo: sample-sources/amd-rocprofiler.c +@am__fastdepCC_TRUE@ $(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -MT sample-sources/libhpcrun_la-amd-rocprofiler.lo -MD -MP -MF sample-sources/$(DEPDIR)/libhpcrun_la-amd-rocprofiler.Tpo -c -o sample-sources/libhpcrun_la-amd-rocprofiler.lo `test -f 'sample-sources/amd-rocprofiler.c' || echo '$(srcdir)/'`sample-sources/amd-rocprofiler.c +@am__fastdepCC_TRUE@ $(AM_V_at)$(am__mv) sample-sources/$(DEPDIR)/libhpcrun_la-amd-rocprofiler.Tpo sample-sources/$(DEPDIR)/libhpcrun_la-amd-rocprofiler.Plo +@AMDEP_TRUE@@am__fastdepCC_FALSE@ $(AM_V_CC)source='sample-sources/amd-rocprofiler.c' object='sample-sources/libhpcrun_la-amd-rocprofiler.lo' libtool=yes @AMDEPBACKSLASH@ +@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@ +@am__fastdepCC_FALSE@ $(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -c -o sample-sources/libhpcrun_la-amd-rocprofiler.lo `test -f 'sample-sources/amd-rocprofiler.c' || echo '$(srcdir)/'`sample-sources/amd-rocprofiler.c + +gpu/amd/libhpcrun_la-hip-api.lo: gpu/amd/hip-api.c +@am__fastdepCC_TRUE@ $(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -MT gpu/amd/libhpcrun_la-hip-api.lo -MD -MP -MF gpu/amd/$(DEPDIR)/libhpcrun_la-hip-api.Tpo -c -o gpu/amd/libhpcrun_la-hip-api.lo `test -f 'gpu/amd/hip-api.c' || echo '$(srcdir)/'`gpu/amd/hip-api.c +@am__fastdepCC_TRUE@ $(AM_V_at)$(am__mv) gpu/amd/$(DEPDIR)/libhpcrun_la-hip-api.Tpo gpu/amd/$(DEPDIR)/libhpcrun_la-hip-api.Plo +@AMDEP_TRUE@@am__fastdepCC_FALSE@ $(AM_V_CC)source='gpu/amd/hip-api.c' object='gpu/amd/libhpcrun_la-hip-api.lo' libtool=yes @AMDEPBACKSLASH@ +@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@ +@am__fastdepCC_FALSE@ $(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -c -o gpu/amd/libhpcrun_la-hip-api.lo `test -f 'gpu/amd/hip-api.c' || echo '$(srcdir)/'`gpu/amd/hip-api.c + gpu/amd/libhpcrun_la-roctracer-activity-translate.lo: gpu/amd/roctracer-activity-translate.c @am__fastdepCC_TRUE@ $(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -MT gpu/amd/libhpcrun_la-roctracer-activity-translate.lo -MD -MP -MF gpu/amd/$(DEPDIR)/libhpcrun_la-roctracer-activity-translate.Tpo -c -o gpu/amd/libhpcrun_la-roctracer-activity-translate.lo `test -f 'gpu/amd/roctracer-activity-translate.c' || echo '$(srcdir)/'`gpu/amd/roctracer-activity-translate.c @am__fastdepCC_TRUE@ $(AM_V_at)$(am__mv) gpu/amd/$(DEPDIR)/libhpcrun_la-roctracer-activity-translate.Tpo gpu/amd/$(DEPDIR)/libhpcrun_la-roctracer-activity-translate.Plo @@ -5660,12 +5794,12 @@ gpu/amd/libhpcrun_la-roctracer-api.lo: gpu/amd/roctracer-api.c @AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@ @am__fastdepCC_FALSE@ $(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -c -o gpu/amd/libhpcrun_la-roctracer-api.lo `test -f 'gpu/amd/roctracer-api.c' || echo '$(srcdir)/'`gpu/amd/roctracer-api.c -gpu/amd/libhpcrun_la-rocm-debug-api.lo: gpu/amd/rocm-debug-api.c -@am__fastdepCC_TRUE@ $(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -MT gpu/amd/libhpcrun_la-rocm-debug-api.lo -MD -MP -MF gpu/amd/$(DEPDIR)/libhpcrun_la-rocm-debug-api.Tpo -c -o gpu/amd/libhpcrun_la-rocm-debug-api.lo `test -f 'gpu/amd/rocm-debug-api.c' || echo '$(srcdir)/'`gpu/amd/rocm-debug-api.c -@am__fastdepCC_TRUE@ $(AM_V_at)$(am__mv) gpu/amd/$(DEPDIR)/libhpcrun_la-rocm-debug-api.Tpo gpu/amd/$(DEPDIR)/libhpcrun_la-rocm-debug-api.Plo -@AMDEP_TRUE@@am__fastdepCC_FALSE@ $(AM_V_CC)source='gpu/amd/rocm-debug-api.c' object='gpu/amd/libhpcrun_la-rocm-debug-api.lo' libtool=yes @AMDEPBACKSLASH@ +gpu/amd/libhpcrun_la-rocprofiler-api.lo: gpu/amd/rocprofiler-api.c +@am__fastdepCC_TRUE@ $(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -MT gpu/amd/libhpcrun_la-rocprofiler-api.lo -MD -MP -MF gpu/amd/$(DEPDIR)/libhpcrun_la-rocprofiler-api.Tpo -c -o gpu/amd/libhpcrun_la-rocprofiler-api.lo `test -f 'gpu/amd/rocprofiler-api.c' || echo '$(srcdir)/'`gpu/amd/rocprofiler-api.c +@am__fastdepCC_TRUE@ $(AM_V_at)$(am__mv) gpu/amd/$(DEPDIR)/libhpcrun_la-rocprofiler-api.Tpo gpu/amd/$(DEPDIR)/libhpcrun_la-rocprofiler-api.Plo +@AMDEP_TRUE@@am__fastdepCC_FALSE@ $(AM_V_CC)source='gpu/amd/rocprofiler-api.c' object='gpu/amd/libhpcrun_la-rocprofiler-api.lo' libtool=yes @AMDEPBACKSLASH@ @AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@ -@am__fastdepCC_FALSE@ $(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -c -o gpu/amd/libhpcrun_la-rocm-debug-api.lo `test -f 'gpu/amd/rocm-debug-api.c' || echo '$(srcdir)/'`gpu/amd/rocm-debug-api.c +@am__fastdepCC_FALSE@ $(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -c -o gpu/amd/libhpcrun_la-rocprofiler-api.lo `test -f 'gpu/amd/rocprofiler-api.c' || echo '$(srcdir)/'`gpu/amd/rocprofiler-api.c gpu/amd/libhpcrun_la-rocm-binary-processing.lo: gpu/amd/rocm-binary-processing.c @am__fastdepCC_TRUE@ $(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_la_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_la_CFLAGS) $(CFLAGS) -MT gpu/amd/libhpcrun_la-rocm-binary-processing.lo -MD -MP -MF gpu/amd/$(DEPDIR)/libhpcrun_la-rocm-binary-processing.Tpo -c -o gpu/amd/libhpcrun_la-rocm-binary-processing.lo `test -f 'gpu/amd/rocm-binary-processing.c' || echo '$(srcdir)/'`gpu/amd/rocm-binary-processing.c @@ -6367,6 +6501,20 @@ libhpcrun_o-sample_sources_all.obj: sample_sources_all.c @AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@ @am__fastdepCC_FALSE@ $(AM_V_CC@am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_o_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_o_CFLAGS) $(CFLAGS) -c -o libhpcrun_o-sample_sources_all.obj `if test -f 'sample_sources_all.c'; then $(CYGPATH_W) 'sample_sources_all.c'; else $(CYGPATH_W) '$(srcdir)/sample_sources_all.c'; fi` +libhpcrun_o-tool_state.o: tool_state.c +@am__fastdepCC_TRUE@ $(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_o_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_o_CFLAGS) $(CFLAGS) -MT libhpcrun_o-tool_state.o -MD -MP -MF $(DEPDIR)/libhpcrun_o-tool_state.Tpo -c -o libhpcrun_o-tool_state.o `test -f 'tool_state.c' || echo '$(srcdir)/'`tool_state.c +@am__fastdepCC_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/libhpcrun_o-tool_state.Tpo $(DEPDIR)/libhpcrun_o-tool_state.Po +@AMDEP_TRUE@@am__fastdepCC_FALSE@ $(AM_V_CC)source='tool_state.c' object='libhpcrun_o-tool_state.o' libtool=no @AMDEPBACKSLASH@ +@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@ +@am__fastdepCC_FALSE@ $(AM_V_CC@am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_o_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_o_CFLAGS) $(CFLAGS) -c -o libhpcrun_o-tool_state.o `test -f 'tool_state.c' || echo '$(srcdir)/'`tool_state.c + +libhpcrun_o-tool_state.obj: tool_state.c +@am__fastdepCC_TRUE@ $(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_o_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_o_CFLAGS) $(CFLAGS) -MT libhpcrun_o-tool_state.obj -MD -MP -MF $(DEPDIR)/libhpcrun_o-tool_state.Tpo -c -o libhpcrun_o-tool_state.obj `if test -f 'tool_state.c'; then $(CYGPATH_W) 'tool_state.c'; else $(CYGPATH_W) '$(srcdir)/tool_state.c'; fi` +@am__fastdepCC_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/libhpcrun_o-tool_state.Tpo $(DEPDIR)/libhpcrun_o-tool_state.Po +@AMDEP_TRUE@@am__fastdepCC_FALSE@ $(AM_V_CC)source='tool_state.c' object='libhpcrun_o-tool_state.obj' libtool=no @AMDEPBACKSLASH@ +@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@ +@am__fastdepCC_FALSE@ $(AM_V_CC@am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_o_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_o_CFLAGS) $(CFLAGS) -c -o libhpcrun_o-tool_state.obj `if test -f 'tool_state.c'; then $(CYGPATH_W) 'tool_state.c'; else $(CYGPATH_W) '$(srcdir)/tool_state.c'; fi` + sample-sources/blame-shift/libhpcrun_o-blame-shift.o: sample-sources/blame-shift/blame-shift.c @am__fastdepCC_TRUE@ $(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_o_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_o_CFLAGS) $(CFLAGS) -MT sample-sources/blame-shift/libhpcrun_o-blame-shift.o -MD -MP -MF sample-sources/blame-shift/$(DEPDIR)/libhpcrun_o-blame-shift.Tpo -c -o sample-sources/blame-shift/libhpcrun_o-blame-shift.o `test -f 'sample-sources/blame-shift/blame-shift.c' || echo '$(srcdir)/'`sample-sources/blame-shift/blame-shift.c @am__fastdepCC_TRUE@ $(AM_V_at)$(am__mv) sample-sources/blame-shift/$(DEPDIR)/libhpcrun_o-blame-shift.Tpo sample-sources/blame-shift/$(DEPDIR)/libhpcrun_o-blame-shift.Po @@ -6759,6 +6907,20 @@ libhpcrun_o-device-finalizers.obj: device-finalizers.c @AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@ @am__fastdepCC_FALSE@ $(AM_V_CC@am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_o_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_o_CFLAGS) $(CFLAGS) -c -o libhpcrun_o-device-finalizers.obj `if test -f 'device-finalizers.c'; then $(CYGPATH_W) 'device-finalizers.c'; else $(CYGPATH_W) '$(srcdir)/device-finalizers.c'; fi` +libhpcrun_o-gpu-monitors.o: gpu-monitors.c +@am__fastdepCC_TRUE@ $(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_o_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_o_CFLAGS) $(CFLAGS) -MT libhpcrun_o-gpu-monitors.o -MD -MP -MF $(DEPDIR)/libhpcrun_o-gpu-monitors.Tpo -c -o libhpcrun_o-gpu-monitors.o `test -f 'gpu-monitors.c' || echo '$(srcdir)/'`gpu-monitors.c +@am__fastdepCC_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/libhpcrun_o-gpu-monitors.Tpo $(DEPDIR)/libhpcrun_o-gpu-monitors.Po +@AMDEP_TRUE@@am__fastdepCC_FALSE@ $(AM_V_CC)source='gpu-monitors.c' object='libhpcrun_o-gpu-monitors.o' libtool=no @AMDEPBACKSLASH@ +@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@ +@am__fastdepCC_FALSE@ $(AM_V_CC@am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_o_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_o_CFLAGS) $(CFLAGS) -c -o libhpcrun_o-gpu-monitors.o `test -f 'gpu-monitors.c' || echo '$(srcdir)/'`gpu-monitors.c + +libhpcrun_o-gpu-monitors.obj: gpu-monitors.c +@am__fastdepCC_TRUE@ $(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_o_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_o_CFLAGS) $(CFLAGS) -MT libhpcrun_o-gpu-monitors.obj -MD -MP -MF $(DEPDIR)/libhpcrun_o-gpu-monitors.Tpo -c -o libhpcrun_o-gpu-monitors.obj `if test -f 'gpu-monitors.c'; then $(CYGPATH_W) 'gpu-monitors.c'; else $(CYGPATH_W) '$(srcdir)/gpu-monitors.c'; fi` +@am__fastdepCC_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/libhpcrun_o-gpu-monitors.Tpo $(DEPDIR)/libhpcrun_o-gpu-monitors.Po +@AMDEP_TRUE@@am__fastdepCC_FALSE@ $(AM_V_CC)source='gpu-monitors.c' object='libhpcrun_o-gpu-monitors.obj' libtool=no @AMDEPBACKSLASH@ +@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@ +@am__fastdepCC_FALSE@ $(AM_V_CC@am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_o_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_o_CFLAGS) $(CFLAGS) -c -o libhpcrun_o-gpu-monitors.obj `if test -f 'gpu-monitors.c'; then $(CYGPATH_W) 'gpu-monitors.c'; else $(CYGPATH_W) '$(srcdir)/gpu-monitors.c'; fi` + libhpcrun_o-device-initializers.o: device-initializers.c @am__fastdepCC_TRUE@ $(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_o_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_o_CFLAGS) $(CFLAGS) -MT libhpcrun_o-device-initializers.o -MD -MP -MF $(DEPDIR)/libhpcrun_o-device-initializers.Tpo -c -o libhpcrun_o-device-initializers.o `test -f 'device-initializers.c' || echo '$(srcdir)/'`device-initializers.c @am__fastdepCC_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/libhpcrun_o-device-initializers.Tpo $(DEPDIR)/libhpcrun_o-device-initializers.Po @@ -7501,6 +7663,48 @@ gpu/libhpcrun_o-gpu-trace-demultiplexer.obj: gpu/gpu-trace-demultiplexer.c @AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@ @am__fastdepCC_FALSE@ $(AM_V_CC@am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_o_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_o_CFLAGS) $(CFLAGS) -c -o gpu/libhpcrun_o-gpu-trace-demultiplexer.obj `if test -f 'gpu/gpu-trace-demultiplexer.c'; then $(CYGPATH_W) 'gpu/gpu-trace-demultiplexer.c'; else $(CYGPATH_W) '$(srcdir)/gpu/gpu-trace-demultiplexer.c'; fi` +gpu/ompt/libhpcrun_o-ompt-gpu-api.o: gpu/ompt/ompt-gpu-api.c +@am__fastdepCC_TRUE@ $(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_o_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_o_CFLAGS) $(CFLAGS) -MT gpu/ompt/libhpcrun_o-ompt-gpu-api.o -MD -MP -MF gpu/ompt/$(DEPDIR)/libhpcrun_o-ompt-gpu-api.Tpo -c -o gpu/ompt/libhpcrun_o-ompt-gpu-api.o `test -f 'gpu/ompt/ompt-gpu-api.c' || echo '$(srcdir)/'`gpu/ompt/ompt-gpu-api.c +@am__fastdepCC_TRUE@ $(AM_V_at)$(am__mv) gpu/ompt/$(DEPDIR)/libhpcrun_o-ompt-gpu-api.Tpo gpu/ompt/$(DEPDIR)/libhpcrun_o-ompt-gpu-api.Po +@AMDEP_TRUE@@am__fastdepCC_FALSE@ $(AM_V_CC)source='gpu/ompt/ompt-gpu-api.c' object='gpu/ompt/libhpcrun_o-ompt-gpu-api.o' libtool=no @AMDEPBACKSLASH@ +@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@ +@am__fastdepCC_FALSE@ $(AM_V_CC@am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_o_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_o_CFLAGS) $(CFLAGS) -c -o gpu/ompt/libhpcrun_o-ompt-gpu-api.o `test -f 'gpu/ompt/ompt-gpu-api.c' || echo '$(srcdir)/'`gpu/ompt/ompt-gpu-api.c + +gpu/ompt/libhpcrun_o-ompt-gpu-api.obj: gpu/ompt/ompt-gpu-api.c +@am__fastdepCC_TRUE@ $(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_o_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_o_CFLAGS) $(CFLAGS) -MT gpu/ompt/libhpcrun_o-ompt-gpu-api.obj -MD -MP -MF gpu/ompt/$(DEPDIR)/libhpcrun_o-ompt-gpu-api.Tpo -c -o gpu/ompt/libhpcrun_o-ompt-gpu-api.obj `if test -f 'gpu/ompt/ompt-gpu-api.c'; then $(CYGPATH_W) 'gpu/ompt/ompt-gpu-api.c'; else $(CYGPATH_W) '$(srcdir)/gpu/ompt/ompt-gpu-api.c'; fi` +@am__fastdepCC_TRUE@ $(AM_V_at)$(am__mv) gpu/ompt/$(DEPDIR)/libhpcrun_o-ompt-gpu-api.Tpo gpu/ompt/$(DEPDIR)/libhpcrun_o-ompt-gpu-api.Po +@AMDEP_TRUE@@am__fastdepCC_FALSE@ $(AM_V_CC)source='gpu/ompt/ompt-gpu-api.c' object='gpu/ompt/libhpcrun_o-ompt-gpu-api.obj' libtool=no @AMDEPBACKSLASH@ +@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@ +@am__fastdepCC_FALSE@ $(AM_V_CC@am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_o_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_o_CFLAGS) $(CFLAGS) -c -o gpu/ompt/libhpcrun_o-ompt-gpu-api.obj `if test -f 'gpu/ompt/ompt-gpu-api.c'; then $(CYGPATH_W) 'gpu/ompt/ompt-gpu-api.c'; else $(CYGPATH_W) '$(srcdir)/gpu/ompt/ompt-gpu-api.c'; fi` + +gpu/ompt/libhpcrun_o-ompt-activity-translate.o: gpu/ompt/ompt-activity-translate.c +@am__fastdepCC_TRUE@ $(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_o_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_o_CFLAGS) $(CFLAGS) -MT gpu/ompt/libhpcrun_o-ompt-activity-translate.o -MD -MP -MF gpu/ompt/$(DEPDIR)/libhpcrun_o-ompt-activity-translate.Tpo -c -o gpu/ompt/libhpcrun_o-ompt-activity-translate.o `test -f 'gpu/ompt/ompt-activity-translate.c' || echo '$(srcdir)/'`gpu/ompt/ompt-activity-translate.c +@am__fastdepCC_TRUE@ $(AM_V_at)$(am__mv) gpu/ompt/$(DEPDIR)/libhpcrun_o-ompt-activity-translate.Tpo gpu/ompt/$(DEPDIR)/libhpcrun_o-ompt-activity-translate.Po +@AMDEP_TRUE@@am__fastdepCC_FALSE@ $(AM_V_CC)source='gpu/ompt/ompt-activity-translate.c' object='gpu/ompt/libhpcrun_o-ompt-activity-translate.o' libtool=no @AMDEPBACKSLASH@ +@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@ +@am__fastdepCC_FALSE@ $(AM_V_CC@am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_o_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_o_CFLAGS) $(CFLAGS) -c -o gpu/ompt/libhpcrun_o-ompt-activity-translate.o `test -f 'gpu/ompt/ompt-activity-translate.c' || echo '$(srcdir)/'`gpu/ompt/ompt-activity-translate.c + +gpu/ompt/libhpcrun_o-ompt-activity-translate.obj: gpu/ompt/ompt-activity-translate.c +@am__fastdepCC_TRUE@ $(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_o_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_o_CFLAGS) $(CFLAGS) -MT gpu/ompt/libhpcrun_o-ompt-activity-translate.obj -MD -MP -MF gpu/ompt/$(DEPDIR)/libhpcrun_o-ompt-activity-translate.Tpo -c -o gpu/ompt/libhpcrun_o-ompt-activity-translate.obj `if test -f 'gpu/ompt/ompt-activity-translate.c'; then $(CYGPATH_W) 'gpu/ompt/ompt-activity-translate.c'; else $(CYGPATH_W) '$(srcdir)/gpu/ompt/ompt-activity-translate.c'; fi` +@am__fastdepCC_TRUE@ $(AM_V_at)$(am__mv) gpu/ompt/$(DEPDIR)/libhpcrun_o-ompt-activity-translate.Tpo gpu/ompt/$(DEPDIR)/libhpcrun_o-ompt-activity-translate.Po +@AMDEP_TRUE@@am__fastdepCC_FALSE@ $(AM_V_CC)source='gpu/ompt/ompt-activity-translate.c' object='gpu/ompt/libhpcrun_o-ompt-activity-translate.obj' libtool=no @AMDEPBACKSLASH@ +@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@ +@am__fastdepCC_FALSE@ $(AM_V_CC@am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_o_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_o_CFLAGS) $(CFLAGS) -c -o gpu/ompt/libhpcrun_o-ompt-activity-translate.obj `if test -f 'gpu/ompt/ompt-activity-translate.c'; then $(CYGPATH_W) 'gpu/ompt/ompt-activity-translate.c'; else $(CYGPATH_W) '$(srcdir)/gpu/ompt/ompt-activity-translate.c'; fi` + +sample-sources/libhpcrun_o-openmp-target.o: sample-sources/openmp-target.c +@am__fastdepCC_TRUE@ $(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_o_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_o_CFLAGS) $(CFLAGS) -MT sample-sources/libhpcrun_o-openmp-target.o -MD -MP -MF sample-sources/$(DEPDIR)/libhpcrun_o-openmp-target.Tpo -c -o sample-sources/libhpcrun_o-openmp-target.o `test -f 'sample-sources/openmp-target.c' || echo '$(srcdir)/'`sample-sources/openmp-target.c +@am__fastdepCC_TRUE@ $(AM_V_at)$(am__mv) sample-sources/$(DEPDIR)/libhpcrun_o-openmp-target.Tpo sample-sources/$(DEPDIR)/libhpcrun_o-openmp-target.Po +@AMDEP_TRUE@@am__fastdepCC_FALSE@ $(AM_V_CC)source='sample-sources/openmp-target.c' object='sample-sources/libhpcrun_o-openmp-target.o' libtool=no @AMDEPBACKSLASH@ +@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@ +@am__fastdepCC_FALSE@ $(AM_V_CC@am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_o_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_o_CFLAGS) $(CFLAGS) -c -o sample-sources/libhpcrun_o-openmp-target.o `test -f 'sample-sources/openmp-target.c' || echo '$(srcdir)/'`sample-sources/openmp-target.c + +sample-sources/libhpcrun_o-openmp-target.obj: sample-sources/openmp-target.c +@am__fastdepCC_TRUE@ $(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_o_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_o_CFLAGS) $(CFLAGS) -MT sample-sources/libhpcrun_o-openmp-target.obj -MD -MP -MF sample-sources/$(DEPDIR)/libhpcrun_o-openmp-target.Tpo -c -o sample-sources/libhpcrun_o-openmp-target.obj `if test -f 'sample-sources/openmp-target.c'; then $(CYGPATH_W) 'sample-sources/openmp-target.c'; else $(CYGPATH_W) '$(srcdir)/sample-sources/openmp-target.c'; fi` +@am__fastdepCC_TRUE@ $(AM_V_at)$(am__mv) sample-sources/$(DEPDIR)/libhpcrun_o-openmp-target.Tpo sample-sources/$(DEPDIR)/libhpcrun_o-openmp-target.Po +@AMDEP_TRUE@@am__fastdepCC_FALSE@ $(AM_V_CC)source='sample-sources/openmp-target.c' object='sample-sources/libhpcrun_o-openmp-target.obj' libtool=no @AMDEPBACKSLASH@ +@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@ +@am__fastdepCC_FALSE@ $(AM_V_CC@am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_o_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_o_CFLAGS) $(CFLAGS) -c -o sample-sources/libhpcrun_o-openmp-target.obj `if test -f 'sample-sources/openmp-target.c'; then $(CYGPATH_W) 'sample-sources/openmp-target.c'; else $(CYGPATH_W) '$(srcdir)/sample-sources/openmp-target.c'; fi` + ompt/libhpcrun_o-ompt-callstack.o: ompt/ompt-callstack.c @am__fastdepCC_TRUE@ $(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_o_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_o_CFLAGS) $(CFLAGS) -MT ompt/libhpcrun_o-ompt-callstack.o -MD -MP -MF ompt/$(DEPDIR)/libhpcrun_o-ompt-callstack.Tpo -c -o ompt/libhpcrun_o-ompt-callstack.o `test -f 'ompt/ompt-callstack.c' || echo '$(srcdir)/'`ompt/ompt-callstack.c @am__fastdepCC_TRUE@ $(AM_V_at)$(am__mv) ompt/$(DEPDIR)/libhpcrun_o-ompt-callstack.Tpo ompt/$(DEPDIR)/libhpcrun_o-ompt-callstack.Po @@ -8299,6 +8503,20 @@ sample-sources/libhpcrun_o-papi-c-cupti.obj: sample-sources/papi-c-cupti.c @AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@ @am__fastdepCC_FALSE@ $(AM_V_CC@am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_o_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_o_CFLAGS) $(CFLAGS) -c -o sample-sources/libhpcrun_o-papi-c-cupti.obj `if test -f 'sample-sources/papi-c-cupti.c'; then $(CYGPATH_W) 'sample-sources/papi-c-cupti.c'; else $(CYGPATH_W) '$(srcdir)/sample-sources/papi-c-cupti.c'; fi` +sample-sources/libhpcrun_o-papi-c-rocm.o: sample-sources/papi-c-rocm.c +@am__fastdepCC_TRUE@ $(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_o_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_o_CFLAGS) $(CFLAGS) -MT sample-sources/libhpcrun_o-papi-c-rocm.o -MD -MP -MF sample-sources/$(DEPDIR)/libhpcrun_o-papi-c-rocm.Tpo -c -o sample-sources/libhpcrun_o-papi-c-rocm.o `test -f 'sample-sources/papi-c-rocm.c' || echo '$(srcdir)/'`sample-sources/papi-c-rocm.c +@am__fastdepCC_TRUE@ $(AM_V_at)$(am__mv) sample-sources/$(DEPDIR)/libhpcrun_o-papi-c-rocm.Tpo sample-sources/$(DEPDIR)/libhpcrun_o-papi-c-rocm.Po +@AMDEP_TRUE@@am__fastdepCC_FALSE@ $(AM_V_CC)source='sample-sources/papi-c-rocm.c' object='sample-sources/libhpcrun_o-papi-c-rocm.o' libtool=no @AMDEPBACKSLASH@ +@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@ +@am__fastdepCC_FALSE@ $(AM_V_CC@am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_o_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_o_CFLAGS) $(CFLAGS) -c -o sample-sources/libhpcrun_o-papi-c-rocm.o `test -f 'sample-sources/papi-c-rocm.c' || echo '$(srcdir)/'`sample-sources/papi-c-rocm.c + +sample-sources/libhpcrun_o-papi-c-rocm.obj: sample-sources/papi-c-rocm.c +@am__fastdepCC_TRUE@ $(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_o_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_o_CFLAGS) $(CFLAGS) -MT sample-sources/libhpcrun_o-papi-c-rocm.obj -MD -MP -MF sample-sources/$(DEPDIR)/libhpcrun_o-papi-c-rocm.Tpo -c -o sample-sources/libhpcrun_o-papi-c-rocm.obj `if test -f 'sample-sources/papi-c-rocm.c'; then $(CYGPATH_W) 'sample-sources/papi-c-rocm.c'; else $(CYGPATH_W) '$(srcdir)/sample-sources/papi-c-rocm.c'; fi` +@am__fastdepCC_TRUE@ $(AM_V_at)$(am__mv) sample-sources/$(DEPDIR)/libhpcrun_o-papi-c-rocm.Tpo sample-sources/$(DEPDIR)/libhpcrun_o-papi-c-rocm.Po +@AMDEP_TRUE@@am__fastdepCC_FALSE@ $(AM_V_CC)source='sample-sources/papi-c-rocm.c' object='sample-sources/libhpcrun_o-papi-c-rocm.obj' libtool=no @AMDEPBACKSLASH@ +@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@ +@am__fastdepCC_FALSE@ $(AM_V_CC@am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_o_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_o_CFLAGS) $(CFLAGS) -c -o sample-sources/libhpcrun_o-papi-c-rocm.obj `if test -f 'sample-sources/papi-c-rocm.c'; then $(CYGPATH_W) 'sample-sources/papi-c-rocm.c'; else $(CYGPATH_W) '$(srcdir)/sample-sources/papi-c-rocm.c'; fi` + sample-sources/libhpcrun_o-papi-c.o: sample-sources/papi-c.c @am__fastdepCC_TRUE@ $(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libhpcrun_o_CPPFLAGS) $(CPPFLAGS) $(libhpcrun_o_CFLAGS) $(CFLAGS) -MT sample-sources/libhpcrun_o-papi-c.o -MD -MP -MF sample-sources/$(DEPDIR)/libhpcrun_o-papi-c.Tpo -c -o sample-sources/libhpcrun_o-papi-c.o `test -f 'sample-sources/papi-c.c' || echo '$(srcdir)/'`sample-sources/papi-c.c @am__fastdepCC_TRUE@ $(AM_V_at)$(am__mv) sample-sources/$(DEPDIR)/libhpcrun_o-papi-c.Tpo sample-sources/$(DEPDIR)/libhpcrun_o-papi-c.Po @@ -8817,6 +9035,7 @@ clean-libtool: -rm -rf gpu/instrumentation/.libs gpu/instrumentation/_libs -rm -rf gpu/level0/.libs gpu/level0/_libs -rm -rf gpu/nvidia/.libs gpu/nvidia/_libs + -rm -rf gpu/ompt/.libs gpu/ompt/_libs -rm -rf gpu/opencl/.libs gpu/opencl/_libs -rm -rf lush/.libs lush/_libs -rm -rf lush-agents/.libs lush-agents/_libs @@ -9079,6 +9298,8 @@ distclean-generic: -rm -f gpu/level0/$(am__dirstamp) -rm -f gpu/nvidia/$(DEPDIR)/$(am__dirstamp) -rm -f gpu/nvidia/$(am__dirstamp) + -rm -f gpu/ompt/$(DEPDIR)/$(am__dirstamp) + -rm -f gpu/ompt/$(am__dirstamp) -rm -f gpu/opencl/$(DEPDIR)/$(am__dirstamp) -rm -f gpu/opencl/$(am__dirstamp) -rm -f lush-agents/$(DEPDIR)/$(am__dirstamp) @@ -9146,7 +9367,7 @@ clean-am: clean-generic clean-libtool clean-noinstLTLIBRARIES \ clean-pkglibLTLIBRARIES mostlyclean-am distclean: distclean-recursive - -rm -rf ./$(DEPDIR) audit/$(DEPDIR) cct/$(DEPDIR) dlmopen/$(DEPDIR) extern-real/$(DEPDIR) fnbounds/$(DEPDIR) gpu/$(DEPDIR) gpu/amd/$(DEPDIR) gpu/instrumentation/$(DEPDIR) gpu/level0/$(DEPDIR) gpu/nvidia/$(DEPDIR) gpu/opencl/$(DEPDIR) lush-agents/$(DEPDIR) lush/$(DEPDIR) memory/$(DEPDIR) messages/$(DEPDIR) monitor-exts/$(DEPDIR) ompt/$(DEPDIR) os/linux/$(DEPDIR) sample-sources/$(DEPDIR) sample-sources/blame-shift/$(DEPDIR) sample-sources/perf/$(DEPDIR) syscalls/$(DEPDIR) trampoline/aarch64/$(DEPDIR) trampoline/common/$(DEPDIR) trampoline/x86-family/$(DEPDIR) unwind/common/$(DEPDIR) unwind/generic-libunwind/$(DEPDIR) unwind/ppc64/$(DEPDIR) unwind/x86-family/$(DEPDIR) unwind/x86-family/manual-intervals/$(DEPDIR) utilities/$(DEPDIR) utilities/arch/ia64/$(DEPDIR) utilities/arch/libunwind/$(DEPDIR) utilities/arch/ppc64/$(DEPDIR) utilities/arch/x86-family/$(DEPDIR) + -rm -rf ./$(DEPDIR) audit/$(DEPDIR) cct/$(DEPDIR) dlmopen/$(DEPDIR) extern-real/$(DEPDIR) fnbounds/$(DEPDIR) gpu/$(DEPDIR) gpu/amd/$(DEPDIR) gpu/instrumentation/$(DEPDIR) gpu/level0/$(DEPDIR) gpu/nvidia/$(DEPDIR) gpu/ompt/$(DEPDIR) gpu/opencl/$(DEPDIR) lush-agents/$(DEPDIR) lush/$(DEPDIR) memory/$(DEPDIR) messages/$(DEPDIR) monitor-exts/$(DEPDIR) ompt/$(DEPDIR) os/linux/$(DEPDIR) sample-sources/$(DEPDIR) sample-sources/blame-shift/$(DEPDIR) sample-sources/perf/$(DEPDIR) syscalls/$(DEPDIR) trampoline/aarch64/$(DEPDIR) trampoline/common/$(DEPDIR) trampoline/x86-family/$(DEPDIR) unwind/common/$(DEPDIR) unwind/generic-libunwind/$(DEPDIR) unwind/ppc64/$(DEPDIR) unwind/x86-family/$(DEPDIR) unwind/x86-family/manual-intervals/$(DEPDIR) utilities/$(DEPDIR) utilities/arch/ia64/$(DEPDIR) utilities/arch/libunwind/$(DEPDIR) utilities/arch/ppc64/$(DEPDIR) utilities/arch/x86-family/$(DEPDIR) -rm -f Makefile distclean-am: clean-am distclean-compile distclean-generic \ distclean-tags @@ -9195,7 +9416,7 @@ install-ps-am: installcheck-am: maintainer-clean: maintainer-clean-recursive - -rm -rf ./$(DEPDIR) audit/$(DEPDIR) cct/$(DEPDIR) dlmopen/$(DEPDIR) extern-real/$(DEPDIR) fnbounds/$(DEPDIR) gpu/$(DEPDIR) gpu/amd/$(DEPDIR) gpu/instrumentation/$(DEPDIR) gpu/level0/$(DEPDIR) gpu/nvidia/$(DEPDIR) gpu/opencl/$(DEPDIR) lush-agents/$(DEPDIR) lush/$(DEPDIR) memory/$(DEPDIR) messages/$(DEPDIR) monitor-exts/$(DEPDIR) ompt/$(DEPDIR) os/linux/$(DEPDIR) sample-sources/$(DEPDIR) sample-sources/blame-shift/$(DEPDIR) sample-sources/perf/$(DEPDIR) syscalls/$(DEPDIR) trampoline/aarch64/$(DEPDIR) trampoline/common/$(DEPDIR) trampoline/x86-family/$(DEPDIR) unwind/common/$(DEPDIR) unwind/generic-libunwind/$(DEPDIR) unwind/ppc64/$(DEPDIR) unwind/x86-family/$(DEPDIR) unwind/x86-family/manual-intervals/$(DEPDIR) utilities/$(DEPDIR) utilities/arch/ia64/$(DEPDIR) utilities/arch/libunwind/$(DEPDIR) utilities/arch/ppc64/$(DEPDIR) utilities/arch/x86-family/$(DEPDIR) + -rm -rf ./$(DEPDIR) audit/$(DEPDIR) cct/$(DEPDIR) dlmopen/$(DEPDIR) extern-real/$(DEPDIR) fnbounds/$(DEPDIR) gpu/$(DEPDIR) gpu/amd/$(DEPDIR) gpu/instrumentation/$(DEPDIR) gpu/level0/$(DEPDIR) gpu/nvidia/$(DEPDIR) gpu/ompt/$(DEPDIR) gpu/opencl/$(DEPDIR) lush-agents/$(DEPDIR) lush/$(DEPDIR) memory/$(DEPDIR) messages/$(DEPDIR) monitor-exts/$(DEPDIR) ompt/$(DEPDIR) os/linux/$(DEPDIR) sample-sources/$(DEPDIR) sample-sources/blame-shift/$(DEPDIR) sample-sources/perf/$(DEPDIR) syscalls/$(DEPDIR) trampoline/aarch64/$(DEPDIR) trampoline/common/$(DEPDIR) trampoline/x86-family/$(DEPDIR) unwind/common/$(DEPDIR) unwind/generic-libunwind/$(DEPDIR) unwind/ppc64/$(DEPDIR) unwind/x86-family/$(DEPDIR) unwind/x86-family/manual-intervals/$(DEPDIR) utilities/$(DEPDIR) utilities/arch/ia64/$(DEPDIR) utilities/arch/libunwind/$(DEPDIR) utilities/arch/ppc64/$(DEPDIR) utilities/arch/x86-family/$(DEPDIR) -rm -f Makefile maintainer-clean-am: distclean-am maintainer-clean-generic diff --git a/src/tool/hpcrun/fnbounds/fnbounds_client.c b/src/tool/hpcrun/fnbounds/fnbounds_client.c index a2b4bdf951..dfdb1f7129 100644 --- a/src/tool/hpcrun/fnbounds/fnbounds_client.c +++ b/src/tool/hpcrun/fnbounds/fnbounds_client.c @@ -454,7 +454,7 @@ launch_server(void) struct { int sendfd[2], recvfd[2]; } fds; - bool sampling_is_running; + bool sampling_is_running = false; pid_t child_pid; // already running @@ -472,11 +472,13 @@ launch_server(void) return -1; } - // some sample sources need to be stopped in the parent, or else - // they cause problems in the child. - sampling_is_running = SAMPLE_SOURCES(started); - if (sampling_is_running) { - SAMPLE_SOURCES(stop); + if (hpcrun_is_initialized()){ + // some sample sources need to be stopped in the parent, or else + // they cause problems in the child. + sampling_is_running = SAMPLE_SOURCES(started); + if (sampling_is_running) { + SAMPLE_SOURCES(stop); + } } // Give up a bit of our stack for the child shim. It doesn't need much. diff --git a/src/tool/hpcrun/gpu-monitors.c b/src/tool/hpcrun/gpu-monitors.c new file mode 100644 index 0000000000..5b03c70503 --- /dev/null +++ b/src/tool/hpcrun/gpu-monitors.c @@ -0,0 +1,40 @@ +// +// Created by dejan on 7/15/20. +// + +#include "gpu-monitors.h" +#include "hpcrun-malloc.h" + + +static __thread gpu_monitor_node_t *gpu_monitor_list = NULL; + +void +gpu_monitor_register( gpu_monitor_node_t node) +{ + gpu_monitor_node_t* new_node = hpcrun_malloc(sizeof(gpu_monitor_node_t)); + new_node->ci = node.ci; + new_node->enter_fn = node.enter_fn; + new_node->exit_fn = node.exit_fn; + new_node->next = gpu_monitor_list; + gpu_monitor_list = new_node; +} + + +void +gpu_monitors_apply(cct_node_t *cct_node, gpu_monitor_type_t type) +{ + gpu_monitor_node_t *node = gpu_monitor_list; + + if (type == gpu_monitor_type_enter){ + while (node != NULL) { + node->enter_fn(node->ci, cct_node); + node = node->next; + } + } + else if (type == gpu_monitor_type_exit){ + while (node != NULL) { + node->exit_fn(node->ci); + node = node->next; + } + } +} diff --git a/src/tool/hpcrun/gpu-monitors.h b/src/tool/hpcrun/gpu-monitors.h new file mode 100644 index 0000000000..7fd2c0d334 --- /dev/null +++ b/src/tool/hpcrun/gpu-monitors.h @@ -0,0 +1,35 @@ +// +// Created by dejan on 7/15/20. +// + +#ifndef HPCTOOLKIT_GPU_MONITORS_H +#define HPCTOOLKIT_GPU_MONITORS_H + +#include +#include + + + +typedef enum { + gpu_monitor_type_enter, + gpu_monitor_type_exit +} gpu_monitor_type_t; + + +typedef void (*gpu_monitor_enter_fn_t)(papi_component_info_t *ci, cct_node_t *cct_node); +typedef void (*gpu_monitor_exit_fn_t)(papi_component_info_t *ci); + + +typedef struct gpu_monitor_node_t { + struct gpu_monitor_node_t * next; + papi_component_info_t *ci; + gpu_monitor_enter_fn_t enter_fn; + gpu_monitor_exit_fn_t exit_fn; +} gpu_monitor_node_t; + + +extern void gpu_monitor_register(gpu_monitor_node_t node); +extern void gpu_monitors_apply(cct_node_t *cct_node, gpu_monitor_type_t type); + + +#endif //HPCTOOLKIT_GPU_MONITORS_H diff --git a/src/tool/hpcrun/gpu/amd/hip-api.c b/src/tool/hpcrun/gpu/amd/hip-api.c new file mode 100644 index 0000000000..be0d916b83 --- /dev/null +++ b/src/tool/hpcrun/gpu/amd/hip-api.c @@ -0,0 +1,251 @@ +// -*-Mode: C++;-*- // technically C99 + +// * BeginRiceCopyright ***************************************************** +// +// -------------------------------------------------------------------------- +// Part of HPCToolkit (hpctoolkit.org) +// +// Information about sources of support for research and development of +// HPCToolkit is at 'hpctoolkit.org' and in 'README.Acknowledgments'. +// -------------------------------------------------------------------------- +// +// Copyright ((c)) 2002-2020, Rice University +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// * Neither the name of Rice University (RICE) nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// This software is provided by RICE and contributors "as is" and any +// express or implied warranties, including, but not limited to, the +// implied warranties of merchantability and fitness for a particular +// purpose are disclaimed. In no event shall RICE or contributors be +// liable for any direct, indirect, incidental, special, exemplary, or +// consequential damages (including, but not limited to, procurement of +// substitute goods or services; loss of use, data, or profits; or +// business interruption) however caused and on any theory of liability, +// whether in contract, strict liability, or tort (including negligence +// or otherwise) arising in any way out of the use of this software, even +// if advised of the possibility of such damage. +// +// ******************************************************* EndRiceCopyright * + +//*************************************************************************** +// +// File: +// hip-api.c +// +// Purpose: +// wrapper around AMD HIP layer +// +//*************************************************************************** + + +//***************************************************************************** +// system include files +//***************************************************************************** + +#include +#include +#include // memset + +// #include +#include + +//***************************************************************************** +// local include files +//***************************************************************************** + +#include +#include + +#include "hip-api.h" + + + +//***************************************************************************** +// macros +//***************************************************************************** + +#define HIP_FN_NAME(f) DYN_FN_NAME(f) + +#define HIP_FN(fn, args) \ + static hipError_t (*HIP_FN_NAME(fn)) args + +#define HPCRUN_HIP_API_CALL(fn, args) \ +{ \ + hipError_t error_result = HIP_FN_NAME(fn) args; \ + if (error_result != hipSuccess) { \ + ETMSG(CUDA, "hip api %s returned %d", #fn, (int) error_result); \ + exit(-1); \ + } \ +} + +#define FORALL_HIP_ROUTINES(macro) \ + macro(hipDeviceSynchronize) \ + macro(hipDeviceGetAttribute) \ + macro(hipCtxGetCurrent) + +//****************************************************************************** +// static data +//****************************************************************************** + +#ifndef HPCRUN_STATIC_LINK +HIP_FN +( + hipDeviceSynchronize, +( void ) +); + +HIP_FN +( + hipDeviceGetAttribute, + ( + int *pi, + hipDeviceAttribute_t attrib, + int dev + ) +); + +HIP_FN +( + hipCtxGetCurrent, + ( + hipCtx_t *ctx + ) +); + +#endif + +//****************************************************************************** +// private operations +//****************************************************************************** +//TODO: Copied from cuda-api.c - check if works for hip +#ifndef HPCRUN_STATIC_LINK +static int +hip_device_sm_blocks_query +( + int major, + int minor +) +{ + switch(major) { + case 7: + case 6: + return 32; + default: + // TODO(Keren): add more devices + return 8; + } +} +#endif + + +//****************************************************************************** +// interface operations +//****************************************************************************** + +int +hip_bind +( +void +) +{ +#ifndef HPCRUN_STATIC_LINK + // dynamic libraries only availabile in non-static case + CHK_DLOPEN(hip, "libamdhip64.so", RTLD_NOW | RTLD_GLOBAL); + +#define HIP_BIND(fn) \ + CHK_DLSYM(hip, fn); + + FORALL_HIP_ROUTINES(HIP_BIND) +#undef CUPTI_BIND + + return 0; +#else + return -1; +#endif // ! HPCRUN_STATIC_LINK +} + +int +hip_context +( + hipCtx_t *ctx +) +{ +#ifndef HPCRUN_STATIC_LINK + HPCRUN_HIP_API_CALL(hipCtxGetCurrent, (ctx)); + return 0; +#else + return -1; +#endif +} + +int +hip_device_property_query +( + int device_id, + hip_device_property_t *property +) +{ +#ifndef HPCRUN_STATIC_LINK + HPCRUN_HIP_API_CALL(hipDeviceGetAttribute, + (&property->sm_count, hipDeviceAttributeMultiprocessorCount, device_id)); + + HPCRUN_HIP_API_CALL(hipDeviceGetAttribute, + (&property->sm_clock_rate, hipDeviceAttributeClockRate, device_id)); + + HPCRUN_HIP_API_CALL(hipDeviceGetAttribute, + (&property->sm_shared_memory, + hipDeviceAttributeMaxSharedMemoryPerMultiprocessor, device_id)); + + HPCRUN_HIP_API_CALL(hipDeviceGetAttribute, + (&property->sm_registers, + hipDeviceAttributeMaxRegistersPerBlock, device_id));//CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR + + HPCRUN_HIP_API_CALL(hipDeviceGetAttribute, + (&property->sm_threads, hipDeviceAttributeMaxThreadsPerMultiProcessor, + device_id)); + + HPCRUN_HIP_API_CALL(hipDeviceGetAttribute, + (&property->num_threads_per_warp, hipDeviceAttributeWarpSize, + device_id)); + + int major = 0, minor = 0; + + HPCRUN_HIP_API_CALL(hipDeviceGetAttribute, + (&major, hipDeviceAttributeComputeCapabilityMajor, device_id)); + + HPCRUN_HIP_API_CALL(hipDeviceGetAttribute, + (&minor, hipDeviceAttributeComputeCapabilityMinor, device_id)); + + property->sm_blocks = hip_device_sm_blocks_query(major, minor); + + return 0; +#else + return -1; +#endif +} + +int +hip_dev_sync +() +{ +#ifndef HPCRUN_STATIC_LINK + HPCRUN_HIP_API_CALL(hipDeviceSynchronize, () ); + return 0; +#else + return -1; +#endif +} diff --git a/src/tool/hpcrun/gpu/amd/hip-api.h b/src/tool/hpcrun/gpu/amd/hip-api.h new file mode 100644 index 0000000000..5d21ac5d6f --- /dev/null +++ b/src/tool/hpcrun/gpu/amd/hip-api.h @@ -0,0 +1,112 @@ +// -*-Mode: C++;-*- // technically C99 + +// * BeginRiceCopyright ***************************************************** +// +// -------------------------------------------------------------------------- +// Part of HPCToolkit (hpctoolkit.org) +// +// Information about sources of support for research and development of +// HPCToolkit is at 'hpctoolkit.org' and in 'README.Acknowledgments'. +// -------------------------------------------------------------------------- +// +// Copyright ((c)) 2002-2020, Rice University +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// * Neither the name of Rice University (RICE) nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// This software is provided by RICE and contributors "as is" and any +// express or implied warranties, including, but not limited to, the +// implied warranties of merchantability and fitness for a particular +// purpose are disclaimed. In no event shall RICE or contributors be +// liable for any direct, indirect, incidental, special, exemplary, or +// consequential damages (including, but not limited to, procurement of +// substitute goods or services; loss of use, data, or profits; or +// business interruption) however caused and on any theory of liability, +// whether in contract, strict liability, or tort (including negligence +// or otherwise) arising in any way out of the use of this software, even +// if advised of the possibility of such damage. +// +// ******************************************************* EndRiceCopyright * + +//*************************************************************************** +// +// File: +// cuda-api.h +// +// Purpose: +// interface definitions for wrapper around AMD HIP layer +// +//*************************************************************************** + +#ifndef hip_api_h +#define hip_api_h + + + +//***************************************************************************** +// rocm includes +//***************************************************************************** + +// #include +#include + + + +//***************************************************************************** +// interface operations +//***************************************************************************** + +typedef struct hip_device_property { + int sm_count; + int sm_clock_rate; + int sm_shared_memory; + int sm_registers; + int sm_threads; + int sm_blocks; + int num_threads_per_warp; +} hip_device_property_t; + + +//***************************************************************************** +// interface operations +//***************************************************************************** + +// returns 0 on success +int +hip_bind +( + void +); + +// returns 0 on success +int +hip_context +( + hipCtx_t *ctx +); + +// returns 0 on success +int +hip_device_property_query +( + int device_id, + hip_device_property_t *property +); + +int +hip_dev_sync(); + +#endif //hip_api_h diff --git a/src/tool/hpcrun/gpu/amd/rocm-binary-processing.c b/src/tool/hpcrun/gpu/amd/rocm-binary-processing.c index 1044368644..e66672bff3 100644 --- a/src/tool/hpcrun/gpu/amd/rocm-binary-processing.c +++ b/src/tool/hpcrun/gpu/amd/rocm-binary-processing.c @@ -58,8 +58,7 @@ //****************************************************************************** #include - -#include "rocm-debug-api.h" +#include #include "rocm-binary-processing.h" #include #include @@ -96,6 +95,12 @@ typedef struct amd_gpu_binary { amd_gpu_binary_t* binary_list = NULL; +// A spin lock to serialize two AMD GPU binary opertionas: +// 1. parse and add a code object to the binary list +// 2. look up a function name from the the binary list +static spinlock_t rocm_binary_list_lock; + + //****************************************************************************** // private operations //****************************************************************************** @@ -313,55 +318,33 @@ file_uri_exists return 0; } -static int +static void parse_amd_gpu_binary ( - void + const char* uri ) { - // rocm debug api library creates a new thread through std::thread. - // This breaks automatic thread ignoring code because we only check - // the caller of pthread_create. So, we manually ignore the new thread. - monitor_disable_new_threads(); - - rocm_debug_api_init(); - size_t code_object_count; - rocm_debug_api_query_code_object(&code_object_count); - - for (size_t i = 0; i < code_object_count; ++i) { - char* uri = rocm_debug_api_query_uri(i); - PRINT("uri %d, %s\n", i, uri); - - // Handle file URIs - if (strncmp(uri, "file://", strlen("file://")) == 0) { - if (file_uri_exists(uri)) continue; - - // Handle a new AMD GPU binary - amd_gpu_binary_t* bin = (amd_gpu_binary_t*) malloc(sizeof(amd_gpu_binary_t)); - bin->uri = strdup(uri); - bin->next = binary_list; - binary_list = bin; - - // Parse URI to extract the binary - parse_amd_gpu_binary_uri(uri, bin); - - // Parse the ELF symbol table - elf_version(EV_CURRENT); - Elf *elf = elf_memory(bin->buf, bin->size); - if (elf != 0) { - construct_amd_gpu_symbols(elf, &(bin->function_table)); - elf_end(elf); - } + // Handle file URIs + if (strncmp(uri, "file://", strlen("file://")) == 0) { + if (file_uri_exists(uri)) return; + + // Handle a new AMD GPU binary + amd_gpu_binary_t* bin = (amd_gpu_binary_t*) malloc(sizeof(amd_gpu_binary_t)); + bin->uri = strdup(uri); + bin->next = binary_list; + binary_list = bin; + + // Parse URI to extract the binary + parse_amd_gpu_binary_uri(uri, bin); + + // Parse the ELF symbol table + elf_version(EV_CURRENT); + Elf *elf = elf_memory(bin->buf, bin->size); + if (elf != 0) { + construct_amd_gpu_symbols(elf, &(bin->function_table)); + elf_end(elf); } } - - rocm_debug_api_fini(); - - // Now we are done with the rocm debug api. - // we enable tracing threads - monitor_enable_new_threads(); - - return 0; } // TODO: @@ -408,19 +391,31 @@ rocm_binary_function_lookup ) { // TODO: - // 1. Handle multi-threaded case. Currently, this function is called when the first - // HIP kernel launch is done. So multiple threads can enter this concurrently. - // 2. Currently we support multiple GPU binaries, but assume that kernel is unique + // 1. Currently we support multiple GPU binaries, but assume that kernel is unique // across GPU binaries. - if (binary_list == NULL) { - if (parse_amd_gpu_binary() < 0) { - // Allocate a placeholder binary - binary_list = (amd_gpu_binary_t*)malloc(sizeof(amd_gpu_binary_t)); - binary_list->next = NULL; - binary_list->function_table.size = 0; - } - } + spinlock_lock(&rocm_binary_list_lock); ip_normalized_t nip = lookup_amd_function(kernel_name); PRINT("HIP launch kernel %s, lm_ip %lx\n", kernel_name, nip.lm_ip); + spinlock_unlock(&rocm_binary_list_lock); return nip; } + +void +rocm_binary_uri_add +( + const char* uri +) +{ + spinlock_lock(&rocm_binary_list_lock); + parse_amd_gpu_binary(uri); + spinlock_unlock(&rocm_binary_list_lock); +} + +void +rocm_binary_uri_list_init +( + void +) +{ + spinlock_init(&rocm_binary_list_lock); +} diff --git a/src/tool/hpcrun/gpu/amd/rocm-binary-processing.h b/src/tool/hpcrun/gpu/amd/rocm-binary-processing.h index 9300ffa710..0fa592e823 100644 --- a/src/tool/hpcrun/gpu/amd/rocm-binary-processing.h +++ b/src/tool/hpcrun/gpu/amd/rocm-binary-processing.h @@ -60,4 +60,16 @@ rocm_binary_function_lookup const char* kernel_name ); +void +rocm_binary_uri_add +( + const char* uri +); + +void +rocm_binary_uri_list_init +( + void +); + #endif diff --git a/src/tool/hpcrun/gpu/amd/rocm-debug-api.c b/src/tool/hpcrun/gpu/amd/rocm-debug-api.c deleted file mode 100644 index b20b664f9b..0000000000 --- a/src/tool/hpcrun/gpu/amd/rocm-debug-api.c +++ /dev/null @@ -1,312 +0,0 @@ -// -*-Mode: C++;-*- // technically C99 - -// * BeginRiceCopyright ***************************************************** -// -// -------------------------------------------------------------------------- -// Part of HPCToolkit (hpctoolkit.org) -// -// Information about sources of support for research and development of -// HPCToolkit is at 'hpctoolkit.org' and in 'README.Acknowledgments'. -// -------------------------------------------------------------------------- -// -// Copyright ((c)) 2002-2022, Rice University -// All rights reserved. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are -// met: -// -// * Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// -// * Redistributions in binary form must reproduce the above copyright -// notice, this list of conditions and the following disclaimer in the -// documentation and/or other materials provided with the distribution. -// -// * Neither the name of Rice University (RICE) nor the names of its -// contributors may be used to endorse or promote products derived from -// this software without specific prior written permission. -// -// This software is provided by RICE and contributors "as is" and any -// express or implied warranties, including, but not limited to, the -// implied warranties of merchantability and fitness for a particular -// purpose are disclaimed. In no event shall RICE or contributors be -// liable for any direct, indirect, incidental, special, exemplary, or -// consequential damages (including, but not limited to, procurement of -// substitute goods or services; loss of use, data, or profits; or -// business interruption) however caused and on any theory of liability, -// whether in contract, strict liability, or tort (including negligence -// or otherwise) arising in any way out of the use of this software, even -// if advised of the possibility of such damage. -// -// ******************************************************* EndRiceCopyright * - -//****************************************************************************** -// system includes -//****************************************************************************** - -#include "amd-dbgapi.h" - -#include -#include - -//****************************************************************************** -// local includes -//****************************************************************************** - -#include "rocm-debug-api.h" - -#include -#include -#include - -//****************************************************************************** -// macros -//****************************************************************************** - -#define FORALL_ROCM_DEBUG_ROUTINES(macro) \ - macro(amd_dbgapi_initialize) \ - macro(amd_dbgapi_process_attach) \ - macro(amd_dbgapi_process_detach) \ - macro(amd_dbgapi_process_code_object_list) \ - macro(amd_dbgapi_code_object_get_info) - - -#define ROCM_DEBUG_FN_NAME(f) DYN_FN_NAME(f) - -#define ROCM_DEBUG_FN(fn, args) \ - static amd_dbgapi_status_t (*ROCM_DEBUG_FN_NAME(fn)) args - -#define HPCRUN_ROCM_DEBUG_CALL(fn, args) \ -{ \ - amd_dbgapi_status_t ret = ROCM_DEBUG_FN_NAME(fn) args; \ - check_rocm_debug_status(ret, __LINE__); \ -} - -//****************************************************************************** -// debug print -//****************************************************************************** - -#define DEBUG 0 - -#include "hpcrun/gpu/gpu-print.h" - -//****************************************************************************** -// local variables -//****************************************************************************** - -static amd_dbgapi_callbacks_t callbacks; -static amd_dbgapi_process_id_t self; -static amd_dbgapi_code_object_id_t *code_objects_id; - -//---------------------------------------------------------- -// rocm debug api function pointers for late binding -//---------------------------------------------------------- - -ROCM_DEBUG_FN -( - amd_dbgapi_initialize, - ( - amd_dbgapi_callbacks_t* - ) -); - -ROCM_DEBUG_FN -( - amd_dbgapi_process_attach, - ( - amd_dbgapi_client_process_id_t, - amd_dbgapi_process_id_t * - ) -); - -ROCM_DEBUG_FN -( - amd_dbgapi_process_detach, - ( - amd_dbgapi_process_id_t - ) -); - -ROCM_DEBUG_FN -( - amd_dbgapi_process_code_object_list, - ( - amd_dbgapi_process_id_t, - size_t *, - amd_dbgapi_code_object_id_t **, - amd_dbgapi_changed_t * - ) -); - -ROCM_DEBUG_FN -( - amd_dbgapi_code_object_get_info, - ( - amd_dbgapi_code_object_id_t, - amd_dbgapi_code_object_info_t, - size_t, - void* - ) -); - -//****************************************************************************** -// private operations -//****************************************************************************** - -static amd_dbgapi_status_t -hpcrun_self_process -( - amd_dbgapi_client_process_id_t cp, - amd_dbgapi_os_process_id_t *os_pid -) -{ - *os_pid = getpid(); - return AMD_DBGAPI_STATUS_SUCCESS; -} - -static amd_dbgapi_status_t -hpcrun_insert_breakpoint -( - amd_dbgapi_client_process_id_t client_process_id, - amd_dbgapi_global_address_t address, - amd_dbgapi_breakpoint_id_t breakpoint_id -) -{ - return AMD_DBGAPI_STATUS_SUCCESS; -} - -static amd_dbgapi_status_t -hpcrun_remove_breakpoint -( - amd_dbgapi_client_process_id_t client_process_id, - amd_dbgapi_breakpoint_id_t breakpoint_id -) -{ - return AMD_DBGAPI_STATUS_SUCCESS; -} - -static void -hpcrun_log_message -( - amd_dbgapi_log_level_t level, - const char *message -) -{ - PRINT("%s\n", message); -} - -static void -check_rocm_debug_status -( - amd_dbgapi_status_t ret, - int lineNo -) -{ - if (ret == AMD_DBGAPI_STATUS_SUCCESS) { - return; - } - -#define CHECK_RET(x) case x: { PRINT("%s", #x); break; } - switch(ret) { - CHECK_RET(AMD_DBGAPI_STATUS_FATAL) - CHECK_RET(AMD_DBGAPI_STATUS_ERROR_NOT_INITIALIZED) - CHECK_RET(AMD_DBGAPI_STATUS_ERROR_INVALID_PROCESS_ID) - CHECK_RET(AMD_DBGAPI_STATUS_ERROR_INVALID_ARGUMENT) - CHECK_RET(AMD_DBGAPI_STATUS_ERROR_CLIENT_CALLBACK) - CHECK_RET(AMD_DBGAPI_STATUS_ERROR_INVALID_CODE_OBJECT_ID) - CHECK_RET(AMD_DBGAPI_STATUS_ERROR_INVALID_ARGUMENT_COMPATIBILITY) - default: - PRINT("unknown rocm debug return value"); - break; - } - -#undef CHECK_RET - - PRINT(" at line %d\n", lineNo); -} - -//****************************************************************************** -// interface operations -//****************************************************************************** - -int -rocm_debug_api_bind -( - void -) -{ - // This disable HIP's deferred code object loading. - // We can remove this when we start to use HSA API tracing - setenv("HIP_ENABLE_DEFERRED_LOADING", "0", 1); - -#ifndef HPCRUN_STATIC_LINK - // dynamic libraries only availabile in non-static case - hpcrun_force_dlopen(true); - CHK_DLOPEN(rocm_debug, "librocm-dbgapi.so", RTLD_NOW | RTLD_GLOBAL); - hpcrun_force_dlopen(false); - -#define ROCM_DEBUG_BIND(fn) \ - CHK_DLSYM(rocm_debug, fn); - - FORALL_ROCM_DEBUG_ROUTINES(ROCM_DEBUG_BIND); - -#undef ROCM_DEBUG_BIND - return DYNAMIC_BINDING_STATUS_OK; -#else - return DYNAMIC_BINDING_STATUS_ERROR; -#endif // ! HPCRUN_STATIC_LINK -} - -void -rocm_debug_api_init -( - void -) -{ - // Fill in call back functions for rocm debug api - callbacks.allocate_memory = malloc; - callbacks.deallocate_memory = free; - callbacks.get_os_pid = hpcrun_self_process; - callbacks.insert_breakpoint = hpcrun_insert_breakpoint; - callbacks.remove_breakpoint = hpcrun_remove_breakpoint; - callbacks.log_message = hpcrun_log_message; - - HPCRUN_ROCM_DEBUG_CALL(amd_dbgapi_initialize, (&callbacks)); - HPCRUN_ROCM_DEBUG_CALL(amd_dbgapi_process_attach, - ((amd_dbgapi_client_process_id_t)(&self), &self)); -} - -void -rocm_debug_api_fini -( - void -) -{ - HPCRUN_ROCM_DEBUG_CALL(amd_dbgapi_process_detach, (self)); -} - -void -rocm_debug_api_query_code_object -( - size_t* code_object_count_ptr -) -{ - HPCRUN_ROCM_DEBUG_CALL(amd_dbgapi_process_code_object_list, - (self, code_object_count_ptr, &code_objects_id, NULL)); - PRINT("code object count %u\n", *code_object_count_ptr); -} - -char* -rocm_debug_api_query_uri -( - size_t code_object_index -) -{ - char* uri; - HPCRUN_ROCM_DEBUG_CALL(amd_dbgapi_code_object_get_info, - (code_objects_id[code_object_index], - AMD_DBGAPI_CODE_OBJECT_INFO_URI_NAME, - sizeof(char*), (void*)(&uri))); - return uri; -} diff --git a/src/tool/hpcrun/gpu/amd/rocprofiler-api.c b/src/tool/hpcrun/gpu/amd/rocprofiler-api.c new file mode 100644 index 0000000000..9e3daac034 --- /dev/null +++ b/src/tool/hpcrun/gpu/amd/rocprofiler-api.c @@ -0,0 +1,674 @@ +// -*-Mode: C++;-*- // technically C99 + +// * BeginRiceCopyright ***************************************************** +// +// -------------------------------------------------------------------------- +// Part of HPCToolkit (hpctoolkit.org) +// +// Information about sources of support for research and development of +// HPCToolkit is at 'hpctoolkit.org' and in 'README.Acknowledgments'. +// -------------------------------------------------------------------------- +// +// Copyright ((c)) 2002-2021, Rice University +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// * Neither the name of Rice University (RICE) nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// This software is provided by RICE and contributors "as is" and any +// express or implied warranties, including, but not limited to, the +// implied warranties of merchantability and fitness for a particular +// purpose are disclaimed. In no event shall RICE or contributors be +// liable for any direct, indirect, incidental, special, exemplary, or +// consequential damages (including, but not limited to, procurement of +// substitute goods or services; loss of use, data, or profits; or +// business interruption) however caused and on any theory of liability, +// whether in contract, strict liability, or tort (including negligence +// or otherwise) arising in any way out of the use of this software, even +// if advised of the possibility of such damage. +// +// ******************************************************* EndRiceCopyright * + +//****************************************************************************** +// local includes +//****************************************************************************** + +#include "rocprofiler-api.h" +#include "rocm-binary-processing.h" + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include + +#include +#include + +#define DEBUG 0 + +#include "hpcrun/gpu/gpu-print.h" +//****************************************************************************** +// macros +//****************************************************************************** + + +#define PUBLIC_API __attribute__((visibility("default"))) + +#define FORALL_ROCPROFILER_ROUTINES(macro) \ + macro(rocprofiler_open) \ + macro(rocprofiler_close) \ + macro(rocprofiler_get_metrics) \ + macro(rocprofiler_set_queue_callbacks) \ + macro(rocprofiler_start_queue_callbacks) \ + macro(rocprofiler_stop_queue_callbacks) \ + macro(rocprofiler_remove_queue_callbacks) \ + macro(rocprofiler_iterate_info) \ + macro(rocprofiler_group_get_data) \ + macro(rocprofiler_get_group) + + + +#define ROCPROFILER_FN_NAME(f) DYN_FN_NAME(f) + +#define ROCPROFILER_FN(fn, args) \ + static hsa_status_t (*ROCPROFILER_FN_NAME(fn)) args + +#define HPCRUN_ROCPROFILER_CALL(fn, args) \ +{ \ + hsa_status_t status = ROCPROFILER_FN_NAME(fn) args; \ + if (status != HSA_STATUS_SUCCESS) { \ + const char* error_string = NULL; \ + rocprofiler_error_string(&error_string); \ + fprintf(stderr, "ERROR: %s\n", error_string); \ + abort(); \ + } \ +} + +typedef struct { + bool valid; + hsa_agent_t agent; + rocprofiler_group_t group; + rocprofiler_callback_data_t data; +} hpcrun_amd_counter_data_t; + +//****************************************************************************** +// local variables +//****************************************************************************** + +// Currently we serialize kernel execution when collecting counters. +// So we have one global correlation id, counter data storage, +// and one variable indicating whether the processing is finished or not +static hpcrun_amd_counter_data_t counter_data; +static uint64_t rocprofiler_correlation_id; +static volatile int context_callback_finish; + +static bool rocprofiler_initialized = false; + +// total number of counters supported by rocprofiler, +// an array of their string names, and an array of their description +static int total_counters = 0; +static const char** counter_name = NULL; +static const char** counter_description = NULL; + +// the list of counters specified at the command line +static int *is_specified_by_user = NULL; +static int total_requested = 0; +static rocprofiler_feature_t* rocprofiler_input = NULL; +static const char** requested_counter_name = NULL; +static const char** requested_counter_description = NULL; + +// A spin lock to serialize GPU kernels +static spinlock_t kernel_lock; + +//---------------------------------------------------------- +// rocprofiler function pointers for late binding +//---------------------------------------------------------- + +ROCPROFILER_FN +( + rocprofiler_open, + ( + hsa_agent_t agent, // GPU handle + rocprofiler_feature_t* features, // [in/out] profiling feature array + uint32_t feature_count, // profiling feature count + rocprofiler_t** context, // [out] profiling context handle + uint32_t mode, // profiling mode mask + rocprofiler_properties_t* properties // profiler properties + ) +); + +ROCPROFILER_FN +( + rocprofiler_close, + ( + rocprofiler_t* context // [in] profiling context + ) +); + +ROCPROFILER_FN +( + rocprofiler_get_metrics, + ( + rocprofiler_t* context // [in/out] profiling context + ) +); + +ROCPROFILER_FN +( + rocprofiler_set_queue_callbacks, + ( + rocprofiler_queue_callbacks_t callbacks, // callbacks + void* data + ) +); + +ROCPROFILER_FN +( + rocprofiler_start_queue_callbacks, + ( + void + ) +); + +ROCPROFILER_FN +( + rocprofiler_stop_queue_callbacks, + ( + void + ) +); + +ROCPROFILER_FN +( + rocprofiler_remove_queue_callbacks, + ( + void + ) +); + +ROCPROFILER_FN +( + rocprofiler_iterate_info, + ( + const hsa_agent_t* agent, // [in] GPU handle, NULL for all + // GPU agents + rocprofiler_info_kind_t kind, // kind of iterated info + hsa_status_t (*callback)(const rocprofiler_info_data_t info, void *data), // callback + void *data + ) +); + +ROCPROFILER_FN +( + rocprofiler_group_get_data, + ( + rocprofiler_group_t* group // [in/out] profiling group + ) +); + +ROCPROFILER_FN +( + rocprofiler_get_group, + ( + rocprofiler_t* context, // [in/out] profiling context, + // will be returned as + // a part of the group structure + uint32_t index, // [in] group index + rocprofiler_group_t* group // [out] profiling group + ) +); + +//****************************************************************************** +// private operations +//****************************************************************************** + +static const char * +rocprofiler_path +( + void +) +{ + const char *path = "librocprofiler64.so"; + + return path; +} + +static void +translate_rocprofiler_output +( + gpu_activity_t* ga +) +{ + // Translate counter results stored in rocprofiler_feature_t + // to hpcrun's gpu_activity_t data structure + rocprofiler_feature_t** features = counter_data.group.features; + unsigned feature_count = counter_data.group.feature_count; + + ga->kind = GPU_ACTIVITY_COUNTER; + ga->details.counters.correlation_id = rocprofiler_correlation_id; + ga->details.counters.total_counters = feature_count; + + // This function should be called by rocprofiler thread, + // which is not monitored. So, this function will not be called + // inside a signal handler and we can call malloc. + // The memory is freed when we attribute this gpu_activity_t. + ga->details.counters.values = (uint64_t*) malloc(sizeof(uint64_t) * feature_count); + + // rocprofiler should pass metric results in the same order + // that we pass metrics as input to rocprofiler + for (unsigned i = 0; i < feature_count; ++i) { + const rocprofiler_feature_t* p = features[i]; + ga->details.counters.values[i] = p->data.result_int64; + } +} + +// Profiling completion handler +// Dump and delete the context entry +// Return true if the context was dumped successfully +static bool +rocprofiler_context_handler +( + rocprofiler_group_t group, + void* arg +) +{ + hpcrun_thread_init_mem_pool_once(0, NULL, false, true); + + // This wait-loop is taken from rocprofiler example. + // It is strange that the rocprofiler thread will have to + // wait for subscriber callback to finish. + volatile bool valid = counter_data.valid; + while (!valid) { + sched_yield(); + valid = counter_data.valid; + } + + if (counter_data.group.context == NULL) { + EMSG("error: AMD group->context = NULL"); + } + if (counter_data.group.feature_count > 0) { + HPCRUN_ROCPROFILER_CALL(rocprofiler_group_get_data, (&counter_data.group)); + HPCRUN_ROCPROFILER_CALL(rocprofiler_get_metrics, (counter_data.group.context)); + } + + gpu_activity_t ga; + memset(&ga, 0, sizeof(gpu_activity_t)); + cstack_ptr_set(&(ga.next), 0); + + translate_rocprofiler_output(&ga); + + // Consume the correlation channel for rocprofiler + gpu_monitoring_thread_activities_ready_with_idx(ROCPROFILER_CHANNEL_IDX); + if (gpu_correlation_id_map_lookup(rocprofiler_correlation_id) == NULL) { + gpu_correlation_id_map_insert(rocprofiler_correlation_id, rocprofiler_correlation_id); + } + gpu_activity_process(&ga); + + context_callback_finish = 1; + return false; +} + +static hsa_status_t +rocprofiler_dispatch_callback +( + const rocprofiler_callback_data_t* callback_data, + void* arg, + rocprofiler_group_t* group +) { + if (total_requested == 0) return HSA_STATUS_SUCCESS; + + // Passed tool data + hsa_agent_t agent = callback_data->agent; + // HSA status + hsa_status_t status = HSA_STATUS_ERROR; + + rocprofiler_t* context = NULL; + rocprofiler_properties_t properties = {}; + properties.handler = rocprofiler_context_handler; + properties.handler_arg = NULL; + + counter_data.valid = false; + HPCRUN_ROCPROFILER_CALL(rocprofiler_open, (agent, rocprofiler_input, total_requested, + &context, 0 /*ROCPROFILER_MODE_SINGLEGROUP*/, &properties)); + + + // Get group[0] + HPCRUN_ROCPROFILER_CALL(rocprofiler_get_group, (context, 0, group)); + + // Fill profiling context entry + counter_data.agent = agent; + counter_data.group = *group; + counter_data.data = *callback_data; + counter_data.valid = true; + + return HSA_STATUS_SUCCESS; +} + +static hsa_status_t +total_counter_accumulator +( + const rocprofiler_info_data_t info, + void *data +) +{ + total_counters += 1; + return HSA_STATUS_SUCCESS; +} + +static hsa_status_t +counter_info_accumulator +( + const rocprofiler_info_data_t info, + void *data +) +{ + if (getenv("HPCRUN_PRINT_ROCPROFILER_COUNTER_DETAILS")) { + printf("Enter counter_info_accumulator\n"); + printf("\tname %s\n", info.metric.name); + printf("\tinstances %d\n", info.metric.instances); + printf("\texpr %s\n", info.metric.expr); + printf("\tblock name %s\n", info.metric.block_name); + printf("\tblock_counters %d\n", info.metric.block_counters); + } + counter_name[total_counters] = strdup(info.metric.name); + counter_description[total_counters] = strdup(info.metric.description); + total_counters += 1; + return HSA_STATUS_SUCCESS; +} + +static void +initialize_counter_information +( + +) +{ + // First we iterate over all counters to get the total + HPCRUN_ROCPROFILER_CALL(rocprofiler_iterate_info, + (NULL, ROCPROFILER_INFO_KIND_METRIC, total_counter_accumulator, NULL)); + + // Allocate infomation array + counter_name = (const char**) malloc(total_counters * sizeof(const char*)); + counter_description = (const char**) malloc(total_counters * sizeof(const char*)); + + // Fill in name and description string for each counter + total_counters = 0; + HPCRUN_ROCPROFILER_CALL(rocprofiler_iterate_info, + (NULL, ROCPROFILER_INFO_KIND_METRIC, counter_info_accumulator, NULL)); + + // Allocate an array to record whether a counter is asked by the user + is_specified_by_user = (int*) malloc(total_counters * sizeof(int)); + memset(is_specified_by_user, 0, total_counters * sizeof(int)); +} + +// This function should be implemented in roctracer-api.c, +// but due to c++ism in AMD software, I can only include rocprofiler header +// filers in one .o +static void +roctracer_codeobj_callback +( + uint32_t domain, + uint32_t cid, + const void* data, + void* arg +) +{ + const hsa_evt_data_t* evt_data = (const hsa_evt_data_t*)(data); + const char* uri = evt_data->codeobj.uri; + rocm_binary_uri_add(uri); + PRINT("codeobj_callback domain(%u) cid(%u): load_base(0x%lx) load_size(0x%lx) load_delta(0x%lx) uri(\"%s\")\n", + domain, + cid, + evt_data->codeobj.load_base, + evt_data->codeobj.load_size, + evt_data->codeobj.load_delta, + uri); + free((void*)uri); +} + +//****************************************************************************** +// AMD hidden interface operations +//****************************************************************************** + +// This is necessary for rocprofiler callback to work +extern PUBLIC_API void OnLoadToolProp(rocprofiler_settings_t* settings){ + // Enable hsa interception for getting code object URIs + settings->hsa_intercepting = 1; +} + +extern PUBLIC_API void OnUnloadTool() { + // Must be provided. Otherwise rocprofiler + // will refuse to work +} + +//****************************************************************************** +// interface operations +//****************************************************************************** + + +void +rocprofiler_start_kernel +( + uint64_t cor +) +{ + spinlock_lock(&kernel_lock); + rocprofiler_correlation_id = cor; + // We will only allow the critical section + // to finish after we get rocprofiler results + context_callback_finish = 0; + HPCRUN_ROCPROFILER_CALL(rocprofiler_start_queue_callbacks, ()); +} + + +void rocprofiler_stop_kernel(){ + HPCRUN_ROCPROFILER_CALL(rocprofiler_stop_queue_callbacks, ()); + spinlock_unlock(&kernel_lock); +} + + +void +rocprofiler_init +( + void +) +{ + if (rocprofiler_initialized) { + return; + } + // Ensure librocprofiler64.so is loaded + // and initialize all rocprofiler API function pointers + rocprofiler_initialized = true; + +#ifndef HPCRUN_STATIC_LINK + // We usually bind GPU vendor library in finalize_event_list. + // But here we must do early binding to query supported list of counters + if (rocprofiler_bind() != DYNAMIC_BINDING_STATUS_OK) { + EEMSG("hpcrun: unable to bind to AMD rocprofiler library %s\n", dlerror()); + monitor_real_exit(-1); + } +#endif + + rocprofiler_queue_callbacks_t callbacks_ptrs = {}; + callbacks_ptrs.dispatch = rocprofiler_dispatch_callback; + HPCRUN_ROCPROFILER_CALL(rocprofiler_set_queue_callbacks, (callbacks_ptrs, NULL)); + + initialize_counter_information(); + + // Initialize the spin lock used to serialize GPU kernel launches + spinlock_init(&kernel_lock); + return; +} + + +void +rocprofiler_fini +( + void *args, + int how +) +{ + HPCRUN_ROCPROFILER_CALL(rocprofiler_remove_queue_callbacks, ()); + return; +} + + + +int +rocprofiler_bind +( + void +) +{ +#ifndef HPCRUN_STATIC_LINK + // dynamic libraries only availabile in non-static case + hpcrun_force_dlopen(true); + CHK_DLOPEN(rocprofiler, rocprofiler_path(), RTLD_NOW | RTLD_GLOBAL); + hpcrun_force_dlopen(false); + +#define ROCPROFILER_BIND(fn) \ + CHK_DLSYM(rocprofiler, fn); + + FORALL_ROCPROFILER_ROUTINES(ROCPROFILER_BIND); + +#undef ROCPROFILER_BIND + + hpcrun_force_dlopen(true); + //if (getenv("HPCRUN_LIST_EVENT")) { + CHK_DLOPEN(hsa, "libhsa-runtime64.so", RTLD_NOW | RTLD_GLOBAL); + hsa_init(); + //} + hpcrun_force_dlopen(false); + + return DYNAMIC_BINDING_STATUS_OK; +#else + return DYNAMIC_BINDING_STATUS_ERROR; +#endif // ! HPCRUN_STATIC_LINK +} + +void +rocprofiler_wait_context_callback +( + void +) +{ + // The rocprofiler monitoring thread will set + // context_callback_finish to 1 after it finishes processing + // rocprofiler data + while (context_callback_finish == 0); +} + +int +rocprofiler_total_counters +( + void +) +{ + return total_counters; +} + +const char* +rocprofiler_counter_name +( + int idx +) +{ + if (idx < 0 || idx >= total_counters || counter_name == NULL) return NULL; + return counter_name[idx]; +} + +const char* +rocprofiler_counter_description +( + int idx +) +{ + if (idx < 0 || idx >= total_counters || counter_description == NULL) return NULL; + return counter_description[idx]; +} + +int +rocprofiler_match_event +( + const char* ev_str +) +{ + for (int i = 0; i < total_counters; i++) { + if (strcmp(ev_str, counter_name[i]) == 0) { + is_specified_by_user[i] = 1; + return 1; + } + } + return 0; +} + +void +rocprofiler_finalize_event_list +( +) +{ + for (int i = 0; i < total_counters; i++) { + if (is_specified_by_user[i] == 1) { + total_requested += 1; + } + } + + rocprofiler_input = (rocprofiler_feature_t*) malloc(sizeof(rocprofiler_feature_t) * total_requested); + memset(rocprofiler_input, 0, total_requested * sizeof(rocprofiler_feature_t)); + + requested_counter_name = (const char**) malloc(sizeof(const char*) * total_requested); + requested_counter_description = (const char**) malloc(sizeof(const char*) * total_requested); + + int cur_id = 0; + for (int i = 0; i < total_counters; i++) { + if (is_specified_by_user[i] == 1) { + rocprofiler_input[cur_id].kind = ROCPROFILER_FEATURE_KIND_METRIC; + rocprofiler_input[cur_id].name = counter_name[i]; + requested_counter_name[cur_id] = counter_name[i]; + requested_counter_description[cur_id] = counter_description[i]; + cur_id += 1; + } + } + + gpu_metrics_GPU_CTR_enable(total_requested, requested_counter_name, requested_counter_description); +} + +void +rocprofiler_uri_setup +( + void +) +{ + // Ask roctracer to set up code object URI callbacks + // TODO: this really should be implemented in roctracer-api.c, + // however, due to an AMD header file that is not fully C compatible, + // I can only include rocprofiler header file in one source file. + rocm_binary_uri_list_init(); + roctracer_enable_op_callback( + ACTIVITY_DOMAIN_HSA_EVT, HSA_EVT_ID_CODEOBJ, roctracer_codeobj_callback, NULL + ); +} diff --git a/src/tool/hpcrun/gpu/amd/rocprofiler-api.h b/src/tool/hpcrun/gpu/amd/rocprofiler-api.h new file mode 100644 index 0000000000..267db702c0 --- /dev/null +++ b/src/tool/hpcrun/gpu/amd/rocprofiler-api.h @@ -0,0 +1,136 @@ +// -*-Mode: C++;-*- // technically C99 + +// * BeginRiceCopyright ***************************************************** +// +// -------------------------------------------------------------------------- +// Part of HPCToolkit (hpctoolkit.org) +// +// Information about sources of support for research and development of +// HPCToolkit is at 'hpctoolkit.org' and in 'README.Acknowledgments'. +// -------------------------------------------------------------------------- +// +// Copyright ((c)) 2002-2022, Rice University +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// * Neither the name of Rice University (RICE) nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// This software is provided by RICE and contributors "as is" and any +// express or implied warranties, including, but not limited to, the +// implied warranties of merchantability and fitness for a particular +// purpose are disclaimed. In no event shall RICE or contributors be +// liable for any direct, indirect, incidental, special, exemplary, or +// consequential damages (including, but not limited to, procurement of +// substitute goods or services; loss of use, data, or profits; or +// business interruption) however caused and on any theory of liability, +// whether in contract, strict liability, or tort (including negligence +// or otherwise) arising in any way out of the use of this software, even +// if advised of the possibility of such damage. +// +// ******************************************************* EndRiceCopyright * + +#ifndef rocprofiler_api_h +#define rocprofiler_api_h + +//****************************************************************************** +// macro definitions +//****************************************************************************** + +#define ROCTRACER_CHANNEL_IDX 0 +#define ROCPROFILER_CHANNEL_IDX 1 + +//****************************************************************************** +// interface operations +//****************************************************************************** + +void +rocprofiler_start_kernel +( + uint64_t +); + + +void +rocprofiler_stop_kernel +( + void +); + + +void +rocprofiler_init +( + void +); + + +void +rocprofiler_fini +( + void *args, + int how +); + + +int +rocprofiler_bind +( + void +); + +void +rocprofiler_wait_context_callback +( + void +); + +int +rocprofiler_total_counters +( + void +); + +const char* +rocprofiler_counter_name +( + int +); + +const char* +rocprofiler_counter_description +( + int +); + +int +rocprofiler_match_event +( + const char* +); + +void +rocprofiler_finalize_event_list +( + void +); + +void +rocprofiler_uri_setup +( + void +); + + +#endif diff --git a/src/tool/hpcrun/gpu/amd/roctracer-api.c b/src/tool/hpcrun/gpu/amd/roctracer-api.c index d02a081d27..db45e6e407 100644 --- a/src/tool/hpcrun/gpu/amd/roctracer-api.c +++ b/src/tool/hpcrun/gpu/amd/roctracer-api.c @@ -47,11 +47,15 @@ #include "roctracer-api.h" #include "roctracer-activity-translate.h" -#include "rocm-debug-api.h" + +#include "hip-api.h" #include "rocm-binary-processing.h" +#include "tool_state.h" #include +#include + #include #include #include @@ -66,11 +70,18 @@ #include + +#include "rocprofiler-api.h" + //****************************************************************************** // macros //****************************************************************************** -#define FORALL_ROCTRACER_ROUTINES(macro) \ +#define DEBUG 0 +#include + + +#define FORALL_ROCTRACER_ROUTINES(macro) \ macro(roctracer_open_pool_expl) \ macro(roctracer_flush_activity_expl) \ macro(roctracer_activity_push_external_correlation_id) \ @@ -81,7 +92,6 @@ macro(roctracer_disable_domain_activity) \ macro(roctracer_set_properties) - #define ROCTRACER_FN_NAME(f) DYN_FN_NAME(f) #define ROCTRACER_FN(fn, args) \ @@ -89,10 +99,10 @@ #define HPCRUN_ROCTRACER_CALL(fn, args) \ { \ - roctracer_status_t status = ROCTRACER_FN_NAME(fn) args; \ - if (status != ROCTRACER_STATUS_SUCCESS) { \ + roctracer_status_t status = ROCTRACER_FN_NAME(fn) args; \ + if (status != ROCTRACER_STATUS_SUCCESS) { \ /* use roctracer_error_string() */ \ - } \ + } \ } typedef const char* (*hip_kernel_name_fnt)(const hipFunction_t f); @@ -109,6 +119,11 @@ typedef const char* (*hip_kernel_name_ref_fnt)(const void* hostFunction, hipStre static hip_kernel_name_fnt hip_kernel_name_fn; static hip_kernel_name_ref_fnt hip_kernel_name_ref_fn; +// If we collect counters for GPU kernels, +// we will serilize kernel executions. +// Hopefully, AMD tool support will improve this the future +static bool collect_counter = false; + //---------------------------------------------------------- // roctracer function pointers for late binding //---------------------------------------------------------- @@ -230,35 +245,35 @@ roctracer_kernel_data_set { case HIP_API_ID_hipModuleLaunchKernel: entry_data->kernel.blockSharedMemory = - data->args.hipModuleLaunchKernel.sharedMemBytes; + data->args.hipModuleLaunchKernel.sharedMemBytes; entry_data->kernel.blockThreads = - data->args.hipModuleLaunchKernel.blockDimX * - data->args.hipModuleLaunchKernel.blockDimY * - data->args.hipModuleLaunchKernel.blockDimZ; + data->args.hipModuleLaunchKernel.blockDimX * + data->args.hipModuleLaunchKernel.blockDimY * + data->args.hipModuleLaunchKernel.blockDimZ; break; case HIP_API_ID_hipLaunchCooperativeKernel: entry_data->kernel.blockSharedMemory = - data->args.hipLaunchCooperativeKernel.sharedMemBytes; + data->args.hipLaunchCooperativeKernel.sharedMemBytes; entry_data->kernel.blockThreads = - data->args.hipLaunchCooperativeKernel.blockDimX.x * - data->args.hipLaunchCooperativeKernel.blockDimX.y * - data->args.hipLaunchCooperativeKernel.blockDimX.z; + data->args.hipLaunchCooperativeKernel.blockDimX.x * + data->args.hipLaunchCooperativeKernel.blockDimX.y * + data->args.hipLaunchCooperativeKernel.blockDimX.z; break; case HIP_API_ID_hipHccModuleLaunchKernel: entry_data->kernel.blockSharedMemory = - data->args.hipHccModuleLaunchKernel.sharedMemBytes; + data->args.hipHccModuleLaunchKernel.sharedMemBytes; entry_data->kernel.blockThreads = - (data->args.hipHccModuleLaunchKernel.globalWorkSizeX * - data->args.hipHccModuleLaunchKernel.globalWorkSizeY * - data->args.hipHccModuleLaunchKernel.globalWorkSizeZ) + - (data->args.hipHccModuleLaunchKernel.localWorkSizeX * - data->args.hipHccModuleLaunchKernel.localWorkSizeY * - data->args.hipHccModuleLaunchKernel.localWorkSizeZ); + (data->args.hipHccModuleLaunchKernel.globalWorkSizeX * + data->args.hipHccModuleLaunchKernel.globalWorkSizeY * + data->args.hipHccModuleLaunchKernel.globalWorkSizeZ) + + (data->args.hipHccModuleLaunchKernel.localWorkSizeX * + data->args.hipHccModuleLaunchKernel.localWorkSizeY * + data->args.hipHccModuleLaunchKernel.localWorkSizeZ); break; } } @@ -284,7 +299,7 @@ ensure_kernel_ip_present // is already present if (hpcrun_cct_children(kernel_ph) == NULL) { cct_node_t *kernel = - hpcrun_cct_insert_ip_norm(kernel_ph, kernel_ip); + hpcrun_cct_insert_ip_norm(kernel_ph, kernel_ip, true); hpcrun_cct_retain(kernel); } } @@ -298,11 +313,18 @@ roctracer_subscriber_callback void* arg ) { + if (is_tool_active()) { +// TMSG(ROCM, "PAPI correlation callback"); +// gpu_correlation_channel_produce(PAPI_CORR_ID, NULL, 0); + return; + } + gpu_op_placeholder_flags_t gpu_op_placeholder_flags = 0; bool is_valid_op = false; bool is_kernel_op = false; const hip_api_data_t* data = (const hip_api_data_t*)(callback_data); - const char* kernel_name = NULL; + const char* kernel_name = NULL; + hipStream_t kernel_stream = 0; switch (callback_id) { case HIP_API_ID_hipMemcpy: @@ -329,7 +351,7 @@ roctracer_subscriber_callback case HIP_API_ID_hipMemcpyDtoHAsync: case HIP_API_ID_hipMemcpyParam2D: gpu_op_placeholder_flags_set(&gpu_op_placeholder_flags, - gpu_placeholder_type_copy); + gpu_placeholder_type_copy); is_valid_op = true; break; @@ -342,7 +364,7 @@ roctracer_subscriber_callback case HIP_API_ID_hipMalloc3D: case HIP_API_ID_hipExtMallocWithFlags: gpu_op_placeholder_flags_set(&gpu_op_placeholder_flags, - gpu_placeholder_type_alloc); + gpu_placeholder_type_alloc); is_valid_op = true; break; @@ -355,14 +377,14 @@ roctracer_subscriber_callback case HIP_API_ID_hipMemsetAsync: case HIP_API_ID_hipMemsetD32Async: gpu_op_placeholder_flags_set(&gpu_op_placeholder_flags, - gpu_placeholder_type_memset); + gpu_placeholder_type_memset); is_valid_op = true; break; case HIP_API_ID_hipFree: case HIP_API_ID_hipFreeArray: gpu_op_placeholder_flags_set(&gpu_op_placeholder_flags, - gpu_placeholder_type_delete); + gpu_placeholder_type_delete); is_valid_op = true; break; @@ -377,6 +399,9 @@ roctracer_subscriber_callback is_valid_op = true; is_kernel_op = true; kernel_name = hip_kernel_name_fn(data->args.hipModuleLaunchKernel.f); + if (collect_counter) { + kernel_stream = data->args.hipModuleLaunchKernel.stream; + } break; } case HIP_API_ID_hipLaunchKernel: { @@ -386,8 +411,11 @@ roctracer_subscriber_callback gpu_placeholder_type_trace); is_valid_op = true; is_kernel_op = true; - kernel_name = hip_kernel_name_ref_fn(data->args.hipLaunchKernel.function_address, + kernel_name = hip_kernel_name_ref_fn(data->args.hipLaunchKernel.function_address, data->args.hipLaunchKernel.stream); + if (collect_counter) { + kernel_stream = data->args.hipLaunchKernel.stream; + } break; } case HIP_API_ID_hipCtxSynchronize: @@ -395,7 +423,7 @@ roctracer_subscriber_callback case HIP_API_ID_hipDeviceSynchronize: case HIP_API_ID_hipEventSynchronize: gpu_op_placeholder_flags_set(&gpu_op_placeholder_flags, - gpu_placeholder_type_sync); + gpu_placeholder_type_sync); is_valid_op = true; break; default: @@ -408,8 +436,9 @@ roctracer_subscriber_callback if (data->phase == ACTIVITY_API_PHASE_ENTER) { uint64_t correlation_id = data->correlation_id; + uint64_t rocprofiler_correlation_id = 0; cct_node_t *api_node = - gpu_application_thread_correlation_callback(correlation_id); + gpu_application_thread_correlation_callback(correlation_id); gpu_op_ccts_t gpu_op_ccts; hpcrun_safe_enter(); @@ -423,17 +452,39 @@ roctracer_subscriber_callback cct_node_t *trace_ph = gpu_op_ccts_get(&gpu_op_ccts, gpu_placeholder_type_trace); ensure_kernel_ip_present(trace_ph, kernel_ip); + + if (collect_counter) { + rocprofiler_correlation_id = correlation_id; + rocprofiler_start_kernel(rocprofiler_correlation_id); + } } hpcrun_safe_exit(); - - gpu_activity_channel_consume(gpu_metrics_attribute); + gpu_activity_channel_consume_with_idx(ROCTRACER_CHANNEL_IDX, gpu_metrics_attribute); + if (collect_counter) { + gpu_activity_channel_consume_with_idx(ROCPROFILER_CHANNEL_IDX, gpu_metrics_attribute); + } // Generate notification entry uint64_t cpu_submit_time = hpcrun_nanotime(); - gpu_correlation_channel_produce(correlation_id, &gpu_op_ccts, cpu_submit_time); + //gpu_monitors_apply(api_node, gpu_monitor_type_enter); + + gpu_correlation_channel_produce_with_idx(ROCTRACER_CHANNEL_IDX, correlation_id, &gpu_op_ccts, cpu_submit_time); + if (collect_counter && is_kernel_op && kernel_name != NULL) { + gpu_correlation_channel_produce_with_idx(ROCPROFILER_CHANNEL_IDX, rocprofiler_correlation_id, &gpu_op_ccts, cpu_submit_time); + } + + }else if (data->phase == ACTIVITY_API_PHASE_EXIT){ + if (is_kernel_op && collect_counter) { + //gpu_monitors_apply(NULL, gpu_monitor_type_exit); + hipStreamSynchronize(kernel_stream); + rocprofiler_wait_context_callback(); + rocprofiler_stop_kernel(); + } } + + } @@ -443,7 +494,7 @@ roctracer_buffer_completion_notify void ) { - gpu_monitoring_thread_activities_ready(); + gpu_monitoring_thread_activities_ready_with_idx(ROCTRACER_CHANNEL_IDX); } @@ -457,7 +508,7 @@ roctracer_activity_process roctracer_activity_translate(&gpu_activity, roctracer_record); if (gpu_correlation_id_map_lookup(roctracer_record->correlation_id) == NULL) { gpu_correlation_id_map_insert(roctracer_record->correlation_id, - roctracer_record->correlation_id); + roctracer_record->correlation_id); } gpu_activity_process(&gpu_activity); } @@ -493,8 +544,6 @@ roctracer_path return path; } - - //****************************************************************************** // interface operations //****************************************************************************** @@ -509,10 +558,6 @@ roctracer_bind // More details: https://github.com/ROCm-Developer-Tools/roctracer/issues/22 setenv("HSA_ENABLE_INTERRUPT", "0", 1); - if (rocm_debug_api_bind() != DYNAMIC_BINDING_STATUS_OK) { - return DYNAMIC_BINDING_STATUS_ERROR; - } - #ifndef HPCRUN_STATIC_LINK // dynamic libraries only availabile in non-static case hpcrun_force_dlopen(true); @@ -572,6 +617,9 @@ roctracer_init HPCRUN_ROCTRACER_CALL(roctracer_enable_domain_callback, (ACTIVITY_DOMAIN_KFD_API, roctracer_subscriber_callback, NULL)); // Enable rocTX HPCRUN_ROCTRACER_CALL(roctracer_enable_domain_callback, (ACTIVITY_DOMAIN_ROCTX, roctracer_subscriber_callback, NULL)); + + // Prepare getting URI + rocprofiler_uri_setup(); } void @@ -604,3 +652,11 @@ roctracer_fini roctracer_flush(args, how); } +void +roctracer_enable_counter_collection +( + void +) +{ + collect_counter = true; +} diff --git a/src/tool/hpcrun/gpu/amd/roctracer-api.h b/src/tool/hpcrun/gpu/amd/roctracer-api.h index af3a381849..db8462e205 100644 --- a/src/tool/hpcrun/gpu/amd/roctracer-api.h +++ b/src/tool/hpcrun/gpu/amd/roctracer-api.h @@ -79,6 +79,10 @@ roctracer_bind void ); - +void +roctracer_enable_counter_collection +( + void +); #endif diff --git a/src/tool/hpcrun/gpu/gpu-activity-channel.c b/src/tool/hpcrun/gpu/gpu-activity-channel.c index 12d1e0a755..b1386b6b01 100644 --- a/src/tool/hpcrun/gpu/gpu-activity-channel.c +++ b/src/tool/hpcrun/gpu/gpu-activity-channel.c @@ -50,6 +50,7 @@ #include "gpu-activity.h" #include "gpu-activity-channel.h" #include "gpu-channel-item-allocator.h" +#include "gpu-channel-common.h" //****************************************************************************** @@ -96,7 +97,7 @@ typedef struct gpu_activity_channel_t { // local data //****************************************************************************** -static __thread gpu_activity_channel_t *gpu_activity_channel = NULL; +static __thread gpu_activity_channel_t *gpu_activity_channels[GPU_CHANNEL_TOTAL]; @@ -134,11 +135,20 @@ gpu_activity_channel_get void ) { - if (gpu_activity_channel == NULL) { - gpu_activity_channel = gpu_activity_channel_alloc(); + return gpu_activity_channel_get_with_idx(0); +} + +gpu_activity_channel_t * +gpu_activity_channel_get_with_idx +( + int idx +) +{ + if (gpu_activity_channels[idx] == NULL) { + gpu_activity_channels[idx] = gpu_activity_channel_alloc(); } - return gpu_activity_channel; + return gpu_activity_channels[idx]; } @@ -164,7 +174,17 @@ gpu_activity_channel_consume gpu_activity_attribute_fn_t aa_fn ) { - gpu_activity_channel_t *channel = gpu_activity_channel_get(); + return gpu_activity_channel_consume_with_idx(0, aa_fn); +} + +void +gpu_activity_channel_consume_with_idx +( + int idx, + gpu_activity_attribute_fn_t aa_fn +) +{ + gpu_activity_channel_t *channel = gpu_activity_channel_get_with_idx(idx); // steal elements previously enqueued by the producer channel_steal(channel, bichannel_direction_forward); diff --git a/src/tool/hpcrun/gpu/gpu-activity-channel.h b/src/tool/hpcrun/gpu/gpu-activity-channel.h index 4565b797a6..e9a994c0e6 100644 --- a/src/tool/hpcrun/gpu/gpu-activity-channel.h +++ b/src/tool/hpcrun/gpu/gpu-activity-channel.h @@ -51,6 +51,7 @@ #include #include "gpu-activity.h" +#include "gpu-channel-common.h" //****************************************************************************** @@ -74,6 +75,13 @@ gpu_activity_channel_get ); +gpu_activity_channel_t * +gpu_activity_channel_get_with_idx +( + int +); + + void gpu_activity_channel_produce ( @@ -89,5 +97,12 @@ gpu_activity_channel_consume ); +void +gpu_activity_channel_consume_with_idx +( + int idx, + gpu_activity_attribute_fn_t aa_fn +); + #endif diff --git a/src/tool/hpcrun/gpu/gpu-activity-process.c b/src/tool/hpcrun/gpu/gpu-activity-process.c index 47a6dc4288..29947bfd1c 100644 --- a/src/tool/hpcrun/gpu/gpu-activity-process.c +++ b/src/tool/hpcrun/gpu/gpu-activity-process.c @@ -57,6 +57,7 @@ #include #include #include +#include #include #include #include @@ -140,6 +141,11 @@ attribute_activity gpu_activity_channel_t *channel = gpu_host_correlation_map_entry_channel_get(hc); activity->cct_node = cct_node; + + PRINT("attributing activity to %p time = [%lu,%lu)\n", + cct_node, activity->details.interval.start, + activity->details.interval.end); + gpu_activity_channel_produce(channel, activity); } @@ -150,7 +156,7 @@ gpu_memcpy_process gpu_activity_t *activity ) { - uint32_t correlation_id = activity->details.memcpy.correlation_id; + uint64_t correlation_id = activity->details.memcpy.correlation_id; gpu_correlation_id_map_entry_t *cid_map_entry = gpu_correlation_id_map_lookup(correlation_id); if (cid_map_entry != NULL) { @@ -194,9 +200,9 @@ gpu_memcpy_process } gpu_correlation_id_map_delete(correlation_id); } else { - PRINT("Memcpy copy correlation_id %u cannot be found\n", correlation_id); + PRINT("Memcpy copy correlation_id 0x%lx cannot be found\n", correlation_id); } - PRINT("Memcpy copy CorrelationId %u\n", correlation_id); + PRINT("Memcpy copy correlation_id 0x%lx\n", correlation_id); PRINT("Memcpy copy kind %u\n", activity->details.memcpy.copyKind); PRINT("Memcpy copy bytes %lu\n", activity->details.memcpy.bytes); } @@ -217,7 +223,7 @@ gpu_sample_process gpu_activity_t* sample ) { - uint32_t correlation_id = sample->details.pc_sampling.correlation_id; + uint64_t correlation_id = sample->details.pc_sampling.correlation_id; gpu_correlation_id_map_entry_t *cid_map_entry = gpu_correlation_id_map_lookup(correlation_id); @@ -254,7 +260,7 @@ gpu_sample_process PRINT("host_map_entry %lu not found\n", external_id); } } else { - PRINT("correlation_id_map_entry %u not found\n", correlation_id); + PRINT("correlation_id_map_entry %lu not found\n", correlation_id); } } @@ -265,7 +271,7 @@ gpu_sampling_info_process gpu_activity_t *sri ) { - uint32_t correlation_id = sri->details.pc_sampling_info.correlation_id; + uint64_t correlation_id = sri->details.pc_sampling_info.correlation_id; gpu_correlation_id_map_entry_t *cid_map_entry = gpu_correlation_id_map_lookup(correlation_id); if (cid_map_entry != NULL) { @@ -305,7 +311,8 @@ gpu_correlation_process if (gpu_correlation_id_map_lookup(gpu_correlation_id) == NULL) { gpu_correlation_id_map_insert(gpu_correlation_id, host_correlation_id); } else { - gpu_correlation_id_map_external_id_replace(gpu_correlation_id, host_correlation_id); + gpu_correlation_id_map_external_id_replace(gpu_correlation_id, + host_correlation_id); } PRINT("Correlation: native_correlation %u --> host_correlation %lu\n", gpu_correlation_id, host_correlation_id); @@ -318,7 +325,7 @@ gpu_memset_process gpu_activity_t *activity ) { - uint32_t correlation_id = activity->details.memset.correlation_id; + uint64_t correlation_id = activity->details.memset.correlation_id; gpu_correlation_id_map_entry_t *cid_map_entry = gpu_correlation_id_map_lookup(correlation_id); if (cid_map_entry != NULL) { @@ -344,7 +351,7 @@ gpu_memset_process } gpu_correlation_id_map_delete(correlation_id); } - PRINT("Memset CorrelationId %u\n", correlation_id); + PRINT("Memset correlation_id 0x%lx\n", correlation_id); PRINT("Memset kind %u\n", activity->details.memset.memKind); PRINT("Memset bytes %lu\n", activity->details.memset.bytes); } @@ -356,7 +363,8 @@ gpu_function_process gpu_activity_t *activity ) { - gpu_function_id_map_insert(activity->details.function.function_id, activity->details.function.pc); + gpu_function_id_map_insert(activity->details.function.function_id, + activity->details.function.pc); PRINT("Function id %u\n", activity->details.function.function_id); } @@ -367,7 +375,7 @@ gpu_kernel_process gpu_activity_t *activity ) { - uint32_t correlation_id = activity->details.kernel.correlation_id; + uint64_t correlation_id = activity->details.kernel.correlation_id; gpu_correlation_id_map_entry_t *cid_map_entry = gpu_correlation_id_map_lookup(correlation_id); @@ -417,11 +425,12 @@ gpu_kernel_process attribute_activity(host_op_entry, activity, kernel_node); } } else { - PRINT("Kernel execution correlation_id %u cannot be found\n", correlation_id); + PRINT("Kernel execution correlation_id 0x%lx cannot be found\n", + correlation_id); } PRINT("Kernel execution deviceId %u\n", activity->details.kernel.device_id); - PRINT("Kernel execution CorrelationId %u\n", correlation_id); + PRINT("Kernel execution correlation_id 0x%lx\n", correlation_id); } @@ -461,15 +470,17 @@ gpu_synchronization_process gpu_activity_t *activity ) { - uint32_t correlation_id = activity->details.synchronization.correlation_id; + uint64_t correlation_id = activity->details.synchronization.correlation_id; gpu_correlation_id_map_entry_t *cid_map_entry = gpu_correlation_id_map_lookup(correlation_id); if (cid_map_entry != NULL) { uint64_t external_id = gpu_correlation_id_map_entry_external_id_get(cid_map_entry); + gpu_host_correlation_map_entry_t *host_op_entry = gpu_host_correlation_map_lookup(external_id); - if (host_op_entry != NULL) { + if (host_op_entry != NULL && external_id != IGNORE_CORR_ID) { + cct_node_t *host_op_node = gpu_host_correlation_map_entry_op_cct_get(host_op_entry, gpu_placeholder_type_sync); @@ -500,7 +511,8 @@ gpu_synchronization_process case GPU_SYNC_EVENT: { // Find the corresponding stream that records the event - gpu_event_id_map_entry_t *event_id_entry = gpu_event_id_map_lookup(event_id); + gpu_event_id_map_entry_t *event_id_entry = + gpu_event_id_map_lookup(event_id); if (event_id_entry != NULL) { context_id = gpu_event_id_map_entry_context_id_get(event_id_entry); stream_id = gpu_event_id_map_entry_stream_id_get(event_id_entry); @@ -511,7 +523,8 @@ gpu_synchronization_process } default: // invalid - PRINT("Invalid synchronization %u\n", correlation_id); + PRINT("Synchronization correlation_id 0x%lx cannot be found\n", + correlation_id); } } // TODO(Keren): handle event synchronization @@ -520,7 +533,7 @@ gpu_synchronization_process } gpu_correlation_id_map_delete(correlation_id); } - PRINT("Synchronization CorrelationId %u\n", correlation_id); + PRINT("Synchronization correlation_id 0x%lx\n", correlation_id); } @@ -530,7 +543,7 @@ gpu_cdpkernel_process gpu_activity_t *activity ) { - uint32_t correlation_id = activity->details.cdpkernel.correlation_id; + uint64_t correlation_id = activity->details.cdpkernel.correlation_id; gpu_correlation_id_map_entry_t *cid_map_entry = gpu_correlation_id_map_lookup(correlation_id); if (cid_map_entry != NULL) { @@ -557,7 +570,7 @@ gpu_cdpkernel_process } gpu_correlation_id_map_delete(correlation_id); } - PRINT("Cdp Kernel CorrelationId %u\n", correlation_id); + PRINT("Cdp Kernel correlation_id 0x%lx\n", correlation_id); } @@ -575,6 +588,22 @@ gpu_event_process PRINT("GPU event %u\n", event_id); } +static gpu_placeholder_type_t +gpu_memory_placeholder +( + gpu_activity_t *activity +) +{ + gpu_mem_op_t mem_op = activity->details.memory.mem_op;; + switch(mem_op) { + case GPU_MEM_OP_ALLOC: return gpu_placeholder_type_alloc; + case GPU_MEM_OP_DELETE: return gpu_placeholder_type_delete; + default: + assert(0); + } + return gpu_placeholder_type_alloc; +} + static void gpu_memory_process @@ -582,7 +611,7 @@ gpu_memory_process gpu_activity_t *activity ) { - uint32_t correlation_id = activity->details.memory.correlation_id; + uint64_t correlation_id = activity->details.memory.correlation_id; gpu_correlation_id_map_entry_t *cid_map_entry = gpu_correlation_id_map_lookup(correlation_id); if (cid_map_entry != NULL) { @@ -591,19 +620,29 @@ gpu_memory_process gpu_host_correlation_map_entry_t *host_op_entry = gpu_host_correlation_map_lookup(external_id); if (host_op_entry != NULL) { - gpu_placeholder_type_t ph = gpu_placeholder_type_alloc; + gpu_placeholder_type_t ph = gpu_memory_placeholder(activity); cct_node_t *host_op_node = gpu_host_correlation_map_entry_op_cct_get(host_op_entry, ph); assert(host_op_node != NULL); // Memory allocation does not always happen on the device // Do not send it to trace channels + + gpu_trace_item_t entry_trace; + trace_item_set(&entry_trace, activity, host_op_entry, host_op_node); + + gpu_context_stream_trace + (activity->details.memory.device_id, + activity->details.memory.context_id, + activity->details.memory.stream_id, + &entry_trace); + attribute_activity(host_op_entry, activity, host_op_node); } gpu_correlation_id_map_delete(correlation_id); } else { - PRINT("Memory correlation_id %u cannot be found\n", correlation_id); + PRINT("Memory correlation_id 0x%lx cannot be found\n", correlation_id); } - PRINT("Memory CorrelationId %u\n", correlation_id); + PRINT("Memory correlation_id 0x%lx\n", correlation_id); PRINT("Memory kind %u\n", activity->details.memory.memKind); PRINT("Memory bytes %lu\n", activity->details.memory.bytes); } @@ -615,7 +654,7 @@ gpu_instruction_process gpu_activity_t *activity ) { - uint32_t correlation_id = activity->details.instruction.correlation_id; + uint64_t correlation_id = activity->details.instruction.correlation_id; ip_normalized_t pc = activity->details.instruction.pc; gpu_correlation_id_map_entry_t *cid_map_entry = gpu_correlation_id_map_lookup(correlation_id); @@ -633,7 +672,49 @@ gpu_instruction_process attribute_activity(host_op_entry, activity, func_ins); } } - PRINT("Instruction correlation_id %u\n", correlation_id); + PRINT("Instruction correlation_id 0x%lx\n", correlation_id); +} + +static void +gpu_counter_process +( + gpu_activity_t *activity +) +{ + uint32_t correlation_id = activity->details.counters.correlation_id; + gpu_correlation_id_map_entry_t *cid_map_entry = + gpu_correlation_id_map_lookup(correlation_id); + if (cid_map_entry != NULL) { + uint64_t external_id = + gpu_correlation_id_map_entry_external_id_get(cid_map_entry); + gpu_host_correlation_map_entry_t *host_op_entry = + gpu_host_correlation_map_lookup(external_id); + if (host_op_entry != NULL) { + gpu_placeholder_type_t ph = gpu_placeholder_type_kernel; + cct_node_t *host_op_node = + gpu_host_correlation_map_entry_op_cct_get(host_op_entry, ph); + assert(host_op_node != NULL); + + cct_node_t *func_node = hpcrun_cct_children(host_op_node); // only child + cct_node_t *kernel_node; + if (func_node == NULL) { + kernel_node = host_op_node; + } else { + cct_addr_t *addr = hpcrun_cct_addr(func_node); + kernel_node = hpcrun_cct_insert_ip_norm(host_op_node, addr->ip_norm, true); + } + // Memory allocation does not always happen on the device + // Do not send it to trace channels + attribute_activity(host_op_entry, activity, kernel_node); + } + gpu_correlation_id_map_delete(correlation_id); + } else { + PRINT("Counter correlation_id %u cannot be found\n", correlation_id); + } + PRINT("Counter CorrelationId %u\n", correlation_id); + PRINT("Counter cycles %lu\n", activity->details.counters.cycles); + PRINT("Counter l2 cache hit %lu\n", activity->details.counters.l2_cache_hit); + PRINT("Counter l2 cache miss %lu\n", activity->details.counters.l2_cache_miss); } @@ -713,6 +794,10 @@ gpu_activity_process gpu_event_process(ga); break; + case GPU_ACTIVITY_COUNTER: + gpu_counter_process(ga); + break; + case GPU_ACTIVITY_MEMCPY2: default: gpu_unknown_process(ga); diff --git a/src/tool/hpcrun/gpu/gpu-activity.c b/src/tool/hpcrun/gpu/gpu-activity.c index 0a640c8636..ba595e187f 100644 --- a/src/tool/hpcrun/gpu/gpu-activity.c +++ b/src/tool/hpcrun/gpu/gpu-activity.c @@ -56,6 +56,9 @@ #include "gpu-activity.h" #include "gpu-channel-item-allocator.h" + +#define DEBUG 0 + #include "gpu-print.h" @@ -66,9 +69,6 @@ #define UNIT_TEST 0 -#define DEBUG 0 - - #define FORALL_OPENCL_KINDS(macro) \ macro(GPU_ACTIVITY_UNKNOWN) \ macro(GPU_ACTIVITY_KERNEL) \ @@ -160,6 +160,8 @@ gpu_interval_set { interval->start = start; interval->end = end; + PRINT("gpu interval: [%lu, %lu) delta = %ld\n", interval->start, + interval->end, interval->end - interval->start); } diff --git a/src/tool/hpcrun/gpu/gpu-activity.h b/src/tool/hpcrun/gpu/gpu-activity.h index 786c4da451..7cf388de90 100644 --- a/src/tool/hpcrun/gpu/gpu-activity.h +++ b/src/tool/hpcrun/gpu/gpu-activity.h @@ -95,7 +95,8 @@ typedef enum { GPU_ACTIVITY_EXTERNAL_CORRELATION = 14, GPU_ACTIVITY_EVENT = 15, GPU_ACTIVITY_FUNCTION = 16, - GPU_ACTIVITY_FLUSH = 17 + GPU_ACTIVITY_FLUSH = 17, + GPU_ACTIVITY_COUNTER = 18 } gpu_activity_kind_t; @@ -179,9 +180,16 @@ typedef enum { } gpu_mem_type_t; +typedef enum { + GPU_MEM_OP_ALLOC = 0, + GPU_MEM_OP_DELETE = 1, + GPU_MEM_OP_UNKNOWN = 2 +} gpu_mem_op_t; + + // pc sampling typedef struct gpu_pc_sampling_t { - uint32_t correlation_id; + uint64_t correlation_id; ip_normalized_t pc; uint32_t samples; uint32_t latencySamples; @@ -190,7 +198,7 @@ typedef struct gpu_pc_sampling_t { typedef struct gpu_pc_sampling_info_t { - uint32_t correlation_id; + uint64_t correlation_id; uint64_t droppedSamples; uint64_t samplingPeriodInCycles; uint64_t totalSamples; @@ -217,7 +225,7 @@ typedef struct gpu_memcpy_t { uint64_t end; uint64_t bytes; uint64_t submit_time; - uint32_t correlation_id; + uint64_t correlation_id; uint32_t device_id; uint32_t context_id; uint32_t stream_id; @@ -231,10 +239,12 @@ typedef struct gpu_memory_t { uint64_t start; uint64_t end; uint64_t bytes; - uint32_t correlation_id; + uint64_t correlation_id; uint32_t device_id; uint32_t context_id; + uint32_t stream_id; gpu_mem_type_t memKind; + gpu_mem_op_t mem_op; } gpu_memory_t; @@ -243,7 +253,7 @@ typedef struct gpu_memset_t { uint64_t start; uint64_t end; uint64_t bytes; - uint32_t correlation_id; + uint64_t correlation_id; uint32_t device_id; uint32_t context_id; uint32_t stream_id; @@ -256,7 +266,7 @@ typedef struct gpu_kernel_t { uint64_t start; uint64_t end; uint64_t submit_time; - uint32_t correlation_id; + uint64_t correlation_id; uint32_t device_id; uint32_t context_id; uint32_t stream_id; @@ -282,7 +292,7 @@ typedef struct gpu_kernel_block_t { typedef struct gpu_cdpkernel_t { uint64_t start; uint64_t end; - uint32_t correlation_id; + uint64_t correlation_id; uint32_t device_id; uint32_t context_id; uint32_t stream_id; @@ -303,7 +313,7 @@ typedef struct gpu_event_t { typedef struct gpu_global_access_t { - uint32_t correlation_id; + uint64_t correlation_id; ip_normalized_t pc; uint64_t l2_transactions; uint64_t theoreticalL2Transactions; @@ -313,7 +323,7 @@ typedef struct gpu_global_access_t { typedef struct gpu_local_access_t { - uint32_t correlation_id; + uint64_t correlation_id; ip_normalized_t pc; uint64_t sharedTransactions; uint64_t theoreticalSharedTransactions; @@ -323,7 +333,7 @@ typedef struct gpu_local_access_t { typedef struct gpu_branch_t { - uint32_t correlation_id; + uint64_t correlation_id; ip_normalized_t pc; uint32_t diverged; uint32_t executed; @@ -333,7 +343,7 @@ typedef struct gpu_branch_t { typedef struct gpu_synchronization_t { uint64_t start; uint64_t end; - uint32_t correlation_id; + uint64_t correlation_id; uint32_t context_id; uint32_t stream_id; uint32_t event_id; @@ -342,10 +352,19 @@ typedef struct gpu_synchronization_t { typedef struct gpu_host_correlation_t { - uint32_t correlation_id; + uint64_t correlation_id; uint64_t host_correlation_id; } gpu_host_correlation_t; +typedef struct gpu_counter_t { + uint32_t correlation_id; + int total_counters; + // The function that creates the structure should + // be responsible for allocating memory. + // The function that attributes the structure should + // be responsible for deallocating the memory. + uint64_t* values; +} gpu_counter_t; // a type that can be used to access start and end times // for a subset of activity kinds including kernel execution, @@ -357,7 +376,7 @@ typedef struct gpu_interval_t { typedef struct gpu_instruction_t { - uint32_t correlation_id; + uint64_t correlation_id; ip_normalized_t pc; } gpu_instruction_t; @@ -383,6 +402,7 @@ typedef struct gpu_activity_details_t { gpu_synchronization_t synchronization; gpu_host_correlation_t correlation; gpu_flush_t flush; + gpu_counter_t counters; /* Access short cut for activitiy fields shared by multiple kinds */ diff --git a/src/tool/hpcrun/gpu/gpu-application-thread-api.c b/src/tool/hpcrun/gpu/gpu-application-thread-api.c index 7d3e6b3f9f..1bc5b20c70 100644 --- a/src/tool/hpcrun/gpu/gpu-application-thread-api.c +++ b/src/tool/hpcrun/gpu/gpu-application-thread-api.c @@ -118,7 +118,6 @@ gpu_application_thread_correlation_callback } } - // skip procedure frames in libhpcrun while (libhpcrun_id != 0 && node_addr->ip_norm.lm_id == libhpcrun_id) { node = hpcrun_cct_parent(node); diff --git a/src/tool/hpcrun/gpu/gpu-channel-common.h b/src/tool/hpcrun/gpu/gpu-channel-common.h new file mode 100644 index 0000000000..b2396dd1a7 --- /dev/null +++ b/src/tool/hpcrun/gpu/gpu-channel-common.h @@ -0,0 +1,24 @@ +#ifndef GPU_CHANNEL_COMMON_H +#define GPU_CHANNEL_COMMON_H + +// GPU_CHANNEL_TOTAL specifies the total number +// of correlation and activity channels an application +// thread will create. +// This is created for supporting AMD GPUs, +// where roctracer and rocprofiler will each create +// one monitoring thread. +// As the implementation of the channel is one-proceduer-one-consumer, +// we need an array of correlation and +// activity channel for each application thread. +// For platforms where there is just one monitoring +// thread, such as NVIDIA, the implementation maintains +// backward compatibility, where we will just use +// the first channel pair. +// Implementation wise, channel operations without _with_idx suffix +// represent old operations and will use channel 0 +// Channel operations with _with_idx suffix requires a channel +// index to specify which channel to operate with + +#define GPU_CHANNEL_TOTAL 2 + +#endif diff --git a/src/tool/hpcrun/gpu/gpu-correlation-channel-set.c b/src/tool/hpcrun/gpu/gpu-correlation-channel-set.c index f7f2d95a93..5557818cac 100644 --- a/src/tool/hpcrun/gpu/gpu-correlation-channel-set.c +++ b/src/tool/hpcrun/gpu/gpu-correlation-channel-set.c @@ -51,6 +51,7 @@ #include "gpu-correlation-channel.h" #include "gpu-correlation-channel-set.h" +#include "gpu-channel-common.h" @@ -99,7 +100,7 @@ typed_stack_declare_type(gpu_correlation_channel_ptr_t); static typed_stack_elem_ptr(gpu_correlation_channel_ptr_t) -gpu_correlation_channel_stack; +gpu_correlation_channel_stacks[GPU_CHANNEL_TOTAL]; @@ -128,12 +129,13 @@ channel_forone static void -gpu_correlation_channel_set_forall +gpu_correlation_channel_set_forall_with_idx ( + int idx, gpu_correlation_channel_fn_t channel_fn ) { - channel_stack_forall(&gpu_correlation_channel_stack, channel_forone, + channel_stack_forall(&gpu_correlation_channel_stacks[idx], channel_forone, channel_fn); } @@ -143,8 +145,9 @@ gpu_correlation_channel_set_forall //****************************************************************************** void -gpu_correlation_channel_set_insert +gpu_correlation_channel_set_insert_with_idx ( + int idx, gpu_correlation_channel_t *channel ) { @@ -157,15 +160,15 @@ gpu_correlation_channel_set_insert channel_stack_elem_ptr_set(e, 0); // clear the entry's next ptr // add the entry to the channel stack - channel_stack_push(&gpu_correlation_channel_stack, e); + channel_stack_push(&gpu_correlation_channel_stacks[idx], e); } void -gpu_correlation_channel_set_consume +gpu_correlation_channel_set_consume_with_idx ( - void + int idx ) { - gpu_correlation_channel_set_forall(gpu_correlation_channel_consume); + gpu_correlation_channel_set_forall_with_idx(idx, gpu_correlation_channel_consume); } diff --git a/src/tool/hpcrun/gpu/gpu-correlation-channel-set.h b/src/tool/hpcrun/gpu/gpu-correlation-channel-set.h index 5eac5a7d5a..091ba7394c 100644 --- a/src/tool/hpcrun/gpu/gpu-correlation-channel-set.h +++ b/src/tool/hpcrun/gpu/gpu-correlation-channel-set.h @@ -70,16 +70,17 @@ typedef void (*gpu_correlation_channel_fn_t) //****************************************************************************** void -gpu_correlation_channel_set_insert +gpu_correlation_channel_set_insert_with_idx ( + int idx, gpu_correlation_channel_t *channel ); void -gpu_correlation_channel_set_consume +gpu_correlation_channel_set_consume_with_idx ( - void + int idx ); diff --git a/src/tool/hpcrun/gpu/gpu-correlation-channel.c b/src/tool/hpcrun/gpu/gpu-correlation-channel.c index 47a8345554..cf855b1c54 100644 --- a/src/tool/hpcrun/gpu/gpu-correlation-channel.c +++ b/src/tool/hpcrun/gpu/gpu-correlation-channel.c @@ -71,7 +71,7 @@ #define typed_bichannel(x) gpu_correlation_channel_t #define typed_stack_elem(x) gpu_correlation_t -// define macros that simplify use of correlation channel API +// define macros that simplify use of correlation channel API #define channel_init \ typed_bichannel_init(gpu_correlation_t) @@ -100,7 +100,7 @@ typedef struct gpu_correlation_channel_t { // local data //****************************************************************************** -static __thread gpu_correlation_channel_t *gpu_correlation_channel = NULL; +static __thread gpu_correlation_channel_t *gpu_correlation_channels[GPU_CHANNEL_TOTAL]; @@ -113,37 +113,35 @@ typed_bichannel_impl(gpu_correlation_t) static gpu_correlation_channel_t * -gpu_correlation_channel_alloc +gpu_correlation_channel_alloc_with_idx ( - void + int idx ) { - gpu_correlation_channel_t *c = + gpu_correlation_channel_t *c = hpcrun_malloc_safe(sizeof(gpu_correlation_channel_t)); channel_init(c); - gpu_correlation_channel_set_insert(c); + gpu_correlation_channel_set_insert_with_idx(idx, c); return c; } static gpu_correlation_channel_t * -gpu_correlation_channel_get +gpu_correlation_channel_get_with_idx ( - void + int idx ) { - if (gpu_correlation_channel == NULL) { - gpu_correlation_channel = gpu_correlation_channel_alloc(); + if (gpu_correlation_channels[idx] == NULL) { + gpu_correlation_channels[idx] = gpu_correlation_channel_alloc_with_idx(idx); } - return gpu_correlation_channel; + return gpu_correlation_channels[idx]; } - - //****************************************************************************** // interface functions //****************************************************************************** @@ -156,8 +154,21 @@ gpu_correlation_channel_produce uint64_t cpu_submit_time ) { - gpu_correlation_channel_t *corr_channel = gpu_correlation_channel_get(); - gpu_activity_channel_t *activity_channel = gpu_activity_channel_get(); + // Relaying parameters with index 0 + gpu_correlation_channel_produce_with_idx(0, host_correlation_id, gpu_op_ccts, cpu_submit_time); +} + +void +gpu_correlation_channel_produce_with_idx +( + int idx, + uint64_t host_correlation_id, + gpu_op_ccts_t *gpu_op_ccts, + uint64_t cpu_submit_time +) +{ + gpu_correlation_channel_t *corr_channel = gpu_correlation_channel_get_with_idx(idx); + gpu_activity_channel_t *activity_channel = gpu_activity_channel_get_with_idx(idx); gpu_correlation_t *c = gpu_correlation_alloc(corr_channel); @@ -167,7 +178,6 @@ gpu_correlation_channel_produce channel_push(corr_channel, bichannel_direction_forward, c); } - void gpu_correlation_channel_consume ( @@ -204,7 +214,7 @@ gpu_correlation_channel_consume void *hpcrun_malloc_safe ( size_t s -) +) { return malloc(s); } @@ -214,7 +224,7 @@ gpu_activity_channel_t * gpu_activity_channel_get ( void -) +) { return (gpu_activity_channel_t *) 0x5000; } @@ -223,7 +233,7 @@ gpu_activity_channel_get int main ( - int argc, + int argc, char **argv ) { diff --git a/src/tool/hpcrun/gpu/gpu-correlation-channel.h b/src/tool/hpcrun/gpu/gpu-correlation-channel.h index 33fcc0185e..5e321d6730 100644 --- a/src/tool/hpcrun/gpu/gpu-correlation-channel.h +++ b/src/tool/hpcrun/gpu/gpu-correlation-channel.h @@ -50,8 +50,7 @@ //****************************************************************************** #include "gpu-correlation.h" - - +#include "gpu-channel-common.h" //****************************************************************************** // type declarations @@ -67,7 +66,7 @@ typedef struct gpu_op_ccts_t gpu_op_ccts_t; // interface operations //****************************************************************************** -// produce into a channel that my thread created +// produce into the first channel that my thread created void gpu_correlation_channel_produce ( @@ -76,6 +75,16 @@ gpu_correlation_channel_produce uint64_t cpu_submit_time ); +// produce into a specified channel (with idx) that my thread created +// when idx == 0, this function is equivalent to gpu_correlation_channel_produce +void +gpu_correlation_channel_produce_with_idx +( + int idx, + uint64_t host_correlation_id, + gpu_op_ccts_t *gpu_ccts, + uint64_t cpu_submit_time +); // consume from a channel that another thread created void diff --git a/src/tool/hpcrun/gpu/gpu-correlation-id-map.c b/src/tool/hpcrun/gpu/gpu-correlation-id-map.c index 90ba4a0470..473640b811 100644 --- a/src/tool/hpcrun/gpu/gpu-correlation-id-map.c +++ b/src/tool/hpcrun/gpu/gpu-correlation-id-map.c @@ -109,7 +109,7 @@ typedef struct typed_splay_node(correlation_id) { uint32_t device_id; uint64_t start; uint64_t end; -} typed_splay_node(correlation_id); +} typed_splay_node(correlation_id); @@ -119,9 +119,9 @@ typedef struct typed_splay_node(correlation_id) { // local data //****************************************************************************** -static gpu_correlation_id_map_entry_t *map_root = NULL; +static __thread gpu_correlation_id_map_entry_t *map_root = NULL; -static gpu_correlation_id_map_entry_t *free_list = NULL; +static __thread gpu_correlation_id_map_entry_t *free_list = NULL; @@ -142,13 +142,13 @@ gpu_correlation_id_map_entry_alloc() static gpu_correlation_id_map_entry_t * gpu_correlation_id_map_entry_new ( - uint32_t gpu_correlation_id, + uint64_t gpu_correlation_id, uint64_t host_correlation_id ) { gpu_correlation_id_map_entry_t *e = gpu_correlation_id_map_entry_alloc(); - memset(e, 0, sizeof(gpu_correlation_id_map_entry_t)); + memset(e, 0, sizeof(gpu_correlation_id_map_entry_t)); e->gpu_correlation_id = gpu_correlation_id; e->host_correlation_id = host_correlation_id; @@ -165,13 +165,13 @@ gpu_correlation_id_map_entry_new gpu_correlation_id_map_entry_t * gpu_correlation_id_map_lookup ( - uint32_t gpu_correlation_id + uint64_t gpu_correlation_id ) { uint64_t correlation_id = gpu_correlation_id; gpu_correlation_id_map_entry_t *result = st_lookup(&map_root, correlation_id); - PRINT("correlation_id map lookup: id=0x%lx (record %p)\n", + PRINT("correlation_id map lookup: id=0x%lx (record %p)\n", correlation_id, result); return result; @@ -181,21 +181,21 @@ gpu_correlation_id_map_lookup void gpu_correlation_id_map_insert ( - uint32_t gpu_correlation_id, + uint64_t gpu_correlation_id, uint64_t host_correlation_id ) { - if (st_lookup(&map_root, gpu_correlation_id)) { + if (st_lookup(&map_root, gpu_correlation_id)) { // fatal error: correlation_id already present; a // correlation should be inserted only once. assert(0); } else { - gpu_correlation_id_map_entry_t *entry = + gpu_correlation_id_map_entry_t *entry = gpu_correlation_id_map_entry_new(gpu_correlation_id, host_correlation_id); st_insert(&map_root, entry); - PRINT("correlation_id_map insert: correlation_id=0x%lx external_id=%ld (entry=%p)\n", + PRINT("correlation_id_map insert: correlation_id=0x%lx external_id=%ld (entry=%p)\n", gpu_correlation_id, host_correlation_id, entry); } } @@ -205,7 +205,7 @@ gpu_correlation_id_map_insert void gpu_correlation_id_map_external_id_replace ( - uint32_t gpu_correlation_id, + uint64_t gpu_correlation_id, uint64_t host_correlation_id ) { @@ -221,7 +221,7 @@ gpu_correlation_id_map_external_id_replace void gpu_correlation_id_map_delete ( - uint32_t gpu_correlation_id + uint64_t gpu_correlation_id ) { gpu_correlation_id_map_entry_t *node = st_delete(&map_root, gpu_correlation_id); @@ -232,7 +232,7 @@ gpu_correlation_id_map_delete void gpu_correlation_id_map_kernel_update ( - uint32_t gpu_correlation_id, + uint64_t gpu_correlation_id, uint32_t device_id, uint64_t start, uint64_t end @@ -280,7 +280,7 @@ gpu_correlation_id_map_entry_end_get } -uint32_t +uint64_t gpu_correlation_id_map_entry_device_id_get ( gpu_correlation_id_map_entry_t *entry diff --git a/src/tool/hpcrun/gpu/gpu-correlation-id-map.h b/src/tool/hpcrun/gpu/gpu-correlation-id-map.h index 1ba5b2a5b0..8a0340ed81 100644 --- a/src/tool/hpcrun/gpu/gpu-correlation-id-map.h +++ b/src/tool/hpcrun/gpu/gpu-correlation-id-map.h @@ -71,14 +71,14 @@ typedef struct cct_node_t cct_node_t; gpu_correlation_id_map_entry_t * gpu_correlation_id_map_lookup ( - uint32_t gpu_correlation_id + uint64_t gpu_correlation_id ); void gpu_correlation_id_map_insert ( - uint32_t gpu_correlation_id, + uint64_t gpu_correlation_id, uint64_t host_correlation_id ); @@ -86,14 +86,14 @@ gpu_correlation_id_map_insert void gpu_correlation_id_map_delete ( - uint32_t gpu_correlation_id + uint64_t gpu_correlation_id ); void gpu_correlation_id_map_external_id_replace ( - uint32_t gpu_correlation_id, + uint64_t gpu_correlation_id, uint64_t host_correlation_id ); @@ -101,7 +101,7 @@ gpu_correlation_id_map_external_id_replace void gpu_correlation_id_map_kernel_update ( - uint32_t correlation_id, + uint64_t correlation_id, uint32_t device_id, uint64_t start, uint64_t end @@ -129,7 +129,7 @@ gpu_correlation_id_map_entry_end_get ); -uint32_t +uint64_t gpu_correlation_id_map_entry_device_id_get ( gpu_correlation_id_map_entry_t *entry diff --git a/src/tool/hpcrun/gpu/gpu-correlation-id.h b/src/tool/hpcrun/gpu/gpu-correlation-id.h index 59a138e2b9..0f24428696 100644 --- a/src/tool/hpcrun/gpu/gpu-correlation-id.h +++ b/src/tool/hpcrun/gpu/gpu-correlation-id.h @@ -52,7 +52,8 @@ #include - +//we use this for our activity that should be ignored +#define IGNORE_CORR_ID (~0ULL) //****************************************************************************** // interface operations diff --git a/src/tool/hpcrun/gpu/gpu-correlation.c b/src/tool/hpcrun/gpu/gpu-correlation.c index bed6e41d76..ebc91cab25 100644 --- a/src/tool/hpcrun/gpu/gpu-correlation.c +++ b/src/tool/hpcrun/gpu/gpu-correlation.c @@ -103,7 +103,7 @@ gpu_correlation_produce { PRINT("Produce correlation id 0x%lx\n", host_correlation_id); c->host_correlation_id = host_correlation_id; - c->gpu_op_ccts = *gpu_op_ccts; + if (gpu_op_ccts) c->gpu_op_ccts = *gpu_op_ccts; c->activity_channel = activity_channel; c->cpu_submit_time = cpu_submit_time; } diff --git a/src/tool/hpcrun/gpu/gpu-correlation.h b/src/tool/hpcrun/gpu/gpu-correlation.h index d9ecce8262..7b83680add 100644 --- a/src/tool/hpcrun/gpu/gpu-correlation.h +++ b/src/tool/hpcrun/gpu/gpu-correlation.h @@ -61,7 +61,6 @@ #define UNIT_TEST_CORRELATION_HEADER 0 - //****************************************************************************** // forward type declarations //****************************************************************************** diff --git a/src/tool/hpcrun/gpu/gpu-host-correlation-map.c b/src/tool/hpcrun/gpu/gpu-host-correlation-map.c index fdd8edb583..f2a3c28cca 100644 --- a/src/tool/hpcrun/gpu/gpu-host-correlation-map.c +++ b/src/tool/hpcrun/gpu/gpu-host-correlation-map.c @@ -125,11 +125,11 @@ typedef struct typed_splay_node(host_correlation) { // local data //****************************************************************************** -static gpu_host_correlation_map_entry_t *map_root = NULL; +static __thread gpu_host_correlation_map_entry_t *map_root = NULL; -static gpu_host_correlation_map_entry_t *free_list = NULL; +static __thread gpu_host_correlation_map_entry_t *free_list = NULL; -static bool allow_replace = false; +static __thread bool allow_replace = false; //****************************************************************************** // private operations @@ -200,7 +200,8 @@ gpu_host_correlation_map_lookup { gpu_host_correlation_map_entry_t *result = st_lookup(&map_root, host_correlation_id); - PRINT("host_correlation_map lookup: id=0x%lx (entry %p)\n", host_correlation_id, result); + PRINT("host_correlation_map lookup: id=0x%lx (entry %p) (&map_root=%p) tid=%llu\n", + host_correlation_id, result, &map_root, (uint64_t) pthread_self()); return result; } @@ -234,8 +235,9 @@ gpu_host_correlation_map_insert st_insert(&map_root, entry); PRINT("host_correlation_map insert: correlation_id=0x%lx " - "activity_channel=%p (entry=%p)\n", - host_correlation_id, activity_channel, entry); + "activity_channel=%p (entry=%p) (&map_root=%p) tid=%llu\n", + host_correlation_id, activity_channel, entry, &map_root, + (uint64_t) pthread_self()); } } diff --git a/src/tool/hpcrun/gpu/gpu-metrics.c b/src/tool/hpcrun/gpu/gpu-metrics.c index 84e81657fa..139bfddcb8 100644 --- a/src/tool/hpcrun/gpu/gpu-metrics.c +++ b/src/tool/hpcrun/gpu/gpu-metrics.c @@ -84,7 +84,9 @@ macro(GPU_INST, 9) \ macro(GTIMES, 10) \ macro(KINFO, 12) \ - macro(GSAMP, 13) + macro(GSAMP, 13) \ + macro(GXFER, 14) \ + macro(CTR, 3) #define FORALL_METRIC_KINDS(macro) \ @@ -203,11 +205,14 @@ name ## _metric_kind // local variables //***************************************************************************** -FORALL_METRIC_KINDS(INITIALIZE_METRIC_KINDS) +FORALL_METRIC_KINDS(INITIALIZE_METRIC_KINDS); -FORALL_INDEXED_METRIC_KINDS(INITIALIZE_INDEXED_METRIC) +FORALL_INDEXED_METRIC_KINDS(INITIALIZE_INDEXED_METRIC); -FORALL_SCALAR_METRIC_KINDS(INITIALIZE_SCALAR_METRIC_KIND) +FORALL_SCALAR_METRIC_KINDS(INITIALIZE_SCALAR_METRIC_KIND); + +static kind_info_t* GPU_COUNTER_METRIC_KIND_INFO = NULL; +static int* gpu_counter_hpcrun_metric_id_array = NULL; static const unsigned int MAX_CHAR_FORMULA = 32; @@ -592,6 +597,59 @@ gpu_metrics_attribute_branch b->executed); } +static void +gpu_metrics_attribute_counter +( + gpu_activity_t *activity +) +{ + gpu_counter_t * c = &(activity->details.counters); + cct_node_t *cct_node = activity->cct_node; + + metric_data_list_t *metrics = + hpcrun_reify_metric_set(cct_node,gpu_counter_hpcrun_metric_id_array[0]); + + for (int i = 0; i < c->total_counters; ++i) { + gpu_metrics_attribute_metric_int(metrics, gpu_counter_hpcrun_metric_id_array[i], c->values[i]); + } + + free(c->values); +} + +static void +gpu_metrics_attribute_link +( +gpu_activity_t *activity +) +{ + + printf("Attrubute NVLINK not implemented\n\n"); +// gpu_link_t *m = &(activity->details.memcpy); +// cct_node_t *cct_node = activity->cct_node; + +// metric_data_list_t *metrics = +// hpcrun_reify_metric_set(cct_node, METRIC_ID(GPU_KINFO_STMEM_ACUMU)); +// +// gpu_metrics_attribute_metric_int(metrics, METRIC_ID(GPU_XFER_XMIT), +// m->staticSharedMemory); +// +// gpu_metrics_attribute_metric_int(metrics, METRIC_ID(GPU_XFER_RCV), +// m->dynamicSharedMemory); +// +// gpu_metrics_attribute_metric_int(metrics, METRIC_ID(GPU_XFER_XMIT_TP), +// m->localMemoryTotal); +// +// gpu_metrics_attribute_metric_int(metrics, METRIC_ID(GPU_XFER_XRCV_TP), +// m->activeWarpsPerSM); +// +// gpu_metrics_attribute_metric_int(metrics, METRIC_ID(GPU_XFER_XMIT_COUNT), +// m->activeWarpsPerSM); +// +// gpu_metrics_attribute_metric_int(metrics, METRIC_ID(GPU_XFER_XRCV_COUNT), +// m->activeWarpsPerSM); + + +} //****************************************************************************** // interface operations @@ -652,6 +710,9 @@ gpu_metrics_attribute gpu_metrics_attribute_branch(activity); break; + case GPU_ACTIVITY_COUNTER: + gpu_metrics_attribute_counter(activity); + break; default: break; } @@ -898,3 +959,42 @@ gpu_metrics_GPU_INST_STALL_enable FINALIZE_METRIC_KIND(); } + +void +gpu_metrics_GPU_CTR_enable +( + int total, + const char** counter_name, + const char** counter_desc +) +{ + gpu_counter_hpcrun_metric_id_array = (int*) malloc(sizeof(int) * total); + + GPU_COUNTER_METRIC_KIND_INFO = hpcrun_metrics_new_kind(); + + for (int i = 0; i < total; ++i) { + gpu_counter_hpcrun_metric_id_array[i] = hpcrun_set_new_metric_desc_and_period( + GPU_COUNTER_METRIC_KIND_INFO, counter_name[i], counter_desc[i], + MetricFlags_ValFmt_Int, 1, metric_property_none + ); + } + + hpcrun_close_kind(GPU_COUNTER_METRIC_KIND_INFO); +} + + +void +gpu_metrics_GXFER_enable +( +void +) +{ +//#undef CURRENT_METRIC +//#define CURRENT_METRIC GXFER + + //INITIALIZE_METRIC_KIND(); + + //FORALL_GXFER(INITIALIZE_SCALAR_METRIC_INT) + + //FINALIZE_METRIC_KIND(); +} diff --git a/src/tool/hpcrun/gpu/gpu-metrics.h b/src/tool/hpcrun/gpu/gpu-metrics.h index 6e05548fb1..b997487198 100644 --- a/src/tool/hpcrun/gpu/gpu-metrics.h +++ b/src/tool/hpcrun/gpu/gpu-metrics.h @@ -89,6 +89,15 @@ typedef enum { } gpu_lmem_ops_t; +typedef enum { +GPU_XFER_XMIT = 0, +GPU_XFER_XRCV = 1, +GPU_XFER_XMIT_TP = 2, +GPU_XFER_XRCV_TP = 3, +GPU_XFER_XMIT_COUNT = 4, +GPU_XFER_XRCV_COUNT = 5 +} gpu_xfer_ops_t; + //-------------------------------------------------------------------------- // indexed metrics @@ -329,7 +338,6 @@ typedef enum { "GPU kernel: launch count") \ macro("GKER:OCC_THR", GPU_KINFO_OCCUPANCY_THR, \ "GPU kernel: theoretical occupancy (FGP_ACT / FGP_MAX)") \ - // gpu implicit copy #define FORALL_GICOPY(macro) \ @@ -387,6 +395,21 @@ typedef enum { FORALL_GSAMP_REAL(macro) +// gpu transfer information +#define FORALL_GXFER(macro) \ + macro("GXFER:XMIT (B)", GPU_XFER_XMIT, \ + "GPU link total data transmited") \ + macro("GXFER:XRCV (B)", GPU_XFER_XRCV, \ + "GPU link total data received") \ + macro("GXFER:XMIT_TP (GB)", GPU_XFER_XMIT_TP, \ + "GPU link total transmit throughput") \ + macro("GXFER:XRCV_TP (GB)", GPU_XFER_XRCV_TP, \ + "GPU link total received throughput") \ + macro("GXFER:XMIT_COUNT", GPU_XFER_XMIT_COUNT, \ + "GPU link launch count transmited") \ + macro("GXFER:XRCV_COUNT", GPU_XFER_XRCV_COUNT, \ + "GPU kernel: launch count received") + //****************************************************************************** // interface operations @@ -452,6 +475,11 @@ gpu_metrics_GSAMP_enable void ); +void +gpu_metrics_GXFER_enable +( +void +); //-------------------------------------------------- // record global memory access statistics @@ -486,6 +514,24 @@ gpu_metrics_GBR_enable ); +//-------------------------------------------------- +// record GPU hardware counters +//-------------------------------------------------- + +// Unlike other GPU metric types that may have up to a dozen of metrics, +// GPU hardware counters may have a few hundred metrics. +// So, we should only create counter metrics for the ones that are +// requested at the command line. +void +gpu_metrics_GPU_CTR_enable +( + int, + const char**, + const char** +); + + + //-------------------------------------------------- // attribute GPU measurements to an application // thread's calling context tree diff --git a/src/tool/hpcrun/gpu/gpu-monitoring-thread-api.c b/src/tool/hpcrun/gpu/gpu-monitoring-thread-api.c index 1c4a937374..361262069b 100644 --- a/src/tool/hpcrun/gpu/gpu-monitoring-thread-api.c +++ b/src/tool/hpcrun/gpu/gpu-monitoring-thread-api.c @@ -60,6 +60,15 @@ gpu_monitoring_thread_activities_ready void ) { - gpu_correlation_channel_set_consume(); + gpu_correlation_channel_set_consume_with_idx(0); +} + +void +gpu_monitoring_thread_activities_ready_with_idx +( + int idx +) +{ + gpu_correlation_channel_set_consume_with_idx(idx); } diff --git a/src/tool/hpcrun/gpu/gpu-monitoring-thread-api.h b/src/tool/hpcrun/gpu/gpu-monitoring-thread-api.h index 881667601e..c3d02d4c82 100644 --- a/src/tool/hpcrun/gpu/gpu-monitoring-thread-api.h +++ b/src/tool/hpcrun/gpu/gpu-monitoring-thread-api.h @@ -57,5 +57,12 @@ gpu_monitoring_thread_activities_ready ); +void +gpu_monitoring_thread_activities_ready_with_idx +( + int idx +); + + #endif diff --git a/src/tool/hpcrun/gpu/gpu-operation-multiplexer.c b/src/tool/hpcrun/gpu/gpu-operation-multiplexer.c index 9254575f5f..cba3cef38b 100644 --- a/src/tool/hpcrun/gpu/gpu-operation-multiplexer.c +++ b/src/tool/hpcrun/gpu/gpu-operation-multiplexer.c @@ -93,6 +93,7 @@ gpu_init_operation_channel(){ } +// OpenCL Monitoring thread static void * gpu_operation_record ( @@ -133,9 +134,11 @@ gpu_operation_multiplexer_create gpu_operation_channel_set_alloc(max_completion_cb_threads); - // You are the first to create monitor thread + monitor_disable_new_threads(); + // Create monitor thread pthread_create(&thread, NULL, (pthread_start_routine_t) gpu_operation_record, NULL); + monitor_enable_new_threads(); } diff --git a/src/tool/hpcrun/gpu/gpu-trace-demultiplexer.c b/src/tool/hpcrun/gpu/gpu-trace-demultiplexer.c index ea6827159f..33a9079000 100644 --- a/src/tool/hpcrun/gpu/gpu-trace-demultiplexer.c +++ b/src/tool/hpcrun/gpu/gpu-trace-demultiplexer.c @@ -54,6 +54,8 @@ #include "gpu-trace-demultiplexer.h" #include "gpu-print.h" +#include + //****************************************************************************** // type declarations @@ -96,8 +98,11 @@ gpu_trace_channel_set_create new_channel_set->channel_set_ptr = gpu_trace_channel_set_alloc(streams_per_thread); atomic_store(&new_channel_set->channel_index, 0); + monitor_disable_new_threads(); + // Create tracing thread pthread_create(&new_channel_set->thread, NULL, (pthread_start_routine_t) gpu_trace_record, new_channel_set); + monitor_enable_new_threads(); return new_channel_set; } diff --git a/src/tool/hpcrun/gpu/gpu-trace.c b/src/tool/hpcrun/gpu/gpu-trace.c index bdf00173b7..d548dae439 100644 --- a/src/tool/hpcrun/gpu/gpu-trace.c +++ b/src/tool/hpcrun/gpu/gpu-trace.c @@ -195,19 +195,6 @@ gpu_trace_cct_insert_context } -static uint64_t -gpu_trace_time -( - uint64_t gpu_time -) -{ - // return time in ns - uint64_t time = gpu_time; - - return time; -} - - static void gpu_trace_stream_append ( @@ -371,6 +358,7 @@ gpu_trace_fini } +// Tracing thread void * gpu_trace_record ( @@ -380,6 +368,7 @@ gpu_trace_record gpu_trace_channel_set_t *channel_set = (gpu_trace_channel_set_t *) args; hpcrun_thread_init_mem_pool_once(0, NULL, false, true); + atomic_fetch_add(&active_streams_counter, 1); while (!atomic_load(&stop_trace_flag)) { //getting data from a trace channel @@ -408,9 +397,6 @@ gpu_trace_create monitor_disable_new_threads(); trace->thread = gpu_trace_demultiplexer_push(trace->trace_channel); - atomic_fetch_add(&active_streams_counter, 1); - - monitor_enable_new_threads(); return trace; } @@ -450,8 +436,8 @@ consume_one_trace_item cct_node_t *leaf = gpu_trace_cct_insert_context(td, call_path); - uint64_t start = gpu_trace_time(start_time); - uint64_t end = gpu_trace_time(end_time); + uint64_t start = start_time; + uint64_t end = end_time; stream_start_set(start_time); @@ -480,8 +466,11 @@ consume_one_trace_item if (append) { gpu_trace_stream_append(td, leaf, start); - gpu_trace_stream_append(td, no_activity, end + 1); - PRINT("%p Append trace activity [%lu, %lu]\n", td, start, end); + // note: adding 1 to end makes sense. however, with AMD OMPT, this + // causes adjacent events to share a timestamp. so, don't add 1. + gpu_trace_stream_append(td, no_activity, end); + + PRINT("%p Append trace activity [%lu, %lu)\n", td, start, end); } } diff --git a/src/tool/hpcrun/gpu/instrumentation/gtpin-instrumentation.c b/src/tool/hpcrun/gpu/instrumentation/gtpin-instrumentation.c index a4e93c46ac..0b9ddea71a 100644 --- a/src/tool/hpcrun/gpu/instrumentation/gtpin-instrumentation.c +++ b/src/tool/hpcrun/gpu/instrumentation/gtpin-instrumentation.c @@ -243,7 +243,6 @@ writeBinary } } - static size_t computeHash ( @@ -264,7 +263,6 @@ computeHash return used; } - static void computeBinaryHash ( diff --git a/src/tool/hpcrun/gpu/nvidia/cuda-api.c b/src/tool/hpcrun/gpu/nvidia/cuda-api.c index 8240f9cab5..30a5dc035a 100644 --- a/src/tool/hpcrun/gpu/nvidia/cuda-api.c +++ b/src/tool/hpcrun/gpu/nvidia/cuda-api.c @@ -185,6 +185,7 @@ CUDA_RUNTIME_FN // private operations //****************************************************************************** + int cuda_bind ( @@ -291,6 +292,7 @@ cuda_runtime_version // interface operations //****************************************************************************** + int cuda_context ( diff --git a/src/tool/hpcrun/gpu/nvidia/cupti-activity-translate.c b/src/tool/hpcrun/gpu/nvidia/cupti-activity-translate.c index 170f35d20a..406b043747 100644 --- a/src/tool/hpcrun/gpu/nvidia/cupti-activity-translate.c +++ b/src/tool/hpcrun/gpu/nvidia/cupti-activity-translate.c @@ -587,7 +587,7 @@ cupti_activity_translate case CUPTI_ACTIVITY_KIND_PC_SAMPLING_RECORD_INFO: convert_pcsampling_record_info - (ga, (CUpti_ActivityPCSamplingRecordInfo *)activity); + (ga, (CUpti_ActivityPCSamplingRecordInfo *)activity); break; case CUPTI_ACTIVITY_KIND_MEMCPY2: diff --git a/src/tool/hpcrun/gpu/nvidia/cupti-api.c b/src/tool/hpcrun/gpu/nvidia/cupti-api.c index 8358ab161b..0cccbe4122 100644 --- a/src/tool/hpcrun/gpu/nvidia/cupti-api.c +++ b/src/tool/hpcrun/gpu/nvidia/cupti-api.c @@ -162,6 +162,7 @@ flush_alarm_handler(int sig, siginfo_t* siginfo, void* context) #include #include // hpcrun_force_dlopen #include +#include #include #include @@ -177,20 +178,31 @@ flush_alarm_handler(int sig, siginfo_t* siginfo, void* context) #include +#include + #include "cuda-api.h" #include "cupti-api.h" #include "cupti-gpu-api.h" #include "cubin-hash-map.h" #include "cubin-id-map.h" +#include "tool_state.h" + +//#include "sample_sources_all.h" //****************************************************************************** // macros //****************************************************************************** -#define CUPTI_LIBRARY_LOCATION "lib64/libcupti.so" -#define CUPTI_PATH_FROM_CUDA "extras/CUPTI/" + +#define DEBUG 0 +#include + + +#define CUPTI_LIBRARY_LOCATION "/lib64/libcupti.so" +#define CUPTI_PATH_FROM_CUDA "extras/CUPTI" + #define HPCRUN_CUPTI_ACTIVITY_BUFFER_SIZE (16 * 1024 * 1024) #define HPCRUN_CUPTI_ACTIVITY_BUFFER_ALIGNMENT (8) @@ -853,6 +865,23 @@ ensure_kernel_ip_present } +static void +cupti_gpu_monitors_apply_enter(cct_node_t *cct_node) +{ + cupti_correlation_id_push(IGNORE_CORR_ID); + gpu_monitors_apply( cct_node, gpu_monitor_type_enter); + cupti_correlation_id_pop(); +} + + +static void +cupti_gpu_monitors_apply_exit() +{ + cupti_correlation_id_push(IGNORE_CORR_ID); + gpu_monitors_apply( NULL, gpu_monitor_type_exit); + cupti_correlation_id_pop(); +} + static void cupti_subscriber_callback ( @@ -862,6 +891,11 @@ cupti_subscriber_callback const void *cb_info ) { + + if (is_tool_active()) { + return; + } + if (domain == CUPTI_CB_DOMAIN_RESOURCE) { const CUpti_ResourceData *rd = (const CUpti_ResourceData *) cb_info; if (cb_id == CUPTI_CBID_RESOURCE_MODULE_LOADED) { @@ -889,6 +923,7 @@ cupti_subscriber_callback cupti_stop_flag_set(); const CUpti_CallbackData *cd = (const CUpti_CallbackData *) cb_info; + PRINT("\nDriver API: -----------------%s\n", cd->functionName ); bool ompt_runtime_api_flag = ompt_runtime_status_get(); @@ -1043,11 +1078,15 @@ cupti_subscriber_callback default: break; } - bool is_kernel_op = gpu_op_placeholder_flags_is_set(gpu_op_placeholder_flags, - gpu_placeholder_type_kernel); + + bool is_kernel_op = gpu_op_placeholder_flags_is_set(gpu_op_placeholder_flags,gpu_placeholder_type_kernel); + +// PRINT("DRIVER: is_valid_op = %d \t is_kernel = %d \t cupti_runtime_api_flag = %d \t ompt_runtime_api_flag = %d | callback_site = %d\n", +// is_valid_op, is_kernel_op, cupti_runtime_api_flag, ompt_runtime_api_flag, cd->callbackSite); + // If we have a valid operation and is not in the interval of a cuda/ompt runtime api if (is_valid_op && !cupti_runtime_api_flag && !ompt_runtime_api_flag) { - if (cd->callbackSite == CUPTI_API_ENTER) { + if (cd->callbackSite == CUPTI_API_ENTER) { // A driver API cannot be implemented by other driver APIs, so we get an id // and unwind when the API is entered @@ -1073,19 +1112,26 @@ cupti_subscriber_callback ensure_kernel_ip_present(trace_ph, kernel_ip); } + hpcrun_safe_exit(); // Generate notification entry uint64_t cpu_submit_time = hpcrun_nanotime(); - gpu_correlation_channel_produce(correlation_id, &gpu_op_ccts, + + + cupti_gpu_monitors_apply_enter(api_node); + + gpu_correlation_channel_produce(correlation_id, &gpu_op_ccts, cpu_submit_time); TMSG(CUPTI_TRACE, "Driver push externalId %lu (cb_id = %u)", correlation_id, cb_id); } else if (cd->callbackSite == CUPTI_API_EXIT) { + cupti_gpu_monitors_apply_exit(); + uint64_t correlation_id __attribute__((unused)); // not used if PRINT omitted correlation_id = cupti_correlation_id_pop(); TMSG(CUPTI_TRACE, "Driver pop externalId %lu (cb_id = %u)", correlation_id, cb_id); - } + } } else if (is_kernel_op && cupti_runtime_api_flag && cd->callbackSite == CUPTI_API_ENTER) { if (cupti_kernel_ph != NULL) { @@ -1106,6 +1152,7 @@ cupti_subscriber_callback cupti_stop_flag_set(); const CUpti_CallbackData *cd = (const CUpti_CallbackData *)cb_info; + PRINT("\nRuntime API: -----------------%s\n", cd->functionName ); bool is_valid_op = false; bool is_kernel_op __attribute__((unused)) = false; // used only by PRINT when debugging @@ -1200,12 +1247,17 @@ cupti_subscriber_callback default: break; } + +// PRINT("RUNTIME: is_valid_op = %d \t is_kernel = %d \t cupti_runtime_api_flag = %d \t ompt_runtime_api_flag = %d | callback_site = %d\n", +// is_valid_op, is_kernel_op, cupti_runtime_api_flag, ompt_runtime_status_get(), cd->callbackSite); + if (is_valid_op) { if (cd->callbackSite == CUPTI_API_ENTER) { // Enter a CUDA runtime api cupti_runtime_api_flag_set(); uint64_t correlation_id = gpu_correlation_id(); cupti_correlation_id_push(correlation_id); + // We should make notification records in the api enter callback. // A runtime API must be implemented by driver APIs. // Though unlikely in most cases, @@ -1226,11 +1278,16 @@ cupti_subscriber_callback // Generate notification entry uint64_t cpu_submit_time = hpcrun_nanotime(); - gpu_correlation_channel_produce(correlation_id, &gpu_op_ccts, + + cupti_gpu_monitors_apply_enter(cupti_kernel_ph); + + gpu_correlation_channel_produce(correlation_id, &gpu_op_ccts, cpu_submit_time); TMSG(CUPTI_TRACE, "Runtime push externalId %lu (cb_id = %u)", correlation_id, cb_id); } else if (cd->callbackSite == CUPTI_API_EXIT) { + + cupti_gpu_monitors_apply_exit(); // Exit an CUDA runtime api cupti_runtime_api_flag_unset(); @@ -1348,7 +1405,7 @@ cupti_buffer_completion_callback do { status = cupti_buffer_cursor_advance(buffer, validSize, &cupti_activity); if (status) { - cupti_activity_process(cupti_activity); + cupti_activity_process(cupti_activity); ++processed; } } while (status); diff --git a/src/tool/hpcrun/gpu/ompt/ompt-activity-translate.c b/src/tool/hpcrun/gpu/ompt/ompt-activity-translate.c new file mode 100644 index 0000000000..4f89e2c37c --- /dev/null +++ b/src/tool/hpcrun/gpu/ompt/ompt-activity-translate.c @@ -0,0 +1,309 @@ +// -*-Mode: C++;-*- // technically C99 + +// * BeginRiceCopyright ***************************************************** +// +// -------------------------------------------------------------------------- +// Part of HPCToolkit (hpctoolkit.org) +// +// Information about sources of support for research and development of +// HPCToolkit is at 'hpctoolkit.org' and in 'README.Acknowledgments'. +// -------------------------------------------------------------------------- +// +// Copyright ((c)) 2002-2021, Rice University +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// * Neither the name of Rice University (RICE) nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// This software is provided by RICE and contributors "as is" and any +// express or implied warranties, including, but not limited to, the +// implied warranties of merchantability and fitness for a particular +// purpose are disclaimed. In no event shall RICE or contributors be +// liable for any direct, indirect, incidental, special, exemplary, or +// consequential damages (including, but not limited to, procurement of +// substitute goods or services; loss of use, data, or profits; or +// business interruption) however caused and on any theory of liability, +// whether in contract, strict liability, or tort (including negligence +// or otherwise) arising in any way out of the use of this software, even +// if advised of the possibility of such damage. +// +// ******************************************************* EndRiceCopyright * + + +//****************************************************************************** +// Description: +// Read fields from a ompt_record_ompt_t and assign to a +// GPU-independent gpu_activity_t. +// +// This interface is only used by the CUPTI GPU monitoring thread. +// It is thread-safe as long as it does not access details structures +// shared by worker threads. +//****************************************************************************** + +//****************************************************************************** +// local includes +//****************************************************************************** + +#include +#include +#include +#include +#include +#include +#include + + +#include "ompt-activity-translate.h" + + +//****************************************************************************** +// macros +//****************************************************************************** + + + + +//****************************************************************************** +// private operations +//****************************************************************************** + +static void +convert_unknown +( + gpu_activity_t *ga, + ompt_record_ompt_t *r, + uint64_t *cid_ptr +) +{ + ga->kind = GPU_ACTIVITY_UNKNOWN; + *cid_ptr = 0; +} + + +static void +convert_ptrop +( + gpu_activity_t *ga, + ompt_record_ompt_t *r, + uint64_t *cid_ptr +) +{ + ga->kind = GPU_ACTIVITY_UNKNOWN; + *cid_ptr = 0; +} + + +static void +convert_target +( + gpu_activity_t *ga, + ompt_record_ompt_t *r, + uint64_t *cid_ptr +) +{ + ompt_record_target_t *t __attribute__((unused)) = &r->record.target; + + ga->kind = GPU_ACTIVITY_UNKNOWN; + *cid_ptr = 0; +} + + +static void +convert_memory +( + gpu_activity_t *ga, + ompt_record_ompt_t *r, + gpu_mem_op_t mem_op, + uint64_t *cid_ptr +) +{ + ompt_record_target_data_op_t *d = &r->record.target_data_op; + + ga->kind = GPU_ACTIVITY_MEMORY; + ga->details.memory.memKind = GPU_MEM_UNKNOWN; + ga->details.memory.correlation_id = d->host_op_id; + ga->details.memory.mem_op = mem_op; + *cid_ptr = d->host_op_id; + + ga->details.memory.bytes = d->bytes; +} + + +static void +convert_alloc +( + gpu_activity_t *ga, + ompt_record_ompt_t *r, + uint64_t *cid_ptr +) +{ + convert_memory(ga, r, GPU_MEM_OP_ALLOC, cid_ptr); +} + + +static void +convert_delete +( + gpu_activity_t *ga, + ompt_record_ompt_t *r, + uint64_t *cid_ptr +) +{ + convert_memory(ga, r, GPU_MEM_OP_DELETE, cid_ptr); +} + + +static gpu_memcpy_type_t +convert_memcpy_type +( + ompt_target_data_op_t kind +) +{ + switch (kind) { + case ompt_target_data_transfer_to_device_async: + case ompt_target_data_transfer_to_device: + return GPU_MEMCPY_H2D; + + case ompt_target_data_transfer_from_device_async: + case ompt_target_data_transfer_from_device: + return GPU_MEMCPY_D2H; + + default: + return GPU_MEMCPY_UNK; + } +} + + +static void +convert_memcpy +( + gpu_activity_t *ga, + ompt_record_ompt_t *r, + uint64_t *cid_ptr +) +{ + ompt_record_target_data_op_t *d = &r->record.target_data_op; + + ga->kind = GPU_ACTIVITY_MEMCPY; + + ga->details.memcpy.correlation_id = d->host_op_id; + *cid_ptr = d->host_op_id; + + ga->details.memcpy.bytes = d->bytes; + ga->details.memcpy.copyKind = convert_memcpy_type(d->optype); +} + + +static void +convert_target_data_op +( + gpu_activity_t *ga, + ompt_record_ompt_t *r, + uint64_t *cid_ptr +) +{ + ompt_record_target_data_op_t *d = &r->record.target_data_op; + + switch(d->optype) { + + case ompt_target_data_transfer_to_device: + case ompt_target_data_transfer_from_device: + convert_memcpy(ga, r, cid_ptr); + break; + + case ompt_target_data_alloc_async: + case ompt_target_data_alloc: + convert_alloc(ga, r, cid_ptr); + break; + + case ompt_target_data_delete_async: + case ompt_target_data_delete: + convert_delete(ga, r, cid_ptr); + break; + + case ompt_target_data_associate: + case ompt_target_data_disassociate: + convert_ptrop(ga, r, cid_ptr); + break; + + default: + convert_unknown(ga, r, cid_ptr); + break; + } + + gpu_interval_set(&ga->details.interval, r->time, d->end_time); +} + + +void +convert_target_submit +( + gpu_activity_t *ga, + ompt_record_ompt_t *r, + uint64_t *cid_ptr +) +{ + ompt_record_target_kernel_t *k = &r->record.target_kernel; + + ga->kind = GPU_ACTIVITY_KERNEL; + ga->details.kernel.correlation_id = k->host_op_id; + *cid_ptr = k->host_op_id; + + gpu_interval_set(&ga->details.interval, r->time, k->end_time); +} + + + +//****************************************************************************** +// interface operations +//****************************************************************************** + +void +ompt_activity_translate +( + gpu_activity_t *ga, + ompt_record_ompt_t *r, + uint64_t *cid_ptr +) +{ + memset(ga, 0, sizeof(gpu_activity_t)); + switch (r->type) { + + case ompt_callback_target: + case ompt_callback_target_emi: + + convert_target(ga,r, cid_ptr); + break; + + case ompt_callback_target_data_op: + case ompt_callback_target_data_op_emi: + + convert_target_data_op(ga,r, cid_ptr); + break; + + case ompt_callback_target_submit: + case ompt_callback_target_submit_emi: + + convert_target_submit(ga,r, cid_ptr); + break; + + default: + convert_unknown(ga, r, cid_ptr); + break; + } + + + cstack_ptr_set(&(ga->next), 0); +} diff --git a/src/tool/hpcrun/gpu/ompt/ompt-activity-translate.h b/src/tool/hpcrun/gpu/ompt/ompt-activity-translate.h new file mode 100644 index 0000000000..30dedb5c01 --- /dev/null +++ b/src/tool/hpcrun/gpu/ompt/ompt-activity-translate.h @@ -0,0 +1,79 @@ +// -*-Mode: C++;-*- // technically C99 + +// * BeginRiceCopyright ***************************************************** +// +// -------------------------------------------------------------------------- +// Part of HPCToolkit (hpctoolkit.org) +// +// Information about sources of support for research and development of +// HPCToolkit is at 'hpctoolkit.org' and in 'README.Acknowledgments'. +// -------------------------------------------------------------------------- +// +// Copyright ((c)) 2002-2021, Rice University +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// * Neither the name of Rice University (RICE) nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// This software is provided by RICE and contributors "as is" and any +// express or implied warranties, including, but not limited to, the +// implied warranties of merchantability and fitness for a particular +// purpose are disclaimed. In no event shall RICE or contributors be +// liable for any direct, indirect, incidental, special, exemplary, or +// consequential damages (including, but not limited to, procurement of +// substitute goods or services; loss of use, data, or profits; or +// business interruption) however caused and on any theory of liability, +// whether in contract, strict liability, or tort (including negligence +// or otherwise) arising in any way out of the use of this software, even +// if advised of the possibility of such damage. +// +// ******************************************************* EndRiceCopyright * + +#ifndef ompt_activity_translate_h +#define ompt_activity_translate_h + + +//****************************************************************************** +// OpenMP includes +//****************************************************************************** + +#include + + + +//****************************************************************************** +// type declarations +//****************************************************************************** + +typedef struct gpu_activity_t gpu_activity_t; +typedef struct cct_node_t cct_node_t; + + + +//****************************************************************************** +// interface operations +//****************************************************************************** + +void +ompt_activity_translate +( + gpu_activity_t *entry, + ompt_record_ompt_t *record, + uint64_t *cid_ptr +); + + + +#endif diff --git a/src/tool/hpcrun/gpu/ompt/ompt-gpu-api.c b/src/tool/hpcrun/gpu/ompt/ompt-gpu-api.c new file mode 100644 index 0000000000..e1dae062ec --- /dev/null +++ b/src/tool/hpcrun/gpu/ompt/ompt-gpu-api.c @@ -0,0 +1,86 @@ +// -*-Mode: C++;-*- // technically C99 + +// * BeginRiceCopyright ***************************************************** +// +// -------------------------------------------------------------------------- +// Part of HPCToolkit (hpctoolkit.org) +// +// Information about sources of support for research and development of +// HPCToolkit is at 'hpctoolkit.org' and in 'README.Acknowledgments'. +// -------------------------------------------------------------------------- +// +// Copyright ((c)) 2002-2021, Rice University +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// * Neither the name of Rice University (RICE) nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// This software is provided by RICE and contributors "as is" and any +// express or implied warranties, including, but not limited to, the +// implied warranties of merchantability and fitness for a particular +// purpose are disclaimed. In no event shall RICE or contributors be +// liable for any direct, indirect, incidental, special, exemplary, or +// consequential damages (including, but not limited to, procurement of +// substitute goods or services; loss of use, data, or profits; or +// business interruption) however caused and on any theory of liability, +// whether in contract, strict liability, or tort (including negligence +// or otherwise) arising in any way out of the use of this software, even +// if advised of the possibility of such damage. +// +// ******************************************************* EndRiceCopyright * + +//****************************************************************************** +// local includes +//****************************************************************************** + +#include + +#include +#include +#include + +#include "ompt-gpu-api.h" +#include "ompt-activity-translate.h" + + + +//****************************************************************************** +// interface operations +//****************************************************************************** + +void +ompt_buffer_completion_notify +( + void +) +{ + gpu_monitoring_thread_activities_ready(); +} + + +void +ompt_activity_process +( + ompt_record_ompt_t *record +) +{ + gpu_activity_t gpu_activity; + uint64_t correlation_id; + ompt_activity_translate(&gpu_activity, record, &correlation_id); + if (gpu_correlation_id_map_lookup(correlation_id) == NULL) { + gpu_correlation_id_map_insert(correlation_id, correlation_id); + } + gpu_activity_process(&gpu_activity); +} diff --git a/src/tool/hpcrun/gpu/amd/rocm-debug-api.h b/src/tool/hpcrun/gpu/ompt/ompt-gpu-api.h similarity index 85% rename from src/tool/hpcrun/gpu/amd/rocm-debug-api.h rename to src/tool/hpcrun/gpu/ompt/ompt-gpu-api.h index 9ffacea2a8..cca8cee7af 100644 --- a/src/tool/hpcrun/gpu/amd/rocm-debug-api.h +++ b/src/tool/hpcrun/gpu/ompt/ompt-gpu-api.h @@ -9,7 +9,7 @@ // HPCToolkit is at 'hpctoolkit.org' and in 'README.Acknowledgments'. // -------------------------------------------------------------------------- // -// Copyright ((c)) 2002-2022, Rice University +// Copyright ((c)) 2002-2021, Rice University // All rights reserved. // // Redistribution and use in source and binary forms, with or without @@ -41,41 +41,36 @@ // // ******************************************************* EndRiceCopyright * -#ifndef rocm_debug_api_h -#define rocm_debug_api_h +#ifndef ompt_gpu_api_h +#define ompt_gpu_api_h + + //****************************************************************************** -// interface operations +// OpenMP includes //****************************************************************************** -int -rocm_debug_api_bind -( - void -); +#include + -void -rocm_debug_api_init -( - void -); + +//****************************************************************************** +// interface operations +//****************************************************************************** void -rocm_debug_api_fini +ompt_buffer_completion_notify ( - void + void ); + void -rocm_debug_api_query_code_object +ompt_activity_process ( - size_t* code_obejct_count_ptr + ompt_record_ompt_t *record ); -char* -rocm_debug_api_query_uri -( - size_t code_object_index -); + #endif diff --git a/src/tool/hpcrun/gpu/opencl/opencl-api.c b/src/tool/hpcrun/gpu/opencl/opencl-api.c index 91df87455a..98689be8e7 100644 --- a/src/tool/hpcrun/gpu/opencl/opencl-api.c +++ b/src/tool/hpcrun/gpu/opencl/opencl-api.c @@ -476,10 +476,10 @@ opencl_operation_multiplexer_push gpu_activity.details.correlation.host_correlation_id = correlation_id; gpu_operation_multiplexer_push(obj->details.initiator_channel, NULL, &gpu_activity); - + // The actual entry opencl_activity_translate(&gpu_activity, obj, interval); - gpu_operation_multiplexer_push(obj->details.initiator_channel, + gpu_operation_multiplexer_push(obj->details.initiator_channel, obj->pending_operations, &gpu_activity); } @@ -1250,6 +1250,441 @@ opencl_api_thread_finalize } +cl_program +clCreateProgramWithSource +( + cl_context context, + cl_uint count, + const char** strings, + const size_t* lengths, + cl_int* errcode_ret +) +{ + ETMSG(OPENCL, "inside clCreateProgramWithSource_wrapper"); + +#if 0 + if (strings != NULL && lengths != NULL) { + FILE *f_ptr; + for (int i = 0; i < (int)count; i++) { + // what if a single file has multiple kernels? + // we need to add logic to get filenames by reading the strings contents + char fileno = '0' + (i + 1); // right now we are naming the files as index numbers + + // TO-DO: AARON using malloc instead of hpcrun_malloc gives extra garbage characters in file name + char *filename = (char *)hpcrun_malloc(sizeof(fileno) + 1); + *filename = fileno + '\0'; + f_ptr = fopen(filename, "w"); + fwrite(strings[i], lengths[i], 1, f_ptr); + } + fclose(f_ptr); + } +#endif + + return HPCRUN_OPENCL_CALL(clCreateProgramWithSource, (context, count, strings, lengths, errcode_ret)); +} + +#ifdef OPT_ENABLE_IGC +// one downside of this appproach is that we may override the callback provided by user +cl_int +clBuildProgram +( + cl_program program, + cl_uint num_devices, + const cl_device_id* device_list, + const char* options, + void (CL_CALLBACK* pfn_notify)(cl_program program, void* user_data), + void* user_data +) +{ + ETMSG(OPENCL, "inside clBuildProgram_wrapper"); + // XXX(Aaron): Caution, what's the maximum length of options? + int len_options = options == NULL ? 0 : strlen(options); + int len_flag = strlen(LINE_TABLE_FLAG); + char *options_with_debug_flags = (char *)malloc((len_options + len_flag + 1) * sizeof(char)); + memset(options_with_debug_flags, 0, (len_options + len_flag + 1)); + if (len_options != 0) { + strncat(options_with_debug_flags, options, len_options); + } + strcat(options_with_debug_flags, LINE_TABLE_FLAG); + cl_int ret = HPCRUN_OPENCL_CALL(clBuildProgram, (program, num_devices, device_list, options_with_debug_flags, clBuildProgramCallback, user_data)); + free(options_with_debug_flags); + return ret; +} +#endif // OPT_ENABLE_IGC + + +cl_command_queue +clCreateCommandQueue +( + cl_context context, + cl_device_id device, + cl_command_queue_properties properties, + cl_int *errcode_ret +) +{ + // enabling profiling + properties |= (cl_command_queue_properties)CL_QUEUE_PROFILING_ENABLE; + + cl_command_queue queue = HPCRUN_OPENCL_CALL(clCreateCommandQueue, (context, device, + properties,errcode_ret)); + + uint32_t context_id = opencl_cl_context_map_update((uint64_t)context); + opencl_cl_queue_map_update((uint64_t)queue, context_id); + + return queue; +} + + +cl_command_queue +clCreateCommandQueueWithProperties +( + cl_context context, + cl_device_id device, + const cl_queue_properties* properties, + cl_int* errcode_ret +) +{ + cl_queue_properties *queue_properties = (cl_queue_properties *)properties; + if (properties == NULL) { + queue_properties = (cl_queue_properties *)malloc(sizeof(cl_queue_properties) * 3); + queue_properties[0] = CL_QUEUE_PROPERTIES; + queue_properties[1] = CL_QUEUE_PROFILING_ENABLE; + queue_properties[2] = 0; + } else { + int queue_props_id = -1; + int props_count = 0; + while (properties[props_count] != 0) { + if (properties[props_count] == CL_QUEUE_PROPERTIES) { + queue_props_id = props_count; + ++props_count; + } else if (properties[props_count] == CL_QUEUE_SIZE) { + ++props_count; + } + ++props_count; + } + + if (queue_props_id >= 0 && queue_props_id + 1 < props_count) { + queue_properties = (cl_queue_properties *)malloc(sizeof(cl_queue_properties) * (props_count + 1)); + for (int i = 0; i < props_count; ++i) { + queue_properties[i] = properties[i]; + } + // We do have a queue property entry, just enable profiling + queue_properties[queue_props_id + 1] |= CL_QUEUE_PROFILING_ENABLE; + queue_properties[props_count] = 0; + } else { + // We do not have a queue property entry, need to allocate a queue property entry and set up + queue_properties = (cl_queue_properties *)malloc(sizeof(cl_queue_properties) * (props_count + 3)); + for (int i = 0; i < props_count; ++i) { + queue_properties[i] = properties[i]; + } + queue_properties[props_count] = CL_QUEUE_PROPERTIES; + queue_properties[props_count + 1] = CL_QUEUE_PROFILING_ENABLE; + queue_properties[props_count + 2] = 0; + } + } + cl_command_queue queue = HPCRUN_OPENCL_CALL(clCreateCommandQueueWithProperties, (context, device, queue_properties, errcode_ret)); + if (queue_properties != NULL) { + // The property is created by us + free(queue_properties); + } + + uint32_t context_id = opencl_cl_context_map_update((uint64_t)context); + opencl_cl_queue_map_update((uint64_t)queue, context_id); + return queue; +} + + +cl_int +clEnqueueNDRangeKernel +( + cl_command_queue command_queue, + cl_kernel ocl_kernel, + cl_uint work_dim, + const size_t *global_work_offset, + const size_t *global_work_size, + const size_t *local_work_size, + cl_uint num_events_in_wait_list, + const cl_event *event_wait_list, + cl_event *event +) +{ + opencl_object_t *kernel_info = opencl_malloc_kind(GPU_ACTIVITY_KERNEL); + INITIALIZE_CALLBACK_INFO(initializeKernelCallBackInfo, kernel_info, (kernel_info, command_queue)) + + opencl_subscriber_callback(kernel_info); + + cl_event *eventp = NULL; + SET_EVENT_POINTER(eventp, event, kernel_info) + + cl_int return_status = + HPCRUN_OPENCL_CALL(clEnqueueNDRangeKernel, (command_queue, ocl_kernel, work_dim, + global_work_offset, global_work_size, local_work_size, + num_events_in_wait_list, event_wait_list, eventp)); + + ETMSG(OPENCL, "Registering callback for kind: Kernel. " + "Correlation id: %"PRIu64 "", kernel_info->details.ker_cb.correlation_id); + + clSetEventCallback_wrapper(*eventp, CL_COMPLETE, + &opencl_activity_completion_callback, kernel_info); + return return_status; +} + + +// this is a simplified version of clEnqueueNDRangeKernel, TODO: check if code duplication can be avoided +cl_int +clEnqueueTask +( + cl_command_queue command_queue, + cl_kernel kernel, + cl_uint num_events_in_wait_list, + const cl_event* event_wait_list, + cl_event* event +) +{ + opencl_object_t *kernel_info = opencl_malloc_kind(GPU_ACTIVITY_KERNEL); + INITIALIZE_CALLBACK_INFO(initializeKernelCallBackInfo, kernel_info, (kernel_info, command_queue)) + + opencl_subscriber_callback(kernel_info); + + cl_event *eventp = NULL; + SET_EVENT_POINTER(eventp, event, kernel_info); + + cl_int return_status = + HPCRUN_OPENCL_CALL(clEnqueueTask, (command_queue, kernel, + num_events_in_wait_list, event_wait_list, eventp)); + + ETMSG(OPENCL, "Registering callback for kind: Kernel. " + "Correlation id: %"PRIu64 "", kernel_info->details.ker_cb.correlation_id); + + clSetEventCallback_wrapper(*eventp, CL_COMPLETE, + &opencl_activity_completion_callback, kernel_info); + return return_status; +} + + +cl_int +clEnqueueReadBuffer +( + cl_command_queue command_queue, + cl_mem buffer, + cl_bool blocking_read, + size_t offset, + size_t cb, + void *ptr, + cl_uint num_events_in_wait_list, + const cl_event *event_wait_list, + cl_event *event +) +{ + ETMSG(OPENCL, "inside clEnqueueReadBuffer wrapper"); + + opencl_object_t *cpy_info = opencl_malloc_kind(GPU_ACTIVITY_MEMCPY); + INITIALIZE_CALLBACK_INFO(initializeMemcpyCallBackInfo, cpy_info, (cpy_info, GPU_MEMCPY_D2H, cb, command_queue)) + + opencl_subscriber_callback(cpy_info); + + cl_event *eventp = NULL; + SET_EVENT_POINTER(eventp, event, cpy_info); + + cl_int return_status = + HPCRUN_OPENCL_CALL(clEnqueueReadBuffer, + (command_queue, buffer, blocking_read, offset, + cb, ptr, num_events_in_wait_list, event_wait_list, eventp)); + + ETMSG(OPENCL, "Registering callback for kind MEMCPY, type: D2H. " + "Correlation id: %"PRIu64 "", cpy_info->details.cpy_cb.correlation_id); + ETMSG(OPENCL, "%d(bytes) of data being transferred from device to host", + (long)cb); + + + clSetEventCallback_wrapper(*eventp, CL_COMPLETE, + &opencl_activity_completion_callback, cpy_info); + + return return_status; +} + + +cl_int +clEnqueueWriteBuffer +( + cl_command_queue command_queue, + cl_mem buffer, + cl_bool blocking_write, + size_t offset, + size_t cb, + const void *ptr, + cl_uint num_events_in_wait_list, + const cl_event *event_wait_list, + cl_event *event +) +{ + ETMSG(OPENCL, "inside clEnqueueWriteBuffer wrapper. cl_mem buffer: %p", buffer); + opencl_object_t *cpy_info = opencl_malloc_kind(GPU_ACTIVITY_MEMCPY); + INITIALIZE_CALLBACK_INFO(initializeMemcpyCallBackInfo, cpy_info, (cpy_info, GPU_MEMCPY_H2D, cb, command_queue)) + + opencl_subscriber_callback(cpy_info); + + cl_event *eventp = NULL; + SET_EVENT_POINTER(eventp, event, cpy_info); + + cl_int return_status = + HPCRUN_OPENCL_CALL(clEnqueueWriteBuffer, + (command_queue, buffer, blocking_write, offset, cb, ptr, + num_events_in_wait_list, event_wait_list, eventp)); + + ETMSG(OPENCL, "Registering callback for kind MEMCPY, type: H2D. " + "Correlation id: %"PRIu64 "", cpy_info->details.cpy_cb.correlation_id); + ETMSG(OPENCL, "%d(bytes) of data being transferred from host to device", + (long)cb); + + clSetEventCallback_wrapper(*eventp, CL_COMPLETE, + &opencl_activity_completion_callback, + (void*) cpy_info); + + return return_status; +} + + +void* +clEnqueueMapBuffer +( + cl_command_queue command_queue, + cl_mem buffer, + cl_bool blocking_map, + cl_map_flags map_flags, + size_t offset, + size_t size, + cl_uint num_events_in_wait_list, + const cl_event* event_wait_list, + cl_event* event, + cl_int* errcode_ret +) +{ + ETMSG(OPENCL, "inside clEnqueueMapBuffer wrapper"); + + opencl_object_t *cpy_info = opencl_malloc_kind(GPU_ACTIVITY_MEMCPY); + if (map_flags == CL_MAP_READ) { + INITIALIZE_CALLBACK_INFO(initializeMemcpyCallBackInfo, cpy_info, (cpy_info, GPU_MEMCPY_D2H, size, command_queue)); + } else { + //map_flags == CL_MAP_WRITE || map_flags == CL_MAP_WRITE_INVALIDATE_REGION + INITIALIZE_CALLBACK_INFO(initializeMemcpyCallBackInfo, cpy_info, (cpy_info, GPU_MEMCPY_H2D, size, command_queue)); + } + + opencl_subscriber_callback(cpy_info); + + cl_event *eventp = NULL; + SET_EVENT_POINTER(eventp, event, cpy_info); + + void *map_ptr = + HPCRUN_OPENCL_CALL(clEnqueueMapBuffer, + (command_queue, buffer, blocking_map, map_flags, offset, + size, num_events_in_wait_list, event_wait_list, eventp, errcode_ret)); + + if (map_flags == CL_MAP_READ) { + ETMSG(OPENCL, "Registering callback for kind MEMCPY, type: D2H. " + "Correlation id: %"PRIu64 "", cpy_info->details.cpy_cb.correlation_id); + ETMSG(OPENCL, "%d(bytes) of data being transferred from device to host", + (long)size); + } else { + ETMSG(OPENCL, "Registering callback for kind MEMCPY, type: H2D. " + "Correlation id: %"PRIu64 "", cpy_info->details.cpy_cb.correlation_id); + ETMSG(OPENCL, "%d(bytes) of data being transferred from host to device", + (long)size); + } + + clSetEventCallback_wrapper(*eventp, CL_COMPLETE, + &opencl_activity_completion_callback, cpy_info); + + return map_ptr; +} + + +cl_mem +clCreateBuffer +( + cl_context context, + cl_mem_flags flags, + size_t size, + void* host_ptr, + cl_int* errcode_ret +) +{ + ETMSG(OPENCL, "clCreateBuffer flags: %u, size: %"PRIu64 "", flags, size); + + opencl_object_t *mem_info = opencl_malloc_kind(GPU_ACTIVITY_MEMORY); + INITIALIZE_CALLBACK_INFO(initializeMemoryCallBackInfo, mem_info, (mem_info, flags, size)) + + opencl_subscriber_callback(mem_info); + + gpu_interval_t interval; + interval.start = CPU_NANOTIME(); + cl_mem buffer = + HPCRUN_OPENCL_CALL(clCreateBuffer, (context, flags, size, host_ptr, errcode_ret)); + interval.end = CPU_NANOTIME(); + + opencl_operation_multiplexer_push(interval, mem_info, mem_info->details.mem_cb.correlation_id); + + opencl_free(mem_info); + + return buffer; +} + + +cl_int +clSetKernelArg +( + cl_kernel kernel, + cl_uint arg_index, + size_t arg_size, + const void* arg_value +) +{ + return HPCRUN_OPENCL_CALL(clSetKernelArg, (kernel, arg_index, arg_size, arg_value)); +} + + +void +opencl_instrumentation_enable +( + void +) +{ + instrumentation = true; +} + + +void +opencl_api_thread_finalize +( + void *args +) +{ + if (opencl_api_flag) { + // If I have invoked any opencl api, I have to attribute all my activities to my ccts + opencl_api_flag = false; + + atomic_bool wait; + atomic_store(&wait, true); + gpu_activity_t gpu_activity; + memset(&gpu_activity, 0, sizeof(gpu_activity_t)); + + gpu_activity.kind = GPU_ACTIVITY_FLUSH; + gpu_activity.details.flush.wait = &wait; + gpu_operation_multiplexer_push(gpu_activity_channel_get(), NULL, &gpu_activity); + + // Wait until operations are drained + // Operation channel is FIFO + while (atomic_load(&wait)) {} + + // Wait until my activities are drained + opencl_wait_for_self_pending_operations(); + + // Now I can attribute activities + gpu_application_thread_process_activities(); + } +} + + void opencl_api_process_finalize ( diff --git a/src/tool/hpcrun/loadmap.c b/src/tool/hpcrun/loadmap.c index 6491d70029..a3b2d463f6 100644 --- a/src/tool/hpcrun/loadmap.c +++ b/src/tool/hpcrun/loadmap.c @@ -145,7 +145,7 @@ hpcrun_dso_make(const char* name, void** table, TMSG(DSO," hpcrun_dso_make for module %s", name); int namelen = strlen(name) + 1; - x->name = (char*) hpcrun_malloc(namelen); + x->name = (char*) malloc(namelen); strcpy(x->name, name); x->table = table; diff --git a/src/tool/hpcrun/main.c b/src/tool/hpcrun/main.c index ef3f54720a..b229e51531 100644 --- a/src/tool/hpcrun/main.c +++ b/src/tool/hpcrun/main.c @@ -78,6 +78,7 @@ #include #include +#include #include "main.h" @@ -221,7 +222,6 @@ bool hpcrun_no_unwind = false; *****************************************************************************/ static __thread bool hpcrun_thread_suppress_sample = true; - //*************************************************************************** // local variables //*************************************************************************** @@ -244,6 +244,9 @@ static hpcrun_aux_cleanup_t * hpcrun_aux_cleanup_free_list_head = NULL; static char execname[PATH_MAX] = {'\0'}; static int monitor_fini_process_how = 0; +static atomic_int ms_init_started = ATOMIC_VAR_INIT(0); +static atomic_int ms_init_completed = ATOMIC_VAR_INIT(0); + //*************************************************************************** // Interface functions for suppressing samples @@ -422,7 +425,7 @@ abort_timeout_handler(int sig, siginfo_t* siginfo, void* context) static void hpcrun_set_abort_timeout() { - static process_index = 0; + static int process_index = 0; char *abort_timeout = getenv("HPCRUN_ABORT_TIMEOUT"); @@ -799,12 +802,13 @@ hpcrun_thread_init(int id, local_thread_data_t* local_thread_data, bool has_trac epoch_t* epoch = TD_GET(core_profile_trace_data.epoch); - if (! hpcrun_thread_suppress_sample) { - // handle event sets for sample sources - SAMPLE_SOURCES(gen_event_set,lush_metrics); + if (! hpcrun_thread_suppress_sample ) { // sample sources take thread specific action prior to start (often is a 'registration' action); SAMPLE_SOURCES(thread_init_action); + // handle event sets for sample sources + SAMPLE_SOURCES(gen_event_set,lush_metrics); + // start the sample sources SAMPLE_SOURCES(start); @@ -889,19 +893,19 @@ hpcrun_wait() //*************************************************************************** -// process control (via libmonitor) +// hpcrun initialization ( process control via libmonitor) //*************************************************************************** +static +void hpcrun_prepare_measurement_subsystem(bool is_child); + void* monitor_init_process(int *argc, char **argv, void* data) { - char* process_name; + const char* process_name; hpcrun_thread_suppress_sample = false; - fork_data_t* fork_data = (fork_data_t*) data; - bool is_child = data && fork_data->is_child; - hpcrun_wait(); #ifndef HPCRUN_STATIC_LINK @@ -925,6 +929,8 @@ monitor_init_process(int *argc, char **argv, void* data) copy_execname(process_name); hpcrun_files_set_executable(process_name); + TMSG(PROCESS,"hpcrun_files_set_executable called w process name = %s", process_name); + // We initialize the load map and fnbounds before registering sample source. // This is because sample source init (such as PAPI) may dlopen other libraries, // which will trigger our library monitoring code and fnbound queries @@ -938,6 +944,10 @@ monitor_init_process(int *argc, char **argv, void* data) // We need to initialize messages related functions and set up measurement directory, // so that we can write vdso and prevent fnbounds print messages to the terminal. messages_init(); + + fork_data_t* fork_data = (fork_data_t*) data; + bool is_child = data && fork_data->is_child; + if (!hpcrun_get_disabled()) { hpcrun_files_set_directory(); messages_logfile_create(); @@ -960,53 +970,73 @@ monitor_init_process(int *argc, char **argv, void* data) auditor_exports->mainlib_connected(get_saved_vdso_path()); #endif } + + if (is_child){ + hpcrun_prepare_measurement_subsystem(is_child); + } - hpcrun_registered_sources_init(); + return data; +} - hpcrun_do_custom_init(); +void +monitor_at_main() +{ + bool is_child = false; + hpcrun_prepare_measurement_subsystem(is_child); +} - // for debugging, limit the life of the execution with an alarm. - char* life = getenv("HPCRUN_LIFETIME"); - if (life != NULL){ - int seconds = atoi(life); - if (seconds > 0) alarm((unsigned int) seconds); - } - // see if unwinding has been turned off - // the same setting governs whether or not fnbounds is needed or used. - hpcrun_no_unwind = hpcrun_get_env_bool("HPCRUN_NO_UNWIND"); +static +void hpcrun_prepare_measurement_subsystem(bool is_child) +{ + if (atomic_fetch_add(&ms_init_started, 1) == 0){ + hpcrun_registered_sources_init(); - char* s = getenv(HPCRUN_EVENT_LIST); + hpcrun_do_custom_init(); - if (! is_child) { - hpcrun_sample_sources_from_eventlist(s); - } + // for debugging, limit the life of the execution with an alarm. + char* life = getenv("HPCRUN_LIFETIME"); + if (life != NULL){ + int seconds = atoi(life); + if (seconds > 0) alarm((unsigned int) seconds); + } - hpcrun_set_abort_timeout(); + // see if unwinding has been turned off + // the same setting governs whether or not fnbounds is needed or used. + hpcrun_no_unwind = hpcrun_get_env_bool("HPCRUN_NO_UNWIND"); - hpcrun_process_sample_source_none(); + char* s = getenv(HPCRUN_EVENT_LIST); - TMSG(PROCESS,"hpcrun_files_set_executable called w process name = %s", process_name); + if (! is_child) { + hpcrun_sample_sources_from_eventlist(s); + } - TMSG(PROCESS,"init"); + hpcrun_set_abort_timeout(); + hpcrun_process_sample_source_none(); - hpcrun_sample_prob_mesg(); + TMSG(PROCESS,"hpcrun outer initialization"); - TMSG(PROCESS, "I am a %s process", is_child ? "child" : "parent"); + hpcrun_sample_prob_mesg(); - hpcrun_init_internal(is_child); + TMSG(PROCESS, "I am a %s process parent"); - if (ENABLED(TST)){ - EEMSG("TST debug ctl is active!"); - STDERR_MSG("Std Err message appears"); - } + hpcrun_init_internal(is_child); + if (ENABLED(TST)){ + EEMSG("TST debug ctl is active!"); + STDERR_MSG("Std Err message appears"); + } - hpcrun_safe_exit(); + hpcrun_safe_exit(); - return data; + atomic_store(&ms_init_completed, 1); + + }else{ + while(! atomic_load(&ms_init_completed)); + } + } @@ -1166,6 +1196,7 @@ monitor_init_thread_support(void) hpcrun_safe_exit(); } + void* monitor_thread_pre_create(void) { @@ -1182,7 +1213,11 @@ monitor_thread_pre_create(void) if (module_ignore_map_inrange_lookup(thread_pre_create_address)) { return MONITOR_IGNORE_NEW_THREAD; } - + bool is_child = false; + // outer initialization + hpcrun_prepare_measurement_subsystem(is_child); + + hpcrun_safe_enter(); local_thread_data_t* rv = hpcrun_malloc(sizeof(local_thread_data_t)); diff --git a/src/tool/hpcrun/memory/mem.c b/src/tool/hpcrun/memory/mem.c index e996453277..7af6897a3c 100644 --- a/src/tool/hpcrun/memory/mem.c +++ b/src/tool/hpcrun/memory/mem.c @@ -94,6 +94,15 @@ static long total_non_freeable = 0; static int out_of_mem_mesg = 0; + +// --------------------------------------------------- +// hpcrun_malloc() memory thread local data structures +// --------------------------------------------------- +__thread hpcrun_meminfo_t memstore; +__thread int mem_low; + + + //------------------------------------------------------------------ // Internal functions //------------------------------------------------------------------ @@ -222,22 +231,12 @@ hpcrun_memory_reinit(void) // Allocate space and init a thread's memstore. // If failure, shutdown sampling and leave old memstore in place. void -hpcrun_make_memstore(hpcrun_meminfo_t *mi, int is_child) +hpcrun_make_memstore(hpcrun_meminfo_t *mi) { void *addr; hpcrun_mem_init(); - // If in the child after fork(), then continue to use the parent's - // memstore if it looks ok, else mmap a new one. Note: we can't - // reset the memstore to empty unless we delete everything that was - // created via hpcrun_malloc() (cct, uw_recipe_map, ...). - if (is_child && mi->mi_start != NULL - && mi->mi_start <= mi->mi_low && mi->mi_low <= mi->mi_high - && mi->mi_high <= mi->mi_start + mi->mi_size) { - return; - } - addr = hpcrun_mmap_anon(memsize); if (addr == NULL) { if (! out_of_mem_mesg) { @@ -260,10 +259,10 @@ hpcrun_make_memstore(hpcrun_meminfo_t *mi, int is_child) void hpcrun_reclaim_freeable_mem(void) { - hpcrun_meminfo_t *mi = &TD_GET(memstore); + hpcrun_meminfo_t *mi = &memstore; mi->mi_low = mi->mi_start; - TD_GET(mem_low) = 0; + mem_low = 0; num_reclaims++; TMSG(MALLOC, "%s: %d", __func__, num_reclaims); } @@ -283,7 +282,7 @@ hpcrun_malloc(size_t size) return NULL; } - mi = &TD_GET(memstore); + mi = &memstore; size = round_up(size); // For a large request that doesn't fit within the existing @@ -310,7 +309,7 @@ hpcrun_malloc(size_t size) || mi->mi_high - mi->mi_low < low_memsize || mi->mi_high - mi->mi_low < size) { if (allow_extra_mmap) { - hpcrun_make_memstore(mi, 0); + hpcrun_make_memstore(mi); } else { if (! out_of_mem_mesg) { EMSG("%s: out of memory, shutting down sampling", __func__); @@ -412,3 +411,11 @@ hpcrun_memory_summary(void) "malloc failures: %ld", total_freeable/meg, total_non_freeable/meg, num_failures); } + +int +get_mem_low( + void +) +{ + return mem_low; +} diff --git a/src/tool/hpcrun/memory/newmem.h b/src/tool/hpcrun/memory/newmem.h index 7fb1ed1211..90695c7322 100644 --- a/src/tool/hpcrun/memory/newmem.h +++ b/src/tool/hpcrun/memory/newmem.h @@ -63,6 +63,7 @@ struct hpcrun_meminfo { typedef struct hpcrun_meminfo hpcrun_meminfo_t; -void hpcrun_make_memstore(hpcrun_meminfo_t *mi, int is_child); +void hpcrun_make_memstore(hpcrun_meminfo_t *mi); +int get_mem_low(void); #endif diff --git a/src/tool/hpcrun/messages/debug-flag.c b/src/tool/hpcrun/messages/debug-flag.c index 2e9c60a15f..66397b2dae 100644 --- a/src/tool/hpcrun/messages/debug-flag.c +++ b/src/tool/hpcrun/messages/debug-flag.c @@ -159,7 +159,7 @@ static pmsg_category all_list_entries [] = { // E(CSP_MALLOC), // E(MEM__ALLOC), E(NORM_IP), - E(PARTIAL_UNW) + E(PARTIAL_UNW) }; diff --git a/src/tool/hpcrun/messages/messages.flag-defns b/src/tool/hpcrun/messages/messages.flag-defns index 8379ee5f71..8b2d321418 100644 --- a/src/tool/hpcrun/messages/messages.flag-defns +++ b/src/tool/hpcrun/messages/messages.flag-defns @@ -156,6 +156,7 @@ E(CUPTI_TRACE), E(CUDA_CUBIN), E(CUPTI_ACTIVITY), + E(ROCM), E(DATACENTRIC), E(IDLE), E(MAIN_BOUNDS), diff --git a/src/tool/hpcrun/metrics.c b/src/tool/hpcrun/metrics.c index d9aa9d7a0a..d6388b58a7 100644 --- a/src/tool/hpcrun/metrics.c +++ b/src/tool/hpcrun/metrics.c @@ -366,6 +366,7 @@ hpcrun_set_new_metric_desc(kind_info_t *kind, const char* name, metric_desc_list_t* n = NULL; // if there are pre-allocated metrics, use them + // (default metrics - not alloc, added metrics - prealloc) if (pre_alloc) { n = pre_alloc; pre_alloc = pre_alloc->next; @@ -373,6 +374,7 @@ hpcrun_set_new_metric_desc(kind_info_t *kind, const char* name, else { n = (metric_desc_list_t*) hpcrun_malloc(sizeof(metric_desc_list_t)); } + // Add n into the list of metric description - kind->metric_data n->next = kind->metric_data; kind->metric_data = n; n->proc = upd_fn; diff --git a/src/tool/hpcrun/module-ignore-map.c b/src/tool/hpcrun/module-ignore-map.c index 56b9db6e7c..98a60630aa 100644 --- a/src/tool/hpcrun/module-ignore-map.c +++ b/src/tool/hpcrun/module-ignore-map.c @@ -108,7 +108,7 @@ // where any GPU can indicate that its functions should be added to // the module ignore map when that type of GPU is being monitored. -#define NUM_FNS 8 +#define NUM_FNS 9 @@ -127,16 +127,19 @@ typedef struct module_ignore_entry { // static data //*************************************************************************** -static const char *IGNORE_FNS[NUM_FNS] = { + +static const char *IGNORE_FNS[] = { "cuLaunchKernel", "cudaLaunchKernel", "cuptiActivityEnable", + "rocprofiler_iterate_info", "roctracer_set_properties", // amd roctracer library "amd_dbgapi_initialize", // amd debug library "hipKernelNameRefByPtr", // amd hip runtime - "hsa_queue_create", // amd hsa runtime + "hsa_init", // amd hsa runtime "hpcrun_malloc" // hpcrun library }; + static module_ignore_entry_t modules[NUM_FNS]; static pfq_rwlock_t modules_lock; @@ -250,7 +253,7 @@ module_ignore_map_lookup } int -serach_functions_in_module(Elf *e, GElf_Shdr* secHead, Elf_Scn *section) +search_functions_in_module(Elf *e, GElf_Shdr* secHead, Elf_Scn *section) { Elf_Data *data; char *symName; @@ -287,6 +290,8 @@ module_ignore_map_ignore load_module_t* lm ) { + if (lm == NULL) return false; + // Update path // Only one thread could update the flag, // Guarantee dlopen modules before notification are updated. @@ -332,7 +337,7 @@ module_ignore_map_ignore gelf_getshdr(scn, &secHead); // Only search .dynsym section if (secHead.sh_type != SHT_DYNSYM) continue; - int module_ignore_index = serach_functions_in_module(elf, &secHead, scn); + int module_ignore_index = search_functions_in_module(elf, &secHead, scn); if (module_ignore_index != -1) { modules[module_ignore_index].module = module; modules[module_ignore_index].empty = false; diff --git a/src/tool/hpcrun/ompt/omp-tools.h b/src/tool/hpcrun/ompt/omp-tools.h index 43788206d2..ffa406ab86 100644 --- a/src/tool/hpcrun/ompt/omp-tools.h +++ b/src/tool/hpcrun/ompt/omp-tools.h @@ -1,5 +1,5 @@ /* - * include/50/omp-tools.h.var + * include/omp-tools.h.var */ //===----------------------------------------------------------------------===// @@ -20,6 +20,16 @@ #include #include +#ifdef DEPRECATION_WARNINGS +# ifdef __cplusplus +# define DEPRECATED_51 [[deprecated("as of 5.1")]] +# else +# define DEPRECATED_51 __attribute__((deprecated("as of 5.1"))) +#endif +#else +#define DEPRECATED_51 +#endif + /***************************************************************************** * iteration macros *****************************************************************************/ @@ -133,7 +143,7 @@ \ macro (ompt_callback_work, ompt_callback_work_t, 20) /* task at work begin or end */ \ \ - macro (ompt_callback_master, ompt_callback_master_t, 21) /* task at master begin or end */ \ + macro (ompt_callback_masked, ompt_callback_masked_t, 21) /* task at masked begin or end */ \ \ macro (ompt_callback_target_map, ompt_callback_target_map_t, 22) /* target map */ \ \ @@ -153,7 +163,26 @@ \ macro (ompt_callback_reduction, ompt_callback_sync_region_t, 31) /* reduction */ \ \ - macro (ompt_callback_dispatch, ompt_callback_dispatch_t, 32) /* dispatch of work */ + macro (ompt_callback_dispatch, ompt_callback_dispatch_t, 32) /* dispatch of work */ \ + macro (ompt_callback_target_emi, ompt_callback_target_emi_t, 33) /* target */ \ + macro (ompt_callback_target_data_op_emi,ompt_callback_target_data_op_emi_t,34) /* target data op */ \ + macro (ompt_callback_target_submit_emi, ompt_callback_target_submit_emi_t, 35) /* target submit */ \ + macro (ompt_callback_target_map_emi, ompt_callback_target_map_emi_t, 36) /* target map */ \ + macro (ompt_callback_error, ompt_callback_error_t, 37) /* error */ + +#define FOREACH_OMPT_TARGET_CALLBACK(macro) \ + macro(ompt_callback_device_initialize) \ + macro(ompt_callback_device_finalize) \ + macro(ompt_callback_device_load) \ + macro(ompt_callback_device_unload) \ + macro(ompt_callback_target) \ + macro(ompt_callback_target_map) \ + macro(ompt_callback_target_data_op) \ + macro(ompt_callback_target_submit) \ + macro(ompt_callback_target_data_op_emi) \ + macro(ompt_callback_target_emi) \ + macro(ompt_callback_target_map_emi) \ + macro(ompt_callback_target_submit_emi) /***************************************************************************** * implementation specific types @@ -190,7 +219,8 @@ typedef enum ompt_callbacks_t { ompt_callback_dependences = 18, ompt_callback_task_dependence = 19, ompt_callback_work = 20, - ompt_callback_master = 21, + ompt_callback_master DEPRECATED_51 = 21, + ompt_callback_masked = 21, ompt_callback_target_map = 22, ompt_callback_sync_region = 23, ompt_callback_lock_init = 24, @@ -201,7 +231,12 @@ typedef enum ompt_callbacks_t { ompt_callback_flush = 29, ompt_callback_cancel = 30, ompt_callback_reduction = 31, - ompt_callback_dispatch = 32 + ompt_callback_dispatch = 32, + ompt_callback_target_emi = 33, + ompt_callback_target_data_op_emi = 34, + ompt_callback_target_submit_emi = 35, + ompt_callback_target_map_emi = 36, + ompt_callback_error = 37 } ompt_callbacks_t; typedef enum ompt_record_t { @@ -239,7 +274,8 @@ typedef enum ompt_thread_t { typedef enum ompt_scope_endpoint_t { ompt_scope_begin = 1, - ompt_scope_end = 2 + ompt_scope_end = 2, + ompt_scope_beginend = 3 } ompt_scope_endpoint_t; typedef enum ompt_dispatch_t { @@ -248,22 +284,29 @@ typedef enum ompt_dispatch_t { } ompt_dispatch_t; typedef enum ompt_sync_region_t { - ompt_sync_region_barrier = 1, - ompt_sync_region_barrier_implicit = 2, + ompt_sync_region_barrier DEPRECATED_51 = 1, + ompt_sync_region_barrier_implicit DEPRECATED_51 = 2, ompt_sync_region_barrier_explicit = 3, ompt_sync_region_barrier_implementation = 4, ompt_sync_region_taskwait = 5, ompt_sync_region_taskgroup = 6, - ompt_sync_region_reduction = 7 + ompt_sync_region_reduction = 7, + ompt_sync_region_barrier_implicit_workshare = 8, + ompt_sync_region_barrier_implicit_parallel = 9, + ompt_sync_region_barrier_teams = 10 } ompt_sync_region_t; typedef enum ompt_target_data_op_t { - ompt_target_data_alloc = 1, - ompt_target_data_transfer_to_device = 2, - ompt_target_data_transfer_from_device = 3, - ompt_target_data_delete = 4, - ompt_target_data_associate = 5, - ompt_target_data_disassociate = 6 + ompt_target_data_alloc = 1, + ompt_target_data_transfer_to_device = 2, + ompt_target_data_transfer_from_device = 3, + ompt_target_data_delete = 4, + ompt_target_data_associate = 5, + ompt_target_data_disassociate = 6, + ompt_target_data_alloc_async = 17, + ompt_target_data_transfer_to_device_async = 18, + ompt_target_data_transfer_from_device_async = 19, + ompt_target_data_delete_async = 20 } ompt_target_data_op_t; typedef enum ompt_work_t { @@ -273,7 +316,8 @@ typedef enum ompt_work_t { ompt_work_single_other = 4, ompt_work_workshare = 5, ompt_work_distribute = 6, - ompt_work_taskloop = 7 + ompt_work_taskloop = 7, + ompt_work_scope = 8 } ompt_work_t; typedef enum ompt_mutex_t { @@ -302,6 +346,7 @@ typedef enum ompt_task_flag_t { ompt_task_implicit = 0x00000002, ompt_task_explicit = 0x00000004, ompt_task_target = 0x00000008, + ompt_task_taskwait = 0x00000010, ompt_task_undeferred = 0x08000000, ompt_task_untied = 0x10000000, ompt_task_final = 0x20000000, @@ -316,14 +361,19 @@ typedef enum ompt_task_status_t { ompt_task_detach = 4, ompt_task_early_fulfill = 5, ompt_task_late_fulfill = 6, - ompt_task_switch = 7 + ompt_task_switch = 7, + ompt_taskwait_complete = 8 } ompt_task_status_t; typedef enum ompt_target_t { ompt_target = 1, ompt_target_enter_data = 2, ompt_target_exit_data = 3, - ompt_target_update = 4 + ompt_target_update = 4, + ompt_target_nowait = 9, + ompt_target_enter_data_nowait = 10, + ompt_target_exit_data_nowait = 11, + ompt_target_update_nowait = 12 } ompt_target_t; typedef enum ompt_parallel_flag_t { @@ -348,9 +398,15 @@ typedef enum ompt_dependence_type_t { ompt_dependence_type_inout = 3, ompt_dependence_type_mutexinoutset = 4, ompt_dependence_type_source = 5, - ompt_dependence_type_sink = 6 + ompt_dependence_type_sink = 6, + ompt_dependence_type_inoutset = 7 } ompt_dependence_type_t; +typedef enum ompt_severity_t { + ompt_warning = 1, + ompt_fatal = 2 +} ompt_severity_t; + typedef enum ompt_cancel_flag_t { ompt_cancel_parallel = 0x01, ompt_cancel_sections = 0x02, @@ -371,18 +427,20 @@ typedef enum ompt_frame_flag_t { ompt_frame_cfa = 0x10, ompt_frame_framepointer = 0x20, ompt_frame_stackaddress = 0x30 -} ompt_frame_flag_t; +} ompt_frame_flag_t; typedef enum ompt_state_t { ompt_state_work_serial = 0x000, ompt_state_work_parallel = 0x001, ompt_state_work_reduction = 0x002, - ompt_state_wait_barrier = 0x010, + ompt_state_wait_barrier DEPRECATED_51 = 0x010, ompt_state_wait_barrier_implicit_parallel = 0x011, ompt_state_wait_barrier_implicit_workshare = 0x012, - ompt_state_wait_barrier_implicit = 0x013, + ompt_state_wait_barrier_implicit DEPRECATED_51 = 0x013, ompt_state_wait_barrier_explicit = 0x014, + ompt_state_wait_barrier_implementation = 0x015, + ompt_state_wait_barrier_teams = 0x016, ompt_state_wait_taskwait = 0x020, ompt_state_wait_taskgroup = 0x021, @@ -439,6 +497,8 @@ typedef enum ompd_rc_t { ompd_rc_device_read_error = 8, ompd_rc_device_write_error = 9, ompd_rc_nomem = 10, + ompd_rc_incomplete = 11, + ompd_rc_callback_error = 12 } ompd_rc_t; typedef void (*ompt_interface_fn_t) (void); @@ -720,14 +780,14 @@ typedef void (*ompt_callback_dispatch_t) ( ompt_data_t *parallel_data, ompt_data_t *task_data, ompt_dispatch_t kind, - ompt_data_t instance + ompt_data_t instance ); typedef struct ompt_record_dispatch_t { ompt_id_t parallel_id; ompt_id_t task_id; ompt_dispatch_t kind; - ompt_data_t instance; + ompt_data_t instance; } ompt_record_dispatch_t; typedef void (*ompt_callback_task_create_t) ( @@ -799,19 +859,21 @@ typedef struct ompt_record_implicit_task_t { int flags; } ompt_record_implicit_task_t; -typedef void (*ompt_callback_master_t) ( +typedef void (*ompt_callback_masked_t) ( ompt_scope_endpoint_t endpoint, ompt_data_t *parallel_data, ompt_data_t *task_data, const void *codeptr_ra ); -typedef struct ompt_record_master_t { +typedef ompt_callback_masked_t ompt_callback_master_t DEPRECATED_51; + +typedef struct ompt_record_masked_t { ompt_scope_endpoint_t endpoint; ompt_id_t parallel_id; ompt_id_t task_id; const void *codeptr_ra; -} ompt_record_master_t; +} ompt_record_masked_t; typedef void (*ompt_callback_sync_region_t) ( ompt_sync_region_t kind, @@ -918,6 +980,20 @@ typedef void (*ompt_callback_device_unload_t) ( uint64_t module_id ); +typedef void (*ompt_callback_target_data_op_emi_t) ( + ompt_scope_endpoint_t endpoint, + ompt_data_t *target_task_data, + ompt_data_t *target_data, + ompt_id_t *host_op_id, + ompt_target_data_op_t optype, + void *src_addr, + int src_device_num, + void *dest_addr, + int dest_device_num, + size_t bytes, + const void *codeptr_ra +); + typedef void (*ompt_callback_target_data_op_t) ( ompt_scope_endpoint_t endpoint, ompt_id_t target_id, @@ -943,6 +1019,16 @@ typedef struct ompt_record_target_data_op_t { const void *codeptr_ra; } ompt_record_target_data_op_t; +typedef void (*ompt_callback_target_emi_t) ( + ompt_target_t kind, + ompt_scope_endpoint_t endpoint, + int device_num, + ompt_data_t *task_data, + ompt_data_t *target_task_data, + ompt_data_t *target_data, + const void *codeptr_ra +); + typedef void (*ompt_callback_target_t) ( ompt_target_t kind, ompt_scope_endpoint_t endpoint, @@ -961,6 +1047,16 @@ typedef struct ompt_record_target_t { const void *codeptr_ra; } ompt_record_target_t; +typedef void (*ompt_callback_target_map_emi_t) ( + ompt_data_t *target_data, + unsigned int nitems, + void **host_addr, + void **device_addr, + size_t *bytes, + unsigned int *mapping_flags, + const void *codeptr_ra +); + typedef void (*ompt_callback_target_map_t) ( ompt_id_t target_id, unsigned int nitems, @@ -981,6 +1077,13 @@ typedef struct ompt_record_target_map_t { const void *codeptr_ra; } ompt_record_target_map_t; +typedef void (*ompt_callback_target_submit_emi_t) ( + ompt_scope_endpoint_t endpoint, + ompt_data_t *target_data, + ompt_id_t *host_op_id, + unsigned int requested_num_teams +); + typedef void (*ompt_callback_target_submit_t) ( ompt_scope_endpoint_t endpoint, ompt_id_t target_id, @@ -1008,6 +1111,19 @@ typedef struct ompt_record_control_tool_t { const void *codeptr_ra; } ompt_record_control_tool_t; +typedef void (*ompt_callback_error_t) ( + ompt_severity_t severity, + const char *message, size_t length, + const void *codeptr_ra +); + +typedef struct ompt_record_error_t { + ompt_severity_t severity; + const char *message; + size_t length; + const void *codeptr_ra; +} ompt_record_error_t; + typedef struct ompd_address_t { ompd_seg_t segment; ompd_addr_t address; @@ -1035,6 +1151,198 @@ typedef struct ompd_device_type_sizes_t { uint8_t sizeof_pointer; } ompd_device_type_sizes_t; +void ompd_dll_locations_valid(void); + +typedef ompd_rc_t (*ompd_callback_memory_alloc_fn_t)(ompd_size_t nbytes, + void **ptr); + +typedef ompd_rc_t (*ompd_callback_memory_free_fn_t)(void *ptr); + +typedef ompd_rc_t (*ompd_callback_get_thread_context_for_thread_id_fn_t)( + ompd_address_space_context_t *address_space_context, ompd_thread_id_t kind, + ompd_size_t sizeof_thread_id, const void *thread_id, + ompd_thread_context_t **thread_context); + +typedef ompd_rc_t (*ompd_callback_sizeof_fn_t)( + ompd_address_space_context_t *address_space_context, + ompd_device_type_sizes_t *sizes); + +typedef ompd_rc_t (*ompd_callback_symbol_addr_fn_t)( + ompd_address_space_context_t *address_space_context, + ompd_thread_context_t *thread_context, const char *symbol_name, + ompd_address_t *symbol_addr, const char *file_name); + +typedef ompd_rc_t (*ompd_callback_memory_read_fn_t)( + ompd_address_space_context_t *address_space_context, + ompd_thread_context_t *thread_context, const ompd_address_t *addr, + ompd_size_t nbytes, void *buffer); + +typedef ompd_rc_t (*ompd_callback_memory_write_fn_t)( + ompd_address_space_context_t *address_space_context, + ompd_thread_context_t *thread_context, const ompd_address_t *addr, + ompd_size_t nbytes, const void *buffer); + +typedef ompd_rc_t (*ompd_callback_device_host_fn_t)( + ompd_address_space_context_t *address_space_context, const void *input, + ompd_size_t unit_size, ompd_size_t count, void *output); + +typedef ompd_rc_t (*ompd_callback_print_string_fn_t)(const char *string, + int category); + +typedef struct ompd_callbacks_t { + ompd_callback_memory_alloc_fn_t alloc_memory; + ompd_callback_memory_free_fn_t free_memory; + ompd_callback_print_string_fn_t print_string; + ompd_callback_sizeof_fn_t sizeof_type; + ompd_callback_symbol_addr_fn_t symbol_addr_lookup; + ompd_callback_memory_read_fn_t read_memory; + ompd_callback_memory_write_fn_t write_memory; + ompd_callback_memory_read_fn_t read_string; + ompd_callback_device_host_fn_t device_to_host; + ompd_callback_device_host_fn_t host_to_device; + ompd_callback_get_thread_context_for_thread_id_fn_t + get_thread_context_for_thread_id; +} ompd_callbacks_t; + +void ompd_bp_parallel_begin(void); + +void ompd_bp_parallel_end(void); + +void ompd_bp_task_begin(void); + +void ompd_bp_task_end(void); + +void ompd_bp_thread_begin(void); + +void ompd_bp_thread_end(void); + +void ompd_bp_device_begin(void); + +void ompd_bp_device_end(void); + +ompd_rc_t ompd_initialize(ompd_word_t api_version, + const ompd_callbacks_t *callbacks); + +ompd_rc_t ompd_get_api_version(ompd_word_t *version); + +ompd_rc_t ompd_get_version_string(const char **string); + +ompd_rc_t ompd_finalize(void); + +ompd_rc_t ompd_process_initialize(ompd_address_space_context_t *context, + ompd_address_space_handle_t **handle); + +ompd_rc_t ompd_device_initialize(ompd_address_space_handle_t *process_handle, + ompd_address_space_context_t *device_context, + ompd_device_t kind, ompd_size_t sizeof_id, + void *id, + ompd_address_space_handle_t **device_handle); + +ompd_rc_t ompd_rel_address_space_handle(ompd_address_space_handle_t *handle); + +ompd_rc_t ompd_get_omp_version(ompd_address_space_handle_t *address_space, + ompd_word_t *omp_version); + +ompd_rc_t +ompd_get_omp_version_string(ompd_address_space_handle_t *address_space, + const char **string); + +ompd_rc_t ompd_get_thread_in_parallel(ompd_parallel_handle_t *parallel_handle, + int thread_num, + ompd_thread_handle_t **thread_handle); + +ompd_rc_t ompd_get_thread_handle(ompd_address_space_handle_t *handle, + ompd_thread_id_t kind, + ompd_size_t sizeof_thread_id, + const void *thread_id, + ompd_thread_handle_t **thread_handle); + +ompd_rc_t ompd_rel_thread_handle(ompd_thread_handle_t *thread_handle); + +ompd_rc_t ompd_thread_handle_compare(ompd_thread_handle_t *thread_handle_1, + ompd_thread_handle_t *thread_handle_2, + int *cmp_value); + +ompd_rc_t ompd_get_thread_id(ompd_thread_handle_t *thread_handle, + ompd_thread_id_t kind, + ompd_size_t sizeof_thread_id, void *thread_id); + +ompd_rc_t +ompd_get_curr_parallel_handle(ompd_thread_handle_t *thread_handle, + ompd_parallel_handle_t **parallel_handle); + +ompd_rc_t ompd_get_enclosing_parallel_handle( + ompd_parallel_handle_t *parallel_handle, + ompd_parallel_handle_t **enclosing_parallel_handle); + +ompd_rc_t +ompd_get_task_parallel_handle(ompd_task_handle_t *task_handle, + ompd_parallel_handle_t **task_parallel_handle); + +ompd_rc_t ompd_rel_parallel_handle(ompd_parallel_handle_t *parallel_handle); + +ompd_rc_t +ompd_parallel_handle_compare(ompd_parallel_handle_t *parallel_handle_1, + ompd_parallel_handle_t *parallel_handle_2, + int *cmp_value); + +ompd_rc_t ompd_get_curr_task_handle(ompd_thread_handle_t *thread_handle, + ompd_task_handle_t **task_handle); + +ompd_rc_t +ompd_get_generating_task_handle(ompd_task_handle_t *task_handle, + ompd_task_handle_t **generating_task_handle); + +ompd_rc_t +ompd_get_scheduling_task_handle(ompd_task_handle_t *task_handle, + ompd_task_handle_t **scheduling_task_handle); + +ompd_rc_t ompd_get_task_in_parallel(ompd_parallel_handle_t *parallel_handle, + int thread_num, + ompd_task_handle_t **task_handle); + +ompd_rc_t ompd_rel_task_handle(ompd_task_handle_t *task_handle); + +ompd_rc_t ompd_task_handle_compare(ompd_task_handle_t *task_handle_1, + ompd_task_handle_t *task_handle_2, + int *cmp_value); + +ompd_rc_t ompd_get_task_function(ompd_task_handle_t *task_handle, + ompd_address_t *entry_point); + +ompd_rc_t ompd_get_task_frame(ompd_task_handle_t *task_handle, + ompd_frame_info_t *exit_frame, + ompd_frame_info_t *enter_frame); + +ompd_rc_t +ompd_enumerate_states(ompd_address_space_handle_t *address_space_handle, + ompd_word_t current_state, ompd_word_t *next_state, + const char **next_state_name, ompd_word_t *more_enums); + +ompd_rc_t ompd_get_state(ompd_thread_handle_t *thread_handle, + ompd_word_t *state, ompt_wait_id_t *wait_id); + +ompd_rc_t +ompd_get_display_control_vars(ompd_address_space_handle_t *address_space_handle, + const char *const **control_vars); + +ompd_rc_t ompd_rel_display_control_vars(const char *const **control_vars); + +ompd_rc_t ompd_enumerate_icvs(ompd_address_space_handle_t *handle, + ompd_icv_id_t current, ompd_icv_id_t *next_id, + const char **next_icv_name, + ompd_scope_t *next_scope, int *more); + +ompd_rc_t ompd_get_icv_from_scope(void *handle, ompd_scope_t scope, + ompd_icv_id_t icv_id, ompd_word_t *icv_value); + +ompd_rc_t ompd_get_icv_string_from_scope(void *handle, ompd_scope_t scope, + ompd_icv_id_t icv_id, + const char **icv_string); + +ompd_rc_t ompd_get_tool_data(void *handle, ompd_scope_t scope, + ompd_word_t *value, ompd_address_t *ptr); + typedef struct ompt_record_ompt_t { ompt_callbacks_t type; ompt_device_time_t time; @@ -1051,7 +1359,7 @@ typedef struct ompt_record_ompt_t { ompt_record_task_dependence_t task_dependence; ompt_record_task_schedule_t task_schedule; ompt_record_implicit_task_t implicit_task; - ompt_record_master_t master; + ompt_record_masked_t masked; ompt_record_sync_region_t sync_region; ompt_record_mutex_acquire_t mutex_acquire; ompt_record_mutex_t mutex; diff --git a/src/tool/hpcrun/ompt/ompt-device.c b/src/tool/hpcrun/ompt/ompt-device.c index ff134e5eaa..b27b968f71 100644 --- a/src/tool/hpcrun/ompt/ompt-device.c +++ b/src/tool/hpcrun/ompt/ompt-device.c @@ -2,7 +2,7 @@ // * BeginRiceCopyright ***************************************************** // -// $HeadURL$ +// $HeadURL$ // $Id$ // // -------------------------------------------------------------------------- @@ -45,9 +45,6 @@ // ******************************************************* EndRiceCopyright * -#include "ompt-device.h" - -#if HAVE_CUPTI_H /****************************************************************************** * global include files @@ -71,14 +68,19 @@ #include "ompt-interface.h" #include "ompt-device-map.h" +#include "ompt-device.h" #include "gpu/gpu-op-placeholders.h" +#include "gpu/gpu-application-thread-api.h" #include "gpu/gpu-correlation-channel.h" #include "gpu/gpu-correlation-channel-set.h" +#include "gpu/gpu-correlation-id.h" +#include "gpu/gpu-metrics.h" #include "gpu/gpu-monitoring.h" +#include "gpu/gpu-monitoring-thread-api.h" +#include "gpu/gpu-trace.h" -#include "gpu/nvidia/cupti-api.h" -#include "sample-sources/nvidia.h" +#include "gpu/ompt/ompt-gpu-api.h" @@ -86,6 +88,15 @@ // macros //***************************************************************************** +#define FOREACH_OMPT_DATA_OP(macro) \ + macro(op, ompt_target_data_alloc, ompt_tgt_alloc) \ + macro(op, ompt_target_data_delete, ompt_tgt_delete) \ + macro(op, ompt_target_data_transfer_to_device, ompt_tgt_copyin) \ + macro(op, ompt_target_data_transfer_from_device, ompt_tgt_copyout) + +// with OMPT support turned on, callpath pruning should not be necessary +#define PRUNE_CALLPATH 0 + #define OMPT_ACTIVITY_DEBUG 0 #if OMPT_ACTIVITY_DEBUG @@ -100,50 +111,79 @@ typedef return_type (*OMPT_API_FNTYPE(fn)) args #define OMPT_TARGET_API_FUNCTION(return_type, fn, args) \ - OMPT_API_FUNCTION(return_type, fn, args) + OMPT_API_FUNCTION(return_type, fn, args) #define FOREACH_OMPT_TARGET_FN(macro) \ macro(ompt_get_device_time) \ macro(ompt_translate_time) \ - macro(ompt_set_trace_native) \ + macro(ompt_set_trace_ompt) \ macro(ompt_start_trace) \ macro(ompt_pause_trace) \ macro(ompt_stop_trace) \ + macro(ompt_flush_trace) \ macro(ompt_get_record_type) \ - macro(ompt_get_record_native) \ + macro(ompt_get_record_ompt) \ macro(ompt_get_record_abstract) \ - macro(ompt_advance_buffer_cursor) \ - macro(ompt_set_pc_sampling) \ - macro(ompt_set_external_subscriber) + macro(ompt_advance_buffer_cursor) //***************************************************************************** -// types +// type declarations //***************************************************************************** -OMPT_TARGET_API_FUNCTION(void, ompt_set_external_subscriber, -( - int enable -)); +typedef struct ompt_device_entry_t { + int device_id; + ompt_device_t *device; + struct ompt_device_entry_t *next; +} ompt_device_entry_t; -OMPT_TARGET_API_FUNCTION(void, ompt_set_pc_sampling, -( - ompt_device_t *device, - int enable, - int pc_sampling_frequency -)); + +//***************************************************************************** +// forward declarations +//***************************************************************************** + +static void ompt_dump(ompt_record_ompt_t *r) __attribute__((unused)); + //***************************************************************************** // static variables //***************************************************************************** -static bool ompt_pc_sampling_enabled = false; +static device_finalizer_fn_entry_t device_finalizer_flush; +static device_finalizer_fn_entry_t device_finalizer_trace; +static device_finalizer_fn_entry_t device_finalizer_shutdown; -static device_finalizer_fn_entry_t device_finalizer; +static int ompt_shutdown_complete = 0; +static ompt_device_entry_t *device_list = 0; + +static __thread bool ompt_need_flush = false; + + + +//***************************************************************************** +// private operations +//***************************************************************************** + +static void +device_list_insert +( + int device_id, + ompt_device_t *device +) +{ + // FIXME: replace with splay-uint64 + ompt_device_entry_t *e = (ompt_device_entry_t *) + malloc(sizeof(ompt_device_entry_t)); + e->device_id = device_id; + e->device = device; + e->next = device_list; + device_list = e; + PRINT("device_list_insert id=%d device=%p\n", device_id, device); +} //------------------------------------------------ // declare function pointers for target functions @@ -180,7 +220,12 @@ hpcrun_ompt_op_id_notify(ompt_scope_endpoint_t endpoint, // Enter a ompt runtime api PRINT("enter ompt runtime op %lu\n", host_op_id); ompt_runtime_api_flag = true; - cupti_correlation_id_push(host_op_id); + + gpu_application_thread_process_activities(); + +#if 0 + ompt_correlation_id_push(host_op_id); +#endif gpu_op_ccts_t gpu_op_ccts; memset(&gpu_op_ccts, 0, sizeof(gpu_op_ccts_t)); @@ -200,13 +245,16 @@ hpcrun_ompt_op_id_notify(ompt_scope_endpoint_t endpoint, // Inform the worker about the placeholders uint64_t cpu_submit_time = hpcrun_nanotime(); + PRINT("producing correlation %lu\n", host_op_id); gpu_correlation_channel_produce(host_op_id, &gpu_op_ccts, cpu_submit_time); } else { PRINT("exit ompt runtime op %lu\n", host_op_id); // Enter a runtime api ompt_runtime_api_flag = false; +#if 0 // Pop the id and make a notification - cupti_correlation_id_pop(); + ompt_correlation_id_pop(); +#endif // Clear kernel status trace_node = NULL; } @@ -215,11 +263,12 @@ hpcrun_ompt_op_id_notify(ompt_scope_endpoint_t endpoint, } -void +void ompt_bind_names(ompt_function_lookup_t lookup) { #define ompt_bind_name(fn) \ - fn = (fn ## _t ) lookup(#fn); + fn = (fn ## _t ) lookup(#fn); \ + PRINT("look up function %s, got %p\n", #fn, fn); FOREACH_OMPT_TARGET_FN(ompt_bind_name) @@ -229,8 +278,8 @@ ompt_bind_names(ompt_function_lookup_t lookup) #define BUFFER_SIZE (1024 * 1024 * 8) -void -ompt_callback_buffer_request +static void +ompt_buffer_request ( int device_id, ompt_buffer_t **buffer, @@ -243,100 +292,228 @@ ompt_callback_buffer_request } -void -ompt_callback_buffer_complete +static void +ompt_buffer_release ( - int device_id, - ompt_buffer_t *buffer, - size_t bytes, - ompt_buffer_cursor_t begin, - int buffer_owned + ompt_buffer_t *buffer ) { - // handle notifications - gpu_correlation_channel_set_consume(); - - // signal advance to return pointer to first record - ompt_buffer_cursor_t next = begin; - int status = 0; - do { - // TODO(keren): replace cupti_activity_handle with device_activity handle - CUpti_Activity *activity = (CUpti_Activity *)next; - cupti_activity_process(activity); - status = cupti_buffer_cursor_advance(buffer, bytes, (CUpti_Activity **)&next); - } while(status); + free(buffer); } -void -ompt_pc_sampling_enable() +static void +ompt_dump +( + ompt_record_ompt_t *r +) { - ompt_pc_sampling_enabled = true; + if (r) { + printf("r=%p type=%d time=%lu thread_id=%lu target_id=0x%lx\n", + r, r->type, r->time, r->thread_id, r->target_id); + + switch (r->type) { + case ompt_callback_target: + // case ompt_callback_target_emi: + { + ompt_record_target_t target_rec = r->record.target; + printf("\tTarget task: kind=%d endpoint=%d device=%d task_id=%lu target_id=0x%lx codeptr=%p\n", + target_rec.kind, target_rec.endpoint, target_rec.device_num, + target_rec.task_id, target_rec.target_id, target_rec.codeptr_ra); + break; + } + case ompt_callback_target_data_op: + // case ompt_callback_target_data_op_emi: + { + ompt_record_target_data_op_t target_data_op_rec = + r->record.target_data_op; + printf("\tTarget data op: host_op_id=%lu optype=%d src_addr=%p " + "src_device=%d dest_addr=%p dest_device=%d bytes=%lu " + "end_time=%lu duration=%luus codeptr=%p\n", + target_data_op_rec.host_op_id, target_data_op_rec.optype, + target_data_op_rec.src_addr, target_data_op_rec.src_device_num, + target_data_op_rec.dest_addr, target_data_op_rec.dest_device_num, + target_data_op_rec.bytes, target_data_op_rec.end_time, + target_data_op_rec.end_time - r->time, + target_data_op_rec.codeptr_ra); + break; + } + case ompt_callback_target_submit: + // case ompt_callback_target_submit_emi: + { + ompt_record_target_kernel_t target_kernel_rec = r->record.target_kernel; + printf("\tTarget kernel: host_op_id=%lu requested_num_teams=%u " + "granted_num_teams=%u end_time=%lu duration=%luus\n", + target_kernel_rec.host_op_id, + target_kernel_rec.requested_num_teams, + target_kernel_rec.granted_num_teams, target_kernel_rec.end_time, + target_kernel_rec.end_time - r->time); + break; + } + default: + assert(0); + break; + } + } } -void -ompt_pc_sampling_disable() +static ompt_device_t * +ompt_get_device +( + int device_id +) { - ompt_pc_sampling_enabled = false; + ompt_device_entry_t *e = device_list; + while (e) { + if (e->device_id == device_id) return e->device; + e = e->next; + } + return 0; } -void -ompt_trace_configure(ompt_device_t *device) +static void +ompt_finalize_flush +( + void *arg, + int how +) { - int flags = 0; + PRINT("ompt_finalize_flush enter\n"); + + ompt_device_entry_t *e = device_list; + while (e) { + PRINT("ompt_finalize_flush flush id=%d device=%p\n", + e->device_id, e->device); + if (ompt_need_flush) ompt_flush_trace(e->device); + e = e->next; + } - // specify desired monitoring - flags |= ompt_native_driver; + gpu_application_thread_process_activities(); - flags |= ompt_native_runtime; + PRINT("ompt_finalize_flush exit\n"); +} - flags |= ompt_native_kernel_invocation; - flags |= ompt_native_kernel_execution; +static void +ompt_finalize_shutdown +( + void *arg, + int how +) +{ + PRINT("ompt_finalize_shutdown enter\n"); + + ompt_device_entry_t *e = device_list; + while (e) { + PRINT("ompt_finalize_flush flush id=%d device=%p\n", + e->device_id, e->device); + ompt_stop_trace(e->device); + e = e->next; + } + ompt_shutdown_complete = 1; + gpu_application_thread_process_activities(); + PRINT("ompt_finalize_shutdown exit\n"); +} - flags |= ompt_native_data_motion_explicit; - // indicate desired monitoring - ompt_set_trace_native(device, 1, flags); - - // set pc sampling after other traces - if (ompt_pc_sampling_enabled) { - int freq_bits = gpu_monitoring_instruction_sample_frequency_get(); - ompt_set_pc_sampling(device, true, freq_bits); +static void +ompt_finalize_trace +( + void *arg, + int how +) +{ + PRINT("ompt_finalize_trace enter\n"); + gpu_trace_fini(arg, how); + PRINT("ompt_finalize_trace exit\n"); +} + + + +static void +ompt_buffer_complete +( + int device_id, + ompt_buffer_t *buffer, + size_t bytes, + ompt_buffer_cursor_t begin, + int buffer_owned +) +{ + PRINT("ompt_callback_buffer_complete enter device=%d\n", device_id); + if (ompt_shutdown_complete == 0) { + + gpu_monitoring_thread_activities_ready(); + + ompt_device_t *device = ompt_get_device(device_id); + + // signal advance to return pointer to first record + ompt_buffer_cursor_t current = begin; + int status = 1; + while (status) { + // extract the next record from the buffer + ompt_record_ompt_t *record = ompt_get_record_ompt(buffer, current); + + // a buffer may be empty, so the first record may be NULL + if (record == NULL) break; + + // process the record + ompt_activity_process(record); + + // advance the cursor to the next record + // status will be 0 if there is no next record + status = ompt_advance_buffer_cursor(device, buffer, bytes, current, + ¤t); + } } + if (buffer_owned) ompt_buffer_release(buffer); + + PRINT("ompt_callback_buffer_complete exit device=%d\n", device_id); +} + + +void +ompt_trace_configure(ompt_device_t *device) +{ + // indicate desired monitoring + ompt_set_trace_ompt(device, 1, 0); + // turn on monitoring previously indicated - ompt_start_trace(device, ompt_callback_buffer_request, ompt_callback_buffer_complete); + ompt_start_trace(device, ompt_buffer_request, + ompt_buffer_complete); } void -ompt_device_initialize(uint64_t device_num, +ompt_device_initialize(int device_num, const char *type, ompt_device_t *device, ompt_function_lookup_t lookup, const char *documentation) { - PRINT("ompt_device_initialize->%s, %" PRIu64 "\n", type, device_num); + PRINT("ompt_device_initialize->%s, %d\n", type, device_num); ompt_bind_names(lookup); - //ompt_trace_configure(device); + ompt_trace_configure(device); + device_list_insert(device_num, device); ompt_device_map_insert(device_num, device, type); } -void -ompt_device_finalize(uint64_t device_num) +void +ompt_device_finalize(int device_num) { + PRINT("ompt_device_finalize id=%d\n", device_num); } -void -ompt_device_load(uint64_t device_num, +void +ompt_device_load(int device_num, const char *filename, int64_t file_offset, const void *file_addr, @@ -345,49 +522,58 @@ ompt_device_load(uint64_t device_num, const void *device_addr, uint64_t module_id) { - PRINT("ompt_device_load->%s, %" PRIu64 "\n", filename, device_num); + PRINT("ompt_device_load->%s, %d\n", filename, device_num); + +#if 0 // FIXME cupti_load_callback_cuda(module_id, host_addr, bytes); +#endif } -void -ompt_device_unload(uint64_t device_num, +void +ompt_device_unload(int device_num, uint64_t module_id) { //cubin_id_map_delete(module_id); } -static int +#if PRUNE_CALLPATH +static int get_load_module ( cct_node_t *node ) { - cct_addr_t *addr = hpcrun_cct_addr(target_node); + cct_addr_t *addr = hpcrun_cct_addr(target_node); ip_normalized_t ip = addr->ip_norm; return ip.lm_id; } +#endif -void -ompt_target_callback +void +ompt_target_callback_emi ( ompt_target_t kind, ompt_scope_endpoint_t endpoint, - uint64_t device_num, + int device_num, ompt_data_t *task_data, - ompt_id_t target_id, + ompt_data_t *target_task_data, + ompt_data_t *target_data, const void *codeptr_ra ) { - PRINT("ompt_target_callback->target_id %" PRIu64 "\n", target_id); - if (endpoint == ompt_scope_end) { target_node = NULL; return; } + ompt_need_flush = true; + + target_data->value = gpu_correlation_id(); + PRINT("ompt_target_callback->target_id 0x%lx\n", target_data->value); + // XXX(Keren): Do not use openmp callbacks to consume and produce records // HPCToolkit always subscribes its own cupti callback // @@ -406,54 +592,58 @@ ompt_target_callback td->overhead++; // NOTE(keren): hpcrun_safe_enter prevent self interruption hpcrun_safe_enter(); - + int skip_this_frame = 1; // omit this procedure frame on the call path - target_node = - hpcrun_sample_callpath(&uc, zero_metric_id, zero_metric_incr, - skip_this_frame, 1, NULL).sample_node; + target_node = + hpcrun_sample_callpath(&uc, zero_metric_id, zero_metric_incr, + skip_this_frame, 1, NULL).sample_node; +#if PRUNE_CALLPATH // the load module for the runtime library that supports offloading - int lm = get_load_module(target_node); + int lm = get_load_module(target_node); - // drop nodes on the call chain until we find one that is not in the load + // drop nodes on the call chain until we find one that is not in the load // module for runtime library that supports offloading - for (;;) { + for (;;) { target_node = hpcrun_cct_parent(target_node); if (get_load_module(target_node) != lm) break; } +#endif hpcrun_safe_exit(); td->overhead--; } -#define FOREACH_OMPT_DATA_OP(macro) \ - macro(ph, ompt_target_data_alloc, ompt_tgt_alloc) \ - macro(ph, ompt_target_data_delete, ompt_tgt_delete) \ - macro(ph, ompt_target_data_transfer_to_device, ompt_tgt_copyin) \ - macro(ph, ompt_target_data_transfer_from_device, ompt_tgt_copyout) - void -ompt_data_op_callback +ompt_data_op_callback_emi ( - ompt_scope_endpoint_t endpoint, - ompt_id_t target_id, - ompt_id_t host_op_id, - ompt_target_data_op_t optype, - void *src_addr, - int src_device_num, - void *dest_addr, - int dest_device_num, - size_t bytes, - const void *codeptr_ra + ompt_scope_endpoint_t endpoint, + ompt_data_t *target_task_data, + ompt_data_t *target_data, + ompt_id_t *host_op_id, + ompt_target_data_op_t optype, + void *src_addr, + int src_device_num, + void *dest_addr, + int dest_device_num, + size_t bytes, + const void *codeptr_ra ) -{ - uint64_t ph = hpcrun_placeholder_ompt_tgt_none; - switch (optype) { +{ + if (endpoint == ompt_scope_end) return; + + ompt_need_flush = true; + + uint64_t op_id = *host_op_id = gpu_correlation_id(); + + PRINT("ompt_data_op enter->target_id 0x%lx\n", target_data->value); + enum hpcrun_placeholder op = hpcrun_placeholder_ompt_tgt_none; + switch (optype) { #define ompt_op_macro(op, ompt_op_type, ompt_op_class) \ case ompt_op_type: \ op = hpcrun_placeholder_##ompt_op_class; \ break; - + FOREACH_OMPT_DATA_OP(ompt_op_macro); #undef ompt_op_macro @@ -461,22 +651,31 @@ ompt_data_op_callback break; } - hpcrun_ompt_op_id_notify(endpoint, host_op_id, get_placeholder_norm(ph)); + hpcrun_ompt_op_id_notify(endpoint, op_id, get_placeholder_norm(op)); + PRINT("ompt_data_op exit->target_id 0x%lx\n", target_data->value); } void -ompt_submit_callback +ompt_submit_callback_emi ( ompt_scope_endpoint_t endpoint, - ompt_id_t target_id, - ompt_id_t host_op_id, + ompt_data_t *target_data, + ompt_id_t *host_op_id, unsigned int requested_num_teams ) { - PRINT("ompt_submit_callback enter->target_id %" PRIu64 "\n", target_id); - hpcrun_ompt_op_id_notify(endpoint, host_op_id, get_placeholder_norm(hpcrun_placeholder_ompt_tgt_kernel)); - PRINT("ompt_submit_callback exit->target_id %" PRIu64 "\n", target_id); + PRINT("ompt_submit_callback enter->target_id 0x%lx\n", target_data->value); + + if (endpoint == ompt_scope_begin) { + *host_op_id = gpu_correlation_id(); + hpcrun_ompt_op_id_notify(endpoint, *host_op_id, + get_placeholder_norm(hpcrun_placeholder_ompt_tgt_kernel)); + + ompt_need_flush = true; + } + + PRINT("ompt_submit_callback exit->target_id 0x%lx\n", target_data->value); } @@ -488,6 +687,7 @@ ompt_map_callback(ompt_id_t target_id, size_t *bytes, unsigned int *mapping_flags) { + ompt_need_flush = true; } @@ -510,25 +710,42 @@ ompt_trace_node_get return trace_node; } - void -prepare_device() +prepare_device +( + void +) { PRINT("ompt_initialize->prepare_device enter\n"); - device_finalizer.fn = cupti_device_flush; - device_finalizer_register(device_finalizer_type_flush, &device_finalizer); - - ompt_set_callback(ompt_callback_device_initialize, ompt_device_initialize); - ompt_set_callback(ompt_callback_device_finalize, ompt_device_finalize); - ompt_set_callback(ompt_callback_device_load, ompt_device_load); - ompt_set_callback(ompt_callback_device_unload, ompt_device_unload); - ompt_set_callback(ompt_callback_target, ompt_target_callback); - ompt_set_callback(ompt_callback_target_data_op, ompt_data_op_callback); - ompt_set_callback(ompt_callback_target_submit, ompt_submit_callback); - ompt_set_callback(ompt_callback_target_map, ompt_map_callback); + device_finalizer_flush.fn = ompt_finalize_flush; + device_finalizer_register(device_finalizer_type_flush, + &device_finalizer_flush); + + device_finalizer_shutdown.fn = ompt_finalize_shutdown; + device_finalizer_register(device_finalizer_type_shutdown, + &device_finalizer_shutdown); + + device_finalizer_trace.fn = ompt_finalize_trace; + device_finalizer_register(device_finalizer_type_shutdown, + &device_finalizer_trace); + + ompt_set_callback + (ompt_callback_device_initialize, ompt_device_initialize); + ompt_set_callback + (ompt_callback_device_finalize, ompt_device_finalize); + ompt_set_callback + (ompt_callback_device_load, ompt_device_load); + ompt_set_callback + (ompt_callback_device_unload, ompt_device_unload); + ompt_set_callback + (ompt_callback_target_emi, ompt_target_callback_emi); + ompt_set_callback + (ompt_callback_target_data_op_emi, ompt_data_op_callback_emi); + ompt_set_callback + (ompt_callback_target_submit_emi, ompt_submit_callback_emi); + ompt_set_callback + (ompt_callback_target_map, ompt_map_callback); PRINT("ompt_initialize->prepare_device exit\n"); } - -#endif diff --git a/src/tool/hpcrun/ompt/ompt-device.h b/src/tool/hpcrun/ompt/ompt-device.h index 17bae2f257..75785c3b08 100644 --- a/src/tool/hpcrun/ompt/ompt-device.h +++ b/src/tool/hpcrun/ompt/ompt-device.h @@ -51,8 +51,6 @@ #include #include -#if HAVE_CUPTI_H - void prepare_device ( @@ -112,12 +110,4 @@ ompt_external_subscriber_disable void ); -#else - -// no op without a CUDA device -#define prepare_device() - -#endif - - #endif // _OMPT_INTERFACE_H_ diff --git a/src/tool/hpcrun/ompt/ompt-interface.c b/src/tool/hpcrun/ompt/ompt-interface.c index 24291289af..c69831ba5d 100644 --- a/src/tool/hpcrun/ompt/ompt-interface.c +++ b/src/tool/hpcrun/ompt/ompt-interface.c @@ -70,6 +70,8 @@ #include #include +#include + #include "ompt-callstack.h" #include "ompt-defer.h" #include "ompt-interface.h" @@ -476,11 +478,9 @@ init_threads void ) { - ompt_set_callback_fn - (ompt_callback_thread_begin, (ompt_callback_t)ompt_thread_begin); + ompt_set_callback(ompt_callback_thread_begin, ompt_thread_begin); - ompt_set_callback_fn - (ompt_callback_thread_end, (ompt_callback_t) ompt_thread_end); + ompt_set_callback(ompt_callback_thread_end, ompt_thread_end); } @@ -490,7 +490,7 @@ init_parallel_regions void ) { - ompt_parallel_region_register_callbacks(ompt_set_callback_fn); + ompt_parallel_region_register_callbacks(ompt_set_callback_internal); ompt_regions_init(); } @@ -501,7 +501,7 @@ init_tasks void ) { - ompt_task_register_callbacks(ompt_set_callback_fn); + ompt_task_register_callbacks(ompt_set_callback_internal); } @@ -519,8 +519,8 @@ init_mutex_blame_shift if (!ompt_mutex_blame_requested) return; - retval = ompt_set_callback_fn(ompt_callback_mutex_released, - (ompt_callback_t) ompt_mutex_blame_accept); + retval = ompt_set_callback(ompt_callback_mutex_released, + ompt_mutex_blame_accept); mutex_blame_shift_avail |= ompt_event_may_occur(retval); @@ -556,13 +556,11 @@ init_idle_blame_shift #if 0 ompt_idle_blame_shift_request(); - retval = ompt_set_callback_fn(ompt_callback_idle, - (ompt_callback_t)ompt_idle); + retval = ompt_set_callback(ompt_callback_idle, ompt_idle); idle_blame_shift_avail |= ompt_event_may_occur(retval); #endif - retval = ompt_set_callback_fn(ompt_callback_sync_region_wait, - (ompt_callback_t)ompt_sync); + retval = ompt_set_callback(ompt_callback_sync_region_wait, ompt_sync); idle_blame_shift_avail |= ompt_event_may_occur(retval); @@ -670,10 +668,13 @@ ompt_start_tool const char *runtime_version ) { + // force hpctoolkit initialization + monitor_initialize(); + // post-condition: hpctoolkit is initialized - if (getenv("OMPT_DEBUG_WAIT")) { + if (getenv("OMPT_DEBUG_WAIT")) { while (ompt_debug_wait); - } + } #if OMPT_DEBUG_STARTUP printf("Starting tool...\n"); @@ -1027,18 +1028,19 @@ hpcrun_ompt_get_parent_region_data int hpcrun_ompt_get_thread_num(int level) { - if (ompt_initialized) { - int task_type_flags; - ompt_data_t *task_data = NULL; - ompt_data_t *parallel_data = NULL; - ompt_frame_t *task_frame = NULL; - int thread_num = 0; - - ompt_get_task_info_fn(level, &task_type_flags, &task_data, &task_frame, ¶llel_data, &thread_num); - //printf("Task frame pointer = %p\n", task_frame); - return thread_num; - } - return -1; + if (ompt_initialized) { + int task_type_flags; + ompt_data_t *task_data = NULL; + ompt_data_t *parallel_data = NULL; + ompt_frame_t *task_frame = NULL; + int thread_num = 0; + + ompt_get_task_info_fn(level, &task_type_flags, &task_data, + &task_frame, ¶llel_data, &thread_num); + //printf("Task frame pointer = %p\n", task_frame); + return thread_num; + } + return -1; } diff --git a/src/tool/hpcrun/sample-sources/amd-rocprofiler.c b/src/tool/hpcrun/sample-sources/amd-rocprofiler.c new file mode 100644 index 0000000000..32cdeaf27e --- /dev/null +++ b/src/tool/hpcrun/sample-sources/amd-rocprofiler.c @@ -0,0 +1,211 @@ +//****************************************************************************** +// system includes +//****************************************************************************** + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#ifndef HPCRUN_STATIC_LINK +#include +#endif + + + +//****************************************************************************** +// libmonitor +//****************************************************************************** + +#include + + + +//****************************************************************************** +// local includes +//****************************************************************************** + +#include "amd.h" + +#include "libdl.h" + +#include "simple_oo.h" +#include "sample_source_obj.h" +#include "common.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include + + + +//****************************************************************************** +// macros +//****************************************************************************** + +#define AMD_ROCPROFILER_PREFIX "rocprof" + +static device_finalizer_fn_entry_t device_finalizer_rocprofiler_shutdown; + +//****************************************************************************** +// interface operations +//****************************************************************************** + +static void +METHOD_FN(init) +{ + self->state = INIT; +} + + +static void +METHOD_FN(thread_init) +{ + TMSG(CUDA, "thread_init"); +} + + +static void +METHOD_FN(thread_init_action) +{ + TMSG(CUDA, "thread_init_action"); +} + + +static void +METHOD_FN(start) +{ + TMSG(CUDA, "start"); + TD_GET(ss_state)[self->sel_idx] = START; +} + + +static void +METHOD_FN(thread_fini_action) +{ + TMSG(CUDA, "thread_fini_action"); +} + + +static void +METHOD_FN(stop) +{ + hpcrun_get_thread_data(); + TD_GET(ss_state)[self->sel_idx] = STOP; +} + + +static void +METHOD_FN(shutdown) +{ + self->state = UNINIT; +} + + +static bool +METHOD_FN(supports_event, const char *ev_str) +{ +#ifndef HPCRUN_STATIC_LINK + if (hpcrun_ev_is(ev_str, AMD_ROCPROFILER_PREFIX)) { + rocprofiler_init(); + const char* roc_str = ev_str + sizeof(AMD_ROCPROFILER_PREFIX); + while (*roc_str == ':') roc_str++; + if (*roc_str == 0) return false; + return rocprofiler_match_event(roc_str) != 0; + } + return false; +#else + return false; +#endif + + +} + +static void +METHOD_FN(process_event_list, int lush_metrics) +{ + int nevents = (self->evl).nevents; + TMSG(CUDA,"nevents = %d", nevents); +} + +static void +METHOD_FN(finalize_event_list) +{ + // After going through all command line arguments, + // we call this function to generate a list of counters + // in rocprofiler's format and initialize corresponding + // hpcrun metrics + rocprofiler_finalize_event_list(); + + device_finalizer_rocprofiler_shutdown.fn = rocprofiler_fini; + device_finalizer_register(device_finalizer_type_shutdown, &device_finalizer_rocprofiler_shutdown); + + // Inform roctracer component that we will collect hardware counters, + // which will serialize kernel launches + roctracer_enable_counter_collection(); +} + + +static void +METHOD_FN(gen_event_set,int lush_metrics) +{ + +} + + +static void +METHOD_FN(display_events) +{ + // We need to query rocprofiler to get a list of supported rocprofiler counters + rocprofiler_init(); + + int total_counters = rocprofiler_total_counters(); + printf("===========================================================================\n"); + printf("Available AMD GPU hardware counter events\n"); + printf("===========================================================================\n"); + printf("Name\t\tDescription\n"); + printf("---------------------------------------------------------------------------\n"); + for (int i = 0; i < total_counters; ++i) { + printf("%s::%s\t\t%s\n", AMD_ROCPROFILER_PREFIX, rocprofiler_counter_name(i), rocprofiler_counter_description(i)); + } + printf("\n"); +} + + + +//************************************************************************** +// object +//************************************************************************** + +#define ss_name amd_rocprof +#define ss_cls SS_HARDWARE + +#include "ss_obj.h" diff --git a/src/tool/hpcrun/sample-sources/amd.c b/src/tool/hpcrun/sample-sources/amd.c index 806606c727..d15d62bf96 100644 --- a/src/tool/hpcrun/sample-sources/amd.c +++ b/src/tool/hpcrun/sample-sources/amd.c @@ -43,6 +43,8 @@ #include #include #include +#include +#include #include #include #include @@ -152,6 +154,14 @@ METHOD_FN(process_event_list, int lush_metrics) gpu_metrics_default_enable(); hpcrun_set_trace_metric(HPCRUN_GPU_TRACE_FLAG); TMSG(CUDA,"nevents = %d", nevents); + + +#ifndef HPCRUN_STATIC_LINK + if (hip_bind()) { + EEMSG("hpcrun: unable to bind to HIP AMD library %s\n", dlerror()); + monitor_real_exit(-1); + } +#endif } static void @@ -213,7 +223,6 @@ METHOD_FN(display_events) } - //************************************************************************** // object //************************************************************************** diff --git a/src/tool/hpcrun/sample-sources/blame-shift/blame-sample-source.h b/src/tool/hpcrun/sample-sources/blame-shift/blame-sample-source.h index 71a1afc816..bec3fd7f5d 100644 --- a/src/tool/hpcrun/sample-sources/blame-shift/blame-sample-source.h +++ b/src/tool/hpcrun/sample-sources/blame-shift/blame-sample-source.h @@ -192,7 +192,6 @@ METHOD_FN(display_events) } - /*-------------------------------------------------------------------------- | sample source object --------------------------------------------------------------------------*/ diff --git a/src/tool/hpcrun/sample-sources/cuda.c b/src/tool/hpcrun/sample-sources/cuda.c index e4dae1b650..f79a4ba2e4 100644 --- a/src/tool/hpcrun/sample-sources/cuda.c +++ b/src/tool/hpcrun/sample-sources/cuda.c @@ -423,6 +423,7 @@ METHOD_FN(display_events) printf("\n"); } + /*************************************************************************** * object ***************************************************************************/ diff --git a/src/tool/hpcrun/sample-sources/generic.c b/src/tool/hpcrun/sample-sources/generic.c index ddb375ef42..d1a2754c62 100644 --- a/src/tool/hpcrun/sample-sources/generic.c +++ b/src/tool/hpcrun/sample-sources/generic.c @@ -446,6 +446,7 @@ METHOD_FN(display_events) printf("\n"); } + //*************************************************************************** // object //*************************************************************************** diff --git a/src/tool/hpcrun/sample-sources/gpu_blame.c b/src/tool/hpcrun/sample-sources/gpu_blame.c index 0d02ec2d54..5d90169d5d 100644 --- a/src/tool/hpcrun/sample-sources/gpu_blame.c +++ b/src/tool/hpcrun/sample-sources/gpu_blame.c @@ -257,6 +257,7 @@ static void METHOD_FN(display_events) printf("\n"); } + /*************************************************************************** * object ***************************************************************************/ diff --git a/src/tool/hpcrun/sample-sources/idle.c b/src/tool/hpcrun/sample-sources/idle.c index 532884fda6..0080956e53 100644 --- a/src/tool/hpcrun/sample-sources/idle.c +++ b/src/tool/hpcrun/sample-sources/idle.c @@ -231,7 +231,6 @@ METHOD_FN(display_events) } - /*************************************************************************** * object ***************************************************************************/ diff --git a/src/tool/hpcrun/sample-sources/memleak.c b/src/tool/hpcrun/sample-sources/memleak.c index 57f00c6ad1..c5e59387e8 100644 --- a/src/tool/hpcrun/sample-sources/memleak.c +++ b/src/tool/hpcrun/sample-sources/memleak.c @@ -204,6 +204,7 @@ METHOD_FN(display_events) printf("\n"); } + /*************************************************************************** * object ***************************************************************************/ diff --git a/src/tool/hpcrun/sample-sources/mpi.c b/src/tool/hpcrun/sample-sources/mpi.c index 56595575f4..8f24a96822 100644 --- a/src/tool/hpcrun/sample-sources/mpi.c +++ b/src/tool/hpcrun/sample-sources/mpi.c @@ -186,6 +186,7 @@ METHOD_FN(display_events) printf("\n"); } + /*************************************************************************** * object ***************************************************************************/ diff --git a/src/tool/hpcrun/sample-sources/none.c b/src/tool/hpcrun/sample-sources/none.c index 9f9648894e..b287b0c744 100644 --- a/src/tool/hpcrun/sample-sources/none.c +++ b/src/tool/hpcrun/sample-sources/none.c @@ -187,6 +187,7 @@ METHOD_FN(display_events) { } + /*************************************************************************** * object ***************************************************************************/ diff --git a/src/tool/hpcrun/sample-sources/nvidia.c b/src/tool/hpcrun/sample-sources/nvidia.c index 1a8b0af39b..bc345064d0 100644 --- a/src/tool/hpcrun/sample-sources/nvidia.c +++ b/src/tool/hpcrun/sample-sources/nvidia.c @@ -101,7 +101,7 @@ #define NVIDIA_CUDA "gpu=nvidia" #define NVIDIA_CUDA_PC_SAMPLING "gpu=nvidia,pc" - +#define NVIDIA_CUDA_NV_LINK "nvlink" /****************************************************************************** @@ -320,7 +320,8 @@ static bool METHOD_FN(supports_event, const char *ev_str) { #ifndef HPCRUN_STATIC_LINK - return hpcrun_ev_is(ev_str, NVIDIA_CUDA) || hpcrun_ev_is(ev_str, NVIDIA_CUDA_PC_SAMPLING); + return hpcrun_ev_is(ev_str, NVIDIA_CUDA) || hpcrun_ev_is(ev_str, NVIDIA_CUDA_PC_SAMPLING) + || hpcrun_ev_is(ev_str, NVIDIA_CUDA_NV_LINK); #else return false; #endif @@ -354,27 +355,32 @@ METHOD_FN(process_event_list, int lush_metrics) char* event = start_tok(evlist); long int frequency = 0; int frequency_default = -1; + hpcrun_extract_ev_thresh(event, sizeof(nvidia_name), nvidia_name, &frequency, frequency_default); - if (hpcrun_ev_is(nvidia_name, NVIDIA_CUDA)) { - trace_frequency = - (frequency == frequency_default) ? trace_frequency_default : frequency; - gpu_monitoring_trace_sample_frequency_set(trace_frequency); - } else if (hpcrun_ev_is(nvidia_name, NVIDIA_CUDA_PC_SAMPLING)) { - pc_sampling_frequency = (frequency == frequency_default) ? - pc_sampling_frequency_default : frequency; + for (; event != NULL; event = next_tok()) { + if (hpcrun_ev_is(event, NVIDIA_CUDA)) { + trace_frequency = + (frequency == frequency_default) ? trace_frequency_default : frequency; + gpu_monitoring_trace_sample_frequency_set(trace_frequency); + } else if (hpcrun_ev_is(event, NVIDIA_CUDA_PC_SAMPLING)) { + pc_sampling_frequency = (frequency == frequency_default) ? + pc_sampling_frequency_default : frequency; - gpu_monitoring_instruction_sample_frequency_set(pc_sampling_frequency); + gpu_monitoring_instruction_sample_frequency_set(pc_sampling_frequency); - gpu_metrics_GPU_INST_enable(); // instruction counts + gpu_metrics_GPU_INST_enable(); // instruction counts - gpu_metrics_GPU_INST_STALL_enable(); // stall metrics + gpu_metrics_GPU_INST_STALL_enable(); // stall metrics gpu_metrics_GSAMP_enable(); // GPU utilization from sampling // pc sampling cannot be on with concurrent kernels kernel_invocation_activities[0] = CUPTI_ACTIVITY_KIND_KERNEL; + } else if (hpcrun_ev_is(event, NVIDIA_CUDA_NV_LINK)) { + gpu_metrics_GXFER_enable(); + } } gpu_metrics_default_enable(); @@ -484,7 +490,6 @@ METHOD_FN(display_events) } - //****************************************************************************** // object //****************************************************************************** diff --git a/src/tool/hpcrun/sample-sources/openmp-target.c b/src/tool/hpcrun/sample-sources/openmp-target.c new file mode 100644 index 0000000000..7aa46462e6 --- /dev/null +++ b/src/tool/hpcrun/sample-sources/openmp-target.c @@ -0,0 +1,194 @@ +//****************************************************************************** +// system includes +//****************************************************************************** + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#ifndef HPCRUN_STATIC_LINK +#include +#endif + + + +//****************************************************************************** +// libmonitor +//****************************************************************************** + +#include + + + +//****************************************************************************** +// local includes +//****************************************************************************** + +#include "amd.h" + +#include "libdl.h" + +#include "simple_oo.h" +#include "sample_source_obj.h" +#include "common.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + + + + +//****************************************************************************** +// macros +//****************************************************************************** + +#define OPENMP_TARGET "gpu=openmp" + +static device_finalizer_fn_entry_t device_finalizer_shutdown; +static device_finalizer_fn_entry_t device_trace_finalizer_shutdown; + + +//****************************************************************************** +// interface operations +//****************************************************************************** + +static void +METHOD_FN(init) +{ + self->state = INIT; +} + + +static void +METHOD_FN(thread_init) +{ + TMSG(CUDA, "thread_init"); +} + + +static void +METHOD_FN(thread_init_action) +{ + TMSG(CUDA, "thread_init_action"); +} + + +static void +METHOD_FN(start) +{ + TMSG(CUDA, "start"); + TD_GET(ss_state)[self->sel_idx] = START; +} + + +static void +METHOD_FN(thread_fini_action) +{ + TMSG(CUDA, "thread_fini_action"); +} + + +static void +METHOD_FN(stop) +{ + hpcrun_get_thread_data(); + + TD_GET(ss_state)[self->sel_idx] = STOP; +} + + +static void +METHOD_FN(shutdown) +{ + self->state = UNINIT; +} + + +static bool +METHOD_FN(supports_event, const char *ev_str) +{ +#ifndef HPCRUN_STATIC_LINK + return hpcrun_ev_is(ev_str, OPENMP_TARGET); +#else + return false; +#endif + + +} + +static void +METHOD_FN(process_event_list, int lush_metrics) +{ + int nevents = (self->evl).nevents; + gpu_metrics_default_enable(); + hpcrun_set_trace_metric(HPCRUN_GPU_TRACE_FLAG); + TMSG(CUDA,"nevents = %d", nevents); +} + +static void +METHOD_FN(finalize_event_list) +{ + gpu_metrics_default_enable(); + gpu_trace_init(); +} + + +static void +METHOD_FN(gen_event_set,int lush_metrics) +{ + +} + + +static void +METHOD_FN(display_events) +{ + printf("===========================================================================\n"); + printf("Available AMD GPU events\n"); + printf("===========================================================================\n"); + printf("Name\t\tDescription\n"); + printf("---------------------------------------------------------------------------\n"); + printf("%s\t\tOperation-level monitoring of OpenMP offloading.\n" + "\t\tCollect timing information on GPU kernel invocations,\n" + "\t\tmemory copies, etc.\n", + OPENMP_TARGET); + printf("\n"); +} + + + +//************************************************************************** +// object +//************************************************************************** + +#define ss_name openmp_gpu +#define ss_cls SS_HARDWARE + +#include "ss_obj.h" diff --git a/src/tool/hpcrun/sample-sources/papi-c-cupti.c b/src/tool/hpcrun/sample-sources/papi-c-cupti.c index 4e6a372e14..9c9050e619 100644 --- a/src/tool/hpcrun/sample-sources/papi-c-cupti.c +++ b/src/tool/hpcrun/sample-sources/papi-c-cupti.c @@ -1,378 +1,193 @@ -// ******************* System Includes ******************** -#include -#include +// -*-Mode: C++;-*- // technically C99 -#include -#include -#include -// ********************************************************* +// * BeginRiceCopyright ***************************************************** +// +// -------------------------------------------------------------------------- +// Part of HPCToolkit (hpctoolkit.org) +// +// Information about sources of support for research and development of +// HPCToolkit is at 'hpctoolkit.org' and in 'README.Acknowledgments'. +// -------------------------------------------------------------------------- +// +// Copyright ((c)) 2002-2020, Rice University +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// * Neither the name of Rice University (RICE) nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// This software is provided by RICE and contributors "as is" and any +// express or implied warranties, including, but not limited to, the +// implied warranties of merchantability and fitness for a particular +// purpose are disclaimed. In no event shall RICE or contributors be +// liable for any direct, indirect, incidental, special, exemplary, or +// consequential damages (including, but not limited to, procurement of +// substitute goods or services; loss of use, data, or profits; or +// business interruption) however caused and on any theory of liability, +// whether in contract, strict liability, or tort (including negligence +// or otherwise) arising in any way out of the use of this software, even +// if advised of the possibility of such damage. +// +// ******************************************************* EndRiceCopyright * +//*************************************************************************** +// +// File: +// cupti-api.c +// +// Purpose: +// implementation of wrapper around NVIDIA's CUPTI performance tools API +// +//*************************************************************************** -// ******************** PAPI ******************************* -#include -// ********************************************************* +//*************************************************************************** +// system includes +//*************************************************************************** -// ******************** MONITOR ******************************* +#include #include -// ********************************************************* -// ******************** GPU includes *********************** -#include -#include -// ********************************************************* -// ******* HPCToolkit Includes ********************************* -#include -#include +//*************************************************************************** +// local includes +//*************************************************************************** + #include -#include -#include -#include -#include #include -// ********************************************************* - -// ******** local includes *********** #include "papi-c.h" #include "papi-c-extended-info.h" -// *********************************** - -// ****************** Convenience macros ******************* - -#define CUPTI_LAUNCH_CALLBACK_DEPTH 7 - -#define Cupti_call(fn, ...) \ -{ \ - int ret = fn(__VA_ARGS__); \ - if (ret != CUPTI_SUCCESS) { \ - const char* errstr; \ - dcuptiGetResultString(ret, &errstr); \ - hpcrun_abort("error: CUDA/CUPTI API " \ - #fn " failed w error code %d ==> '%s'\n", \ - ret, errstr); \ - } \ -} - -#define Cupti_call_silent(fn, ...) \ -{ \ - (void) fn(__VA_ARGS__); \ -} -#define Chk_dlopen(v, lib, flags) \ - void* v = monitor_real_dlopen(lib, flags); \ - if (! v) { \ - fprintf(stderr, "gpu dlopen %s failed\n", lib); \ - return; \ - } \ - -#define Chk_dlsym(h, fn) { \ - dlerror(); \ - d ## fn = dlsym(h, #fn); \ - char* e = dlerror(); \ - if (e) { \ - fprintf(stderr, "dlsym(%s) fails w '%s'\n", #fn, e); \ - return; \ - } \ -} -// *********************************************************** - -typedef struct { - int nevents; - int event_set; - sample_source_t* self; -} papi_cuda_data_t; - -static bool event_set_created = false; -static bool event_set_finalized = false; -static papi_cuda_data_t local = {}; - -static spinlock_t cupti_lock = SPINLOCK_UNLOCKED; -static spinlock_t setup_lock = SPINLOCK_UNLOCKED; - -// ******************** cuda/cupti functions *********************** -// Some cuda/cupti functions must not be wrapped! So, we fetch them via dlopen. -// NOTE: naming convention is to prepend the letter "d" to the actual function -// The indirect functions are below. -// -cudaError_t (*dcudaThreadSynchronize)(void); -CUptiResult (*dcuptiGetResultString)(CUptiResult result, const char** str); +//****************************************************************************** +// static data +//****************************************************************************** -CUptiResult (*dcuptiSubscribe)(CUpti_SubscriberHandle* subscriber, - CUpti_CallbackFunc callback, - void* userdata); +static __thread bool event_set_created = false; +static __thread bool event_set_finalized = false; +static __thread int my_event_set = PAPI_NULL; -CUptiResult (*dcuptiEnableCallback)(uint32_t enable, - CUpti_SubscriberHandle subscriber, - CUpti_CallbackDomain domain, - CUpti_CallbackId cbid); -CUptiResult (*dcuptiUnsubscribe)(CUpti_SubscriberHandle subscriber); +//****************************************************************************** +// private operations +//****************************************************************************** -// ***************************************************************** -typedef struct cuda_callback_t { - sample_source_t* ss; - int event_set; -} cuda_callback_t; - -// -// populate the cuda/cupti functions via dlopen -// - -static void -dlgpu(void) -{ - // only use dlfunctions in NON static case -#ifndef HPCRUN_STATIC_LINK - Chk_dlopen(cudart, "libcudart.so", RTLD_NOW | RTLD_GLOBAL); - Chk_dlsym(cudart, cudaThreadSynchronize); - - Chk_dlopen(cupti, "libcupti.so", RTLD_NOW | RTLD_GLOBAL); - Chk_dlsym(cupti, cuptiGetResultString); - Chk_dlsym(cupti, cuptiSubscribe); - Chk_dlsym(cupti, cuptiEnableCallback); - Chk_dlsym(cupti, cuptiUnsubscribe); -#endif // ! HPCRUN_STATIC_LINK -} - -// -// noop routine -// static void papi_c_no_action(void) { ; } -// -// Predicate to determine if this component is being referenced -// + static bool is_papi_c_cuda(const char* name) { return strstr(name, "cuda") == name; } -static void CUPTIAPI -hpcrun_cuda_kernel_callback(void* userdata, - CUpti_CallbackDomain domain, - CUpti_CallbackId cbid, - const CUpti_CallbackData* cbInfo) -{ - TMSG(CUDA, "Got Kernel Callback"); - - papi_cuda_data_t* cuda_data = userdata; - int nevents = cuda_data->nevents; - int cudaEventSet = cuda_data->event_set; - sample_source_t* self = cuda_data->self; - - - TMSG(CUDA, "nevents = %d, cuda event set = %x", nevents, cudaEventSet); - - // This callback is enabled only for kernel launch; anything else is an error. - if (cbid != CUPTI_RUNTIME_TRACE_CBID_cudaLaunch_v3020) { - hpcrun_abort("CUDA CUPTI callback seen for unexpected " - "interface operation: callback id %d\n", cbid); - } - - if (cbInfo->callbackSite == CUPTI_API_ENTER) { - TMSG(CUDA, "Cupti API -ENTER- portion"); - // MC recommends FIXME: Unnecessary, but use cudaDeviceSynchronize - // exclusive access to launcher - spinlock_lock(&cupti_lock); - TMSG(CUPTI, "-ACQ-lock"); - dcudaThreadSynchronize(); - - TMSG(CUPTI,"-- PRE launch callback"); - TMSG(CUDA, "Start monitoring with event set %d", cudaEventSet); - int ret = PAPI_start(cudaEventSet); - if (ret != PAPI_OK){ - EMSG("CUDA monitoring failed to start. PAPI_start failed with %s (%d)", - PAPI_strerror(ret), ret); - } - } - TMSG(CUDA, "Past (or done with) CUDA -ENTER- portion"); - - - if (cbInfo->callbackSite == CUPTI_API_EXIT) { - TMSG(CUDA, "Cupti API -EXIT- portion"); - // MC recommends Use cudaDeviceSynchronize - dcudaThreadSynchronize(); - TMSG(CUPTI, "-- POST launch callback"); - long_long eventValues[nevents+2]; - - TMSG(CUDA,"stopping CUDA monitoring w event set %d",cudaEventSet); - int ret = PAPI_stop(cudaEventSet, eventValues); - if (ret != PAPI_OK){ - EMSG("CUDA monitoring failed to -stop-. PAPI_stop failed with %s (%d)", - PAPI_strerror(ret), ret); - } - TMSG(CUDA,"stopped CUDA monitoring w event set %d",cudaEventSet); - - ucontext_t uc; - TMSG(CUDA,"getting context in CUDA event handler"); - getcontext(&uc); - TMSG(CUDA,"got context in CUDA event handler"); - bool safe = hpcrun_safe_enter(); - TMSG(CUDA,"blocked async event in CUDA event handler"); - { - int i; - for (i = 0; i < nevents; i++) - { - int metric_id = hpcrun_event2metric(self, i); - - TMSG(CUDA, "sampling call path for metric_id = %d", metric_id); - hpcrun_sample_callpath(&uc, metric_id, eventValues[i]/*metricIncr*/, - CUPTI_LAUNCH_CALLBACK_DEPTH/*skipInner*/, - 0/*isSync*/, NULL); - TMSG(CUDA, "sampled call path for metric_id = %d", metric_id); - } - } - TMSG(CUDA,"unblocking async event in CUDA event handler"); - if (safe) hpcrun_safe_exit(); - TMSG(CUDA,"unblocked async event in CUDA event handler"); - spinlock_unlock(&cupti_lock); - TMSG(CUPTI,"-REL-lock\n"); - } - TMSG(CUDA, "At end (past -EXIT-)"); -} - -static CUpti_SubscriberHandle subscriber; - -// -// sync setup for cuda/cupti -// +// Get or create a cupti event set static void -papi_c_cupti_setup(void) -{ - // FIXME: Remove local definition - // CUpti_SubscriberHandle subscriber; - - static bool one_time = false; - - spinlock_lock(&setup_lock); - TMSG(CUDA, "CUPTI setup acquire lock"); - if (one_time) { - spinlock_unlock(&setup_lock); - TMSG(CUDA, "CUPTI setup release lock (setup already called)"); - return; - } - - TMSG(CUDA,"sync setup called"); - - thread_data_t* td = hpcrun_get_thread_data(); - local.self = hpcrun_fetch_source_by_name("papi"); - - local.nevents = local.self->evl.nevents; - - // get cuda event set - - int cuda_component_idx; - int n_components = PAPI_num_components(); - - for (int i = 0; i < n_components; i++) { - if (is_papi_c_cuda(PAPI_get_component_info(i)->name)) { - cuda_component_idx = i; - break; - } - } - - papi_source_info_t* psi = td->ss_info[local.self->sel_idx].ptr; - local.event_set = get_component_event_set(psi, cuda_component_idx); - - Cupti_call(dcuptiSubscribe, &subscriber, - (CUpti_CallbackFunc)hpcrun_cuda_kernel_callback, - &local); - - Cupti_call(dcuptiEnableCallback, 1, subscriber, - CUPTI_CB_DOMAIN_RUNTIME_API, - CUPTI_RUNTIME_TRACE_CBID_cudaLaunch_v3020); - - one_time = true; - spinlock_unlock(&setup_lock); - TMSG(CUDA, "CUPTI setup release lock"); -} - -// -// Get or create a cupti event set --- but only ONCE per process -// -void -papi_c_cupti_get_event_set(int* ev_s) +papi_c_cupti_get_event_set(int* event_set) { TMSG(CUDA, "Get event set"); - spinlock_lock(&setup_lock); - TMSG(CUDA, "Cupti lock acquired"); if (! event_set_created) { TMSG(CUDA, "No event set created, so create one"); - int ret = PAPI_create_eventset(ev_s); + int ret = PAPI_create_eventset(&my_event_set); if (ret != PAPI_OK) { - hpcrun_abort("Failure: PAPI_create_eventset.Return code = %d ==> %s", + hpcrun_abort("Failure: PAPI_create_eventset.Return code = %d ==> %s", ret, PAPI_strerror(ret)); } - local.event_set = *ev_s; + *event_set = my_event_set; event_set_created = true; - TMSG(CUDA, "Event set %d created", local.event_set); + TMSG(CUDA, "Event set %d created", my_event_set); } - spinlock_unlock(&setup_lock); - TMSG(CUDA, "Cupti lock released"); } -int -papi_c_cupti_add_event(int ev_s, int ev) + +// Add event to my_event_set +void +papi_c_cupti_add_event(int event_set, int evcode) { + assert(event_set == my_event_set); + int rv = PAPI_OK; - TMSG(CUDA, "Adding event to cupti event set"); - spinlock_lock(&setup_lock); - TMSG(CUDA, "Cupti lock acquired"); if (! event_set_finalized) { - TMSG(CUDA, "Really add event %x to cupti event set", ev); - rv = PAPI_add_event(local.event_set, ev); - TMSG(CUDA, "Check event set passed in = %d, cuda event set = %d", ev_s, local.event_set); + TMSG(CUDA, "Adding event %x to cupti event set", evcode); + rv = PAPI_add_event(my_event_set, evcode); + if (rv != PAPI_OK) { + hpcrun_abort("failure in PAPI gen_event_set(): PAPI_add_event() returned: %s (%d)", + PAPI_strerror(rv), rv); + } + TMSG(CUDA, "Added event %d, to cuda event set %d", evcode, my_event_set); } - spinlock_unlock(&setup_lock); - TMSG(CUDA, "Cupti lock released"); - return rv; } +// No adding new events after this point void papi_c_cupti_finalize_event_set(void) { - spinlock_lock(&setup_lock); event_set_finalized = true; - spinlock_unlock(&setup_lock); } -// -// sync teardown for cuda/cupti -// -static void -papi_c_cupti_teardown(void) +void +papi_c_cupti_start() +{ + int ret = PAPI_start(my_event_set); + if (ret != PAPI_OK) { + hpcrun_abort("PAPI_start of event set %d failed with %s (%d)", + my_event_set, PAPI_strerror(ret), ret); + } +} + + +void +papi_c_cupti_read(long long *values) +{ + int ret = PAPI_read(my_event_set, values); + if (ret != PAPI_OK) { + hpcrun_abort("PAPI_read of event set %d failed with %s (%d)", + my_event_set, PAPI_strerror(ret), ret); + } +} + + +void +papi_c_cupti_stop(long long *values) { - static bool one_time = false; - spinlock_lock(&setup_lock); - if (one_time) return; - - TMSG(CUDA,"sync teardown called (=unsubscribe)"); - - Cupti_call(dcuptiUnsubscribe, subscriber); - one_time = true; - spinlock_unlock(&setup_lock); + int ret = PAPI_stop(my_event_set, values); + if (ret != PAPI_OK) { + hpcrun_abort("PAPI_stop of event set %d failed with %s (%d)", + my_event_set, PAPI_strerror(ret), ret); + } } + static sync_info_list_t cuda_component = { .pred = is_papi_c_cuda, .get_event_set = papi_c_cupti_get_event_set, .add_event = papi_c_cupti_add_event, .finalize_event_set = papi_c_cupti_finalize_event_set, - .sync_setup = papi_c_cupti_setup, - .sync_teardown = papi_c_cupti_teardown, - .sync_start = papi_c_no_action, - .sync_stop = papi_c_no_action, + .is_gpu_sync = true, + .setup = papi_c_no_action, + .teardown = papi_c_no_action, + .start = papi_c_cupti_start, + .read = papi_c_cupti_read, + .stop = papi_c_cupti_stop, .process_only = true, .next = NULL, }; @@ -381,7 +196,5 @@ static sync_info_list_t cuda_component = { void SS_OBJ_CONSTRUCTOR(papi_c_cupti)(void) { - // fetch actual cuda/cupti functions - dlgpu(); papi_c_sync_register(&cuda_component); -} +} \ No newline at end of file diff --git a/src/tool/hpcrun/sample-sources/papi-c-extended-info.c b/src/tool/hpcrun/sample-sources/papi-c-extended-info.c index 9d49d892bd..f113705199 100644 --- a/src/tool/hpcrun/sample-sources/papi-c-extended-info.c +++ b/src/tool/hpcrun/sample-sources/papi-c-extended-info.c @@ -14,26 +14,16 @@ papi_c_sync_register(sync_info_list_t* info) registered_sync_components = info; } -void -no_action(void) -{ -} void -std_get_event_set(int* ev_s) +no_action(void) { - int ret = PAPI_create_eventset(ev_s); - TMSG(PAPI,"PAPI_create_eventset = %d, eventSet = %d", ret, *ev_s); - if (ret != PAPI_OK) { - hpcrun_abort("Failure: PAPI_create_eventset.Return code = %d ==> %s", - ret, PAPI_strerror(ret)); - } } -int -std_add_event(int ev_s, int ev) +const char * +component_get_name(int cidx) { - return PAPI_add_event(ev_s, ev); + return PAPI_get_component_info(cidx)->name; } get_event_set_proc_t @@ -45,9 +35,11 @@ component_get_event_set(int cidx) for(sync_info_list_t* item=registered_sync_components; item; item = item->next) { if (item->pred(name)) return item->get_event_set; } - return std_get_event_set; +// hpcrun_abort("Failure: PAPI_create_eventset to not registered component"); + return NULL; } + add_event_proc_t component_add_event_proc(int cidx) { @@ -57,7 +49,8 @@ component_add_event_proc(int cidx) for(sync_info_list_t* item=registered_sync_components; item; item = item->next) { if (item->pred(name)) return item->add_event; } - return std_add_event; +// hpcrun_abort("Failure: PAPI_add_event to not registered component"); + return NULL; } finalize_event_set_proc_t @@ -76,13 +69,10 @@ bool component_uses_sync_samples(int cidx) { const char* name = PAPI_get_component_info(cidx)->name; - + TMSG(PAPI, "checking component idx %d (name %s) to see if it is synchronous", cidx, name); for(sync_info_list_t* item=registered_sync_components; item; item = item->next) { - if (item->pred(name)) { - TMSG(PAPI, "Component %s IS a synchronous component", name); - return true; - } + if (item->pred(name)) return item->is_gpu_sync; } return false; } @@ -94,9 +84,9 @@ sync_setup_for_component(int cidx) TMSG(PAPI, "looking for sync setup for component idx=%d(%s)", cidx, name); for(sync_info_list_t* item=registered_sync_components; item; item = item->next) { - if (item->pred(name)) return item->sync_setup; + if (item->pred(name)) return item->setup; } - return no_action; + return NULL; } teardown_proc_t @@ -106,9 +96,9 @@ sync_teardown_for_component(int cidx) TMSG(PAPI, "looking for sync teardown for component idx=%d(%s)", cidx, name); for(sync_info_list_t* item=registered_sync_components; item; item = item->next) { - if (item->pred(name)) return item->sync_teardown; + if (item->pred(name)) return item->teardown; } - return no_action; + return NULL; } start_proc_t @@ -118,11 +108,25 @@ sync_start_for_component(int cidx) TMSG(PAPI, "looking for sync start for component idx=%d(%s)", cidx, name); for(sync_info_list_t* item=registered_sync_components; item; item = item->next) { - if (item->pred(name)) return item->sync_start; + if (item->pred(name)) return item->start; } - return no_action; + return NULL; } + +read_proc_t +sync_read_for_component(int cidx) +{ + const char* name = PAPI_get_component_info(cidx)->name; + + TMSG(PAPI, "looking for sync start for component idx=%d(%s)", cidx, name); + for(sync_info_list_t* item=registered_sync_components; item; item = item->next) { + if (item->pred(name)) return item->read; + } + return NULL; +} + + stop_proc_t sync_stop_for_component(int cidx) { @@ -130,7 +134,7 @@ sync_stop_for_component(int cidx) TMSG(PAPI, "looking for sync stop for component idx=%d(%s)", cidx, name); for(sync_info_list_t* item=registered_sync_components; item; item = item->next) { - if (item->pred(name)) return item->sync_stop; + if (item->pred(name)) return item->stop; } - return no_action; + return NULL; } diff --git a/src/tool/hpcrun/sample-sources/papi-c-extended-info.h b/src/tool/hpcrun/sample-sources/papi-c-extended-info.h index 1636a3f631..eb83b101dc 100644 --- a/src/tool/hpcrun/sample-sources/papi-c-extended-info.h +++ b/src/tool/hpcrun/sample-sources/papi-c-extended-info.h @@ -1,13 +1,14 @@ #ifndef PAPI_C_EXTENDED_INFO_H #define PAPI_C_EXTENDED_INFO_H -typedef void (*get_event_set_proc_t)(int* ev_s); -typedef int (*add_event_proc_t)(int ev_s, int evcode); +typedef void (*get_event_set_proc_t)(int* event_set); +typedef void (*add_event_proc_t)(int event_set, int evcode); typedef void (*finalize_event_set_proc_t)(void); typedef void (*setup_proc_t)(void); typedef void (*teardown_proc_t)(void); typedef void (*start_proc_t)(void); -typedef void (*stop_proc_t)(void); +typedef void (*read_proc_t)(long long *values); +typedef void (*stop_proc_t)(long long *values); typedef bool (*pred_proc_t)(const char* name); typedef struct sync_info_list_t { @@ -15,14 +16,17 @@ typedef struct sync_info_list_t { const get_event_set_proc_t get_event_set; const add_event_proc_t add_event; const finalize_event_set_proc_t finalize_event_set; - const setup_proc_t sync_setup; - const teardown_proc_t sync_teardown; - const start_proc_t sync_start; - const stop_proc_t sync_stop; + const bool is_gpu_sync; + const setup_proc_t setup; + const teardown_proc_t teardown; + const start_proc_t start; + const read_proc_t read; + const stop_proc_t stop; const bool process_only; struct sync_info_list_t* next; } sync_info_list_t; +extern const char* component_get_name(int cidx); extern bool component_uses_sync_samples(int cidx); extern get_event_set_proc_t component_get_event_set(int cidx); extern add_event_proc_t component_add_event_proc(int cidx); @@ -30,6 +34,7 @@ extern finalize_event_set_proc_t component_finalize_event_set(int cidx); extern setup_proc_t sync_setup_for_component(int cidx); extern teardown_proc_t sync_teardown_for_component(int cidx); extern start_proc_t sync_start_for_component(int cidx); +extern read_proc_t sync_read_for_component(int cidx); extern stop_proc_t sync_stop_for_component(int cidx); extern void papi_c_sync_register(sync_info_list_t* info); diff --git a/src/tool/hpcrun/sample-sources/papi-c-rocm.c b/src/tool/hpcrun/sample-sources/papi-c-rocm.c new file mode 100644 index 0000000000..0aca13a1e1 --- /dev/null +++ b/src/tool/hpcrun/sample-sources/papi-c-rocm.c @@ -0,0 +1,201 @@ +// -*-Mode: C++;-*- // technically C99 + +// * BeginRiceCopyright ***************************************************** +// +// -------------------------------------------------------------------------- +// Part of HPCToolkit (hpctoolkit.org) +// +// Information about sources of support for research and development of +// HPCToolkit is at 'hpctoolkit.org' and in 'README.Acknowledgments'. +// -------------------------------------------------------------------------- +// +// Copyright ((c)) 2002-2020, Rice University +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// * Neither the name of Rice University (RICE) nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// This software is provided by RICE and contributors "as is" and any +// express or implied warranties, including, but not limited to, the +// implied warranties of merchantability and fitness for a particular +// purpose are disclaimed. In no event shall RICE or contributors be +// liable for any direct, indirect, incidental, special, exemplary, or +// consequential damages (including, but not limited to, procurement of +// substitute goods or services; loss of use, data, or profits; or +// business interruption) however caused and on any theory of liability, +// whether in contract, strict liability, or tort (including negligence +// or otherwise) arising in any way out of the use of this software, even +// if advised of the possibility of such damage. +// +// ******************************************************* EndRiceCopyright * + +//*************************************************************************** +// +// File: +// rocm-api.c +// +// Purpose: +// implementation of wrapper around NVIDIA's ROCM performance tools API +// +//*************************************************************************** + +//*************************************************************************** +// system includes +//*************************************************************************** + +#include +#include + + + +//*************************************************************************** +// local includes +//*************************************************************************** + +#include +#include +#include "papi-c.h" +#include "papi-c-extended-info.h" +#include + + +//****************************************************************************** +// static data +//****************************************************************************** + +static __thread bool event_set_created = false; +static __thread bool event_set_finalized = false; +static __thread int my_event_set = PAPI_NULL; + + + +//****************************************************************************** +// private operations +//****************************************************************************** + +static void +papi_c_no_action(void) +{ + ; +} + + +static bool +is_papi_c_rocm(const char* name) +{ + return strstr(name, "rocm") == name; +} + + +// Get or create a rocm event set +static void +papi_c_rocm_get_event_set(int* event_set) +{ + TMSG(ROCM, "Get event set"); + if (! event_set_created) { + TMSG(ROCM, "No event set created, so create one"); + int ret = PAPI_create_eventset(&my_event_set); + if (ret != PAPI_OK) { + hpcrun_abort("Failure: PAPI_create_eventset.Return code = %d ==> %s", + ret, PAPI_strerror(ret)); + } + *event_set = my_event_set; + event_set_created = true; + TMSG(ROCM, "Event set %d created", my_event_set); + } +} + + +// Add event to my_event_set +void +papi_c_rocm_add_event(int event_set, int evcode) +{ + assert(event_set == my_event_set); + + int rv = PAPI_OK; + if (! event_set_finalized) { + TMSG(ROCM, "Adding event %x to rocm event set", evcode); + rv = PAPI_add_event(my_event_set, evcode); + if (rv != PAPI_OK) { + hpcrun_abort("failure in PAPI gen_event_set(): PAPI_add_event() returned: %s (%d)", + PAPI_strerror(rv), rv); + } + TMSG(ROCM, "Added event %d, to rocm event set %d", evcode, my_event_set); + } +} + +// No adding new events after this point +void +papi_c_rocm_finalize_event_set(void) +{ + event_set_finalized = true; +} + + +void +papi_c_rocm_start() +{ + int ret = PAPI_start(my_event_set); + if (ret != PAPI_OK) { + hpcrun_abort("PAPI_start of event set %d failed with %s (%d)", + my_event_set, PAPI_strerror(ret), ret); + } +} + + +void +papi_c_rocm_read(long long *values) +{ + hip_dev_sync(); // TODO:Dejan check this out + int ret = PAPI_read(my_event_set, values); + if (ret != PAPI_OK) { + hpcrun_abort("PAPI_read of event set %d failed with %s (%d)", + my_event_set, PAPI_strerror(ret), ret); + } +} + + +void +papi_c_rocm_stop(long long *values) +{ + int ret = PAPI_stop(my_event_set, values); + if (ret != PAPI_OK) { + hpcrun_abort("PAPI_stop of event set %d failed with %s (%d)", + my_event_set, PAPI_strerror(ret), ret); + } +} + + +static sync_info_list_t rocm_component = { + .pred = is_papi_c_rocm, + .get_event_set = papi_c_rocm_get_event_set, + .add_event = papi_c_rocm_add_event, + .finalize_event_set = papi_c_rocm_finalize_event_set, + .is_gpu_sync = true, + .setup = papi_c_no_action, + .teardown = papi_c_no_action, + .start = papi_c_rocm_start, + .read = papi_c_rocm_read, + .stop = papi_c_rocm_stop, + .process_only = true, + .next = NULL, +}; + + +void +SS_OBJ_CONSTRUCTOR(papi_c_rocm)(void) +{ + papi_c_sync_register(&rocm_component); +} \ No newline at end of file diff --git a/src/tool/hpcrun/sample-sources/papi-c.c b/src/tool/hpcrun/sample-sources/papi-c.c index 99cf9c5fbb..dc522f3f7c 100644 --- a/src/tool/hpcrun/sample-sources/papi-c.c +++ b/src/tool/hpcrun/sample-sources/papi-c.c @@ -87,6 +87,7 @@ #include #include #include +#include #include #include #include @@ -100,22 +101,31 @@ #include #include +#include "papi-c.h" +#include "tool_state.h" /****************************************************************************** * macros *****************************************************************************/ +#define DEBUG 0 + +#include +#include + #define OVERFLOW_MODE 0 #define WEIGHT_METRIC 0 #define DEFAULT_THRESHOLD 2000000L -#include "papi-c.h" /****************************************************************************** * forward declarations *****************************************************************************/ static void papi_event_handler(int event_set, void *pc, long long ovec, void *context); +static void papi_monitor_enter(papi_component_info_t *ci, cct_node_t *cct_node); +static void papi_monitor_exit(papi_component_info_t *ci); + static int event_is_derived(int ev_code); static void event_fatal_error(int ev_code, int papi_ret); @@ -123,6 +133,10 @@ static void event_fatal_error(int ev_code, int papi_ret); * local variables *****************************************************************************/ +// Support for derived events (proxy sampling). +static int derived[MAX_EVENTS]; +static int some_overflow; + // Special case to make PAPI_library_init() a soft failure. // Make sure that we call no other PAPI functions. @@ -145,6 +159,7 @@ static uint64_t hpcrun_cycles_cmd_period = 0; * private operations *****************************************************************************/ + static int get_event_index(sample_source_t *self, int event_code) { @@ -157,18 +172,24 @@ get_event_index(sample_source_t *self, int event_code) assert(0); } -// -// fetch a given component's event set. Create one if need be -// -int -get_component_event_set(papi_source_info_t* psi, int cidx) + +static int +evcode_to_component_id(papi_source_info_t* psi, int evcode) { - if (cidx < 0 || cidx >= psi->num_components) { + int cidx = PAPI_get_event_component(evcode); + if (cidx < 0 || cidx >= psi->num_components) { hpcrun_abort("PAPI component index out of range [0,%d]: %d", psi->num_components, cidx); - } + } + return cidx; +} - papi_component_info_t* ci = &(psi->component_info[cidx]); +// +// fetch a given component's event set. Create one if need be +// +int +get_component_event_set(papi_component_info_t* ci) +{ if (!ci->inUse) { ci->get_event_set(&(ci->eventSet)); ci->inUse = true; @@ -176,17 +197,110 @@ get_component_event_set(papi_source_info_t* psi, int cidx) return ci->eventSet; } + // // add an event to a component's event set // -int -component_add_event(papi_source_info_t* psi, int cidx, int evcode) +void +component_add_event(papi_source_info_t* psi, int evcode) { - int event_set = get_component_event_set(psi, cidx); + int cidx = evcode_to_component_id(psi, evcode); papi_component_info_t* ci = &(psi->component_info[cidx]); - return ci->add_event(event_set, evcode); + int event_set = get_component_event_set(ci); + + ci->add_event(event_set, evcode); + ci->some_derived |= event_is_derived(evcode); + + TMSG(PAPI, "Added event code %x to component %d", evcode, cidx); + { + char buffer[PAPI_MAX_STR_LEN]; + PAPI_event_code_to_name(evcode, buffer); + TMSG(PAPI, + "PAPI_add_event(eventSet=%%d, event_code=%x (event name %s)) component=%d", + /* eventSet, */ evcode, buffer, cidx); + } +} + + +static void +papi_register_events(papi_source_info_t *psi, evlist_t evl) +{ + int i; + int nevents = evl.nevents; + + // add events to new event_sets + for (i = 0; i < nevents; i++) { + int evcode = evl.events[i].event; + component_add_event(psi, evcode); + + } + + // finalize component event sets + for (i = 0; i < psi->num_components; i++) { + papi_component_info_t *ci = &(psi->component_info[i]); + ci->finalize_event_set(); + } +} + + +static void +papi_register_sync_callback(papi_component_info_t *ci) +{ + gpu_monitor_node_t node; + node.ci = ci; + node.enter_fn = papi_monitor_enter; + node.exit_fn = papi_monitor_exit; + gpu_monitor_register(node); } + +static void +papi_register_overflow_callback(int eventSet, int evcode, long thresh) +{ + TMSG(PAPI, "PAPI_overflow(eventSet=%d, evcode=%x, thresh=%d) register", + eventSet, evcode, thresh); + + int ret = PAPI_overflow(eventSet, evcode, thresh, OVERFLOW_MODE, papi_event_handler); + if (ret != PAPI_OK) { + EMSG("failure in PAPI gen_event_set(): PAPI_overflow() returned: %s (%d)", + PAPI_strerror(ret), ret); + event_fatal_error(evcode, ret); + } +} + + +static void +papi_register_callbacks(papi_source_info_t *psi, evlist_t evl) +{ + int i; + // set up overflow handling for asynchronous event sets for active components + // set up synchronous handling for synchronous event sets for active compoents + for (i = 0; i < evl.nevents; i++) { + + int evcode = evl.events[i].event; + long thresh = evl.events[i].thresh; + int cidx = evcode_to_component_id(psi, evcode); + papi_component_info_t *ci = &(psi->component_info[cidx]); + int eventSet = get_component_event_set(ci); + + // **** No overflow for synchronous events **** + if (ci->is_gpu_sync) { + TMSG(PAPI, "event code %d (component %d) is synchronous, so do NOT set overflow", evcode, cidx); + TMSG(PAPI, "Set up papi_monitor_apply instead"); + TMSG(PAPI, "synchronous sample component index = %d", cidx); + + papi_register_sync_callback(ci); + } + else{ + if (! derived[i]) { // ***** Only set overflow if NOT derived event ***** + papi_register_overflow_callback(eventSet, evcode, thresh); + } + } + } + +} + + static bool thread_count_scaling_for_component(int cidx) { @@ -196,14 +310,6 @@ thread_count_scaling_for_component(int cidx) } -/****************************************************************************** - * sample source registration - *****************************************************************************/ - -// Support for derived events (proxy sampling). -static int derived[MAX_EVENTS]; -static int some_overflow; - /****************************************************************************** * method functions *****************************************************************************/ @@ -225,15 +331,18 @@ strip_papi_prefix(const char *str) return str; } + static void METHOD_FN(init) { + tool_enter(); // PAPI_set_debug(0x3ff); // **NOTE: some papi components may start threads, so // hpcrun must ignore these threads to ensure that PAPI_library_init // succeeds // + monitor_disable_new_threads(); if (disable_papi_cuda) { TMSG(PAPI_C, "Will disable PAPI cuda component (if component is active)"); @@ -241,10 +350,10 @@ METHOD_FN(init) if (cidx) { int res = PAPI_disable_component(cidx); if (res == PAPI_OK) { - TMSG(PAPI, "PAPI cuda component disabled"); + TMSG(PAPI, "PAPI cuda component disabled"); } else { - EMSG("*** PAPI cuda component could not be disabled!!!"); + EMSG("*** PAPI cuda component could not be disabled!!!"); } } } @@ -278,13 +387,15 @@ METHOD_FN(init) } self->state = INIT; + tool_exit(); } static void METHOD_FN(thread_init) { + tool_enter(); TMSG(PAPI, "thread init"); - if (papi_unavail) { return; } + if (papi_unavail) { goto finish; } int retval = PAPI_thread_init(pthread_self); if (retval != PAPI_OK) { @@ -292,13 +403,17 @@ METHOD_FN(thread_init) monitor_real_abort(); } TMSG(PAPI, "thread init OK"); + +finish: + tool_exit(); } static void METHOD_FN(thread_init_action) { + tool_enter(); TMSG(PAPI, "register thread"); - if (papi_unavail) { return; } + if (papi_unavail) { goto finish; } int retval = PAPI_register_thread(); if (retval != PAPI_OK) { @@ -306,16 +421,20 @@ METHOD_FN(thread_init_action) monitor_real_abort(); } TMSG(PAPI, "register thread ok"); + +finish: + tool_exit(); } static void METHOD_FN(start) { + tool_enter(); int cidx; TMSG(PAPI, "start"); - if (papi_unavail) { - return; + if (papi_unavail) { + goto finish; } thread_data_t* td = hpcrun_get_thread_data(); @@ -327,7 +446,7 @@ METHOD_FN(start) if (my_state == START) { TMSG(PAPI,"*NOTE* PAPI start called when already in state START"); - return; + goto finish; } // for each active component, start its event set @@ -336,54 +455,64 @@ METHOD_FN(start) papi_component_info_t* ci = &(psi->component_info[cidx]); if (ci->inUse) { if (component_uses_sync_samples(cidx)) { - TMSG(PAPI, "component %d is synchronous, use synchronous start", cidx); - ci->sync_start(); + TMSG(PAPI, "component %d is synchronous, use synchronous start", cidx); + ci->start(); } - else { //use async start - TMSG(PAPI,"starting PAPI event set %d for component %d", ci->eventSet, cidx); - int ret = PAPI_start(ci->eventSet); - if (ret == PAPI_EISRUN) { - // this case should not happen, but maybe it's not fatal - EMSG("PAPI returned EISRUN for event set %d component %d", ci->eventSet, cidx); - } - else if (ret != PAPI_OK) { - EMSG("PAPI_start failed with %s (%d) for event set %d component %d ", - PAPI_strerror(ret), ret, ci->eventSet, cidx); - hpcrun_ssfail_start("PAPI"); - } - - if (ci->some_derived) { - ret = PAPI_read(ci->eventSet, ci->prev_values); - if (ret != PAPI_OK) { - EMSG("PAPI_read of event set %d for component %d failed with %s (%d)", - ci->eventSet, cidx, PAPI_strerror(ret), ret); - } - } + else { + TMSG(PAPI,"starting PAPI event set %d for component %d", ci->eventSet, cidx); + int ret = PAPI_start(ci->eventSet); + if (ret == PAPI_EISRUN) { + // this case should not happen, but maybe it's not fatal + EMSG("PAPI returned EISRUN for event set %d component %d", ci->eventSet, cidx); + } + else if (ret != PAPI_OK) { + EMSG("PAPI_start failed with %s (%d) for event set %d component %d ", + PAPI_strerror(ret), ret, ci->eventSet, cidx); + hpcrun_ssfail_start("PAPI"); + } + + if (ci->some_derived) { + ret = PAPI_read(ci->eventSet, ci->prev_values); + if (ret != PAPI_OK) { + EMSG("PAPI_read of event set %d for component %d failed with %s (%d)", + ci->eventSet, cidx, PAPI_strerror(ret), ret); + } + } + } } } td->ss_state[self->sel_idx] = START; + +finish: + tool_exit(); } static void METHOD_FN(thread_fini_action) { + tool_enter(); TMSG(PAPI, "unregister thread"); - if (papi_unavail) { return; } + if (papi_unavail) { goto finish; } int retval = PAPI_unregister_thread(); char msg[] = "!!NOT PAPI_OK!! (code = -9999999)\n"; snprintf(msg, sizeof(msg)-1, "!!NOT PAPI_OK!! (code = %d)", retval); TMSG(PAPI, "unregister thread returns %s", retval == PAPI_OK? "PAPI_OK" : msg); +finish: + tool_exit(); } + static void METHOD_FN(stop) { + tool_enter(); + int cidx; TMSG(PAPI, "stop"); - if (papi_unavail) { return; } + if (papi_unavail) { goto finish; } thread_data_t *td = hpcrun_get_thread_data(); int nevents = self->evl.nevents; @@ -391,12 +520,12 @@ METHOD_FN(stop) if (my_state == STOP) { TMSG(PAPI,"*NOTE* PAPI stop called when already in state STOP"); - return; + goto finish; } if (my_state != START) { TMSG(PAPI,"*WARNING* PAPI stop called when not in state START"); - return; + goto finish; } papi_source_info_t *psi = td->ss_info[self->sel_idx].ptr; @@ -404,36 +533,46 @@ METHOD_FN(stop) papi_component_info_t *ci = &(psi->component_info[cidx]); if (ci->inUse) { if (component_uses_sync_samples(cidx)) { - TMSG(PAPI, "component %d is synchronous, stop is trivial", cidx); + TMSG(PAPI, "component %d is synchronous, stop is trivial", cidx); } else { - TMSG(PAPI,"stop w event set = %d", ci->eventSet); - long_long values[nevents+2]; - // long_long *values = (long_long *) alloca(sizeof(long_long) * (nevents+2)); - int ret = PAPI_stop(ci->eventSet, values); - if (ret != PAPI_OK){ - EMSG("Failed to stop PAPI for eventset %d. Return code = %d ==> %s", - ci->eventSet, ret, PAPI_strerror(ret)); - } + TMSG(PAPI,"stop w event set = %d", ci->eventSet); + long_long values[nevents+2]; + // long_long *values = (long_long *) alloca(sizeof(long_long) * (nevents+2)); + + int ret = PAPI_stop(ci->eventSet, values); + if (ret != PAPI_OK) { + EMSG("Failed to stop PAPI for eventset %d. Return code = %d ==> %s", + ci->eventSet, ret, PAPI_strerror(ret)); + } + } } } TD_GET(ss_state)[self->sel_idx] = STOP; +finish: + tool_exit(); } + static void METHOD_FN(shutdown) { + tool_enter(); TMSG(PAPI, "shutdown"); - if (papi_unavail) { return; } + if (papi_unavail) { goto finish; } - METHOD_CALL(self, stop); // make sure stop has been called + do{ + METHOD_CALL(self, stop); // make sure stop has been called + }while(0); // FIXME: add component shutdown code here PAPI_shutdown(); self->state = UNINIT; +finish: + tool_exit(); } // Return true if PAPI recognizes the name, whether supported or not. @@ -441,15 +580,17 @@ METHOD_FN(shutdown) static bool METHOD_FN(supports_event, const char *ev_str) { + tool_enter(); + bool ret; ev_str = strip_papi_prefix(ev_str); - + TMSG(PAPI, "supports event"); - if (papi_unavail) { return false; } + if (papi_unavail) { ret = false; goto finish;} if (self->state == UNINIT){ METHOD_CALL(self, init); } - + char evtmp[1024]; int ec; long th; @@ -460,15 +601,20 @@ METHOD_FN(supports_event, const char *ev_str) if (is_event_to_exclude(evtmp)) { return false; } + + ret = (PAPI_event_name_to_code(evtmp, &ec) == PAPI_OK); - return PAPI_event_name_to_code(evtmp, &ec) == PAPI_OK; +finish: + tool_exit(); + return ret; } - + static void METHOD_FN(process_event_list, int lush_metrics) { + tool_enter(); TMSG(PAPI, "process event list"); - if (papi_unavail) { return; } + if (papi_unavail) { goto finish; } char *event; int i, ret; @@ -490,7 +636,7 @@ METHOD_FN(process_event_list, int lush_metrics) int period_type = hpcrun_extract_ev_thresh(event, sizeof(name), name, &thresh, DEFAULT_THRESHOLD); if (!period_type) { AMSG("WARNING: %s using default threshold %ld, " - "better to use an explicit threshold.", name, DEFAULT_THRESHOLD); + "better to use an explicit threshold.", name, DEFAULT_THRESHOLD); } #else int period_type = hpcrun_extract_ev_thresh(event, sizeof(name), name, &thresh, DEFAULT_THRESHOLD); @@ -498,8 +644,8 @@ METHOD_FN(process_event_list, int lush_metrics) ret = PAPI_event_name_to_code(name, &evcode); if (ret != PAPI_OK) { EMSG("unexpected failure in PAPI process_event_list(): " - "PAPI_event_name_to_code() returned %s (%d)", - PAPI_strerror(ret), ret); + "PAPI_event_name_to_code() returned %s (%d)", + PAPI_strerror(ret), ret); hpcrun_ssfail_unsupported("PAPI", name); } if (PAPI_query_event(evcode) != PAPI_OK) { @@ -549,7 +695,7 @@ METHOD_FN(process_event_list, int lush_metrics) // supports hardware overflow. use threshold = 0 to force proxy // sampling (for testing). if (event_is_derived(self->evl.events[i].event) - || self->evl.events[i].thresh == 0) { + || self->evl.events[i].thresh == 0) { TMSG(PAPI, "using proxy sampling for event %s", buffer); strcat(buffer, " (proxy)"); self->evl.events[i].thresh = 1; @@ -571,10 +717,11 @@ METHOD_FN(process_event_list, int lush_metrics) if (component_uses_sync_samples(cidx)) TMSG(PAPI, "Event %s from synchronous component", buffer); + int metric_id = /* weight */ hpcrun_set_new_metric_info_and_period(papi_kind, strdup(buffer), - MetricFlags_ValFmt_Int, - threshold, prop); + MetricFlags_ValFmt_Int, + threshold, prop); METHOD_CALL(self, store_metric_id, i, metric_id); if (isCycles) { hpcrun_cycles_metric_id = metric_id; @@ -585,9 +732,9 @@ METHOD_FN(process_event_list, int lush_metrics) if (num_lush_metrics > 0 && strcmp(buffer, "PAPI_TOT_CYC") == 0) { // there should be one lush metric; its source is the last event int mid_idleness = - hpcrun_set_new_metric_info_and_period(papi_kind, "idleness", - MetricFlags_ValFmt_Real, - self->evl.events[i].thresh, prop); + hpcrun_set_new_metric_info_and_period(papi_kind, "idleness", + MetricFlags_ValFmt_Real, + self->evl.events[i].thresh, prop); assert(num_lush_metrics == 1 && (i == (nevents - 1))); lush_agents->metric_time = metric_id; lush_agents->metric_idleness = mid_idleness; @@ -599,6 +746,9 @@ METHOD_FN(process_event_list, int lush_metrics) if (! some_overflow) { hpcrun_ssfail_all_derived("PAPI"); } + +finish: + tool_exit(); } static void @@ -609,16 +759,16 @@ METHOD_FN(finalize_event_list) static void METHOD_FN(gen_event_set, int lush_metrics) { + tool_enter(); thread_data_t *td = hpcrun_get_thread_data(); int i; - int ret; TMSG(PAPI, "generating all event sets for all components"); - if (papi_unavail) { return; } + if (papi_unavail) { goto finish; } int num_components = PAPI_num_components(); - int ss_info_size = sizeof(papi_source_info_t) + - num_components * sizeof(papi_component_info_t); + int ss_info_size = sizeof(papi_source_info_t) + + num_components * sizeof(papi_component_info_t); TMSG(PAPI, "Num components = %d", num_components); papi_source_info_t* psi = hpcrun_malloc(ss_info_size); @@ -630,7 +780,8 @@ METHOD_FN(gen_event_set, int lush_metrics) psi->num_components = num_components; for (i = 0; i < num_components; i++) { papi_component_info_t *ci = &(psi->component_info[i]); - ci->inUse = false; + ci->name = component_get_name(i); + ci->inUse = false; ci->eventSet = PAPI_NULL; ci->state = INIT; ci->some_derived = 0; @@ -638,98 +789,48 @@ METHOD_FN(gen_event_set, int lush_metrics) ci->add_event = component_add_event_proc(i); ci->finalize_event_set = component_finalize_event_set(i); ci->scale_by_thread_count = thread_count_scaling_for_component(i); - ci->is_sync = component_uses_sync_samples(i); - ci->sync_setup = sync_setup_for_component(i); - ci->sync_teardown = sync_teardown_for_component(i); - ci->sync_start = sync_start_for_component(i); - ci->sync_stop = sync_stop_for_component(i); + ci->is_gpu_sync = component_uses_sync_samples(i); + ci->setup = sync_setup_for_component(i); + ci->teardown = sync_teardown_for_component(i); + ci->start = sync_start_for_component(i); + ci->read = sync_read_for_component(i); + ci->stop = sync_stop_for_component(i); memset(ci->prev_values, 0, sizeof(ci->prev_values)); } // record the component state in thread state td->ss_info[self->sel_idx].ptr = psi; - int nevents = (self->evl).nevents; - for (i = 0; i < nevents; i++) { - int evcode = self->evl.events[i].event; - int cidx = PAPI_get_event_component(evcode); - - ret = component_add_event(psi, cidx, evcode); - psi->component_info[cidx].some_derived |= event_is_derived(evcode); - TMSG(PAPI, "Added event code %x to component %d", evcode, cidx); - { - char buffer[PAPI_MAX_STR_LEN]; - PAPI_event_code_to_name(evcode, buffer); - TMSG(PAPI, - "PAPI_add_event(eventSet=%%d, event_code=%x (event name %s)) component=%d", - /* eventSet, */ evcode, buffer, cidx); - } - if (ret != PAPI_OK) { - EMSG("failure in PAPI gen_event_set(): PAPI_add_event() returned: %s (%d)", - PAPI_strerror(ret), ret); - event_fatal_error(evcode, ret); - } - } + papi_register_events(psi, self->evl); - // finalize component event sets - for (i = 0; i < num_components; i++) { - papi_component_info_t *ci = &(psi->component_info[i]); - ci->finalize_event_set(); - } + papi_register_callbacks(psi, self->evl); - // set up overflow handling for asynchronous event sets for active components - // set up synchronous handling for synchronous event sets for active compoents - for (i = 0; i < nevents; i++) { - int evcode = self->evl.events[i].event; - long thresh = self->evl.events[i].thresh; - int cidx = PAPI_get_event_component(evcode); - int eventSet = get_component_event_set(psi, cidx); - - // **** No overflow for synchronous events **** - // **** Use component-specific setup for synchronous events **** - if (component_uses_sync_samples(cidx)) { - TMSG(PAPI, "event code %d (component %d) is synchronous, so do NOT set overflow", evcode, cidx); - TMSG(PAPI, "Set up sync handler instead"); - TMSG(PAPI, "synchronous sample component index = %d", cidx); - sync_setup_for_component(cidx)(); - continue; - } - // ***** Only set overflow if NOT derived event ***** - if (! derived[i]) { - ret = PAPI_overflow(eventSet, evcode, thresh, OVERFLOW_MODE, - papi_event_handler); - TMSG(PAPI, "PAPI_overflow(eventSet=%d, evcode=%x, thresh=%d) = %d", - eventSet, evcode, thresh, ret); - if (ret != PAPI_OK) { - EMSG("failure in PAPI gen_event_set(): PAPI_overflow() returned: %s (%d)", - PAPI_strerror(ret), ret); - event_fatal_error(evcode, ret); - } - } - } +finish: + tool_exit(); } static void METHOD_FN(display_events) { + tool_enter(); PAPI_event_info_t info; int ev, ret, num_total, num_prof; int num_components, cidx; if (papi_unavail) { - printf("PAPI is not available. Probably, the kernel doesn't support PAPI,\n" - "or else maybe HPCToolkit is out of sync with PAPI.\n\n"); - return; + PRINT("PAPI is not available. Probably, the kernel doesn't support PAPI,\n" + "or else maybe HPCToolkit is out of sync with PAPI.\n\n"); + goto finish; } cidx = 0; // CPU component { const PAPI_component_info_t *component = PAPI_get_component_info(cidx); - printf("===========================================================================\n"); - printf("Available PAPI preset events in component %s\n", component->name); - printf("\n"); - printf("Name\t Profilable\tDescription\n"); - printf("===========================================================================\n"); + PRINT("===========================================================================\n"); + PRINT("Available PAPI preset events in component %s\n", component->name); + PRINT("\n"); + PRINT("Name\t Profilable\tDescription\n"); + PRINT("===========================================================================\n"); num_total = 0; num_prof = 0; @@ -739,53 +840,55 @@ METHOD_FN(display_events) char *prof; memset(&info, 0, sizeof(info)); if (PAPI_get_event_info(ev, &info) == PAPI_OK && info.count != 0) { - if (event_is_derived(ev)) { - prof = "No"; - } else { - prof = "Yes"; - num_prof++; - } - num_total++; - printf("%-10s\t%s\t%s\n", info.symbol, prof, info.long_descr); + if (event_is_derived(ev)) { + prof = "No"; + } else { + prof = "Yes"; + num_prof++; + } + num_total++; + PRINT("%-10s\t%s\t%s\n", info.symbol, prof, info.long_descr); } ret = PAPI_enum_cmp_event(&ev, PAPI_ENUM_EVENTS, cidx); } - printf("---------------------------------------------------------------------------\n"); - printf("Total PAPI events: %d, able to profile: %d\n", num_total, num_prof); - printf("\n\n"); + PRINT("---------------------------------------------------------------------------\n"); + PRINT("Total PAPI events: %d, able to profile: %d\n", num_total, num_prof); + PRINT("\n\n"); } - num_components = PAPI_num_components(); + num_components = PAPI_num_components(); for(cidx = 0; cidx < num_components; cidx++) { const PAPI_component_info_t* component = PAPI_get_component_info(cidx); int cmp_event_count = 0; if (component->disabled) continue; - printf("===========================================================================\n"); - printf("Native events in component %s\n", component->name); - printf("\n"); - printf("Name Description\n"); - printf("===========================================================================\n"); - + PRINT("===========================================================================\n"); + PRINT("Native events in component %s\n", component->name); + PRINT("\n"); + PRINT("Name Description\n"); + PRINT("===========================================================================\n"); + ev = 0 | PAPI_NATIVE_MASK; ret = PAPI_enum_cmp_event(&ev, PAPI_ENUM_FIRST, cidx); while (ret == PAPI_OK) { memset(&info, 0, sizeof(info)); if (PAPI_get_event_info(ev, &info) == PAPI_OK) { - cmp_event_count++; + cmp_event_count++; display_event_info(stdout, info.symbol, info.long_descr); - printf("---------------------------------------------------------------------------\n"); + PRINT("---------------------------------------------------------------------------\n"); } ret = PAPI_enum_cmp_event(&ev, PAPI_ENUM_EVENTS, cidx); } - printf("Total native events for component %s: %d\n", component->name, cmp_event_count); - printf("\n\n"); + PRINT("Total native events for component %s: %d\n", component->name, cmp_event_count); + PRINT("\n\n"); num_total += cmp_event_count; } - printf( "Total events reported: %d\n", num_total); - printf("\n\n"); + PRINT( "Total events reported: %d\n", num_total); + PRINT("\n\n"); +finish: + tool_exit(); } @@ -800,16 +903,18 @@ METHOD_FN(display_events) #include "ss_obj.h" // ************************************************************************** -// * public operations +// * public operations // ************************************************************************** void hpcrun_disable_papi_cuda(void) { + tool_enter(); disable_papi_cuda = true; + tool_exit(); } /****************************************************************************** - * private operations + * private operations *****************************************************************************/ // Returns: 1 if the event code is a derived event. @@ -817,26 +922,35 @@ hpcrun_disable_papi_cuda(void) static int event_is_derived(int ev_code) { + tool_enter(); + int ret; PAPI_event_info_t info; // "Is derived" is kind of a bad thing, so if any unexpected failure // occurs, we'll return the "bad" answer. if (PAPI_get_event_info(ev_code, &info) != PAPI_OK || info.derived == NULL) { - return 1; + ret = 1; + goto finish; } if (info.count == 1 || strlen(info.derived) == 0 || strcmp(info.derived, "NOT_DERIVED") == 0 || strcmp(info.derived, "DERIVED_CMPD") == 0) { - return 0; + ret = 0; + goto finish; } - return 1; + ret = 1; + +finish: + tool_exit(); + return ret; } static void event_fatal_error(int ev_code, int papi_ret) { + tool_enter(); char name[1024]; PAPI_event_code_to_name(ev_code, name); @@ -850,36 +964,39 @@ event_fatal_error(int ev_code, int papi_ret) hpcrun_ssfail_conflict("PAPI", name); } hpcrun_ssfail_unsupported("PAPI", name); + + tool_exit(); } static void papi_event_handler(int event_set, void *pc, long long ovec, void *context) { + tool_enter(); sample_source_t *self = &obj_name(); long long values[MAX_EVENTS]; int my_events[MAX_EVENTS]; - int my_event_count = MAX_EVENTS; + int my_events_number = MAX_EVENTS; int nevents = self->evl.nevents; int i, ret; - int my_event_codes[MAX_EVENTS]; - int my_event_codes_count = MAX_EVENTS; + int my_events_code[MAX_EVENTS]; + int my_events_code_count = MAX_EVENTS; // if sampling disabled explicitly for this thread, skip all processing - if (hpcrun_suppress_sample() || sample_filters_apply()) return; + if (hpcrun_suppress_sample() || sample_filters_apply()) goto finish; if (!ovec) { TMSG(PAPI_SAMPLE, "papi overflow event: event set %d ovec = %ld", - event_set, ovec); - return; + event_set, ovec); + goto finish; } // If the interrupt came from inside our code, then drop the sample // and return and avoid any MSG. if (! hpcrun_safe_enter_async(pc)) { hpcrun_stats_num_samples_blocked_async_inc(); - return; + goto finish; } int cidx = PAPI_get_eventset_component(event_set); @@ -894,42 +1011,42 @@ papi_event_handler(int event_set, void *pc, long long ovec, } } - ret = PAPI_get_overflow_event_index(event_set, ovec, my_events, - &my_event_count); + ret = PAPI_get_overflow_event_index(event_set, ovec, my_events, + &my_events_number); if (ret != PAPI_OK) { TMSG(PAPI_SAMPLE, "papi_event_handler: event set %d ovec %ld " - "get_overflow_event_index return code = %d ==> %s", - event_set, ovec, ret, PAPI_strerror(ret)); + "get_overflow_event_index return code = %d ==> %s", + event_set, ovec, ret, PAPI_strerror(ret)); #ifdef DEBUG_PAPI_OVERFLOW - ret = PAPI_list_events(event_set, my_event_codes, &my_event_codes_count); + ret = PAPI_list_events(event_set, my_events_code, &my_events_code_count); if (ret != PAPI_OK) { TMSG(PAPI_SAMPLE, "PAPI_list_events failed inside papi_event_handler." - "Return code = %d ==> %s", ret, PAPI_strerror(ret)); + "Return code = %d ==> %s", ret, PAPI_strerror(ret)); } else { - for (i = 0; i < my_event_codes_count; i++) { - TMSG(PAPI_SAMPLE, "event set %d event code %d = %x\n", - event_set, i, my_event_codes[i]); + for (i = 0; i < my_events_code_count; i++) { + TMSG(PAPI_SAMPLE, "event set %d event code %d = %x\n", + event_set, i, my_events_code[i]); } } TMSG(PAPI_SAMPLE, "get_overflow_event_index failure in papi_event_handler"); #endif } - ret = PAPI_list_events(event_set, my_event_codes, &my_event_codes_count); + ret = PAPI_list_events(event_set, my_events_code, &my_events_code_count); if (ret != PAPI_OK) { hpcrun_abort("PAPI_list_events failed inside papi_event_handler." - "Return code = %d ==> %s", ret, PAPI_strerror(ret)); + "Return code = %d ==> %s", ret, PAPI_strerror(ret)); } - for (i = 0; i < my_event_count; i++) { + for (i = 0; i < my_events_number; i++) { // FIXME: SUBTLE ERROR: metric_id may not be same from hpcrun_new_metric()! // This means lush's 'time' metric should be *last* TMSG(PAPI_SAMPLE,"handling papi overflow event: " - "event set %d event index = %d event code = 0x%x", - event_set, my_events[i], my_event_codes[my_events[i]]); + "event set %d event index = %d event code = 0x%x", + event_set, my_events[i], my_events_code[my_events[i]]); - int event_index = get_event_index(self, my_event_codes[my_events[i]]); + int event_index = get_event_index(self, my_events_code[my_events[i]]); int metric_id = hpcrun_event2metric(self, event_index); @@ -967,9 +1084,9 @@ papi_event_handler(int event_set, void *pc, long long ovec, if (ci->some_derived) { for (i = 0; i < nevents; i++) { if (derived[i]) { - hpcrun_sample_callpath(context, hpcrun_event2metric(self, i), - (hpcrun_metricVal_t) {.i=values[i] - ci->prev_values[i]}, - 0, 0, NULL); + hpcrun_sample_callpath(context, hpcrun_event2metric(self, i), + (hpcrun_metricVal_t) {.i=values[i] - ci->prev_values[i]}, + 0, 0, NULL); } } @@ -979,5 +1096,99 @@ papi_event_handler(int event_set, void *pc, long long ovec, } } +finish: + tool_exit(); hpcrun_safe_exit(); } + + +static void +attribute_metric_to_cct +( + int metric_id, + cct_node_t *cct_node, + long long value +) +{ + metric_data_list_t* metrics = hpcrun_reify_metric_set(cct_node, metric_id); + + hpcrun_metric_std_inc(metric_id, + metrics, + (cct_metric_data_t) {.i = value}); +} + + +static void +attribute_counters(papi_component_info_t *ci, long long *collected_values, cct_node_t *cct_node) +{ + sample_source_t *self = &obj_name(); + int events_codes[MAX_EVENTS]; + int my_events_number = MAX_EVENTS; + int ret; + + // Attribute collected metric to cct nodes + ret = PAPI_list_events(ci->eventSet, events_codes, &my_events_number); + if (ret != PAPI_OK) { + hpcrun_abort("PAPI_list_events failed inside papi_event_handler." + "Return code = %d ==> %s", ret, PAPI_strerror(ret)); + } + + for (int eid = 0; eid < my_events_number; ++eid) { + int event_index = get_event_index(self, events_codes[eid]); + int metric_id = hpcrun_event2metric(self, event_index); + long long int final_counts = collected_values[eid] - ci->prev_values[eid]; + + + blame_shift_apply(metric_id, cct_node, final_counts/*metricIncr*/); + attribute_metric_to_cct(metric_id, cct_node, final_counts); + + PRINT("PAPI_EXIT:: %d Event = %x, event_index = %d, metric_id = %d || value = %lld - %lld == %lld\n", + eid, events_codes[eid], event_index, metric_id, + collected_values[eid], ci->prev_values[eid], + final_counts); + } +} + + +static void +papi_monitor_enter(papi_component_info_t *ci, cct_node_t *cct_node) +{ + tool_enter(); +// PRINT("|------->PAPI_MONITOR_ENTER | cct = %p\n", cct_node); + + // if sampling disabled explicitly for this thread, skip all processing + if (hpcrun_suppress_sample() || sample_filters_apply()) goto finish; + + ci->cct_node = cct_node; + + // Save counts on the end so we could substract that from next call (we don't want to measure ourselves) + + if (ci->inUse) { + ci->read(ci->prev_values); + + PRINT("PAPI_ENTER:: Component %s Event = %d, value = %lld | %p\n", ci->name, ci->eventSet, ci->prev_values[0], cct_node); + } + +finish: + tool_exit(); +} + + +static void +papi_monitor_exit(papi_component_info_t *ci) +{ + tool_enter(); + long long collected_values[MAX_EVENTS]; + + // if sampling disabled explicitly for this thread, skip all processing + if (hpcrun_suppress_sample() || sample_filters_apply()) goto finish; + + if (ci->inUse){ + ci->read(collected_values); + attribute_counters(ci, collected_values, ci->cct_node); + } + + +finish: + tool_exit(); +} diff --git a/src/tool/hpcrun/sample-sources/papi-c.h b/src/tool/hpcrun/sample-sources/papi-c.h index 426a778117..2c125ef129 100644 --- a/src/tool/hpcrun/sample-sources/papi-c.h +++ b/src/tool/hpcrun/sample-sources/papi-c.h @@ -53,6 +53,8 @@ #include "papi-c-extended-info.h" +#include "sample_source_obj.h" +#include "cct.h" /****************************************************************************** @@ -60,21 +62,24 @@ *****************************************************************************/ typedef struct { + const char *name; bool inUse; int eventSet; source_state_t state; int some_derived; bool scale_by_thread_count; long long prev_values[MAX_EVENTS]; - bool is_sync; + cct_node_t *cct_node; + bool is_gpu_sync; bool setup_process_only; get_event_set_proc_t get_event_set; add_event_proc_t add_event; finalize_event_set_proc_t finalize_event_set; - start_proc_t sync_start; - stop_proc_t sync_stop; - setup_proc_t sync_setup; - teardown_proc_t sync_teardown; + start_proc_t start; + read_proc_t read; + stop_proc_t stop; + setup_proc_t setup; + teardown_proc_t teardown; } papi_component_info_t; @@ -84,11 +89,10 @@ typedef struct { } papi_source_info_t; - /****************************************************************************** * external declarations *****************************************************************************/ -extern int get_component_event_set(papi_source_info_t *psi, int cidx); +extern int get_component_event_set(papi_component_info_t* ci); #endif // PAPI_C_H diff --git a/src/tool/hpcrun/sample-sources/papi.c b/src/tool/hpcrun/sample-sources/papi.c index 8eed9ddc65..e4514533c6 100644 --- a/src/tool/hpcrun/sample-sources/papi.c +++ b/src/tool/hpcrun/sample-sources/papi.c @@ -96,6 +96,8 @@ #include #include +#include "tool_state.h" + /****************************************************************************** * macros @@ -165,6 +167,7 @@ strip_papi_prefix(const char *str) static void METHOD_FN(init) { + tool_enter(); PAPI_set_debug(0x3ff); // **NOTE: some papi components may start threads, so @@ -201,13 +204,15 @@ METHOD_FN(init) } self->state = INIT; + tool_exit(); } static void METHOD_FN(thread_init) { + tool_enter(); TMSG(PAPI, "thread init"); - if (papi_unavail) { return; } + if (papi_unavail) { goto finish; } int retval = PAPI_thread_init(pthread_self); if (retval != PAPI_OK) { @@ -215,13 +220,16 @@ METHOD_FN(thread_init) monitor_real_abort(); } TMSG(PAPI, "thread init OK"); +finish: + tool_exit(); } static void METHOD_FN(thread_init_action) { + tool_enter(); TMSG(PAPI, "register thread"); - if (papi_unavail) { return; } + if (papi_unavail) { goto finish; } int retval = PAPI_register_thread(); if (retval != PAPI_OK) { @@ -229,13 +237,16 @@ METHOD_FN(thread_init_action) monitor_real_abort(); } TMSG(PAPI, "register thread ok"); +finish: + tool_exit(); } static void METHOD_FN(start) { + tool_enter(); TMSG(PAPI, "start"); - if (papi_unavail) { return; } + if (papi_unavail) { goto finish; } thread_data_t *td = hpcrun_get_thread_data(); papi_source_info_t *psi = td->ss_info[self->sel_idx].ptr; @@ -247,7 +258,7 @@ METHOD_FN(start) // state PAPI is in. if (my_state == START) { - return; + goto finish; } TMSG(PAPI,"starting PAPI w event set %d",eventSet); @@ -269,25 +280,33 @@ METHOD_FN(start) } TD_GET(ss_state)[self->sel_idx] = START; + +finish: + tool_exit(); } static void METHOD_FN(thread_fini_action) { - TMSG(PAPI, "unregister thread"); - if (papi_unavail) { return; } + tool_enter(); + TMSG(PAPI, "unregister thread"); + if (papi_unavail) { goto finish; } int retval = PAPI_unregister_thread(); char msg[] = "!!NOT PAPI_OK!! (code = -9999999)\n"; snprintf(msg, sizeof(msg)-1, "!!NOT PAPI_OK!! (code = %d)", retval); TMSG(PAPI, "unregister thread returns %s", retval == PAPI_OK? "PAPI_OK" : msg); +finish: + tool_exit(); } static void METHOD_FN(stop) { - TMSG(PAPI, "stop"); - if (papi_unavail) { return; } + tool_enter(); + + TMSG(PAPI, "stop"); + if (papi_unavail) { goto finish; } thread_data_t *td = hpcrun_get_thread_data(); papi_source_info_t *psi = td->ss_info[self->sel_idx].ptr; @@ -297,12 +316,12 @@ METHOD_FN(stop) if (my_state == STOP) { TMSG(PAPI,"--stop called on an already stopped event set %d",eventSet); - return; + goto finish; } if (my_state != START) { TMSG(PAPI,"*WARNING* Stop called on event set that has not been started"); - return; + goto finish; } TMSG(PAPI,"stop w event set = %d",eventSet); @@ -314,18 +333,23 @@ METHOD_FN(stop) } TD_GET(ss_state)[self->sel_idx] = STOP; +finish: + tool_exit(); } static void METHOD_FN(shutdown) { - TMSG(PAPI, "shutdown"); - if (papi_unavail) { return; } + tool_enter(); + TMSG(PAPI, "shutdown"); + if (papi_unavail) { goto finish; } METHOD_CALL(self, stop); // make sure stop has been called PAPI_shutdown(); self->state = UNINIT; +finish: + tool_exit(); } // Return true if PAPI recognizes the name, whether supported or not. @@ -333,10 +357,12 @@ METHOD_FN(shutdown) static bool METHOD_FN(supports_event, const char *ev_str) { + tool_enter(); + bool ret; ev_str = strip_papi_prefix(ev_str); TMSG(PAPI, "supports event"); - if (papi_unavail) { return false; } + if (papi_unavail) { ret = false; goto finish; } if (self->state == UNINIT){ METHOD_CALL(self, init); @@ -347,14 +373,19 @@ METHOD_FN(supports_event, const char *ev_str) long th; hpcrun_extract_ev_thresh(ev_str, sizeof(evtmp), evtmp, &th, DEFAULT_THRESHOLD); - return PAPI_event_name_to_code(evtmp, &ec) == PAPI_OK; + ret = (PAPI_event_name_to_code(evtmp, &ec) == PAPI_OK); + +finish: + tool_exit(); + return ret; } static void METHOD_FN(process_event_list, int lush_metrics) { + tool_enter(); TMSG(PAPI, "process event list"); - if (papi_unavail) { return; } + if (papi_unavail) { goto finish; } char *event; int i, ret; @@ -456,17 +487,20 @@ METHOD_FN(process_event_list, int lush_metrics) if (! some_overflow) { hpcrun_ssfail_all_derived("PAPI"); } +finish: + tool_exit(); } static void METHOD_FN(gen_event_set,int lush_metrics) { - int i; + tool_enter(); + int i; int ret; int eventSet; TMSG(PAPI, "gen event set"); - if (papi_unavail) { return; } + if (papi_unavail) { goto finish; } int ss_info_size = sizeof(papi_source_info_t); papi_source_info_t *psi = hpcrun_malloc(ss_info_size); @@ -520,11 +554,14 @@ METHOD_FN(gen_event_set,int lush_metrics) } } psi->eventSet= eventSet; +finish: + tool_exit(); } static void METHOD_FN(display_events) { + tool_enter(); PAPI_event_info_t info; char name[200], *prof; int ev, ret, num_total, num_prof; @@ -538,7 +575,7 @@ METHOD_FN(display_events) if (papi_unavail) { printf("PAPI is not available. Probably, the kernel doesn't support PAPI,\n" "or else maybe HPCToolkit is out of sync with PAPI.\n\n"); - return; + goto finish; } num_total = 0; @@ -592,8 +629,11 @@ METHOD_FN(display_events) } printf("Total native events: %d\n", num_total); printf("\n"); +finish: + tool_exit(); } + /*************************************************************************** * object ***************************************************************************/ @@ -626,26 +666,35 @@ hpcrun_disable_papi_cuda(void) static int event_is_derived(int ev_code) { - PAPI_event_info_t info; + tool_enter(); + int ret; + PAPI_event_info_t info; // "Is derived" is kind of a bad thing, so if any unexpected failure // occurs, we'll return the "bad" answer. if (PAPI_get_event_info(ev_code, &info) != PAPI_OK || info.derived == NULL) { - return 1; + ret = 1; + goto finish; } if (info.count == 1 || strlen(info.derived) == 0 || strcmp(info.derived, "NOT_DERIVED") == 0 || strcmp(info.derived, "DERIVED_CMPD") == 0) { - return 0; + ret = 0; + goto finish; } - return 1; + ret = 1; + +finish: + tool_exit(); + return ret; } static void event_fatal_error(int ev_code, int papi_ret) { + tool_enter(); char name[1024]; PAPI_event_code_to_name(ev_code, name); @@ -659,12 +708,15 @@ event_fatal_error(int ev_code, int papi_ret) hpcrun_ssfail_conflict("PAPI", name); } hpcrun_ssfail_unsupported("PAPI", name); + + tool_exit(); } static void papi_event_handler(int event_set, void *pc, long long ovec, void *context) { + tool_enter(); sample_source_t *self = &_papi_obj; long long values[MAX_EVENTS]; int my_events[MAX_EVENTS]; @@ -673,14 +725,14 @@ papi_event_handler(int event_set, void *pc, long long ovec, int i, ret; // if sampling disabled explicitly for this thread, skip all processing - if (hpcrun_suppress_sample()) return; + if (hpcrun_suppress_sample()) goto finish; // If the interrupt came from inside our code, then drop the sample // and return and avoid any MSG. if (! hpcrun_safe_enter_async(pc)) { hpcrun_stats_num_samples_blocked_async_inc(); - return; + goto finish; } TMSG(PAPI_SAMPLE,"papi event happened, ovec = %ld",ovec); @@ -734,5 +786,7 @@ papi_event_handler(int event_set, void *pc, long long ovec, } } - hpcrun_safe_exit(); +finish: + tool_exit(); + hpcrun_safe_exit(); } diff --git a/src/tool/hpcrun/sample-sources/retcnt.c b/src/tool/hpcrun/sample-sources/retcnt.c index dfe405920a..0a44f44b5b 100644 --- a/src/tool/hpcrun/sample-sources/retcnt.c +++ b/src/tool/hpcrun/sample-sources/retcnt.c @@ -208,6 +208,7 @@ METHOD_FN(display_events) printf("\n"); } + #define ss_name retcnt #define ss_cls SS_SOFTWARE #define ss_sort_order 100 diff --git a/src/tool/hpcrun/sample-sources/ss-list.h b/src/tool/hpcrun/sample-sources/ss-list.h index f9674cb2f1..abd1f112b0 100644 --- a/src/tool/hpcrun/sample-sources/ss-list.h +++ b/src/tool/hpcrun/sample-sources/ss-list.h @@ -88,6 +88,10 @@ SAMPLE_SOURCE_DECL_MACRO(retcnt) SAMPLE_SOURCE_DECL_MACRO(papi_c_cupti) #endif +#ifdef HPCRUN_SS_PAPI_C_ROCM +SAMPLE_SOURCE_DECL_MACRO(papi_c_rocm) +#endif + #ifdef HPCRUN_SS_NVIDIA SAMPLE_SOURCE_DECL_MACRO(nvidia_gpu) #endif @@ -98,6 +102,14 @@ SAMPLE_SOURCE_DECL_MACRO(amd_gpu) #endif #endif +SAMPLE_SOURCE_DECL_MACRO(openmp_gpu) + +#ifdef HPCRUN_SS_AMD +#ifndef HPCRUN_STATIC_LINK +SAMPLE_SOURCE_DECL_MACRO(amd_rocprof) +#endif +#endif + #ifdef HPCRUN_SS_LEVEL0 SAMPLE_SOURCE_DECL_MACRO(level0) #endif diff --git a/src/tool/hpcrun/sample-sources/sync.c b/src/tool/hpcrun/sample-sources/sync.c index b9608f915f..f0c68a2b12 100644 --- a/src/tool/hpcrun/sample-sources/sync.c +++ b/src/tool/hpcrun/sample-sources/sync.c @@ -199,6 +199,7 @@ METHOD_FN(display_events) printf("\n"); } + /*************************************************************************** * object ***************************************************************************/ diff --git a/src/tool/hpcrun/sample-sources/tst.c b/src/tool/hpcrun/sample-sources/tst.c index 6b58d21446..54baf04327 100644 --- a/src/tool/hpcrun/sample-sources/tst.c +++ b/src/tool/hpcrun/sample-sources/tst.c @@ -366,6 +366,7 @@ METHOD_FN(display_events) #endif } + /*************************************************************************** * object ***************************************************************************/ diff --git a/src/tool/hpcrun/sample-sources/upc.c b/src/tool/hpcrun/sample-sources/upc.c index 7bd4cf4b4b..ef12ac7419 100644 --- a/src/tool/hpcrun/sample-sources/upc.c +++ b/src/tool/hpcrun/sample-sources/upc.c @@ -447,6 +447,7 @@ METHOD_FN(display_events) printf("\n"); } + #define ss_name upc #define ss_cls SS_HARDWARE diff --git a/src/tool/hpcrun/sample_event.c b/src/tool/hpcrun/sample_event.c index f5c7b46f30..56fd3732a5 100644 --- a/src/tool/hpcrun/sample_event.c +++ b/src/tool/hpcrun/sample_event.c @@ -244,7 +244,7 @@ hpcrun_sample_callpath(void* context, int metricId, } } } - else { + else { // Partial unwind case cct_bundle_t* cct = &(td->core_profile_trace_data.epoch->csdata); node = record_partial_unwind(cct, td->btbuf_beg, td->btbuf_cur - 1, metricId, metricIncr, skipInner, NULL); @@ -305,7 +305,7 @@ hpcrun_sample_callpath(void* context, int metricId, } hpcrun_clear_handling_sample(td); - if (TD_GET(mem_low) || ENABLED(FLUSH_EVERY_SAMPLE)) { + if (get_mem_low() || ENABLED(FLUSH_EVERY_SAMPLE)) { hpcrun_flush_epochs(&(TD_GET(core_profile_trace_data))); hpcrun_reclaim_freeable_mem(); } @@ -384,7 +384,7 @@ hpcrun_gen_thread_ctxt(void* context) } #endif hpcrun_clear_handling_sample(td); - if (TD_GET(mem_low) || ENABLED(FLUSH_EVERY_SAMPLE)) { + if (get_mem_low() || ENABLED(FLUSH_EVERY_SAMPLE)) { hpcrun_flush_epochs(&(TD_GET(core_profile_trace_data))); hpcrun_reclaim_freeable_mem(); } diff --git a/src/tool/hpcrun/sample_sources_registered.c b/src/tool/hpcrun/sample_sources_registered.c index aa623c5dca..658b40c104 100644 --- a/src/tool/hpcrun/sample_sources_registered.c +++ b/src/tool/hpcrun/sample_sources_registered.c @@ -75,7 +75,6 @@ static sample_source_t* registered_sample_sources = NULL; - //------------------------------------------------------------------------------ // interface operations //------------------------------------------------------------------------------ @@ -92,8 +91,6 @@ hpcrun_sample_sources_register(void) } - - //------------------------------------------------------------------------------ // interface operations //------------------------------------------------------------------------------ @@ -146,7 +143,8 @@ hpcrun_registered_sources_init(void) METHOD_CALL(ss, init); TMSG(SS_COMMON, "sample source \"%s\": init", ss->name); } - + + // set user-defined control_knobs for the sample sources control_knob_init(); } diff --git a/src/tool/hpcrun/scripts/hpcrun.in b/src/tool/hpcrun/scripts/hpcrun.in index 491e2b20ab..9b97b4aff3 100644 --- a/src/tool/hpcrun/scripts/hpcrun.in +++ b/src/tool/hpcrun/scripts/hpcrun.in @@ -375,7 +375,10 @@ do CPU_GPU_IDLE* ) preload_list="${preload_list:+${preload_list}:}${hpcrun_dir}/libhpcrun_gpu.so" ;; MPI* ) preload_list="${preload_list:+${preload_list}:}${hpcrun_dir}/libhpcrun_mpi.so" ;; gpu=amd) roctracer_libdir="${roctracer_lib_path}" - export HIP_ENABLE_DEFERRED_LOADING=0;; + export HSA_TOOLS_LIB=librocprofiler64.so.1 + export ROCP_TOOL_LIB=libhpcrun.so + export ROCP_METRICS=@ROCM_PROFILER_LD_DIR@/metrics.xml + export ROCP_HSA_INTERCEPT=1;; gpu=opencl) preload_list="${preload_list:+${preload_list}:}${hpcrun_dir}/libhpcrun_opencl.so" ;; gpu=opencl,inst) gtpin_libdir="${gtpin_lib_path}" @@ -395,6 +398,9 @@ do ;; -L | -l | --list-events ) + export HSA_TOOLS_LIB=librocprofiler64.so.1 + export ROCP_TOOL_LIB=libhpcrun.so + export ROCP_METRICS=@ROCM_PROFILER_LD_DIR@/metrics.xml export HPCRUN_EVENT_LIST=LIST export HPCRUN_LIST_EVENT=1 ;; diff --git a/src/tool/hpcrun/thread_data.c b/src/tool/hpcrun/thread_data.c index bdea4fa984..c1adbe5fbc 100644 --- a/src/tool/hpcrun/thread_data.c +++ b/src/tool/hpcrun/thread_data.c @@ -129,7 +129,6 @@ static pthread_key_t _hpcrun_key; static int use_getspecific = 0; static __thread bool mem_pool_initialized = false; - void hpcrun_init_pthread_key ( @@ -291,8 +290,6 @@ hpcrun_thread_init_mem_pool_once } } - - //*************************************************************************** // //*************************************************************************** @@ -385,7 +382,6 @@ hpcrun_thread_data_init size_t n_sources ) { - hpcrun_meminfo_t memstore; thread_data_t* td = hpcrun_get_thread_data(); // ---------------------------------------- @@ -396,12 +392,8 @@ hpcrun_thread_data_init // memstore so we can reuse it in the child after fork. This must // come first. td->inside_hpcrun = 1; - memstore = td->memstore; memset(td, 0xfe, sizeof(thread_data_t)); td->inside_hpcrun = 1; - td->memstore = memstore; - hpcrun_make_memstore(&td->memstore, is_child); - td->mem_low = 0; // ---------------------------------------- // normalized thread id (monitor-generated) diff --git a/src/tool/hpcrun/thread_data.h b/src/tool/hpcrun/thread_data.h index 2874b39400..baa37f6c95 100644 --- a/src/tool/hpcrun/thread_data.h +++ b/src/tool/hpcrun/thread_data.h @@ -172,12 +172,6 @@ typedef struct thread_data_t { int omp_thread; uint64_t last_bar_time_us; - // ---------------------------------------- - // hpcrun_malloc() memory data structures - // ---------------------------------------- - hpcrun_meminfo_t memstore; - int mem_low; - // ---------------------------------------- // sample sources // ---------------------------------------- diff --git a/src/tool/hpcrun/tool_state.c b/src/tool/hpcrun/tool_state.c new file mode 100644 index 0000000000..1a978c6e90 --- /dev/null +++ b/src/tool/hpcrun/tool_state.c @@ -0,0 +1,23 @@ +// +// Created by dejan on 1.7.20.. +// + +#include "tool_state.h" + +static __thread int tool_active = false; + + + +void tool_enter(){ + tool_active++; +} + + +void tool_exit(){ + tool_active--; +} + + +bool is_tool_active(){ + return tool_active; +} \ No newline at end of file diff --git a/src/tool/hpcrun/tool_state.h b/src/tool/hpcrun/tool_state.h new file mode 100644 index 0000000000..95bc91f67f --- /dev/null +++ b/src/tool/hpcrun/tool_state.h @@ -0,0 +1,15 @@ +// +// Created by dejan on 1.7.20.. +// + +#ifndef HPCTOOLKIT_TOOL_STATE_H +#define HPCTOOLKIT_TOOL_STATE_H + +#include + + +void tool_enter(); +void tool_exit(); +bool is_tool_active(); + +#endif //HPCTOOLKIT_TOOL_STATE_H diff --git a/src/tool/hpcrun/utilities/bgq-cnk/Makefile.in b/src/tool/hpcrun/utilities/bgq-cnk/Makefile.in index 7b714659b0..9852d55315 100644 --- a/src/tool/hpcrun/utilities/bgq-cnk/Makefile.in +++ b/src/tool/hpcrun/utilities/bgq-cnk/Makefile.in @@ -312,6 +312,7 @@ OPT_CUPTI_IFLAGS = @OPT_CUPTI_IFLAGS@ OPT_CUPTI_LDFLAGS = @OPT_CUPTI_LDFLAGS@ OPT_GTPIN = @OPT_GTPIN@ OPT_GTPIN_IFLAGS = @OPT_GTPIN_IFLAGS@ +OPT_GTPIN_LDFLAGS = @OPT_GTPIN_LDFLAGS@ OPT_GTPIN_LIBDIR = @OPT_GTPIN_LIBDIR@ OPT_IGC = @OPT_IGC@ OPT_IGC_IFLAGS = @OPT_IGC_IFLAGS@ @@ -349,6 +350,7 @@ PERFMON_LDFLAGS_STAT = @PERFMON_LDFLAGS_STAT@ PERFMON_LIB = @PERFMON_LIB@ PERF_EVENT_PARANOID = @PERF_EVENT_PARANOID@ RANLIB = @RANLIB@ +ROCM_PROFILER_LD_DIR = @ROCM_PROFILER_LD_DIR@ SED = @SED@ SET_MAKE = @SET_MAKE@ SHELL = @SHELL@ diff --git a/src/tool/hpcserver/Makefile.in b/src/tool/hpcserver/Makefile.in index ea164abf7f..7caa9659b1 100644 --- a/src/tool/hpcserver/Makefile.in +++ b/src/tool/hpcserver/Makefile.in @@ -366,6 +366,7 @@ OPT_CUPTI_IFLAGS = @OPT_CUPTI_IFLAGS@ OPT_CUPTI_LDFLAGS = @OPT_CUPTI_LDFLAGS@ OPT_GTPIN = @OPT_GTPIN@ OPT_GTPIN_IFLAGS = @OPT_GTPIN_IFLAGS@ +OPT_GTPIN_LDFLAGS = @OPT_GTPIN_LDFLAGS@ OPT_GTPIN_LIBDIR = @OPT_GTPIN_LIBDIR@ OPT_IGC = @OPT_IGC@ OPT_IGC_IFLAGS = @OPT_IGC_IFLAGS@ @@ -403,6 +404,7 @@ PERFMON_LDFLAGS_STAT = @PERFMON_LDFLAGS_STAT@ PERFMON_LIB = @PERFMON_LIB@ PERF_EVENT_PARANOID = @PERF_EVENT_PARANOID@ RANLIB = @RANLIB@ +ROCM_PROFILER_LD_DIR = @ROCM_PROFILER_LD_DIR@ SED = @SED@ SET_MAKE = @SET_MAKE@ SHELL = @SHELL@ diff --git a/src/tool/hpcserver/mpi/Makefile.in b/src/tool/hpcserver/mpi/Makefile.in index d4fc024c72..a58be57c8d 100644 --- a/src/tool/hpcserver/mpi/Makefile.in +++ b/src/tool/hpcserver/mpi/Makefile.in @@ -374,6 +374,7 @@ OPT_CUPTI_IFLAGS = @OPT_CUPTI_IFLAGS@ OPT_CUPTI_LDFLAGS = @OPT_CUPTI_LDFLAGS@ OPT_GTPIN = @OPT_GTPIN@ OPT_GTPIN_IFLAGS = @OPT_GTPIN_IFLAGS@ +OPT_GTPIN_LDFLAGS = @OPT_GTPIN_LDFLAGS@ OPT_GTPIN_LIBDIR = @OPT_GTPIN_LIBDIR@ OPT_IGC = @OPT_IGC@ OPT_IGC_IFLAGS = @OPT_IGC_IFLAGS@ @@ -411,6 +412,7 @@ PERFMON_LDFLAGS_STAT = @PERFMON_LDFLAGS_STAT@ PERFMON_LIB = @PERFMON_LIB@ PERF_EVENT_PARANOID = @PERF_EVENT_PARANOID@ RANLIB = @RANLIB@ +ROCM_PROFILER_LD_DIR = @ROCM_PROFILER_LD_DIR@ SED = @SED@ SET_MAKE = @SET_MAKE@ SHELL = @SHELL@ diff --git a/src/tool/hpcstruct/Makefile.in b/src/tool/hpcstruct/Makefile.in index 49ad90c9ad..22162293bb 100644 --- a/src/tool/hpcstruct/Makefile.in +++ b/src/tool/hpcstruct/Makefile.in @@ -402,6 +402,7 @@ OPT_CUPTI_IFLAGS = @OPT_CUPTI_IFLAGS@ OPT_CUPTI_LDFLAGS = @OPT_CUPTI_LDFLAGS@ OPT_GTPIN = @OPT_GTPIN@ OPT_GTPIN_IFLAGS = @OPT_GTPIN_IFLAGS@ +OPT_GTPIN_LDFLAGS = @OPT_GTPIN_LDFLAGS@ OPT_GTPIN_LIBDIR = @OPT_GTPIN_LIBDIR@ OPT_IGC = @OPT_IGC@ OPT_IGC_IFLAGS = @OPT_IGC_IFLAGS@ @@ -439,6 +440,7 @@ PERFMON_LDFLAGS_STAT = @PERFMON_LDFLAGS_STAT@ PERFMON_LIB = @PERFMON_LIB@ PERF_EVENT_PARANOID = @PERF_EVENT_PARANOID@ RANLIB = @RANLIB@ +ROCM_PROFILER_LD_DIR = @ROCM_PROFILER_LD_DIR@ SED = @SED@ SET_MAKE = @SET_MAKE@ SHELL = @SHELL@ diff --git a/src/tool/hpctracedump/Makefile.in b/src/tool/hpctracedump/Makefile.in index 46cf02cc36..f511af86f5 100644 --- a/src/tool/hpctracedump/Makefile.in +++ b/src/tool/hpctracedump/Makefile.in @@ -352,6 +352,7 @@ OPT_CUPTI_IFLAGS = @OPT_CUPTI_IFLAGS@ OPT_CUPTI_LDFLAGS = @OPT_CUPTI_LDFLAGS@ OPT_GTPIN = @OPT_GTPIN@ OPT_GTPIN_IFLAGS = @OPT_GTPIN_IFLAGS@ +OPT_GTPIN_LDFLAGS = @OPT_GTPIN_LDFLAGS@ OPT_GTPIN_LIBDIR = @OPT_GTPIN_LIBDIR@ OPT_IGC = @OPT_IGC@ OPT_IGC_IFLAGS = @OPT_IGC_IFLAGS@ @@ -389,6 +390,7 @@ PERFMON_LDFLAGS_STAT = @PERFMON_LDFLAGS_STAT@ PERFMON_LIB = @PERFMON_LIB@ PERF_EVENT_PARANOID = @PERF_EVENT_PARANOID@ RANLIB = @RANLIB@ +ROCM_PROFILER_LD_DIR = @ROCM_PROFILER_LD_DIR@ SED = @SED@ SET_MAKE = @SET_MAKE@ SHELL = @SHELL@ diff --git a/src/tool/misc/Makefile.in b/src/tool/misc/Makefile.in index acb75a41da..c28239b629 100644 --- a/src/tool/misc/Makefile.in +++ b/src/tool/misc/Makefile.in @@ -307,6 +307,7 @@ OPT_CUPTI_IFLAGS = @OPT_CUPTI_IFLAGS@ OPT_CUPTI_LDFLAGS = @OPT_CUPTI_LDFLAGS@ OPT_GTPIN = @OPT_GTPIN@ OPT_GTPIN_IFLAGS = @OPT_GTPIN_IFLAGS@ +OPT_GTPIN_LDFLAGS = @OPT_GTPIN_LDFLAGS@ OPT_GTPIN_LIBDIR = @OPT_GTPIN_LIBDIR@ OPT_IGC = @OPT_IGC@ OPT_IGC_IFLAGS = @OPT_IGC_IFLAGS@ @@ -344,6 +345,7 @@ PERFMON_LDFLAGS_STAT = @PERFMON_LDFLAGS_STAT@ PERFMON_LIB = @PERFMON_LIB@ PERF_EVENT_PARANOID = @PERF_EVENT_PARANOID@ RANLIB = @RANLIB@ +ROCM_PROFILER_LD_DIR = @ROCM_PROFILER_LD_DIR@ SED = @SED@ SET_MAKE = @SET_MAKE@ SHELL = @SHELL@ diff --git a/src/tool/xprof/Makefile.in b/src/tool/xprof/Makefile.in index 6e11068ad1..410d824678 100644 --- a/src/tool/xprof/Makefile.in +++ b/src/tool/xprof/Makefile.in @@ -373,6 +373,7 @@ OPT_CUPTI_IFLAGS = @OPT_CUPTI_IFLAGS@ OPT_CUPTI_LDFLAGS = @OPT_CUPTI_LDFLAGS@ OPT_GTPIN = @OPT_GTPIN@ OPT_GTPIN_IFLAGS = @OPT_GTPIN_IFLAGS@ +OPT_GTPIN_LDFLAGS = @OPT_GTPIN_LDFLAGS@ OPT_GTPIN_LIBDIR = @OPT_GTPIN_LIBDIR@ OPT_IGC = @OPT_IGC@ OPT_IGC_IFLAGS = @OPT_IGC_IFLAGS@ @@ -410,6 +411,7 @@ PERFMON_LDFLAGS_STAT = @PERFMON_LDFLAGS_STAT@ PERFMON_LIB = @PERFMON_LIB@ PERF_EVENT_PARANOID = @PERF_EVENT_PARANOID@ RANLIB = @RANLIB@ +ROCM_PROFILER_LD_DIR = @ROCM_PROFILER_LD_DIR@ SED = @SED@ SET_MAKE = @SET_MAKE@ SHELL = @SHELL@ diff --git a/tests/Makefile.in b/tests/Makefile.in index dc7d338c90..f42b0a49ec 100644 --- a/tests/Makefile.in +++ b/tests/Makefile.in @@ -539,6 +539,7 @@ OPT_CUPTI_IFLAGS = @OPT_CUPTI_IFLAGS@ OPT_CUPTI_LDFLAGS = @OPT_CUPTI_LDFLAGS@ OPT_GTPIN = @OPT_GTPIN@ OPT_GTPIN_IFLAGS = @OPT_GTPIN_IFLAGS@ +OPT_GTPIN_LDFLAGS = @OPT_GTPIN_LDFLAGS@ OPT_GTPIN_LIBDIR = @OPT_GTPIN_LIBDIR@ OPT_IGC = @OPT_IGC@ OPT_IGC_IFLAGS = @OPT_IGC_IFLAGS@ @@ -576,6 +577,7 @@ PERFMON_LDFLAGS_STAT = @PERFMON_LDFLAGS_STAT@ PERFMON_LIB = @PERFMON_LIB@ PERF_EVENT_PARANOID = @PERF_EVENT_PARANOID@ RANLIB = @RANLIB@ +ROCM_PROFILER_LD_DIR = @ROCM_PROFILER_LD_DIR@ SED = @SED@ SET_MAKE = @SET_MAKE@ SHELL = @SHELL@