diff --git a/.gitignore b/.gitignore index 9a8cb0a..beb6022 100644 --- a/.gitignore +++ b/.gitignore @@ -45,3 +45,6 @@ libtool # Debug files *.dSYM/ *.su + +# Editor files +*.swp diff --git a/Makefile.am b/Makefile.am index a66dfda..61e9a2d 100644 --- a/Makefile.am +++ b/Makefile.am @@ -20,13 +20,18 @@ EXTRA_DIST = autogen.sh include_HEADERS = include/gdsync.h libgdsyncincludedir = $(includedir)/gdsync -libgdsyncinclude_HEADERS = include/gdsync/core.h include/gdsync/device.cuh include/gdsync/mlx5.h include/gdsync/tools.h +libgdsyncinclude_HEADERS = include/gdsync/core.h include/gdsync/device.cuh include/gdsync/mlx5.h include/gdsync/tools.h src_libgdsync_la_CFLAGS = $(AM_CFLAGS) src_libgdsync_la_SOURCES = src/gdsync.cpp src/memmgr.cpp src/mem.cpp src/objs.cpp src/apis.cpp src/mlx5.cpp include/gdsync.h src_libgdsync_la_LDFLAGS = -version-info @VERSION_INFO@ -noinst_HEADERS = src/mem.hpp src/memmgr.hpp src/objs.hpp src/rangeset.hpp src/utils.hpp src/archutils.h src/mlnxutils.h +noinst_HEADERS = src/mem.hpp src/memmgr.hpp src/objs.hpp src/rangeset.hpp src/utils.hpp src/archutils.h src/mlnxutils.h + +if COMPILE_EXP_VERBS +src_libgdsync_la_SOURCES += src/transports/mlx5-exp/mlx5-exp.cpp +noinst_HEADERS += src/transports/mlx5-exp/mlx5-exp.hpp +endif # if enabled at configure time @@ -36,7 +41,7 @@ bin_PROGRAMS = tests/gds_kernel_latency tests/gds_poll_lat tests/gds_kernel_loop noinst_PROGRAMS = tests/rstest tests/wqtest tests_gds_kernel_latency_SOURCES = tests/gds_kernel_latency.c tests/gpu_kernels.cu tests/pingpong.c tests/gpu.cpp -tests_gds_kernel_latency_LDADD = $(top_builddir)/src/libgdsync.la -lmpi $(LIBGDSTOOLS) -lgdrapi $(LIBNVTX) -lcuda -lcudart $(PTHREAD_LIBS) +tests_gds_kernel_latency_LDADD = $(top_builddir)/src/libgdsync.la $(MPILDFLAGS) $(LIBGDSTOOLS) -lgdrapi $(LIBNVTX) -lcuda -lcudart $(PTHREAD_LIBS) tests_rstest_SOURCES = tests/rstest.cpp tests_rstest_LDADD = @@ -45,10 +50,10 @@ tests_wqtest_SOURCES = tests/task_queue_test.cpp tests_wqtest_LDADD = $(PTHREAD_LIBS) tests_gds_poll_lat_SOURCES = tests/gds_poll_lat.c tests/gpu.cpp tests/gpu_kernels.cu -tests_gds_poll_lat_LDADD = $(top_builddir)/src/libgdsync.la $(LIBGDSTOOLS) -lgdrapi -lmpi $(LIBNVTX) -lcuda -lcudart $(PTHREAD_LIBS) +tests_gds_poll_lat_LDADD = $(top_builddir)/src/libgdsync.la $(LIBGDSTOOLS) -lgdrapi $(MPILDFLAGS) $(LIBNVTX) -lcuda -lcudart $(PTHREAD_LIBS) tests_gds_sanity_SOURCES = tests/gds_sanity.cpp tests/gpu.cpp tests/gpu_kernels.cu -tests_gds_sanity_LDADD = $(top_builddir)/src/libgdsync.la $(LIBGDSTOOLS) -lgdrapi -lmpi $(LIBNVTX) -lcuda -lcudart $(PTHREAD_LIBS) +tests_gds_sanity_LDADD = $(top_builddir)/src/libgdsync.la $(LIBGDSTOOLS) -lgdrapi $(MPILDFLAGS) $(LIBNVTX) -lcuda -lcudart $(PTHREAD_LIBS) tests_gds_kernel_loopback_latency_SOURCES = tests/gds_kernel_loopback_latency.c tests/pingpong.c tests/gpu.cpp tests/gpu_kernels.cu tests_gds_kernel_loopback_latency_LDADD = $(top_builddir)/src/libgdsync.la $(LIBGDSTOOLS) -lgdrapi $(LIBNVTX) -lcuda -lcudart $(PTHREAD_LIBS) diff --git a/configure.ac b/configure.ac index a79aed6..6f4449c 100644 --- a/configure.ac +++ b/configure.ac @@ -93,25 +93,54 @@ else AC_SUBST(LIBGDSTOOLS) fi -AC_ARG_WITH([mpi], - AC_HELP_STRING([--with-mpi], [ Set path to mpi installation ])) -if test x$with_mpi = x || test x$with_mpi = xno; then +AC_ARG_WITH([spectrum-mpi], + AC_HELP_STRING([--with-spectrum-mpi], [ Set path to Spectrum MPI installation ])) +if test x$with_spectrum_mpi = x || test x$with_spectrum_mpi = xno; then # assuming system location mpi_home=/usr - MPICC=$with_home/bin/mpicc - MPICXX=$with_home/bin/mpic++ + MPICC=/bin/mpicc + MPICXX=/bin/mpic++ + MPILDFLAGS="-lmpi_ibm" else - if test -d $with_mpi; then - mpi_home=$with_mpi + if test -d $with_spectrum_mpi; then + mpi_home=$with_spectrum_mpi MPICC=${mpi_home}/bin/mpicc MPICXX=${mpi_home}/bin/mpic++ CPPFLAGS="$CPPFLAGS -I${mpi_home}/include" LDFLAGS="$LDFLAGS -L${mpi_home}/lib -L${mpi_home}/lib64" + MPILDFLAGS="-lmpi_ibm" else echo "MPI dir does not exist" fi fi +AC_ARG_WITH([mpi], + AC_HELP_STRING([--with-mpi], [ Set path to MPI installation ])) +if test x$with_spectrum_mpi = x || test x$with_spectrum_mpi == xno; then + if test x$with_mpi = x || test x$with_mpi = xno; then + # assuming system location + mpi_home=/usr + MPICC=/bin/mpicc + MPICXX=/bin/mpic++ + MPILDFLAGS="-lmpi" + else + if test -d $with_mpi; then + mpi_home=$with_mpi + MPICC=${mpi_home}/bin/mpicc + MPICXX=${mpi_home}/bin/mpic++ + CPPFLAGS="$CPPFLAGS -I${mpi_home}/include" + LDFLAGS="$LDFLAGS -L${mpi_home}/lib -L${mpi_home}/lib64" + MPILDFLAGS="-lmpi" + else + echo "MPI dir does not exist" + fi + fi +fi + +if test x$with_spectrum_mpi != x && test x$with_spectrum_mpi != xno && test x$with_mpi != x && test x$with_mpi != xno; then + AC_MSG_ERROR([--with-mpi and --with-spectrum-mpi are mutually exclusive.]) +fi + dnl Specify CUDA Location AC_ARG_WITH(cuda-toolkit, AC_HELP_STRING([--with-cuda-toolkit=CUDATKDIR], [ Specify CUDA toolkit installation directory (default: /usr/local/cuda)]), @@ -161,11 +190,21 @@ dnl Checks for Verbs support AC_CHECK_LIB(ibverbs, ibv_get_device_list, [], AC_MSG_ERROR([ibv_get_device_list() not found. libgdsync requires libibverbs.])) -AC_CHECK_LIB(ibverbs, ibv_exp_create_qp, - AC_MSG_ERROR([ibv_exp_create_qp not found. libgdsync requires verbs extension support.])) +dnl ibv_exp_create_qp is an inline function. So, we check for exp_cmd instead. +AC_CHECK_LIB(ibverbs, ibv_exp_cmd_create_qp, [have_exp_verbs=1]) +AC_CHECK_HEADER([infiniband/peer_ops.h], [have_peer_ops=1], [], +[[ +#include +]]) + +if test "x$have_exp_verbs" != "x" && test "x$have_peer_ops" != "x"; then + AC_DEFINE([HAVE_EXP_VERBS], [1], [Define if exp-verbs exists.]) + enable_exp_verbs=1 +else + AC_MSG_WARN([This version of libgdsync cannot be used without exp-verbs.]) +fi +AM_CONDITIONAL([COMPILE_EXP_VERBS], [test "x$enable_exp_verbs" != "x"]) -AC_CHECK_HEADER(infiniband/peer_ops.h, [], - AC_MSG_ERROR([ not found. libgdsync requires verbs peer-direct support.])) AC_HEADER_STDC dnl Checks for typedefs, structures, and compiler characteristics. @@ -186,6 +225,7 @@ AC_MSG_NOTICE([Setting MPI_PATH = ${mpi_home} ]) AC_SUBST( MPI_PATH, [${mpi_home} ]) AC_SUBST( MPICC, [${MPICC} ]) AC_SUBST( MPICXX, [${MPICXX} ]) +AC_SUBST( MPILDFLAGS, [${MPILDFLAGS} ]) CPPFLAGS="$CPPFLAGS -I$CUDA_DRV_PATH/include -I$CUDA_PATH/include" LDFLAGS="$LDFLAGS -L$CUDA_DRV_PATH/lib64 -L$CUDA_DRV_PATH/lib -L$CUDA_PATH/lib64 -L$CUDA_PATH/lib" diff --git a/include/gdsync.h b/include/gdsync.h index 7d6a45b..f2ed858 100644 --- a/include/gdsync.h +++ b/include/gdsync.h @@ -33,8 +33,6 @@ */ #include -#include -#include #include #include diff --git a/include/gdsync/core.h b/include/gdsync/core.h index 7ff0cbb..e500c93 100644 --- a/include/gdsync/core.h +++ b/include/gdsync/core.h @@ -40,35 +40,34 @@ ((((v) & 0x0000ffffU) >> 0 ) >= (unsigned)GDS_API_MINOR_VERSION) ) typedef enum gds_param { - GDS_PARAM_VERSION, - GDS_NUM_PARAMS + GDS_PARAM_VERSION, + GDS_NUM_PARAMS } gds_param_t; int gds_query_param(gds_param_t param, int *value); enum gds_create_qp_flags { - GDS_CREATE_QP_DEFAULT = 0, - GDS_CREATE_QP_WQ_ON_GPU = 1<<0, - GDS_CREATE_QP_TX_CQ_ON_GPU = 1<<1, - GDS_CREATE_QP_RX_CQ_ON_GPU = 1<<2, - GDS_CREATE_QP_WQ_DBREC_ON_GPU = 1<<5, + GDS_CREATE_QP_DEFAULT = 0, + GDS_CREATE_QP_WQ_ON_GPU = 1<<0, + GDS_CREATE_QP_TX_CQ_ON_GPU = 1<<1, + GDS_CREATE_QP_RX_CQ_ON_GPU = 1<<2, + GDS_CREATE_QP_WQ_DBREC_ON_GPU = 1<<5, }; -typedef struct ibv_exp_qp_init_attr gds_qp_init_attr_t; -typedef struct ibv_exp_send_wr gds_send_wr; +typedef struct ibv_qp_init_attr gds_qp_init_attr_t; +typedef struct ibv_send_wr gds_send_wr; -struct gds_cq { +typedef struct gds_cq { struct ibv_cq *cq; uint32_t curr_offset; -}; +} gds_cq_t; -struct gds_qp { +typedef struct gds_qp { struct ibv_qp *qp; - struct gds_cq send_cq; - struct gds_cq recv_cq; - struct ibv_exp_res_domain * res_domain; + struct gds_cq *send_cq; + struct gds_cq *recv_cq; struct ibv_context *dev_context; -}; +} gds_qp_t; /* \brief: Create a peer-enabled QP attached to the specified GPU id. * @@ -153,9 +152,11 @@ enum { */ typedef struct gds_send_request { - struct ibv_exp_peer_commit commit; - struct peer_op_wr wr[GDS_SEND_INFO_MAX_OPS]; + uint8_t reserved0[32]; + uint8_t reserved1[56 * GDS_SEND_INFO_MAX_OPS]; + uint8_t pad0[32]; } gds_send_request_t; +static_assert(sizeof(gds_send_request_t) % 64 == 0, "gds_send_request_t must be 64-byte aligned."); int gds_prepare_send(struct gds_qp *qp, gds_send_wr *p_ewr, gds_send_wr **bad_ewr, gds_send_request_t *request); int gds_stream_post_send(CUstream stream, gds_send_request_t *request); @@ -167,9 +168,11 @@ int gds_stream_post_send_all(CUstream stream, int count, gds_send_request_t *req */ typedef struct gds_wait_request { - struct ibv_exp_peer_peek peek; - struct peer_op_wr wr[GDS_WAIT_INFO_MAX_OPS]; + uint8_t reserved0[40]; + uint8_t reserved1[56 * GDS_WAIT_INFO_MAX_OPS]; + uint8_t pad0[24]; } gds_wait_request_t; +static_assert(sizeof(gds_wait_request_t) % 64 == 0, "gds_wait_request_t must be 64-byte aligned."); /** * Initializes a wait request out of the next heading CQE, which is kept in diff --git a/src/apis.cpp b/src/apis.cpp index cd532d7..cf9008c 100644 --- a/src/apis.cpp +++ b/src/apis.cpp @@ -40,7 +40,6 @@ //using namespace std; //#include -//#include //#include #include "gdsync.h" @@ -51,17 +50,7 @@ #include "utils.hpp" #include "archutils.h" #include "mlnxutils.h" - - -//----------------------------------------------------------------------------- - -static void gds_init_ops(struct peer_op_wr *op, int count) -{ - int i = count; - while (--i) - op[i-1].next = &op[i]; - op[count-1].next = NULL; -} +#include "transport.hpp" //----------------------------------------------------------------------------- @@ -70,9 +59,7 @@ static void gds_init_send_info(gds_send_request_t *info) gds_dbg("send_request=%p\n", info); memset(info, 0, sizeof(*info)); - info->commit.storage = info->wr; - info->commit.entries = sizeof(info->wr)/sizeof(info->wr[0]); - gds_init_ops(info->commit.storage, info->commit.entries); + gds_main_transport->init_send_info(info); } //----------------------------------------------------------------------------- @@ -81,46 +68,18 @@ static void gds_init_wait_request(gds_wait_request_t *request, uint32_t offset) { gds_dbg("wait_request=%p offset=%08x\n", request, offset); memset(request, 0, sizeof(*request)); - request->peek.storage = request->wr; - request->peek.entries = sizeof(request->wr)/sizeof(request->wr[0]); - request->peek.whence = IBV_EXP_PEER_PEEK_ABSOLUTE; - request->peek.offset = offset; - gds_init_ops(request->peek.storage, request->peek.entries); + + gds_main_transport->init_wait_request(request, offset); } //----------------------------------------------------------------------------- -static int gds_rollback_qp(struct gds_qp *qp, gds_send_request_t * send_info, enum ibv_exp_rollback_flags flag) +static int gds_rollback_qp(struct gds_qp *qp, gds_send_request_t *send_info) { - struct ibv_exp_rollback_ctx rollback; - int ret=0; - assert(qp); - assert(qp->qp); assert(send_info); - if( - flag != IBV_EXP_ROLLBACK_ABORT_UNCOMMITED && - flag != IBV_EXP_ROLLBACK_ABORT_LATE - ) - { - gds_err("erroneous ibv_exp_rollback_flags flag input value\n"); - ret=EINVAL; - goto out; - } - - /* from ibv_exp_peer_commit call */ - rollback.rollback_id = send_info->commit.rollback_id; - /* from ibv_exp_rollback_flag */ - rollback.flags = flag; - /* Reserved for future expensions, must be 0 */ - rollback.comp_mask = 0; - gds_warn("Need to rollback WQE %lx\n", rollback.rollback_id); - ret = ibv_exp_rollback_qp(qp->qp, &rollback); - if(ret) - gds_err("error %d in ibv_exp_rollback_qp\n", ret); -out: - return ret; + return gds_main_transport->rollback_qp(qp, send_info); } //----------------------------------------------------------------------------- @@ -138,7 +97,7 @@ int gds_post_send(struct gds_qp *qp, gds_send_wr *p_ewr, gds_send_wr **bad_ewr) ret = gds_post_pokes_on_cpu(1, &send_info, NULL, 0); if (ret) { gds_err("error %d in gds_post_pokes_on_cpu\n", ret); - ret_roll = gds_rollback_qp(qp, &send_info, IBV_EXP_ROLLBACK_ABORT_LATE); + ret_roll = gds_rollback_qp(qp, &send_info); if (ret_roll) { gds_err("error %d in gds_rollback_qp\n", ret_roll); } @@ -171,33 +130,20 @@ int gds_post_recv(struct gds_qp *qp, struct ibv_recv_wr *wr, struct ibv_recv_wr //----------------------------------------------------------------------------- -int gds_prepare_send(struct gds_qp *qp, gds_send_wr *p_ewr, +int gds_prepare_send(struct gds_qp *gqp, gds_send_wr *p_ewr, gds_send_wr **bad_ewr, gds_send_request_t *request) { int ret = 0; + gds_init_send_info(request); - assert(qp); - assert(qp->qp); - ret = ibv_exp_post_send(qp->qp, p_ewr, bad_ewr); - if (ret) { + assert(gqp); + assert(gqp->qp); + + ret = gds_main_transport->prepare_send(gqp, p_ewr, bad_ewr, request); + if (ret) + gds_err("Error %d in prepare_send.\n", ret); - if (ret == ENOMEM) { - // out of space error can happen too often to report - gds_dbg("ENOMEM error %d in ibv_exp_post_send\n", ret); - } else { - gds_err("error %d in ibv_exp_post_send\n", ret); - } - goto out; - } - - ret = ibv_exp_peer_commit_qp(qp->qp, &request->commit); - if (ret) { - gds_err("error %d in ibv_exp_peer_commit_qp\n", ret); - //gds_wait_kernel(); - goto out; - } -out: return ret; } @@ -281,7 +227,6 @@ int gds_stream_post_send_all(CUstream stream, int count, gds_send_request_t *req int gds_prepare_wait_cq(struct gds_cq *cq, gds_wait_request_t *request, int flags) { - int retcode = 0; if (flags != 0) { gds_err("invalid flags != 0\n"); return EINVAL; @@ -289,50 +234,22 @@ int gds_prepare_wait_cq(struct gds_cq *cq, gds_wait_request_t *request, int flag gds_init_wait_request(request, cq->curr_offset++); - retcode = ibv_exp_peer_peek_cq(cq->cq, &request->peek); - if (retcode == -ENOSPC) { - // TODO: handle too few entries - gds_err("not enough ops in peer_peek_cq\n"); - goto out; - } else if (retcode) { - gds_err("error %d in peer_peek_cq\n", retcode); - goto out; - } - //gds_dump_wait_request(request, 1); - out: - return retcode; + return gds_main_transport->prepare_wait_cq(cq, request, flags); } //----------------------------------------------------------------------------- int gds_append_wait_cq(gds_wait_request_t *request, uint32_t *dw, uint32_t val) { - int ret = 0; - unsigned MAX_NUM_ENTRIES = sizeof(request->wr)/sizeof(request->wr[0]); - unsigned n = request->peek.entries; - struct peer_op_wr *wr = request->peek.storage; - - if (n + 1 > MAX_NUM_ENTRIES) { - gds_err("no space left to stuff a poke\n"); - ret = ENOMEM; - goto out; + int ret = gds_transport_init(); + if (ret) { + gds_err("error in gds_transport_init\n"); + goto out; } - // at least 1 op - assert(n); - assert(wr); - - for (; n; --n) wr = wr->next; - assert(wr); + ret = gds_main_transport->append_wait_cq(request, dw, val); - wr->type = IBV_EXP_PEER_OP_STORE_DWORD; - wr->wr.dword_va.data = val; - wr->wr.dword_va.target_id = 0; // direct mapping, offset IS the address - wr->wr.dword_va.offset = (ptrdiff_t)(dw-(uint32_t*)0); - - ++request->peek.entries; - - out: +out: return ret; } @@ -356,10 +273,8 @@ static int gds_abort_wait_cq(struct gds_cq *cq, gds_wait_request_t *request) { assert(cq); assert(request); - struct ibv_exp_peer_abort_peek abort_ctx; - abort_ctx.peek_id = request->peek.peek_id; - abort_ctx.comp_mask = 0; - return ibv_exp_peer_abort_peek_cq(cq->cq, &abort_ctx); + + return gds_main_transport->abort_wait_cq(cq, request); } //----------------------------------------------------------------------------- @@ -550,14 +465,21 @@ static int calc_n_mem_ops(size_t n_descs, gds_descriptor_t *descs, size_t &n_mem int ret = 0; n_mem_ops = 0; size_t i; - for(i = 0; i < n_descs; ++i) { + + ret = gds_transport_init(); + if (ret) { + gds_err("error in gds_transport_init\n"); + goto out; + } + + for (i = 0; i < n_descs; ++i) { gds_descriptor_t *desc = descs + i; - switch(desc->tag) { + switch (desc->tag) { case GDS_TAG_SEND: - n_mem_ops += desc->send->commit.entries + 2; // extra space, ugly + n_mem_ops += gds_main_transport->get_num_send_request_entries(desc->send) + 2; // extra space, ugly break; case GDS_TAG_WAIT: - n_mem_ops += desc->wait->peek.entries + 2; // ditto + n_mem_ops += gds_main_transport->get_num_wait_request_entries(desc->wait) + 2; // ditto break; case GDS_TAG_WAIT_VALUE32: case GDS_TAG_WRITE_VALUE32: @@ -569,6 +491,8 @@ static int calc_n_mem_ops(size_t n_descs, gds_descriptor_t *descs, size_t &n_mem ret = EINVAL; } } + +out: return ret; } @@ -585,6 +509,11 @@ int gds_stream_post_descriptors(CUstream stream, size_t n_descs, gds_descriptor_ gds_peer *peer = NULL; gds_op_list_t params; + ret = gds_transport_init(); + if (ret) { + gds_err("error in gds_transport_init\n"); + goto out; + } ret = calc_n_mem_ops(n_descs, descs, n_mem_ops); if (ret) { @@ -612,12 +541,11 @@ int gds_stream_post_descriptors(CUstream stream, size_t n_descs, gds_descriptor_ return EINVAL; } - for(i = 0; i < n_descs; ++i) { + for (i = 0; i < n_descs; ++i) { gds_descriptor_t *desc = descs + i; - switch(desc->tag) { + switch (desc->tag) { case GDS_TAG_SEND: { - gds_send_request_t *sreq = desc->send; - retcode = gds_post_ops(peer, sreq->commit.entries, sreq->commit.storage, params); + retcode = gds_main_transport->post_send_ops(peer, desc->send, params); if (retcode) { gds_err("error %d in gds_post_ops\n", retcode); ret = retcode; @@ -626,15 +554,14 @@ int gds_stream_post_descriptors(CUstream stream, size_t n_descs, gds_descriptor_ break; } case GDS_TAG_WAIT: { - gds_wait_request_t *wreq = desc->wait; int flags = 0; if (move_flush && i != last_wait) { gds_dbg("discarding FLUSH!\n"); flags = GDS_POST_OPS_DISCARD_WAIT_FLUSH; } - retcode = gds_post_ops(peer, wreq->peek.entries, wreq->peek.storage, params, flags); + retcode = gds_main_transport->stream_post_wait_descriptor(peer, desc->wait, params, flags); if (retcode) { - gds_err("error %d in gds_post_ops\n", retcode); + gds_err("error %d in stream_post_wait_descriptor\n", retcode); ret = retcode; goto out; } @@ -689,13 +616,19 @@ int gds_post_descriptors(size_t n_descs, gds_descriptor_t *descs, int flags) size_t i; int ret = 0; int retcode = 0; + + ret = gds_transport_init(); + if (ret) { + gds_err("error in gds_transport_init\n"); + goto out; + } + for(i = 0; i < n_descs; ++i) { gds_descriptor_t *desc = descs + i; switch(desc->tag) { case GDS_TAG_SEND: { gds_dbg("desc[%zu] SEND\n", i); - gds_send_request_t *sreq = desc->send; - retcode = gds_post_ops_on_cpu(sreq->commit.entries, sreq->commit.storage, flags); + retcode = gds_main_transport->post_send_ops_on_cpu(desc->send, flags); if (retcode) { gds_err("error %d in gds_post_ops_on_cpu\n", retcode); ret = retcode; @@ -705,10 +638,9 @@ int gds_post_descriptors(size_t n_descs, gds_descriptor_t *descs, int flags) } case GDS_TAG_WAIT: { gds_dbg("desc[%zu] WAIT\n", i); - gds_wait_request_t *wreq = desc->wait; - retcode = gds_post_ops_on_cpu(wreq->peek.entries, wreq->peek.storage, flags); + retcode = gds_main_transport->post_wait_descriptor(desc->wait, flags); if (retcode) { - gds_err("error %d in gds_post_ops_on_cpu\n", retcode); + gds_err("error %d in post_wait_descriptor\n", retcode); ret = retcode; goto out; } diff --git a/src/gdsync.cpp b/src/gdsync.cpp index 90d5508..84a9b05 100644 --- a/src/gdsync.cpp +++ b/src/gdsync.cpp @@ -43,6 +43,11 @@ #include "archutils.h" #include "mlnxutils.h" #include "task_queue.hpp" +#include "transport.hpp" + +//----------------------------------------------------------------------------- + +gds_transport_t *gds_main_transport = NULL; //----------------------------------------------------------------------------- @@ -92,10 +97,6 @@ int gds_flusher_enabled() #define CU_STREAM_BATCH_MEM_OP_RELAXED_ORDERING 0x1 #endif -// TODO: use correct value -// TODO: make it dependent upon the particular GPU -const size_t GDS_GPU_MAX_INLINE_SIZE = 256; - //----------------------------------------------------------------------------- // Note: these are default overrides, i.e. allow to disable/enable the features @@ -172,7 +173,7 @@ static bool gds_enable_inlcpy() } // simulate 64-bits writes with inlcpy -static bool gds_simulate_write64() +bool gds_simulate_write64() { static int gds_simulate_write64 = -1; if (-1 == gds_simulate_write64) { @@ -348,7 +349,7 @@ int gds_fill_membar(gds_peer *peer, gds_op_list_t &ops, int flags) //----------------------------------------------------------------------------- -static int gds_fill_inlcpy(gds_peer *peer, gds_op_list_t &ops, CUdeviceptr addr, const void *data, size_t n_bytes, int flags) +int gds_fill_inlcpy(gds_peer *peer, gds_op_list_t &ops, CUdeviceptr addr, const void *data, size_t n_bytes, int flags) { int retcode = 0; #if HAVE_DECL_CU_STREAM_MEM_OP_WRITE_MEMORY @@ -409,7 +410,7 @@ int gds_fill_inlcpy(gds_peer *peer, gds_op_list_t &ops, void *ptr, const void *d //----------------------------------------------------------------------------- -static void gds_enable_barrier_for_inlcpy(CUstreamBatchMemOpParams *param) +void gds_enable_barrier_for_inlcpy(CUstreamBatchMemOpParams *param) { #if HAVE_DECL_CU_STREAM_MEM_OP_WRITE_MEMORY assert(param->operation == CU_STREAM_MEM_OP_WRITE_MEMORY); @@ -419,7 +420,7 @@ static void gds_enable_barrier_for_inlcpy(CUstreamBatchMemOpParams *param) //----------------------------------------------------------------------------- -static int gds_fill_poke(gds_peer *peer, gds_op_list_t &ops, CUdeviceptr addr, uint32_t value, int flags) +int gds_fill_poke(gds_peer *peer, gds_op_list_t &ops, CUdeviceptr addr, uint32_t value, int flags) { int retcode = 0; CUdeviceptr dev_ptr = addr; @@ -466,7 +467,7 @@ int gds_fill_poke(gds_peer *peer, gds_op_list_t &ops, uint32_t *ptr, uint32_t va //----------------------------------------------------------------------------- -static int gds_fill_poke64(gds_peer *peer, gds_op_list_t &ops, CUdeviceptr addr, uint64_t value, int flags) +int gds_fill_poke64(gds_peer *peer, gds_op_list_t &ops, CUdeviceptr addr, uint64_t value, int flags) { int retcode = 0; #if HAVE_DECL_CU_STREAM_MEM_OP_WRITE_VALUE_64 @@ -580,7 +581,7 @@ unsigned poll_checker::m_global_index = 0; //----------------------------------------------------------------------------- -static int gds_fill_poll(gds_peer *peer, gds_op_list_t &ops, CUdeviceptr ptr, uint32_t magic, int cond_flag, int flags) +int gds_fill_poll(gds_peer *peer, gds_op_list_t &ops, CUdeviceptr ptr, uint32_t magic, int cond_flag, int flags) { int retcode = 0; const char *cond_str = NULL; @@ -717,270 +718,6 @@ int gds_stream_batch_ops(gds_peer *peer, CUstream stream, gds_op_list_t &ops, in //----------------------------------------------------------------------------- -/* - A) plain+membar: - WR32 - MEMBAR - WR32 - WR32 - - B) plain: - WR32 - WR32+PREBARRIER - WR32 - - C) sim64+membar: - WR32 - MEMBAR - INLCPY 8B - - D) sim64: - INLCPY 4B + POSTBARRIER - INLCPY 8B - - E) inlcpy+membar: - WR32 - MEMBAR - INLCPY XB - - F) inlcpy: - INLCPY 4B + POSTBARRIER - INLCPY 128B -*/ - -int gds_post_ops(gds_peer *peer, size_t n_ops, struct peer_op_wr *op, gds_op_list_t &ops, int post_flags) -{ - int retcode = 0; - size_t n = 0; - bool prev_was_fence = false; - bool use_inlcpy_for_dword = false; - //size_t n_ops = ops.size(); - CUstreamBatchMemOpParams param; - - gds_dbg("n_ops=%zu\n", n_ops); - - if (!peer->has_memops) { - gds_err("CUDA MemOps are required\n"); - return EINVAL; - } - - // divert the request to the same engine handling 64bits - // to avoid out-of-order execution - // caveat: can't use membar if inlcpy is used for 4B writes (to simulate 8B writes) - if (peer->has_inlcpy) { - if (!peer->has_membar) - use_inlcpy_for_dword = true; // F - } - if (gds_simulate_write64()) { - if (!peer->has_membar) { - gds_warn_once("enabling use_inlcpy_for_dword\n"); - use_inlcpy_for_dword = true; // D - } - } - - for (; op && n < n_ops; op = op->next, ++n) { - //int flags = 0; - gds_dbg("op[%zu] type:%08x\n", n, op->type); - switch(op->type) { - case IBV_EXP_PEER_OP_FENCE: { - gds_dbg("OP_FENCE: fence_flags=%" PRIu64 "\n", op->wr.fence.fence_flags); - uint32_t fence_op = (op->wr.fence.fence_flags & (IBV_EXP_PEER_FENCE_OP_READ|IBV_EXP_PEER_FENCE_OP_WRITE)); - uint32_t fence_from = (op->wr.fence.fence_flags & (IBV_EXP_PEER_FENCE_FROM_CPU|IBV_EXP_PEER_FENCE_FROM_HCA)); - uint32_t fence_mem = (op->wr.fence.fence_flags & (IBV_EXP_PEER_FENCE_MEM_SYS|IBV_EXP_PEER_FENCE_MEM_PEER)); - - if (fence_op == IBV_EXP_PEER_FENCE_OP_READ) { - gds_dbg("nothing to do for read fences\n"); - //retcode = EINVAL; - break; - } - else { - if (!peer->has_membar) { - if (use_inlcpy_for_dword) { - assert(ops.size() > 0); - gds_dbg("patching previous param\n"); - gds_enable_barrier_for_inlcpy(&ops.back()); - } - else { - gds_dbg("recording fence event\n"); - prev_was_fence = true; - } - //retcode = 0; - } - else { - if (fence_from != IBV_EXP_PEER_FENCE_FROM_HCA) { - gds_err("unexpected from fence\n"); - retcode = EINVAL; - break; - } - int flags = 0; - if (fence_mem == IBV_EXP_PEER_FENCE_MEM_PEER) { - gds_dbg("using light membar\n"); - flags = GDS_MEMBAR_DEFAULT | GDS_MEMBAR_MLX5; - } - else if (fence_mem == IBV_EXP_PEER_FENCE_MEM_SYS) { - gds_dbg("using heavy membar\n"); - flags = GDS_MEMBAR_SYS | GDS_MEMBAR_MLX5; - } - else { - gds_err("unsupported fence combination\n"); - retcode = EINVAL; - break; - } - retcode = gds_fill_membar(peer, ops, flags); - } - } - break; - } - case IBV_EXP_PEER_OP_STORE_DWORD: { - CUdeviceptr dev_ptr = range_from_id(op->wr.dword_va.target_id)->dptr + - op->wr.dword_va.offset; - uint32_t data = op->wr.dword_va.data; - int flags = 0; - gds_dbg("OP_STORE_DWORD dev_ptr=%llx data=%" PRIx32 "\n", dev_ptr, data); - if (use_inlcpy_for_dword) { // F || D - // membar may be out of order WRT inlcpy - if (peer->has_membar) { - gds_err("invalid feature combination, inlcpy + membar\n"); - retcode = EINVAL; - break; - } - // tail flush is set when following fence is met - // flags |= GDS_IMMCOPY_POST_TAIL_FLUSH; - retcode = gds_fill_inlcpy(peer, ops, dev_ptr, &data, sizeof(data), flags); - } - else { // A || B || C || E - // can't guarantee ordering of write32+inlcpy unless - // a membar is there - // TODO: fix driver when !weak - if (peer->has_inlcpy && !peer->has_membar) { - gds_err("invalid feature combination, inlcpy needs membar\n"); - retcode = EINVAL; - break; - } - if (prev_was_fence) { - gds_dbg("using PRE_BARRIER as fence\n"); - flags |= GDS_WRITE_PRE_BARRIER; - prev_was_fence = false; - } - retcode = gds_fill_poke(peer, ops, dev_ptr, data, flags); - } - break; - } - case IBV_EXP_PEER_OP_STORE_QWORD: { - CUdeviceptr dev_ptr = range_from_id(op->wr.qword_va.target_id)->dptr + - op->wr.qword_va.offset; - uint64_t data = op->wr.qword_va.data; - int flags = 0; - gds_dbg("OP_STORE_QWORD dev_ptr=%llx data=%" PRIx64 "\n", dev_ptr, data); - // C || D - if (gds_simulate_write64()) { - // simulate 64-bit poke by inline copy - if (!peer->has_membar) { - gds_err("invalid feature combination, inlcpy needs membar\n"); - retcode = EINVAL; - break; - } - - // tail flush is never useful here - //flags |= GDS_IMMCOPY_POST_TAIL_FLUSH; - retcode = gds_fill_inlcpy(peer, ops, dev_ptr, &data, sizeof(data), flags); - } - else if (peer->has_write64) { - retcode = gds_fill_poke64(peer, ops, dev_ptr, data, flags); - } - else { - uint32_t datalo = gds_qword_lo(op->wr.qword_va.data); - uint32_t datahi = gds_qword_hi(op->wr.qword_va.data); - - if (prev_was_fence) { - gds_dbg("enabling PRE_BARRIER\n"); - flags |= GDS_WRITE_PRE_BARRIER; - prev_was_fence = false; - } - retcode = gds_fill_poke(peer, ops, dev_ptr, datalo, flags); - - // get rid of the barrier, if there - flags &= ~GDS_WRITE_PRE_BARRIER; - - // advance to next DWORD - dev_ptr += sizeof(uint32_t); - retcode = gds_fill_poke(peer, ops, dev_ptr, datahi, flags); - } - - break; - } - case IBV_EXP_PEER_OP_COPY_BLOCK: { - CUdeviceptr dev_ptr = range_from_id(op->wr.copy_op.target_id)->dptr + - op->wr.copy_op.offset; - size_t len = op->wr.copy_op.len; - void *src = op->wr.copy_op.src; - int flags = 0; - gds_dbg("OP_COPY_BLOCK dev_ptr=%llx src=%p len=%zu\n", dev_ptr, src, len); - // catching any other size here - if (!peer->has_inlcpy) { - gds_err("inline copy is not supported\n"); - retcode = EINVAL; - break; - } - // IB Verbs bug - assert(len <= GDS_GPU_MAX_INLINE_SIZE); - //if (desc->need_flush) { - // flags |= GDS_IMMCOPY_POST_TAIL_FLUSH; - //} - retcode = gds_fill_inlcpy(peer, ops, dev_ptr, src, len, flags); - break; - } - case IBV_EXP_PEER_OP_POLL_AND_DWORD: - case IBV_EXP_PEER_OP_POLL_GEQ_DWORD: - case IBV_EXP_PEER_OP_POLL_NOR_DWORD: { - int poll_cond; - CUdeviceptr dev_ptr = range_from_id(op->wr.dword_va.target_id)->dptr + - op->wr.dword_va.offset; - uint32_t data = op->wr.dword_va.data; - // TODO: properly handle a following fence instead of blidly flushing - int flags = 0; - if (!(post_flags & GDS_POST_OPS_DISCARD_WAIT_FLUSH)) - flags |= GDS_WAIT_POST_FLUSH_REMOTE; - - gds_dbg("OP_WAIT_DWORD dev_ptr=%llx data=%" PRIx32 " type=%" PRIx32 "\n", dev_ptr, data, (uint32_t)op->type); - - switch(op->type) { - case IBV_EXP_PEER_OP_POLL_NOR_DWORD: - poll_cond = GDS_WAIT_COND_NOR; - break; - case IBV_EXP_PEER_OP_POLL_GEQ_DWORD: - poll_cond = GDS_WAIT_COND_GEQ; - break; - case IBV_EXP_PEER_OP_POLL_AND_DWORD: - poll_cond = GDS_WAIT_COND_AND; - break; - default: - assert(!"cannot happen"); - retcode = EINVAL; - goto out; - } - retcode = gds_fill_poll(peer, ops, dev_ptr, data, poll_cond, flags); - break; - } - default: - gds_err("undefined peer op type %d\n", op->type); - retcode = EINVAL; - break; - } - if (retcode) { - gds_err("error in fill func at entry n=%zu\n", n); - goto out; - } - } - - assert(n_ops == n); - -out: - return retcode; -} - -//----------------------------------------------------------------------------- - int gds_post_pokes(CUstream stream, int count, gds_send_request_t *info, uint32_t *dw, uint32_t val) { int retcode = 0; @@ -997,7 +734,7 @@ int gds_post_pokes(CUstream stream, int count, gds_send_request_t *info, uint32_ for (int j=0; jpost_send_ops(peer, &info[j], ops); if (retcode) { goto out; } @@ -1024,124 +761,6 @@ int gds_post_pokes(CUstream stream, int count, gds_send_request_t *info, uint32_ //----------------------------------------------------------------------------- -int gds_post_ops_on_cpu(size_t n_ops, struct peer_op_wr *op, int post_flags) -{ - int retcode = 0; - size_t n = 0; - gds_dbg("n_ops=%zu op=%p post_flags=0x%x\n", n_ops, op, post_flags); - for (; op && n < n_ops; op = op->next, ++n) { - //int flags = 0; - gds_dbg("op[%zu]=%p\n", n, op); - //gds_dbg("op[%zu]=%p type:%08x\n", n, op, op->type); - switch(op->type) { - case IBV_EXP_PEER_OP_FENCE: { - gds_dbg("FENCE flags=%" PRIu64 "\n", op->wr.fence.fence_flags); - uint32_t fence_op = (op->wr.fence.fence_flags & (IBV_EXP_PEER_FENCE_OP_READ|IBV_EXP_PEER_FENCE_OP_WRITE)); - uint32_t fence_from = (op->wr.fence.fence_flags & (IBV_EXP_PEER_FENCE_FROM_CPU|IBV_EXP_PEER_FENCE_FROM_HCA)); - uint32_t fence_mem = (op->wr.fence.fence_flags & (IBV_EXP_PEER_FENCE_MEM_SYS|IBV_EXP_PEER_FENCE_MEM_PEER)); - - if (fence_op == IBV_EXP_PEER_FENCE_OP_READ) { - gds_warnc(1, "nothing to do for read fences\n"); - //retcode = EINVAL; - break; - } - else { - if (fence_from != IBV_EXP_PEER_FENCE_FROM_HCA) { - gds_err("unexpected from %08x fence, expected FROM_HCA\n", fence_from); - retcode = EINVAL; - break; - } - if (fence_mem == IBV_EXP_PEER_FENCE_MEM_PEER) { - gds_dbg("using light membar\n"); - wmb(); - } - else if (fence_mem == IBV_EXP_PEER_FENCE_MEM_SYS) { - gds_dbg("using heavy membar\n"); - wmb(); - } - else { - gds_err("unsupported fence combination\n"); - retcode = EINVAL; - break; - } - } - break; - } - case IBV_EXP_PEER_OP_STORE_DWORD: { - uint32_t *ptr = (uint32_t*)((ptrdiff_t)range_from_id(op->wr.dword_va.target_id)->va + op->wr.dword_va.offset); - uint32_t data = op->wr.dword_va.data; - // A || B || C || E - gds_dbg("STORE_DWORD ptr=%p data=%08" PRIx32 "\n", ptr, data); - gds_atomic_set(ptr, data); - break; - } - case IBV_EXP_PEER_OP_STORE_QWORD: { - uint64_t *ptr = (uint64_t*)((ptrdiff_t)range_from_id(op->wr.qword_va.target_id)->va + op->wr.qword_va.offset); - uint64_t data = op->wr.qword_va.data; - gds_dbg("STORE_QWORD ptr=%p data=%016" PRIx64 "\n", ptr, data); - gds_atomic_set(ptr, data); - break; - } - case IBV_EXP_PEER_OP_COPY_BLOCK: { - uint64_t *ptr = (uint64_t*)((ptrdiff_t)range_from_id(op->wr.copy_op.target_id)->va + op->wr.copy_op.offset); - uint64_t *src = (uint64_t*)op->wr.copy_op.src; - size_t n_bytes = op->wr.copy_op.len; - gds_dbg("COPY_BLOCK ptr=%p src=%p len=%zu\n", ptr, src, n_bytes); - gds_bf_copy(ptr, src, n_bytes); - break; - } - case IBV_EXP_PEER_OP_POLL_AND_DWORD: - case IBV_EXP_PEER_OP_POLL_GEQ_DWORD: - case IBV_EXP_PEER_OP_POLL_NOR_DWORD: { - int poll_cond; - uint32_t *ptr = (uint32_t*)((ptrdiff_t)range_from_id(op->wr.dword_va.target_id)->va + op->wr.dword_va.offset); - uint32_t value = op->wr.dword_va.data; - bool flush = true; - if (post_flags & GDS_POST_OPS_DISCARD_WAIT_FLUSH) - flush = false; - gds_dbg("WAIT_32 dev_ptr=%p data=%" PRIx32 " type=%" PRIx32 "\n", ptr, value, (uint32_t)op->type); - bool done = false; - do { - uint32_t data = gds_atomic_get(ptr); - switch(op->type) { - case IBV_EXP_PEER_OP_POLL_NOR_DWORD: - done = (0 != ~(data | value)); - break; - case IBV_EXP_PEER_OP_POLL_GEQ_DWORD: - done = ((int32_t)data - (int32_t)value >= 0); - break; - case IBV_EXP_PEER_OP_POLL_AND_DWORD: - done = (0 != (data & value)); - break; - default: - gds_err("invalid op type %02x\n", op->type); - retcode = EINVAL; - goto out; - } - if (done) - break; - // TODO: more aggressive CPU relaxing needed here to avoid starving I/O fabric - arch_cpu_relax(); - } while(true); - break; - } - default: - gds_err("undefined peer op type %d\n", op->type); - retcode = EINVAL; - break; - } - if (retcode) { - gds_err("error %d at entry n=%zu\n", retcode, n); - goto out; - } - } - -out: - return retcode; -} - -//----------------------------------------------------------------------------- - int gds_post_pokes_on_cpu(int count, gds_send_request_t *info, uint32_t *dw, uint32_t val) { int retcode = 0; @@ -1151,7 +770,7 @@ int gds_post_pokes_on_cpu(int count, gds_send_request_t *info, uint32_t *dw, uin for (int j=0; jpost_send_ops_on_cpu(&info[j], 0); if (retcode) { goto out; } @@ -1168,72 +787,13 @@ int gds_post_pokes_on_cpu(int count, gds_send_request_t *info, uint32_t *dw, uin //----------------------------------------------------------------------------- -static void gds_dump_ops(struct peer_op_wr *op, size_t count) -{ - size_t n = 0; - for (; op; op = op->next, ++n) { - gds_dbg("op[%zu] type:%d\n", n, op->type); - switch(op->type) { - case IBV_EXP_PEER_OP_FENCE: { - gds_dbg("FENCE flags=%" PRIu64 "\n", op->wr.fence.fence_flags); - break; - } - case IBV_EXP_PEER_OP_STORE_DWORD: { - CUdeviceptr dev_ptr = range_from_id(op->wr.dword_va.target_id)->dptr + - op->wr.dword_va.offset; - gds_dbg("STORE_QWORD data:%x target_id:%" PRIx64 " offset:%zu dev_ptr=%llx\n", - op->wr.dword_va.data, op->wr.dword_va.target_id, - op->wr.dword_va.offset, dev_ptr); - break; - } - case IBV_EXP_PEER_OP_STORE_QWORD: { - CUdeviceptr dev_ptr = range_from_id(op->wr.qword_va.target_id)->dptr + - op->wr.qword_va.offset; - gds_dbg("STORE_QWORD data:%" PRIx64 " target_id:%" PRIx64 " offset:%zu dev_ptr=%llx\n", - op->wr.qword_va.data, op->wr.qword_va.target_id, - op->wr.qword_va.offset, dev_ptr); - break; - } - case IBV_EXP_PEER_OP_COPY_BLOCK: { - CUdeviceptr dev_ptr = range_from_id(op->wr.copy_op.target_id)->dptr + - op->wr.copy_op.offset; - gds_dbg("COPY_BLOCK src:%p len:%zu target_id:%" PRIx64 " offset:%zu dev_ptr=%llx\n", - op->wr.copy_op.src, op->wr.copy_op.len, - op->wr.copy_op.target_id, op->wr.copy_op.offset, - dev_ptr); - break; - } - case IBV_EXP_PEER_OP_POLL_AND_DWORD: - case IBV_EXP_PEER_OP_POLL_NOR_DWORD: { - CUdeviceptr dev_ptr = range_from_id(op->wr.dword_va.target_id)->dptr + - op->wr.dword_va.offset; - gds_dbg("%s data:%08x target_id:%" PRIx64 " offset:%zu dev_ptr=%llx\n", - (op->type==IBV_EXP_PEER_OP_POLL_AND_DWORD) ? "POLL_AND_DW" : "POLL_NOR_SDW", - op->wr.dword_va.data, - op->wr.dword_va.target_id, - op->wr.dword_va.offset, - dev_ptr); - break; - } - default: - gds_err("undefined peer op type %d\n", op->type); - break; - } - } - - assert(count == n); -} - -//----------------------------------------------------------------------------- - void gds_dump_wait_request(gds_wait_request_t *request, size_t count) { - for (size_t j=0; jentries, peek->whence, peek->offset, - peek->peek_id, peek->comp_mask); - gds_dump_ops(peek->storage, peek->entries); + for (size_t j = 0; j < count; ++j) { + if (count == 0) + return; + + gds_main_transport->dump_wait_request(&request[j], j); } } @@ -1279,7 +839,7 @@ int gds_stream_post_wait_cq_multi(CUstream stream, int count, gds_wait_request_t // If NULL returned then buffer will be allocated in system memory // by ibverbs driver. -static struct ibv_exp_peer_buf *gds_buf_alloc(ibv_exp_peer_buf_alloc_attr *attr) +static gds_peer_buf_t *gds_buf_alloc(gds_peer_buf_alloc_attr_t *attr) { assert(attr); gds_peer *peer = peer_from_id(attr->peer_id); @@ -1291,7 +851,7 @@ static struct ibv_exp_peer_buf *gds_buf_alloc(ibv_exp_peer_buf_alloc_attr *attr) return peer->buf_alloc(peer->alloc_type, attr->length, attr->dir, attr->alignment, peer->alloc_flags); } -static int gds_buf_release(struct ibv_exp_peer_buf *pb) +static int gds_buf_release(gds_peer_buf_t *pb) { gds_dbg("freeing pb=%p\n", pb); gds_buf *buf = static_cast(pb); @@ -1300,14 +860,14 @@ static int gds_buf_release(struct ibv_exp_peer_buf *pb) return 0; } -static uint64_t gds_register_va(void *start, size_t length, uint64_t peer_id, struct ibv_exp_peer_buf *pb) +static uint64_t gds_register_va(void *start, size_t length, uint64_t peer_id, gds_peer_buf_t *pb) { gds_peer *peer = peer_from_id(peer_id); gds_range *range = NULL; gds_dbg("start=%p length=%zu peer_id=%" PRIx64 " peer_buf=%p\n", start, length, peer_id, pb); - if (IBV_EXP_PEER_IOMEMORY == pb) { + if (GDS_PEER_IOMEMORY == pb) { // register as IOMEM range = peer->register_range(start, length, GDS_MEMORY_IO); } @@ -1509,28 +1069,28 @@ static void gds_init_peer(gds_peer *peer, CUdevice dev, int gpu_id) peer->attr.register_va = gds_register_va; peer->attr.unregister_va = gds_unregister_va; - peer->attr.caps = ( IBV_EXP_PEER_OP_STORE_DWORD_CAP | - IBV_EXP_PEER_OP_STORE_QWORD_CAP | - IBV_EXP_PEER_OP_FENCE_CAP | - IBV_EXP_PEER_OP_POLL_AND_DWORD_CAP ); + peer->attr.caps = ( GDS_PEER_OP_STORE_DWORD_CAP | + GDS_PEER_OP_STORE_QWORD_CAP | + GDS_PEER_OP_FENCE_CAP | + GDS_PEER_OP_POLL_AND_DWORD_CAP ); if (peer->has_wait_nor) { gds_dbg("enabling NOR feature\n"); - peer->attr.caps |= IBV_EXP_PEER_OP_POLL_NOR_DWORD_CAP; + peer->attr.caps |= GDS_PEER_OP_POLL_NOR_DWORD_CAP; } else - peer->attr.caps |= IBV_EXP_PEER_OP_POLL_GEQ_DWORD_CAP; + peer->attr.caps |= GDS_PEER_OP_POLL_GEQ_DWORD_CAP; if (peer->has_inlcpy) { gds_dbg("enabling COPY BLOCK feature\n"); - peer->attr.caps |= IBV_EXP_PEER_OP_COPY_BLOCK_CAP; + peer->attr.caps |= GDS_PEER_OP_COPY_BLOCK_CAP; } else if (peer->has_write64 || gds_simulate_write64()) { gds_dbg("enabling STORE QWORD feature\n"); - peer->attr.caps |= IBV_EXP_PEER_OP_STORE_QWORD_CAP; + peer->attr.caps |= GDS_PEER_OP_STORE_QWORD_CAP; } gds_dbg("caps=%016lx\n", peer->attr.caps); peer->attr.peer_dma_op_map_len = GDS_GPU_MAX_INLINE_SIZE; - peer->attr.comp_mask = IBV_EXP_PEER_DIRECT_VERSION; + peer->attr.comp_mask = GDS_PEER_DIRECT_VERSION; peer->attr.version = 1; peer->tq = new task_queue; @@ -1678,136 +1238,11 @@ gds_peer *peer_from_stream(CUstream stream) //----------------------------------------------------------------------------- -static ibv_exp_res_domain *gds_create_res_domain(struct ibv_context *context) -{ - if (!context) { - gds_err("invalid context"); - return NULL; - } - - ibv_exp_res_domain_init_attr res_domain_attr; - memset(&res_domain_attr, 0, sizeof(res_domain_attr)); - - res_domain_attr.comp_mask |= IBV_EXP_RES_DOMAIN_THREAD_MODEL; - res_domain_attr.thread_model = IBV_EXP_THREAD_SINGLE; - - ibv_exp_res_domain *res_domain = ibv_exp_create_res_domain(context, &res_domain_attr); - if (!res_domain) { - gds_warn("Can't create resource domain\n"); - } - - return res_domain; -} - -//----------------------------------------------------------------------------- - -static struct gds_cq * -gds_create_cq_internal(struct ibv_context *context, int cqe, - void *cq_context, struct ibv_comp_channel *channel, - int comp_vector, int gpu_id, gds_alloc_cq_flags_t flags, - struct ibv_exp_res_domain * res_domain) -{ - struct gds_cq *gcq = NULL; - ibv_exp_cq_init_attr attr; - gds_peer *peer = NULL; - gds_peer_attr *peer_attr = NULL; - int ret=0; - - if(!context) - { - gds_dbg("Invalid input context\n"); - return NULL; - } - - gcq = (struct gds_cq*)calloc(1, sizeof(struct gds_cq)); - if (!gcq) { - gds_err("cannot allocate memory\n"); - return NULL; - } - - //Here we need to recover peer and peer_attr pointers to set alloc_type and alloc_flags - //before ibv_exp_create_cq - ret = gds_register_peer_by_ordinal(gpu_id, &peer, &peer_attr); - if (ret) { - gds_err("error %d while registering GPU peer\n", ret); - return NULL; - } - assert(peer); - assert(peer_attr); - - peer->alloc_type = gds_peer::CQ; - peer->alloc_flags = flags; - - attr.comp_mask = IBV_EXP_CQ_INIT_ATTR_PEER_DIRECT; - attr.flags = 0; // see ibv_exp_cq_create_flags - attr.peer_direct_attrs = peer_attr; - if (res_domain) { - gds_dbg("using peer->res_domain %p for CQ\n", res_domain); - attr.res_domain = res_domain; - attr.comp_mask |= IBV_EXP_CQ_INIT_ATTR_RES_DOMAIN; - } - - int old_errno = errno; - gcq->cq = ibv_exp_create_cq(context, cqe, cq_context, channel, comp_vector, &attr); - if (!gcq->cq) { - gds_err("error %d in ibv_exp_create_cq, old errno %d\n", errno, old_errno); - return NULL; - } - - return gcq; -} - -//Note: general create cq function, not really used for now! -struct gds_cq * -gds_create_cq(struct ibv_context *context, int cqe, - void *cq_context, struct ibv_comp_channel *channel, - int comp_vector, int gpu_id, gds_alloc_cq_flags_t flags) -{ - int ret = 0; - struct gds_cq *gcq = NULL; - //TODO: leak of res_domain - struct ibv_exp_res_domain * res_domain; - gds_dbg("cqe=%d gpu_id=%d cq_flags=%08x\n", cqe, gpu_id, flags); - - gds_peer *peer = NULL; - gds_peer_attr *peer_attr = NULL; - ret = gds_register_peer_by_ordinal(gpu_id, &peer, &peer_attr); - if (ret) { - gds_err("error %d while registering GPU peer\n", ret); - return NULL; - } - assert(peer); - assert(peer_attr); - - peer->alloc_type = gds_peer::CQ; - peer->alloc_flags = flags; - - res_domain = gds_create_res_domain(context); - if (res_domain) - gds_dbg("using res_domain %p\n", res_domain); - else - gds_warn("NOT using res_domain\n"); - - - gcq = gds_create_cq_internal(context, cqe, cq_context, channel, comp_vector, gpu_id, flags, res_domain); - - if (!gcq) { - gds_err("error in gds_create_cq_internal\n"); - return NULL; - } - - return gcq; -} - -//----------------------------------------------------------------------------- - struct gds_qp *gds_create_qp(struct ibv_pd *pd, struct ibv_context *context, gds_qp_init_attr_t *qp_attr, int gpu_id, int flags) { int ret = 0; - struct gds_qp *gqp = NULL; - struct ibv_qp *qp = NULL; - struct gds_cq *rx_gcq = NULL, *tx_gcq = NULL; + gds_qp_t *gqp = NULL; gds_peer *peer = NULL; gds_peer_attr *peer_attr = NULL; int old_errno = errno; @@ -1815,6 +1250,7 @@ struct gds_qp *gds_create_qp(struct ibv_pd *pd, struct ibv_context *context, gds_dbg("pd=%p context=%p gpu_id=%d flags=%08x current errno=%d\n", pd, context, gpu_id, flags, errno); assert(pd); assert(context); + assert(context->device); assert(qp_attr); if (flags & ~(GDS_CREATE_QP_WQ_ON_GPU|GDS_CREATE_QP_TX_CQ_ON_GPU|GDS_CREATE_QP_RX_CQ_ON_GPU|GDS_CREATE_QP_WQ_DBREC_ON_GPU)) { @@ -1822,138 +1258,47 @@ struct gds_qp *gds_create_qp(struct ibv_pd *pd, struct ibv_context *context, return NULL; } - gqp = (struct gds_qp*)calloc(1, sizeof(struct gds_qp)); - if (!gqp) { - gds_err("cannot allocate memory\n"); - return NULL; + ret = gds_transport_init(); + if (ret) { + gds_err("error in gds_transport_init\n"); + goto err; } - gqp->dev_context=context; - // peer registration gds_dbg("before gds_register_peer_ex\n"); ret = gds_register_peer_by_ordinal(gpu_id, &peer, &peer_attr); if (ret) { - gds_err("error %d in gds_register_peer_ex\n", ret); - goto err; - } - - gqp->res_domain = gds_create_res_domain(context); - if (gqp->res_domain) - gds_dbg("using gqp->res_domain %p\n", gqp->res_domain); - else - gds_warn("NOT using gqp->res_domain\n"); - - tx_gcq = gds_create_cq_internal(context, qp_attr->cap.max_send_wr, NULL, NULL, 0, gpu_id, - (flags & GDS_CREATE_QP_TX_CQ_ON_GPU) ? GDS_ALLOC_CQ_ON_GPU : GDS_ALLOC_CQ_DEFAULT, - gqp->res_domain); - if (!tx_gcq) { - ret = errno; - gds_err("error %d while creating TX CQ, old_errno=%d\n", ret, old_errno); - goto err; - } - - rx_gcq = gds_create_cq_internal(context, qp_attr->cap.max_recv_wr, NULL, NULL, 0, gpu_id, - (flags & GDS_CREATE_QP_RX_CQ_ON_GPU) ? GDS_ALLOC_CQ_ON_GPU : GDS_ALLOC_CQ_DEFAULT, - gqp->res_domain); - if (!rx_gcq) { - ret = errno; - gds_err("error %d while creating RX CQ\n", ret); + gds_err("error %d in gds_register_peer_ex\n", ret); goto err; } - // peer registration - qp_attr->send_cq = tx_gcq->cq; - qp_attr->recv_cq = rx_gcq->cq; - qp_attr->pd = pd; - qp_attr->comp_mask |= IBV_EXP_QP_INIT_ATTR_PD; - - peer->alloc_type = gds_peer::WQ; - peer->alloc_flags = GDS_ALLOC_WQ_DEFAULT | GDS_ALLOC_DBREC_DEFAULT; - if (flags & GDS_CREATE_QP_WQ_ON_GPU) { - gds_err("error, QP WQ on GPU is not supported yet\n"); - goto err; - } - if (flags & GDS_CREATE_QP_WQ_DBREC_ON_GPU) { - gds_warn("QP WQ DBREC on GPU\n"); - peer->alloc_flags |= GDS_ALLOC_DBREC_ON_GPU; - } - qp_attr->comp_mask |= IBV_EXP_QP_INIT_ATTR_PEER_DIRECT; - qp_attr->peer_direct_attrs = peer_attr; - - qp = ibv_exp_create_qp(context, qp_attr); - if (!qp) { - ret = EINVAL; - gds_err("error in ibv_exp_create_qp\n"); + ret = gds_main_transport->create_qp(pd, context, qp_attr, peer, peer_attr, flags, &gqp); + if (ret) { + gds_err("Error in create_qp.\n"); goto err; } - gqp->qp = qp; - gqp->send_cq.cq = qp->send_cq; - gqp->send_cq.curr_offset = 0; - gqp->recv_cq.cq = qp->recv_cq; - gqp->recv_cq.curr_offset = 0; - gds_dbg("created gds_qp=%p\n", gqp); return gqp; err: - gds_dbg("destroying QP\n"); - gds_destroy_qp(gqp); - return NULL; } + //----------------------------------------------------------------------------- int gds_destroy_qp(struct gds_qp *gqp) { - int retcode = 0; - int ret; + int ret = 0; - if(!gqp) return retcode; + if (!gqp) + return ret; - if(gqp->qp) - { - ret = ibv_destroy_qp(gqp->qp); - if (ret) { - gds_err("error %d in destroy_qp\n", ret); - retcode = ret; - } - } + ret = gds_main_transport->destroy_qp(gqp); - if(gqp->send_cq.cq) - { - ret = ibv_destroy_cq(gqp->send_cq.cq); - if (ret) { - gds_err("error %d in destroy_cq send_cq\n", ret); - retcode = ret; - } - } - - if(gqp->recv_cq.cq) - { - ret = ibv_destroy_cq(gqp->recv_cq.cq); - if (ret) { - gds_err("error %d in destroy_cq recv_cq\n", ret); - retcode = ret; - } - } - - if(gqp->res_domain) { - struct ibv_exp_destroy_res_domain_attr attr; //IBV_EXP_DESTROY_RES_DOMAIN_RESERVED - attr.comp_mask=0; - ret = ibv_exp_destroy_res_domain(gqp->dev_context, gqp->res_domain, &attr); - if (ret) { - gds_err("ibv_exp_destroy_res_domain error %d: %s\n", ret, strerror(ret)); - retcode = ret; - } - } - - free(gqp); - - return retcode; + return ret; } //----------------------------------------------------------------------------- diff --git a/src/gdsync_debug_hostregister_bug.cpp b/src/gdsync_debug_hostregister_bug.cpp index 1e36d08..2537a74 100644 --- a/src/gdsync_debug_hostregister_bug.cpp +++ b/src/gdsync_debug_hostregister_bug.cpp @@ -41,6 +41,7 @@ #include "objs.hpp" #include "archutils.h" #include "mlnxutils.h" +#include "mlx5-exp.hpp" //----------------------------------------------------------------------------- @@ -704,11 +705,11 @@ static int gds_post_ops(size_t n_ops, struct peer_op_wr *op, CUstreamBatchMemOpP switch(op->type) { case IBV_PEER_OP_FENCE: { gds_dbg("OP_FENCE: fence_flags=%"PRIu64"\n", op->wr.fence.fence_flags); - uint32_t fence_op = (op->wr.fence.fence_flags & (IBV_EXP_PEER_FENCE_OP_READ|IBV_EXP_PEER_FENCE_OP_WRITE)); - uint32_t fence_from = (op->wr.fence.fence_flags & (IBV_EXP_PEER_FENCE_FROM_CPU|IBV_EXP_PEER_FENCE_FROM_HCA)); - uint32_t fence_mem = (op->wr.fence.fence_flags & (IBV_EXP_PEER_FENCE_MEM_SYS|IBV_EXP_PEER_FENCE_MEM_PEER)); + uint32_t fence_op = (op->wr.fence.fence_flags & (GDS_PEER_FENCE_OP_READ|GDS_PEER_FENCE_OP_WRITE)); + uint32_t fence_from = (op->wr.fence.fence_flags & (GDS_PEER_FENCE_FROM_CPU|GDS_PEER_FENCE_FROM_HCA)); + uint32_t fence_mem = (op->wr.fence.fence_flags & (GDS_PEER_FENCE_MEM_SYS|GDS_PEER_FENCE_MEM_PEER)); - if (fence_op == IBV_EXP_PEER_FENCE_OP_READ) { + if (fence_op == GDS_PEER_FENCE_OP_READ) { gds_dbg("nothing to do for read fences\n"); //retcode = EINVAL; break; @@ -727,17 +728,17 @@ static int gds_post_ops(size_t n_ops, struct peer_op_wr *op, CUstreamBatchMemOpP //retcode = 0; } else { - if (fence_from != IBV_EXP_PEER_FENCE_FROM_HCA) { + if (fence_from != GDS_PEER_FENCE_FROM_HCA) { gds_err("unexpected from fence\n"); retcode = EINVAL; break; } int flags = 0; - if (fence_mem == IBV_EXP_PEER_FENCE_MEM_PEER) { + if (fence_mem == GDS_PEER_FENCE_MEM_PEER) { gds_dbg("using light membar\n"); flags = GDS_MEMBAR_DEFAULT; } - else if (fence_mem == IBV_EXP_PEER_FENCE_MEM_SYS) { + else if (fence_mem == GDS_PEER_FENCE_MEM_SYS) { gds_dbg("using heavy membar\n"); flags = GDS_MEMBAR_SYS; } @@ -973,26 +974,26 @@ static int gds_post_ops_on_cpu(size_t n_descs, struct peer_op_wr *op) switch(op->type) { case IBV_PEER_OP_FENCE: { gds_dbg("fence_flags=%"PRIu64"\n", op->wr.fence.fence_flags); - uint32_t fence_op = (op->wr.fence.fence_flags & (IBV_EXP_PEER_FENCE_OP_READ|IBV_EXP_PEER_FENCE_OP_WRITE)); - uint32_t fence_from = (op->wr.fence.fence_flags & (IBV_EXP_PEER_FENCE_FROM_CPU|IBV_EXP_PEER_FENCE_FROM_HCA)); - uint32_t fence_mem = (op->wr.fence.fence_flags & (IBV_EXP_PEER_FENCE_MEM_SYS|IBV_EXP_PEER_FENCE_MEM_PEER)); + uint32_t fence_op = (op->wr.fence.fence_flags & (GDS_PEER_FENCE_OP_READ|GDS_PEER_FENCE_OP_WRITE)); + uint32_t fence_from = (op->wr.fence.fence_flags & (GDS_PEER_FENCE_FROM_CPU|GDS_PEER_FENCE_FROM_HCA)); + uint32_t fence_mem = (op->wr.fence.fence_flags & (GDS_PEER_FENCE_MEM_SYS|GDS_PEER_FENCE_MEM_PEER)); - if (fence_op == IBV_EXP_PEER_FENCE_OP_READ) { + if (fence_op == GDS_PEER_FENCE_OP_READ) { gds_warnc(1, "nothing to do for read fences\n"); //retcode = EINVAL; break; } else { - if (fence_from != IBV_EXP_PEER_FENCE_FROM_HCA) { + if (fence_from != GDS_PEER_FENCE_FROM_HCA) { gds_err("unexpected from %08x fence, expected FROM_HCA\n", fence_from); retcode = EINVAL; break; } - if (fence_mem == IBV_EXP_PEER_FENCE_MEM_PEER) { + if (fence_mem == GDS_PEER_FENCE_MEM_PEER) { gds_dbg("using light membar\n"); wmb(); } - else if (fence_mem == IBV_EXP_PEER_FENCE_MEM_SYS) { + else if (fence_mem == GDS_PEER_FENCE_MEM_SYS) { gds_dbg("using heavy membar\n"); wmb(); } @@ -1233,12 +1234,13 @@ static void gds_dump_ops(struct peer_op_wr *op, size_t count) void gds_dump_wait_request(gds_wait_request_t *request, size_t count) { - for (size_t j=0; jentries, peek->whence, peek->offset, - peek->peek_id, peek->comp_mask); - gds_dump_ops(peek->storage, peek->entries); + for (size_t j = 0; j < count; ++j) { + gds_mlx5_exp_wait_request_t *gmexp_request; + if (count == 0) + return; + + gmexp_request = to_gds_mexp_wait_request(&request[j]); + gds_mlx5_exp_dump_wait_request(gmexp_request, j); } } @@ -1315,14 +1317,14 @@ static int gds_buf_release(struct ibv_peer_buf *pb) return 0; } -static uint64_t gds_register_va(void *start, size_t length, uint64_t peer_id, struct ibv_exp_peer_buf *pb) +static uint64_t gds_register_va(void *start, size_t length, uint64_t peer_id, gds_peer_buf_t *pb) { gds_peer *peer = peer_from_id(peer_id); gds_range *range = NULL; gds_dbg("start=%p length=%zu peer_id=%"PRIx64" peer_buf=%p\n", start, length, peer_id, pb); - if (IBV_EXP_PEER_IOMEMORY == pb) { + if (GDS_PEER_IOMEMORY == pb) { // register as IOMEM range = peer->register_range(start, length, GDS_MEMORY_IO); } @@ -1375,25 +1377,25 @@ static void gds_init_peer_attr(gds_peer_attr *attr, gds_peer *peer) attr->register_va = gds_register_va; attr->unregister_va = gds_unregister_va; - attr->caps = ( IBV_EXP_PEER_OP_STORE_DWORD_CAP | - IBV_EXP_PEER_OP_STORE_QWORD_CAP | - IBV_EXP_PEER_OP_FENCE_CAP | - IBV_EXP_PEER_OP_POLL_AND_DWORD_CAP ); + attr->caps = ( GDS_PEER_OP_STORE_DWORD_CAP | + GDS_PEER_OP_STORE_QWORD_CAP | + GDS_PEER_OP_FENCE_CAP | + GDS_PEER_OP_POLL_AND_DWORD_CAP ); if (gpu_does_support_nor(peer)) - attr->caps |= IBV_EXP_PEER_OP_POLL_NOR_DWORD_CAP; + attr->caps |= GDS_PEER_OP_POLL_NOR_DWORD_CAP; else - attr->caps |= IBV_EXP_PEER_OP_POLL_GEQ_DWORD_CAP; + attr->caps |= GDS_PEER_OP_POLL_GEQ_DWORD_CAP; if (gds_enable_inlcpy()) { - attr->caps |= IBV_EXP_PEER_OP_COPY_BLOCK_CAP; + attr->caps |= GDS_PEER_OP_COPY_BLOCK_CAP; } else if (gds_enable_write64() || gds_simulate_write64()) { - attr->caps |= IBV_EXP_PEER_OP_STORE_QWORD_CAP; + attr->caps |= GDS_PEER_OP_STORE_QWORD_CAP; } gds_dbg("caps=%016lx\n", attr->caps); attr->peer_dma_op_map_len = GDS_GPU_MAX_INLINE_SIZE; - attr->comp_mask = IBV_EXP_PEER_DIRECT_VERSION; + attr->comp_mask = GDS_PEER_DIRECT_VERSION; attr->version = 1; gds_dbg("peer_attr: peer_id=%"PRIx64"\n", attr->peer_id); @@ -1536,13 +1538,13 @@ struct gds_qp *gds_create_qp(struct ibv_pd *pd, struct ibv_context *context, gds // the CQE without updating the tracking variables if (flags & GDS_CREATE_QP_GPU_INVALIDATE_RX_CQ) { gds_warn("IGNORE_RQ_OVERFLOW\n"); - qp_attr->exp_create_flags |= IBV_EXP_QP_CREATE_IGNORE_RQ_OVERFLOW; - qp_attr->comp_mask |= IBV_EXP_QP_INIT_ATTR_CREATE_FLAGS; + qp_attr->exp_create_flags |= GDS_QP_CREATE_IGNORE_RQ_OVERFLOW; + qp_attr->comp_mask |= GDS_QP_INIT_ATTR_CREATE_FLAGS; } if (flags & GDS_CREATE_QP_GPU_INVALIDATE_TX_CQ) { gds_warn("IGNORE_SQ_OVERFLOW\n"); - qp_attr->exp_create_flags |= IBV_EXP_QP_CREATE_IGNORE_SQ_OVERFLOW; - qp_attr->comp_mask |= IBV_EXP_QP_INIT_ATTR_CREATE_FLAGS; + qp_attr->exp_create_flags |= GDS_QP_CREATE_IGNORE_SQ_OVERFLOW; + qp_attr->comp_mask |= GDS_QP_INIT_ATTR_CREATE_FLAGS; } gds_dbg("before gds_register_peer_ex\n"); diff --git a/src/mem.cpp b/src/mem.cpp index 59a6af6..7cf3602 100644 --- a/src/mem.cpp +++ b/src/mem.cpp @@ -40,7 +40,6 @@ using namespace std; #include -#include #include #include "gdsync.h" diff --git a/src/memmgr.cpp b/src/memmgr.cpp index ab3e490..6d6a21c 100644 --- a/src/memmgr.cpp +++ b/src/memmgr.cpp @@ -40,7 +40,6 @@ using namespace std; #include -#include #include #include "gdsync.h" diff --git a/src/mlx5.cpp b/src/mlx5.cpp index a2c7b39..94872cc 100644 --- a/src/mlx5.cpp +++ b/src/mlx5.cpp @@ -40,6 +40,7 @@ //#include "mem.hpp" #include "objs.hpp" #include "utils.hpp" +#include "transport.hpp" #if 0 union { uint64_t qw; uint32_t dw[2]; } db_val; @@ -49,122 +50,21 @@ mlx5_i->db_value = db_val.qw; #endif -//----------------------------------------------------------------------------- - -int gds_mlx5_get_send_descs(gds_mlx5_send_info_t *mlx5_i, const gds_send_request_t *request) -{ - int retcode = 0; - size_t n_ops = request->commit.entries; - peer_op_wr *op = request->commit.storage; - size_t n = 0; - - memset(mlx5_i, 0, sizeof(*mlx5_i)); - - for (; op && n < n_ops; op = op->next, ++n) { - switch(op->type) { - case IBV_EXP_PEER_OP_FENCE: { - gds_dbg("OP_FENCE: fence_flags=%" PRIu64 "\n", op->wr.fence.fence_flags); - uint32_t fence_op = (op->wr.fence.fence_flags & (IBV_EXP_PEER_FENCE_OP_READ|IBV_EXP_PEER_FENCE_OP_WRITE)); - uint32_t fence_from = (op->wr.fence.fence_flags & (IBV_EXP_PEER_FENCE_FROM_CPU|IBV_EXP_PEER_FENCE_FROM_HCA)); - uint32_t fence_mem = (op->wr.fence.fence_flags & (IBV_EXP_PEER_FENCE_MEM_SYS|IBV_EXP_PEER_FENCE_MEM_PEER)); - if (fence_op == IBV_EXP_PEER_FENCE_OP_READ) { - gds_dbg("nothing to do for read fences\n"); - break; - } - if (fence_from != IBV_EXP_PEER_FENCE_FROM_HCA) { - gds_err("unexpected from fence\n"); - retcode = EINVAL; - break; - } - if (fence_mem == IBV_EXP_PEER_FENCE_MEM_PEER) { - gds_dbg("using light membar\n"); - mlx5_i->membar = 1; - } - else if (fence_mem == IBV_EXP_PEER_FENCE_MEM_SYS) { - gds_dbg("using heavy membar\n"); - mlx5_i->membar_full = 1; - } - else { - gds_err("unsupported fence combination\n"); - retcode = EINVAL; - break; - } - break; - } - case IBV_EXP_PEER_OP_STORE_DWORD: { - CUdeviceptr dev_ptr = range_from_id(op->wr.dword_va.target_id)->dptr + - op->wr.dword_va.offset; - uint32_t data = op->wr.dword_va.data; - gds_dbg("OP_STORE_DWORD dev_ptr=%" PRIx64 " data=%08x\n", (uint64_t)dev_ptr, data); - if (n != 0) { - gds_err("store DWORD is not 1st op\n"); - retcode = EINVAL; - break; - } - mlx5_i->dbrec_ptr = (uint32_t*)dev_ptr; - mlx5_i->dbrec_value = data; - break; - } - case IBV_EXP_PEER_OP_STORE_QWORD: { - CUdeviceptr dev_ptr = range_from_id(op->wr.qword_va.target_id)->dptr + - op->wr.qword_va.offset; - uint64_t data = op->wr.qword_va.data; - gds_dbg("OP_STORE_QWORD dev_ptr=%" PRIx64 " data=%" PRIx64 "\n", (uint64_t)dev_ptr, (uint64_t)data); - if (n != 2) { - gds_err("store QWORD is not 3rd op\n"); - retcode = EINVAL; - break; - } - mlx5_i->db_ptr = (uint64_t*)dev_ptr; - mlx5_i->db_value = data; - break; - } - case IBV_EXP_PEER_OP_COPY_BLOCK: { - CUdeviceptr dev_ptr = range_from_id(op->wr.copy_op.target_id)->dptr + - op->wr.copy_op.offset; - size_t len = op->wr.copy_op.len; - void *src = op->wr.copy_op.src; - gds_dbg("send inline detected\n"); - if (len < 8 || len > 64) { - gds_err("unexpected len %zu\n", len); - retcode = EINVAL; - break; - } - mlx5_i->db_ptr = (uint64_t*)dev_ptr; - mlx5_i->db_value = *(uint64_t*)src; - break; - } - case IBV_EXP_PEER_OP_POLL_AND_DWORD: - case IBV_EXP_PEER_OP_POLL_GEQ_DWORD: - case IBV_EXP_PEER_OP_POLL_NOR_DWORD: { - gds_err("unexpected polling op in send request\n"); - retcode = EINVAL; - break; - } - default: - gds_err("undefined peer op type %d\n", op->type); - retcode = EINVAL; - break; - } - - if (retcode) { - gds_err("error in fill func at entry n=%zu\n", n); - break; - } - } - return retcode; -} //----------------------------------------------------------------------------- int gds_mlx5_get_send_info(int count, const gds_send_request_t *requests, gds_mlx5_send_info_t *mlx5_infos) { - int retcode = 0; + int retcode = gds_transport_init(); + if (retcode) { + gds_err("error in gds_transport_init\n"); + goto out; + } for (int j=0; jget_send_descs(mlx5_i, request); if (retcode) { gds_err("error %d while retrieving descriptors for %dth request\n", retcode, j); break; @@ -173,6 +73,7 @@ int gds_mlx5_get_send_info(int count, const gds_send_request_t *requests, gds_ml mlx5_i->dbrec_ptr, mlx5_i->dbrec_value, mlx5_i->db_ptr, mlx5_i->db_value); } +out: return retcode; } @@ -180,107 +81,7 @@ int gds_mlx5_get_send_info(int count, const gds_send_request_t *requests, gds_ml int gds_mlx5_get_wait_descs(gds_mlx5_wait_info_t *mlx5_i, const gds_wait_request_t *request) { - int retcode = 0; - size_t n_ops = request->peek.entries; - peer_op_wr *op = request->peek.storage; - size_t n = 0; - - memset(mlx5_i, 0, sizeof(*mlx5_i)); - - for (; op && n < n_ops; op = op->next, ++n) { - switch(op->type) { - case IBV_EXP_PEER_OP_FENCE: { - gds_dbg("OP_FENCE: fence_flags=%" PRIu64 "\n", op->wr.fence.fence_flags); - uint32_t fence_op = (op->wr.fence.fence_flags & (IBV_EXP_PEER_FENCE_OP_READ|IBV_EXP_PEER_FENCE_OP_WRITE)); - uint32_t fence_from = (op->wr.fence.fence_flags & (IBV_EXP_PEER_FENCE_FROM_CPU|IBV_EXP_PEER_FENCE_FROM_HCA)); - uint32_t fence_mem = (op->wr.fence.fence_flags & (IBV_EXP_PEER_FENCE_MEM_SYS|IBV_EXP_PEER_FENCE_MEM_PEER)); - if (fence_op == IBV_EXP_PEER_FENCE_OP_READ) { - gds_dbg("nothing to do for read fences\n"); - break; - } - if (fence_from != IBV_EXP_PEER_FENCE_FROM_HCA) { - gds_err("unexpected from fence\n"); - retcode = EINVAL; - break; - } - gds_err("unsupported fence combination\n"); - retcode = EINVAL; - break; - } - case IBV_EXP_PEER_OP_STORE_DWORD: { - CUdeviceptr dev_ptr = range_from_id(op->wr.dword_va.target_id)->dptr + - op->wr.dword_va.offset; - uint32_t data = op->wr.dword_va.data; - gds_dbg("OP_STORE_DWORD dev_ptr=%" PRIx64 " data=%08x\n", (uint64_t)dev_ptr, data); - if (n != 1) { - gds_err("store DWORD is not 2nd op\n"); - retcode = EINVAL; - break; - } - mlx5_i->flag_ptr = (uint32_t*)dev_ptr; - mlx5_i->flag_value = data; - break; - } - case IBV_EXP_PEER_OP_STORE_QWORD: { - CUdeviceptr dev_ptr = range_from_id(op->wr.qword_va.target_id)->dptr + - op->wr.qword_va.offset; - uint64_t data = op->wr.qword_va.data; - gds_dbg("OP_STORE_QWORD dev_ptr=%" PRIx64 " data=%" PRIx64 "\n", (uint64_t)dev_ptr, (uint64_t)data); - gds_err("unsupported QWORD op\n"); - retcode = EINVAL; - break; - } - case IBV_EXP_PEER_OP_COPY_BLOCK: { - CUdeviceptr dev_ptr = range_from_id(op->wr.copy_op.target_id)->dptr + - op->wr.copy_op.offset; - size_t len = op->wr.copy_op.len; - void *src = op->wr.copy_op.src; - gds_err("unsupported COPY_BLOCK\n"); - retcode = EINVAL; - break; - } - case IBV_EXP_PEER_OP_POLL_AND_DWORD: - case IBV_EXP_PEER_OP_POLL_GEQ_DWORD: - case IBV_EXP_PEER_OP_POLL_NOR_DWORD: { - CUdeviceptr dev_ptr = range_from_id(op->wr.dword_va.target_id)->dptr + - op->wr.dword_va.offset; - uint32_t data = op->wr.dword_va.data; - - gds_dbg("OP_POLL_DWORD dev_ptr=%" PRIx64 " data=%08x\n", (uint64_t)dev_ptr, data); - - mlx5_i->cqe_ptr = (uint32_t *)dev_ptr; - mlx5_i->cqe_value = data; - - switch(op->type) { - case IBV_EXP_PEER_OP_POLL_NOR_DWORD: - // GPU SMs can always do NOR - mlx5_i->cond = GDS_WAIT_COND_NOR; - break; - case IBV_EXP_PEER_OP_POLL_GEQ_DWORD: - mlx5_i->cond = GDS_WAIT_COND_GEQ; - break; - case IBV_EXP_PEER_OP_POLL_AND_DWORD: - mlx5_i->cond = GDS_WAIT_COND_AND; - break; - default: - gds_err("unexpected op type\n"); - retcode = EINVAL; - goto err; - } - break; - } - default: - gds_err("undefined peer op type %d\n", op->type); - retcode = EINVAL; - break; - } - err: - if (retcode) { - gds_err("error in fill func at entry n=%zu\n", n); - break; - } - } - return retcode; + return gds_main_transport->get_wait_descs(mlx5_i, request); } //----------------------------------------------------------------------------- diff --git a/src/objs.cpp b/src/objs.cpp index 1dac250..475c9d3 100644 --- a/src/objs.cpp +++ b/src/objs.cpp @@ -39,7 +39,6 @@ using namespace std; #include -#include #include #include "gdsync.h" @@ -71,7 +70,7 @@ gds_buf *gds_peer::buf_alloc_cq(size_t length, uint32_t dir, uint32_t alignment, { gds_buf *buf = NULL; switch (dir) { - case (IBV_EXP_PEER_DIRECTION_FROM_HCA|IBV_EXP_PEER_DIRECTION_TO_PEER|IBV_EXP_PEER_DIRECTION_TO_CPU): + case (GDS_PEER_DIRECTION_FROM_HCA|GDS_PEER_DIRECTION_TO_PEER|GDS_PEER_DIRECTION_TO_CPU): // CQ buf if (GDS_ALLOC_CQ_ON_GPU == (flags & GDS_ALLOC_CQ_MASK)) { gds_dbg("allocating CQ on GPU mem\n"); @@ -80,14 +79,14 @@ gds_buf *gds_peer::buf_alloc_cq(size_t length, uint32_t dir, uint32_t alignment, gds_dbg("allocating CQ on Host mem\n"); } break; - case (IBV_EXP_PEER_DIRECTION_FROM_PEER|IBV_EXP_PEER_DIRECTION_TO_CPU): + case (GDS_PEER_DIRECTION_FROM_PEER|GDS_PEER_DIRECTION_TO_CPU): // CQ peer buf, helper buffer // on SYSMEM for the near future // GPU does a store to the 'busy' field as part of the peek_cq task // CPU polls on that field gds_dbg("allocating CQ peer buf on Host mem\n"); break; - case (IBV_EXP_PEER_DIRECTION_FROM_PEER|IBV_EXP_PEER_DIRECTION_TO_HCA): + case (GDS_PEER_DIRECTION_FROM_PEER|GDS_PEER_DIRECTION_TO_HCA): gds_dbg("allocating CQ dbrec on Host mem\n"); break; default: @@ -101,7 +100,7 @@ gds_buf *gds_peer::buf_alloc_wq(size_t length, uint32_t dir, uint32_t alignment, { gds_buf *buf = NULL; switch (dir) { - case IBV_EXP_PEER_DIRECTION_FROM_PEER|IBV_EXP_PEER_DIRECTION_TO_HCA: + case GDS_PEER_DIRECTION_FROM_PEER|GDS_PEER_DIRECTION_TO_HCA: // dbrec if (GDS_ALLOC_DBREC_ON_GPU == (flags & GDS_ALLOC_DBREC_MASK)) { gds_dbg("allocating DBREC on GPU mem\n"); diff --git a/src/objs.hpp b/src/objs.hpp index 796b6bd..71f4ea4 100644 --- a/src/objs.hpp +++ b/src/objs.hpp @@ -29,11 +29,172 @@ static const size_t max_gpus = 16; -typedef struct ibv_exp_peer_direct_attr gds_peer_attr; +/** + * Compatible with enum ibv_exp_peer_op + */ +enum gds_peer_op { + GDS_PEER_OP_RESERVED1 = 1, + + GDS_PEER_OP_FENCE = 0, + + GDS_PEER_OP_STORE_DWORD = 4, + GDS_PEER_OP_STORE_QWORD = 2, + GDS_PEER_OP_COPY_BLOCK = 3, + + GDS_PEER_OP_POLL_AND_DWORD = 12, + GDS_PEER_OP_POLL_NOR_DWORD = 13, + GDS_PEER_OP_POLL_GEQ_DWORD = 14, +}; + +/** + * Compatible with enum ibv_exp_peer_op_caps + */ +enum gds_peer_op_caps { + GDS_PEER_OP_FENCE_CAP = (1 << GDS_PEER_OP_FENCE), + GDS_PEER_OP_STORE_DWORD_CAP = (1 << GDS_PEER_OP_STORE_DWORD), + GDS_PEER_OP_STORE_QWORD_CAP = (1 << GDS_PEER_OP_STORE_QWORD), + GDS_PEER_OP_COPY_BLOCK_CAP = (1 << GDS_PEER_OP_COPY_BLOCK), + GDS_PEER_OP_POLL_AND_DWORD_CAP + = (1 << GDS_PEER_OP_POLL_AND_DWORD), + GDS_PEER_OP_POLL_NOR_DWORD_CAP + = (1 << GDS_PEER_OP_POLL_NOR_DWORD), + GDS_PEER_OP_POLL_GEQ_DWORD_CAP + = (1 << GDS_PEER_OP_POLL_GEQ_DWORD), +}; + + +/** + * Compatible with enum ibv_exp_peer_fence + */ +enum gds_peer_fence { + GDS_PEER_FENCE_OP_READ = (1 << 0), + GDS_PEER_FENCE_OP_WRITE = (1 << 1), + GDS_PEER_FENCE_FROM_CPU = (1 << 2), + GDS_PEER_FENCE_FROM_HCA = (1 << 3), + GDS_PEER_FENCE_MEM_SYS = (1 << 4), + GDS_PEER_FENCE_MEM_PEER = (1 << 5), +}; + +/** + * Indicate HW entities supposed to access memory buffer: + * GDS_PEER_DIRECTION_FROM_X means X writes to the buffer + * GDS_PEER_DIRECTION_TO_Y means Y read from the buffer + * + * Compatible with enum ibv_exp_peer_direction + */ +enum gds_peer_direction { + GDS_PEER_DIRECTION_FROM_CPU = (1 << 0), + GDS_PEER_DIRECTION_FROM_HCA = (1 << 1), + GDS_PEER_DIRECTION_FROM_PEER = (1 << 2), + GDS_PEER_DIRECTION_TO_CPU = (1 << 3), + GDS_PEER_DIRECTION_TO_HCA = (1 << 4), + GDS_PEER_DIRECTION_TO_PEER = (1 << 5), +}; + +/** + * Compatible with enum ibv_exp_peer_direct_attr_mask + */ +enum gds_peer_direct_attr_mask { + GDS_PEER_DIRECT_VERSION = (1 << 0) /* Must be set */ +}; + +/** + * Compatible with IBV_EXP_PEER_IOMEMORY + */ +#define GDS_PEER_IOMEMORY ((struct gds_buf *)-1UL) + +/** + * Compatible with struct ibv_exp_peer_buf_alloc_attr + */ +typedef struct gds_peer_buf_alloc_attr { + size_t length; + /* Bitmask from enum gds_peer_direction */ + uint32_t dir; + /* The ID of the peer device which will be + * * accessing the allocated buffer + * */ + uint64_t peer_id; + /* Data alignment */ + uint32_t alignment; + /* Reserved for future extensions, must be 0 */ + uint32_t comp_mask; +} gds_peer_buf_alloc_attr_t; + + +/** + * Compatible with struct ibv_exp_peer_buf + */ +typedef struct gds_peer_buf { + void *addr; + size_t length; + /* Reserved for future extensions, must be 0 */ + uint32_t comp_mask; +} gds_peer_buf_t; + +/** + * Compatible with struct ibv_exp_peer_direct_attr + */ +typedef struct { + /* Unique ID per peer device. + * Used to identify specific HW devices where relevant. + */ + uint64_t peer_id; + /* buf_alloc callback should return gds_peer_buf_t with buffer + * of at least attr->length. + * @attr: description of desired buffer + * + * Buffer should be mapped in the application address space + * for read/write (depends on attr->dir value). + * attr->dir value is supposed to indicate the expected directions + * of access to the buffer, to allow optimization by the peer driver. + * If NULL returned then buffer will be allocated in system memory + * by ibverbs driver. + */ + gds_peer_buf_t *(*buf_alloc)(gds_peer_buf_alloc_attr_t *attr); + /* If buffer was allocated by buf_alloc then buf_release will be + * called to release it. + * @pb: struct returned by buf_alloc + * + * buf_release is responsible to release everything allocated by + * buf_alloc. + * Return 0 on succes. + */ + int (*buf_release)(gds_peer_buf_t *pb); + /* register_va callback should register virtual address from the + * application as an area the peer is allowed to access. + * @start: pointer to beginning of region in virtual space + * @length: length of region + * @peer_id: the ID of the peer device which will be accessing + * the region. + * @pb: if registering a buffer that was returned from buf_alloc(), + * pb is the struct that was returned. If registering io memory area, + * pb is GDS_PEER_IOMEMORY. Otherwise - NULL + * + * Return id of registered address on success, 0 on failure. + */ + uint64_t (*register_va)(void *start, size_t length, uint64_t peer_id, + gds_peer_buf_t *pb); + /* If virtual address was registered with register_va then + * unregister_va will be called to unregister it. + * @target_id: id returned by register_va + * @peer_id: the ID of the peer device passed to register_va + * + * Return 0 on success. + */ + int (*unregister_va)(uint64_t target_id, uint64_t peer_id); + /* Bitmask from gds_peer_op_caps */ + uint64_t caps; + /* Maximal length of DMA operation the peer can do in copy-block */ + size_t peer_dma_op_map_len; + /* From gds_peer_direct_attr_mask */ + uint32_t comp_mask; + /* Feature version, must be 1 */ + uint32_t version; +} gds_peer_attr; struct gds_peer; -struct gds_buf: ibv_exp_peer_buf { +struct gds_buf: gds_peer_buf_t { gds_peer *peer; CUdeviceptr peer_addr; void *handle; diff --git a/src/transport.hpp b/src/transport.hpp new file mode 100644 index 0000000..079f9ce --- /dev/null +++ b/src/transport.hpp @@ -0,0 +1,96 @@ +/* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of NVIDIA CORPORATION nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#pragma once + +#include +#include +#include +#include +#include + +typedef struct gds_transport { + int (*create_qp)(struct ibv_pd *pd, struct ibv_context *context, gds_qp_init_attr_t *qp_attr, gds_peer *peer, gds_peer_attr *peer_attr, int flags, gds_qp_t **gqp); + int (*destroy_qp)(gds_qp_t *gqp); + int (*rollback_qp)(gds_qp_t *gqp, gds_send_request_t *request); + + void (*init_send_info)(gds_send_request_t *request); + int (*post_send_ops)(gds_peer *peer, gds_send_request_t *request, gds_op_list_t &ops); + int (*post_send_ops_on_cpu)(gds_send_request_t *request, int flags); + int (*prepare_send)(gds_qp_t *gqp, gds_send_wr *p_ewr, gds_send_wr **bad_ewr, gds_send_request_t *request); + int (*get_send_descs)(gds_mlx5_send_info_t *mlx5_i, const gds_send_request_t *_request); + uint32_t (*get_num_send_request_entries)(gds_send_request_t *request); + + void (*init_wait_request)(gds_wait_request_t *request, uint32_t offset); + void (*dump_wait_request)(gds_wait_request_t *request, size_t idx); + int (*stream_post_wait_descriptor)(gds_peer *peer, gds_wait_request_t *request, gds_op_list_t ¶ms, int flags); + int (*post_wait_descriptor)(gds_wait_request_t *request, int flags); + int (*get_wait_descs)(gds_mlx5_wait_info_t *mlx5_i, const gds_wait_request_t *request); + uint32_t (*get_num_wait_request_entries)(gds_wait_request_t *request); + + int (*prepare_wait_cq)(gds_cq_t *gcq, gds_wait_request_t *request, int flags); + int (*append_wait_cq)(gds_wait_request_t *request, uint32_t *dw, uint32_t val); + int (*abort_wait_cq)(gds_cq_t *gcq, gds_wait_request_t *request); +} gds_transport_t; + +extern gds_transport_t *gds_main_transport; + +#if HAVE_EXP_VERBS +int gds_transport_mlx5_exp_init(gds_transport_t **transport); +#else +#warning "This library requires exp-verbs." +#endif + +static inline int gds_transport_init() +{ + int status = 0; + if (!gds_main_transport) { + gds_transport_t *t = NULL; + #if HAVE_EXP_VERBS + status = gds_transport_mlx5_exp_init(&t); + if (status) { + gds_err("error in gds_transport_mlx5_exp_init\n"); + goto out; + } + assert(t); + #else + status = ENOTSUP; + goto out; + #endif + gds_main_transport = t; + } +out: + return status; +} + +/* + * Local variables: + * c-indent-level: 8 + * c-basic-offset: 8 + * tab-width: 8 + * End: + */ diff --git a/src/transports/mlx5-exp/mlx5-exp.cpp b/src/transports/mlx5-exp/mlx5-exp.cpp new file mode 100644 index 0000000..85f97e2 --- /dev/null +++ b/src/transports/mlx5-exp/mlx5-exp.cpp @@ -0,0 +1,1259 @@ +#include +#include +#include + +#include "mlx5-exp.hpp" +#include "utils.hpp" +#include "archutils.h" +#include "mlnxutils.h" +#include "transport.hpp" + +//----------------------------------------------------------------------------- + +static void gds_mlx5_exp_init_ops(struct peer_op_wr *op, int count) +{ + int i = count; + while (--i) + op[i-1].next = &op[i]; + op[count-1].next = NULL; +} + +//----------------------------------------------------------------------------- + +/* + A) plain+membar: + WR32 + MEMBAR + WR32 + WR32 + + B) plain: + WR32 + WR32+PREBARRIER + WR32 + + C) sim64+membar: + WR32 + MEMBAR + INLCPY 8B + + D) sim64: + INLCPY 4B + POSTBARRIER + INLCPY 8B + + E) inlcpy+membar: + WR32 + MEMBAR + INLCPY XB + + F) inlcpy: + INLCPY 4B + POSTBARRIER + INLCPY 128B +*/ + +static int gds_mlx5_exp_post_ops(gds_peer *peer, size_t n_ops, struct peer_op_wr *op, gds_op_list_t &ops, int post_flags) +{ + int retcode = 0; + size_t n = 0; + bool prev_was_fence = false; + bool use_inlcpy_for_dword = false; + //size_t n_ops = ops.size(); + CUstreamBatchMemOpParams param; + + gds_dbg("n_ops=%zu\n", n_ops); + + if (!peer->has_memops) { + gds_err("CUDA MemOps are required\n"); + return EINVAL; + } + + // divert the request to the same engine handling 64bits + // to avoid out-of-order execution + // caveat: can't use membar if inlcpy is used for 4B writes (to simulate 8B writes) + if (peer->has_inlcpy) { + if (!peer->has_membar) + use_inlcpy_for_dword = true; // F + } + if (gds_simulate_write64()) { + if (!peer->has_membar) { + gds_warn_once("enabling use_inlcpy_for_dword\n"); + use_inlcpy_for_dword = true; // D + } + } + + for (; op && n < n_ops; op = op->next, ++n) { + //int flags = 0; + gds_dbg("op[%zu] type:%08x\n", n, op->type); + switch(op->type) { + case GDS_PEER_OP_FENCE: { + gds_dbg("OP_FENCE: fence_flags=%" PRIu64 "\n", op->wr.fence.fence_flags); + uint32_t fence_op = (op->wr.fence.fence_flags & (GDS_PEER_FENCE_OP_READ|GDS_PEER_FENCE_OP_WRITE)); + uint32_t fence_from = (op->wr.fence.fence_flags & (GDS_PEER_FENCE_FROM_CPU|GDS_PEER_FENCE_FROM_HCA)); + uint32_t fence_mem = (op->wr.fence.fence_flags & (GDS_PEER_FENCE_MEM_SYS|GDS_PEER_FENCE_MEM_PEER)); + + if (fence_op == GDS_PEER_FENCE_OP_READ) { + gds_dbg("nothing to do for read fences\n"); + //retcode = EINVAL; + break; + } + else { + if (!peer->has_membar) { + if (use_inlcpy_for_dword) { + assert(ops.size() > 0); + gds_dbg("patching previous param\n"); + gds_enable_barrier_for_inlcpy(&ops.back()); + } + else { + gds_dbg("recording fence event\n"); + prev_was_fence = true; + } + //retcode = 0; + } + else { + if (fence_from != GDS_PEER_FENCE_FROM_HCA) { + gds_err("unexpected from fence\n"); + retcode = EINVAL; + break; + } + int flags = 0; + if (fence_mem == GDS_PEER_FENCE_MEM_PEER) { + gds_dbg("using light membar\n"); + flags = GDS_MEMBAR_DEFAULT | GDS_MEMBAR_MLX5; + } + else if (fence_mem == GDS_PEER_FENCE_MEM_SYS) { + gds_dbg("using heavy membar\n"); + flags = GDS_MEMBAR_SYS | GDS_MEMBAR_MLX5; + } + else { + gds_err("unsupported fence combination\n"); + retcode = EINVAL; + break; + } + retcode = gds_fill_membar(peer, ops, flags); + } + } + break; + } + case GDS_PEER_OP_STORE_DWORD: { + CUdeviceptr dev_ptr = range_from_id(op->wr.dword_va.target_id)->dptr + + op->wr.dword_va.offset; + uint32_t data = op->wr.dword_va.data; + int flags = 0; + gds_dbg("OP_STORE_DWORD dev_ptr=%llx data=%" PRIx32 "\n", dev_ptr, data); + if (use_inlcpy_for_dword) { // F || D + // membar may be out of order WRT inlcpy + if (peer->has_membar) { + gds_err("invalid feature combination, inlcpy + membar\n"); + retcode = EINVAL; + break; + } + // tail flush is set when following fence is met + // flags |= GDS_IMMCOPY_POST_TAIL_FLUSH; + retcode = gds_fill_inlcpy(peer, ops, dev_ptr, &data, sizeof(data), flags); + } + else { // A || B || C || E + // can't guarantee ordering of write32+inlcpy unless + // a membar is there + // TODO: fix driver when !weak + if (peer->has_inlcpy && !peer->has_membar) { + gds_err("invalid feature combination, inlcpy needs membar\n"); + retcode = EINVAL; + break; + } + if (prev_was_fence) { + gds_dbg("using PRE_BARRIER as fence\n"); + flags |= GDS_WRITE_PRE_BARRIER; + prev_was_fence = false; + } + retcode = gds_fill_poke(peer, ops, dev_ptr, data, flags); + } + break; + } + case GDS_PEER_OP_STORE_QWORD: { + CUdeviceptr dev_ptr = range_from_id(op->wr.qword_va.target_id)->dptr + + op->wr.qword_va.offset; + uint64_t data = op->wr.qword_va.data; + int flags = 0; + gds_dbg("OP_STORE_QWORD dev_ptr=%llx data=%" PRIx64 "\n", dev_ptr, data); + // C || D + if (gds_simulate_write64()) { + // simulate 64-bit poke by inline copy + if (!peer->has_membar) { + gds_err("invalid feature combination, inlcpy needs membar\n"); + retcode = EINVAL; + break; + } + + // tail flush is never useful here + //flags |= GDS_IMMCOPY_POST_TAIL_FLUSH; + retcode = gds_fill_inlcpy(peer, ops, dev_ptr, &data, sizeof(data), flags); + } + else if (peer->has_write64) { + retcode = gds_fill_poke64(peer, ops, dev_ptr, data, flags); + } + else { + uint32_t datalo = gds_qword_lo(op->wr.qword_va.data); + uint32_t datahi = gds_qword_hi(op->wr.qword_va.data); + + if (prev_was_fence) { + gds_dbg("enabling PRE_BARRIER\n"); + flags |= GDS_WRITE_PRE_BARRIER; + prev_was_fence = false; + } + retcode = gds_fill_poke(peer, ops, dev_ptr, datalo, flags); + + // get rid of the barrier, if there + flags &= ~GDS_WRITE_PRE_BARRIER; + + // advance to next DWORD + dev_ptr += sizeof(uint32_t); + retcode = gds_fill_poke(peer, ops, dev_ptr, datahi, flags); + } + + break; + } + case GDS_PEER_OP_COPY_BLOCK: { + CUdeviceptr dev_ptr = range_from_id(op->wr.copy_op.target_id)->dptr + + op->wr.copy_op.offset; + size_t len = op->wr.copy_op.len; + void *src = op->wr.copy_op.src; + int flags = 0; + gds_dbg("OP_COPY_BLOCK dev_ptr=%llx src=%p len=%zu\n", dev_ptr, src, len); + // catching any other size here + if (!peer->has_inlcpy) { + gds_err("inline copy is not supported\n"); + retcode = EINVAL; + break; + } + // IB Verbs bug + assert(len <= GDS_GPU_MAX_INLINE_SIZE); + //if (desc->need_flush) { + // flags |= GDS_IMMCOPY_POST_TAIL_FLUSH; + //} + retcode = gds_fill_inlcpy(peer, ops, dev_ptr, src, len, flags); + break; + } + case GDS_PEER_OP_POLL_AND_DWORD: + case GDS_PEER_OP_POLL_GEQ_DWORD: + case GDS_PEER_OP_POLL_NOR_DWORD: { + int poll_cond; + CUdeviceptr dev_ptr = range_from_id(op->wr.dword_va.target_id)->dptr + + op->wr.dword_va.offset; + uint32_t data = op->wr.dword_va.data; + // TODO: properly handle a following fence instead of blidly flushing + int flags = 0; + if (!(post_flags & GDS_POST_OPS_DISCARD_WAIT_FLUSH)) + flags |= GDS_WAIT_POST_FLUSH_REMOTE; + + gds_dbg("OP_WAIT_DWORD dev_ptr=%llx data=%" PRIx32 " type=%" PRIx32 "\n", dev_ptr, data, (uint32_t)op->type); + + switch(op->type) { + case GDS_PEER_OP_POLL_NOR_DWORD: + poll_cond = GDS_WAIT_COND_NOR; + break; + case GDS_PEER_OP_POLL_GEQ_DWORD: + poll_cond = GDS_WAIT_COND_GEQ; + break; + case GDS_PEER_OP_POLL_AND_DWORD: + poll_cond = GDS_WAIT_COND_AND; + break; + default: + assert(!"cannot happen"); + retcode = EINVAL; + goto out; + } + retcode = gds_fill_poll(peer, ops, dev_ptr, data, poll_cond, flags); + break; + } + default: + gds_err("undefined peer op type %d\n", op->type); + retcode = EINVAL; + break; + } + if (retcode) { + gds_err("error in fill func at entry n=%zu\n", n); + goto out; + } + } + + assert(n_ops == n); + +out: + return retcode; +} + +//----------------------------------------------------------------------------- + +static int gds_mlx5_exp_post_ops_on_cpu(size_t n_ops, struct peer_op_wr *op, int post_flags) +{ + int retcode = 0; + size_t n = 0; + gds_dbg("n_ops=%zu op=%p post_flags=0x%x\n", n_ops, op, post_flags); + for (; op && n < n_ops; op = op->next, ++n) { + //int flags = 0; + gds_dbg("op[%zu]=%p\n", n, op); + //gds_dbg("op[%zu]=%p type:%08x\n", n, op, op->type); + switch(op->type) { + case GDS_PEER_OP_FENCE: { + gds_dbg("FENCE flags=%" PRIu64 "\n", op->wr.fence.fence_flags); + uint32_t fence_op = (op->wr.fence.fence_flags & (GDS_PEER_FENCE_OP_READ|GDS_PEER_FENCE_OP_WRITE)); + uint32_t fence_from = (op->wr.fence.fence_flags & (GDS_PEER_FENCE_FROM_CPU|GDS_PEER_FENCE_FROM_HCA)); + uint32_t fence_mem = (op->wr.fence.fence_flags & (GDS_PEER_FENCE_MEM_SYS|GDS_PEER_FENCE_MEM_PEER)); + + if (fence_op == GDS_PEER_FENCE_OP_READ) { + gds_warnc(1, "nothing to do for read fences\n"); + //retcode = EINVAL; + break; + } + else { + if (fence_from != GDS_PEER_FENCE_FROM_HCA) { + gds_err("unexpected from %08x fence, expected FROM_HCA\n", fence_from); + retcode = EINVAL; + break; + } + if (fence_mem == GDS_PEER_FENCE_MEM_PEER) { + gds_dbg("using light membar\n"); + wmb(); + } + else if (fence_mem == GDS_PEER_FENCE_MEM_SYS) { + gds_dbg("using heavy membar\n"); + wmb(); + } + else { + gds_err("unsupported fence combination\n"); + retcode = EINVAL; + break; + } + } + break; + } + case GDS_PEER_OP_STORE_DWORD: { + uint32_t *ptr = (uint32_t*)((ptrdiff_t)range_from_id(op->wr.dword_va.target_id)->va + op->wr.dword_va.offset); + uint32_t data = op->wr.dword_va.data; + // A || B || C || E + gds_dbg("STORE_DWORD ptr=%p data=%08" PRIx32 "\n", ptr, data); + gds_atomic_set(ptr, data); + break; + } + case GDS_PEER_OP_STORE_QWORD: { + uint64_t *ptr = (uint64_t*)((ptrdiff_t)range_from_id(op->wr.qword_va.target_id)->va + op->wr.qword_va.offset); + uint64_t data = op->wr.qword_va.data; + gds_dbg("STORE_QWORD ptr=%p data=%016" PRIx64 "\n", ptr, data); + gds_atomic_set(ptr, data); + break; + } + case GDS_PEER_OP_COPY_BLOCK: { + uint64_t *ptr = (uint64_t*)((ptrdiff_t)range_from_id(op->wr.copy_op.target_id)->va + op->wr.copy_op.offset); + uint64_t *src = (uint64_t*)op->wr.copy_op.src; + size_t n_bytes = op->wr.copy_op.len; + gds_dbg("COPY_BLOCK ptr=%p src=%p len=%zu\n", ptr, src, n_bytes); + gds_bf_copy(ptr, src, n_bytes); + break; + } + case GDS_PEER_OP_POLL_AND_DWORD: + case GDS_PEER_OP_POLL_GEQ_DWORD: + case GDS_PEER_OP_POLL_NOR_DWORD: { + int poll_cond; + uint32_t *ptr = (uint32_t*)((ptrdiff_t)range_from_id(op->wr.dword_va.target_id)->va + op->wr.dword_va.offset); + uint32_t value = op->wr.dword_va.data; + bool flush = true; + if (post_flags & GDS_POST_OPS_DISCARD_WAIT_FLUSH) + flush = false; + gds_dbg("WAIT_32 dev_ptr=%p data=%" PRIx32 " type=%" PRIx32 "\n", ptr, value, (uint32_t)op->type); + bool done = false; + do { + uint32_t data = gds_atomic_get(ptr); + switch(op->type) { + case GDS_PEER_OP_POLL_NOR_DWORD: + done = (0 != ~(data | value)); + break; + case GDS_PEER_OP_POLL_GEQ_DWORD: + done = ((int32_t)data - (int32_t)value >= 0); + break; + case GDS_PEER_OP_POLL_AND_DWORD: + done = (0 != (data & value)); + break; + default: + gds_err("invalid op type %02x\n", op->type); + retcode = EINVAL; + goto out; + } + if (done) + break; + // TODO: more aggressive CPU relaxing needed here to avoid starving I/O fabric + arch_cpu_relax(); + } while(true); + break; + } + default: + gds_err("undefined peer op type %d\n", op->type); + retcode = EINVAL; + break; + } + if (retcode) { + gds_err("error %d at entry n=%zu\n", retcode, n); + goto out; + } + } + +out: + return retcode; +} + +//----------------------------------------------------------------------------- + +int gds_mlx5_exp_get_send_descs(gds_mlx5_send_info_t *mlx5_i, const gds_send_request_t *_request) +{ + int retcode = 0; + const gds_mlx5_exp_send_request_t *request = to_gds_mexp_send_request(_request); + size_t n_ops = request->commit.entries; + peer_op_wr *op = request->commit.storage; + size_t n = 0; + + memset(mlx5_i, 0, sizeof(*mlx5_i)); + + for (; op && n < n_ops; op = op->next, ++n) { + switch(op->type) { + case GDS_PEER_OP_FENCE: { + gds_dbg("OP_FENCE: fence_flags=%" PRIu64 "\n", op->wr.fence.fence_flags); + uint32_t fence_op = (op->wr.fence.fence_flags & (GDS_PEER_FENCE_OP_READ|GDS_PEER_FENCE_OP_WRITE)); + uint32_t fence_from = (op->wr.fence.fence_flags & (GDS_PEER_FENCE_FROM_CPU|GDS_PEER_FENCE_FROM_HCA)); + uint32_t fence_mem = (op->wr.fence.fence_flags & (GDS_PEER_FENCE_MEM_SYS|GDS_PEER_FENCE_MEM_PEER)); + if (fence_op == GDS_PEER_FENCE_OP_READ) { + gds_dbg("nothing to do for read fences\n"); + break; + } + if (fence_from != GDS_PEER_FENCE_FROM_HCA) { + gds_err("unexpected from fence\n"); + retcode = EINVAL; + break; + } + if (fence_mem == GDS_PEER_FENCE_MEM_PEER) { + gds_dbg("using light membar\n"); + mlx5_i->membar = 1; + } + else if (fence_mem == GDS_PEER_FENCE_MEM_SYS) { + gds_dbg("using heavy membar\n"); + mlx5_i->membar_full = 1; + } + else { + gds_err("unsupported fence combination\n"); + retcode = EINVAL; + break; + } + break; + } + case GDS_PEER_OP_STORE_DWORD: { + CUdeviceptr dev_ptr = range_from_id(op->wr.dword_va.target_id)->dptr + + op->wr.dword_va.offset; + uint32_t data = op->wr.dword_va.data; + gds_dbg("OP_STORE_DWORD dev_ptr=%" PRIx64 " data=%08x\n", (uint64_t)dev_ptr, data); + if (n != 0) { + gds_err("store DWORD is not 1st op\n"); + retcode = EINVAL; + break; + } + mlx5_i->dbrec_ptr = (uint32_t*)dev_ptr; + mlx5_i->dbrec_value = data; + break; + } + case GDS_PEER_OP_STORE_QWORD: { + CUdeviceptr dev_ptr = range_from_id(op->wr.qword_va.target_id)->dptr + + op->wr.qword_va.offset; + uint64_t data = op->wr.qword_va.data; + gds_dbg("OP_STORE_QWORD dev_ptr=%" PRIx64 " data=%" PRIx64 "\n", (uint64_t)dev_ptr, (uint64_t)data); + if (n != 2) { + gds_err("store QWORD is not 3rd op\n"); + retcode = EINVAL; + break; + } + mlx5_i->db_ptr = (uint64_t*)dev_ptr; + mlx5_i->db_value = data; + break; + } + case GDS_PEER_OP_COPY_BLOCK: { + CUdeviceptr dev_ptr = range_from_id(op->wr.copy_op.target_id)->dptr + + op->wr.copy_op.offset; + size_t len = op->wr.copy_op.len; + void *src = op->wr.copy_op.src; + gds_dbg("send inline detected\n"); + if (len < 8 || len > 64) { + gds_err("unexpected len %zu\n", len); + retcode = EINVAL; + break; + } + mlx5_i->db_ptr = (uint64_t*)dev_ptr; + mlx5_i->db_value = *(uint64_t*)src; + break; + } + case GDS_PEER_OP_POLL_AND_DWORD: + case GDS_PEER_OP_POLL_GEQ_DWORD: + case GDS_PEER_OP_POLL_NOR_DWORD: { + gds_err("unexpected polling op in send request\n"); + retcode = EINVAL; + break; + } + default: + gds_err("undefined peer op type %d\n", op->type); + retcode = EINVAL; + break; + } + + if (retcode) { + gds_err("error in fill func at entry n=%zu\n", n); + break; + } + } + return retcode; +} + +//----------------------------------------------------------------------------- + +static ibv_exp_res_domain *gds_mlx5_exp_create_res_domain(struct ibv_context *context) +{ + if (!context) { + gds_err("invalid context"); + return NULL; + } + + ibv_exp_res_domain_init_attr res_domain_attr; + memset(&res_domain_attr, 0, sizeof(res_domain_attr)); + + res_domain_attr.comp_mask |= IBV_EXP_RES_DOMAIN_THREAD_MODEL; + res_domain_attr.thread_model = IBV_EXP_THREAD_SINGLE; + + ibv_exp_res_domain *res_domain = ibv_exp_create_res_domain(context, &res_domain_attr); + if (!res_domain) { + gds_warn("Can't create resource domain\n"); + } + + return res_domain; +} + +//----------------------------------------------------------------------------- + +gds_mlx5_exp_cq_t *gds_mlx5_exp_create_cq( + struct ibv_context *context, int cqe, + void *cq_context, struct ibv_comp_channel *channel, + int comp_vector, gds_peer *peer, gds_peer_attr *peer_attr, gds_alloc_cq_flags_t flags, + struct ibv_exp_res_domain *res_domain) +{ + gds_mlx5_exp_cq_t *gmexpcq = NULL; + ibv_exp_cq_init_attr attr; + int ret = 0; + + assert(context); + assert(peer); + assert(peer_attr); + + gmexpcq = (gds_mlx5_exp_cq_t *)calloc(1, sizeof(gds_mlx5_exp_cq_t)); + if (!gmexpcq) { + gds_err("cannot allocate memory\n"); + return NULL; + } + + peer->alloc_type = gds_peer::CQ; + peer->alloc_flags = flags; + + attr.comp_mask = IBV_EXP_CQ_INIT_ATTR_PEER_DIRECT; + attr.flags = 0; // see ibv_exp_cq_create_flags + static_assert(sizeof(gds_peer_attr) == sizeof(struct ibv_exp_peer_direct_attr)); + attr.peer_direct_attrs = (struct ibv_exp_peer_direct_attr *)(peer_attr); + if (res_domain) { + gds_dbg("using peer->res_domain %p for CQ\n", res_domain); + attr.res_domain = res_domain; + attr.comp_mask |= IBV_EXP_CQ_INIT_ATTR_RES_DOMAIN; + gmexpcq->res_domain = res_domain; + } + + int old_errno = errno; + gmexpcq->gcq.cq = ibv_exp_create_cq(context, cqe, cq_context, channel, comp_vector, &attr); + if (!gmexpcq->gcq.cq) { + gds_err("error %d in ibv_exp_create_cq, old errno %d\n", errno, old_errno); + return NULL; + } + + return gmexpcq; +} + +//----------------------------------------------------------------------------- + +int gds_mlx5_exp_destroy_cq(gds_mlx5_exp_cq_t *gmexpcq) +{ + int retcode = 0; + int ret; + + if (!gmexpcq) + return retcode; + + if (gmexpcq->gcq.cq) { + ret = ibv_destroy_cq(gmexpcq->gcq.cq); + if (ret) { + gds_err("error %d in destroy_cq\n", ret); + retcode = ret; + } + } + + // res_domain will be destroyed in gds_mlx5_exp_destroy_qp. + + free(gmexpcq); + + return retcode; +} + + +//----------------------------------------------------------------------------- + +int gds_mlx5_exp_destroy_qp(gds_qp_t *gqp) +{ + int retcode = 0; + int ret; + + if (!gqp) + return retcode; + + gds_mlx5_exp_qp_t *gmexpqp = to_gds_mexp_qp(gqp); + + if (gmexpqp->gqp.qp) { + ret = ibv_destroy_qp(gmexpqp->gqp.qp); + if (ret) { + gds_err("error %d in destroy_qp\n", ret); + retcode = ret; + } + } + + if (gmexpqp->gqp.send_cq) { + ret = gds_mlx5_exp_destroy_cq(to_gds_mexp_cq(gmexpqp->gqp.send_cq)); + if (ret) { + gds_err("error %d in destroy_cq send_cq\n", ret); + retcode = ret; + } + } + + if (gmexpqp->gqp.recv_cq) { + ret = gds_mlx5_exp_destroy_cq(to_gds_mexp_cq(gmexpqp->gqp.recv_cq)); + if (ret) { + gds_err("error %d in destroy_cq recv_cq\n", ret); + retcode = ret; + } + } + + if (gmexpqp->res_domain) { + struct ibv_exp_destroy_res_domain_attr attr = {0,}; //IBV_EXP_DESTROY_RES_DOMAIN_RESERVED + ret = ibv_exp_destroy_res_domain(gmexpqp->gqp.dev_context, gmexpqp->res_domain, &attr); + if (ret) { + gds_err("ibv_exp_destroy_res_domain error %d: %s\n", ret, strerror(ret)); + retcode = ret; + } + } + + free(gmexpqp); + + return retcode; +} + +//----------------------------------------------------------------------------- + +int gds_mlx5_exp_create_qp( + struct ibv_pd *pd, struct ibv_context *context, gds_qp_init_attr_t *qp_attr, + gds_peer *peer, gds_peer_attr *peer_attr, int flags, gds_qp_t **gqp) +{ + int ret = 0; + gds_mlx5_exp_qp_t *gmexpqp = NULL; + struct ibv_qp *qp = NULL; + gds_mlx5_exp_cq_t *rx_gmexpcq = NULL, *tx_gmexpcq = NULL; + struct ibv_exp_qp_init_attr exp_qp_attr = {0,}; + int old_errno = errno; + + assert(pd); + assert(context); + assert(qp_attr); + assert(peer); + assert(peer_attr); + + gmexpqp = (gds_mlx5_exp_qp_t *)calloc(1, sizeof(gds_mlx5_exp_qp_t)); + if (!gmexpqp) { + ret = ENOMEM; + gds_err("cannot allocate memory\n"); + goto err; + } + + gmexpqp->gqp.dev_context = context; + + gmexpqp->res_domain = gds_mlx5_exp_create_res_domain(context); + if (gmexpqp->res_domain) + gds_dbg("using res_domain %p\n", gmexpqp->res_domain); + else + gds_warn("NOT using res_domain\n"); + + tx_gmexpcq = gds_mlx5_exp_create_cq( + context, qp_attr->cap.max_send_wr, NULL, NULL, 0, peer, peer_attr, + (flags & GDS_CREATE_QP_TX_CQ_ON_GPU) ? GDS_ALLOC_CQ_ON_GPU : GDS_ALLOC_CQ_DEFAULT, + gmexpqp->res_domain + ); + if (!tx_gmexpcq) { + ret = errno; + gds_err("error %d while creating TX CQ, old_errno=%d\n", ret, old_errno); + goto err; + } + + rx_gmexpcq = gds_mlx5_exp_create_cq( + context, qp_attr->cap.max_recv_wr, NULL, NULL, 0, peer, peer_attr, + (flags & GDS_CREATE_QP_RX_CQ_ON_GPU) ? GDS_ALLOC_CQ_ON_GPU : GDS_ALLOC_CQ_DEFAULT, + gmexpqp->res_domain + ); + if (!rx_gmexpcq) { + ret = errno; + gds_err("error %d while creating RX CQ\n", ret); + goto err; + } + + // peer registration + peer->alloc_type = gds_peer::WQ; + peer->alloc_flags = GDS_ALLOC_WQ_DEFAULT | GDS_ALLOC_DBREC_DEFAULT; + if (flags & GDS_CREATE_QP_WQ_ON_GPU) { + gds_err("error, QP WQ on GPU is not supported yet\n"); + goto err; + } + if (flags & GDS_CREATE_QP_WQ_DBREC_ON_GPU) { + gds_warn("QP WQ DBREC on GPU\n"); + peer->alloc_flags |= GDS_ALLOC_DBREC_ON_GPU; + } + + exp_qp_attr.send_cq = tx_gmexpcq->gcq.cq; + exp_qp_attr.recv_cq = rx_gmexpcq->gcq.cq; + exp_qp_attr.pd = pd; + exp_qp_attr.comp_mask = IBV_EXP_QP_INIT_ATTR_PD | IBV_EXP_QP_INIT_ATTR_PEER_DIRECT; + static_assert(sizeof(gds_peer_attr) == sizeof(struct ibv_exp_peer_direct_attr)); + exp_qp_attr.peer_direct_attrs = (struct ibv_exp_peer_direct_attr *)peer_attr; + exp_qp_attr.qp_type = qp_attr->qp_type; + + assert(sizeof(exp_qp_attr.cap) == sizeof(qp_attr->cap)); + + memcpy(&exp_qp_attr.cap, &qp_attr->cap, sizeof(qp_attr->cap)); + + qp = ibv_exp_create_qp(context, &exp_qp_attr); + if (!qp) { + ret = EINVAL; + gds_err("error in ibv_exp_create_qp\n"); + goto err; + } + + tx_gmexpcq->gcq.cq = qp->send_cq; + rx_gmexpcq->gcq.cq = qp->recv_cq; + + gmexpqp->gqp.qp = qp; + gmexpqp->gqp.send_cq = &tx_gmexpcq->gcq; + gmexpqp->gqp.recv_cq = &rx_gmexpcq->gcq; + + gds_dbg("created gds_mlx5_exp_qp=%p\n", gmexpqp); + + *gqp = &gmexpqp->gqp; + + return 0; + +err: + if (gmexpqp) { + gds_dbg("destroying QP\n"); + gds_mlx5_exp_destroy_qp(&gmexpqp->gqp); + } + + return ret; +} + + +//----------------------------------------------------------------------------- + +int gds_mlx5_exp_prepare_send(gds_qp_t *gqp, gds_send_wr *p_ewr, + gds_send_wr **bad_ewr, + gds_send_request_t *_request) +{ + int ret = 0; + + gds_mlx5_exp_qp_t *gmexpqp; + gds_mlx5_exp_send_request_t *request; + + assert(gqp); + assert(_request); + + gmexpqp = to_gds_mexp_qp(gqp); + request = to_gds_mexp_send_request(_request); + + ret = ibv_post_send(gmexpqp->gqp.qp, p_ewr, bad_ewr); + if (ret) { + + if (ret == ENOMEM) { + // out of space error can happen too often to report + gds_dbg("ENOMEM error %d in ibv_post_send\n", ret); + } else { + gds_err("error %d in ibv_post_send\n", ret); + } + goto out; + } + + ret = ibv_exp_peer_commit_qp(gmexpqp->gqp.qp, &request->commit); + if (ret) { + gds_err("error %d in ibv_exp_peer_commit_qp\n", ret); + goto out; + } +out: + return ret; +} + +//----------------------------------------------------------------------------- + +void gds_mlx5_exp_init_send_info(gds_send_request_t *_info) +{ + gds_mlx5_exp_send_request_t *info; + + assert(_info); + info = to_gds_mexp_send_request(_info); + + gds_dbg("send_request=%p\n", info); + + info->commit.storage = info->wr; + info->commit.entries = sizeof(info->wr)/sizeof(info->wr[0]); + gds_mlx5_exp_init_ops(info->commit.storage, info->commit.entries); +} + +//----------------------------------------------------------------------------- + +int gds_mlx5_exp_post_send_ops(gds_peer *peer, gds_send_request_t *_info, gds_op_list_t &ops) +{ + gds_mlx5_exp_send_request_t *info; + + assert(peer); + assert(_info); + + info = to_gds_mexp_send_request(_info); + return gds_mlx5_exp_post_ops(peer, info->commit.entries, info->commit.storage, ops, 0); +} + +//----------------------------------------------------------------------------- + +int gds_mlx5_exp_post_send_ops_on_cpu(gds_send_request_t *_info, int flags) +{ + gds_mlx5_exp_send_request_t *info; + + assert(_info); + + info = to_gds_mexp_send_request(_info); + return gds_mlx5_exp_post_ops_on_cpu(info->commit.entries, info->commit.storage, flags); +} + +//----------------------------------------------------------------------------- + +void gds_mlx5_exp_init_wait_request(gds_wait_request_t *_request, uint32_t offset) +{ + gds_mlx5_exp_wait_request_t *request; + + assert(_request); + request = to_gds_mexp_wait_request(_request); + + gds_dbg("wait_request=%p offset=%08x\n", request, offset); + request->peek.storage = request->wr; + request->peek.entries = sizeof(request->wr)/sizeof(request->wr[0]); + request->peek.whence = IBV_EXP_PEER_PEEK_ABSOLUTE; + request->peek.offset = offset; + gds_mlx5_exp_init_ops(request->peek.storage, request->peek.entries); +} + +//----------------------------------------------------------------------------- + +static void gds_mlx5_exp_dump_ops(struct peer_op_wr *op, size_t count) +{ + size_t n = 0; + for (; op; op = op->next, ++n) { + gds_dbg("op[%zu] type:%d\n", n, op->type); + switch(op->type) { + case IBV_EXP_PEER_OP_FENCE: { + gds_dbg("FENCE flags=%" PRIu64 "\n", op->wr.fence.fence_flags); + break; + } + case IBV_EXP_PEER_OP_STORE_DWORD: { + CUdeviceptr dev_ptr = range_from_id(op->wr.dword_va.target_id)->dptr + + op->wr.dword_va.offset; + gds_dbg("STORE_QWORD data:%x target_id:%" PRIx64 " offset:%zu dev_ptr=%llx\n", + op->wr.dword_va.data, op->wr.dword_va.target_id, + op->wr.dword_va.offset, dev_ptr); + break; + } + case IBV_EXP_PEER_OP_STORE_QWORD: { + CUdeviceptr dev_ptr = range_from_id(op->wr.qword_va.target_id)->dptr + + op->wr.qword_va.offset; + gds_dbg("STORE_QWORD data:%" PRIx64 " target_id:%" PRIx64 " offset:%zu dev_ptr=%llx\n", + op->wr.qword_va.data, op->wr.qword_va.target_id, + op->wr.qword_va.offset, dev_ptr); + break; + } + case IBV_EXP_PEER_OP_COPY_BLOCK: { + CUdeviceptr dev_ptr = range_from_id(op->wr.copy_op.target_id)->dptr + + op->wr.copy_op.offset; + gds_dbg("COPY_BLOCK src:%p len:%zu target_id:%" PRIx64 " offset:%zu dev_ptr=%llx\n", + op->wr.copy_op.src, op->wr.copy_op.len, + op->wr.copy_op.target_id, op->wr.copy_op.offset, + dev_ptr); + break; + } + case IBV_EXP_PEER_OP_POLL_AND_DWORD: + case IBV_EXP_PEER_OP_POLL_NOR_DWORD: { + CUdeviceptr dev_ptr = range_from_id(op->wr.dword_va.target_id)->dptr + + op->wr.dword_va.offset; + gds_dbg("%s data:%08x target_id:%" PRIx64 " offset:%zu dev_ptr=%llx\n", + (op->type==IBV_EXP_PEER_OP_POLL_AND_DWORD) ? "POLL_AND_DW" : "POLL_NOR_SDW", + op->wr.dword_va.data, + op->wr.dword_va.target_id, + op->wr.dword_va.offset, + dev_ptr); + break; + } + default: + gds_err("undefined peer op type %d\n", op->type); + break; + } + } + + assert(count == n); +} + +//----------------------------------------------------------------------------- + +void gds_mlx5_exp_dump_wait_request(gds_wait_request_t *_request, size_t idx) +{ + gds_mlx5_exp_wait_request_t *request; + struct ibv_exp_peer_peek *peek; + + assert(_request); + request = to_gds_mexp_wait_request(_request); + peek = &request->peek; + gds_dbg("req[%zu] entries:%u whence:%u offset:%u peek_id:%" PRIx64 " comp_mask:%08x\n", + idx, peek->entries, peek->whence, peek->offset, + peek->peek_id, peek->comp_mask); + gds_mlx5_exp_dump_ops(peek->storage, peek->entries); +} + +//----------------------------------------------------------------------------- + +int gds_mlx5_exp_prepare_wait_cq(gds_cq_t *gcq, gds_wait_request_t *_request, int flags) +{ + int retcode = 0; + gds_mlx5_exp_cq_t *mexpcq; + gds_mlx5_exp_wait_request_t *request; + + assert(gcq); + assert(_request); + + mexpcq = to_gds_mexp_cq(gcq); + request = to_gds_mexp_wait_request(_request); + + retcode = ibv_exp_peer_peek_cq(mexpcq->gcq.cq, &request->peek); + if (retcode == ENOSPC) { + // TODO: handle too few entries + gds_err("not enough ops in peer_peek_cq\n"); + goto out; + } else if (retcode) { + gds_err("error %d in peer_peek_cq\n", retcode); + goto out; + } + //gds_dump_wait_request(request, 1); + out: + return retcode; +} + +//----------------------------------------------------------------------------- + +int gds_mlx5_exp_append_wait_cq(gds_wait_request_t *_request, uint32_t *dw, uint32_t val) +{ + int ret = 0; + unsigned MAX_NUM_ENTRIES; + unsigned n; + struct peer_op_wr *wr; + gds_mlx5_exp_wait_request_t *request; + + assert(_request); + + request = to_gds_mexp_wait_request(_request); + MAX_NUM_ENTRIES = sizeof(request->wr) / sizeof(request->wr[0]); + n = request->peek.entries; + wr = request->peek.storage; + + if (n + 1 > MAX_NUM_ENTRIES) { + gds_err("no space left to stuff a poke\n"); + ret = ENOMEM; + goto out; + } + + // at least 1 op + assert(n); + assert(wr); + + for (; n; --n) + wr = wr->next; + + assert(wr); + + wr->type = IBV_EXP_PEER_OP_STORE_DWORD; + wr->wr.dword_va.data = val; + wr->wr.dword_va.target_id = 0; // direct mapping, offset IS the address + wr->wr.dword_va.offset = (ptrdiff_t)(dw-(uint32_t*)0); + + ++request->peek.entries; + +out: + return ret; +} + +//----------------------------------------------------------------------------- + +int gds_mlx5_exp_abort_wait_cq(gds_cq_t *gcq, gds_wait_request_t *_request) +{ + struct ibv_exp_peer_abort_peek abort_ctx; + gds_mlx5_exp_cq_t *gmexpcq; + gds_mlx5_exp_wait_request_t *request; + + assert(gcq); + assert(_request); + + gmexpcq = to_gds_mexp_cq(gcq); + request = to_gds_mexp_wait_request(_request); + + abort_ctx.peek_id = request->peek.peek_id; + abort_ctx.comp_mask = 0; + return ibv_exp_peer_abort_peek_cq(gmexpcq->gcq.cq, &abort_ctx); +} + +//----------------------------------------------------------------------------- + +int gds_mlx5_exp_stream_post_wait_descriptor(gds_peer *peer, gds_wait_request_t *_request, gds_op_list_t ¶ms, int flags) +{ + int ret = 0; + gds_mlx5_exp_wait_request_t *request; + + assert(peer); + assert(_request); + + request = to_gds_mexp_wait_request(_request); + + ret = gds_mlx5_exp_post_ops(peer, request->peek.entries, request->peek.storage, params, flags); + if (ret) + gds_err("error %d in gds_mlx5_exp_post_ops\n", ret); + + return ret; +} + +//----------------------------------------------------------------------------- + +int gds_mlx5_exp_post_wait_descriptor(gds_wait_request_t *_request, int flags) +{ + int ret = 0; + gds_mlx5_exp_wait_request_t *request; + + assert(_request); + request = to_gds_mexp_wait_request(_request); + + ret = gds_mlx5_exp_post_ops_on_cpu(request->peek.entries, request->peek.storage, flags); + if (ret) + gds_err("error %d in gds_mlx5_exp_post_ops_on_cpu\n", ret); + + return ret; +} + +//----------------------------------------------------------------------------- + +int gds_mlx5_exp_get_wait_descs(gds_mlx5_wait_info_t *mlx5_i, const gds_wait_request_t *_request) +{ + int retcode = 0; + const gds_mlx5_exp_wait_request_t *request = to_gds_mexp_wait_request(_request); + size_t n_ops = request->peek.entries; + peer_op_wr *op = request->peek.storage; + size_t n = 0; + + memset(mlx5_i, 0, sizeof(*mlx5_i)); + + for (; op && n < n_ops; op = op->next, ++n) { + switch(op->type) { + case IBV_EXP_PEER_OP_FENCE: { + gds_dbg("OP_FENCE: fence_flags=%" PRIu64 "\n", op->wr.fence.fence_flags); + uint32_t fence_op = (op->wr.fence.fence_flags & (IBV_EXP_PEER_FENCE_OP_READ|IBV_EXP_PEER_FENCE_OP_WRITE)); + uint32_t fence_from = (op->wr.fence.fence_flags & (IBV_EXP_PEER_FENCE_FROM_CPU|IBV_EXP_PEER_FENCE_FROM_HCA)); + uint32_t fence_mem = (op->wr.fence.fence_flags & (IBV_EXP_PEER_FENCE_MEM_SYS|IBV_EXP_PEER_FENCE_MEM_PEER)); + if (fence_op == IBV_EXP_PEER_FENCE_OP_READ) { + gds_dbg("nothing to do for read fences\n"); + break; + } + if (fence_from != IBV_EXP_PEER_FENCE_FROM_HCA) { + gds_err("unexpected from fence\n"); + retcode = EINVAL; + break; + } + gds_err("unsupported fence combination\n"); + retcode = EINVAL; + break; + } + case IBV_EXP_PEER_OP_STORE_DWORD: { + CUdeviceptr dev_ptr = range_from_id(op->wr.dword_va.target_id)->dptr + + op->wr.dword_va.offset; + uint32_t data = op->wr.dword_va.data; + gds_dbg("OP_STORE_DWORD dev_ptr=%" PRIx64 " data=%08x\n", (uint64_t)dev_ptr, data); + if (n != 1) { + gds_err("store DWORD is not 2nd op\n"); + retcode = EINVAL; + break; + } + mlx5_i->flag_ptr = (uint32_t*)dev_ptr; + mlx5_i->flag_value = data; + break; + } + case IBV_EXP_PEER_OP_STORE_QWORD: { + CUdeviceptr dev_ptr = range_from_id(op->wr.qword_va.target_id)->dptr + + op->wr.qword_va.offset; + uint64_t data = op->wr.qword_va.data; + gds_dbg("OP_STORE_QWORD dev_ptr=%" PRIx64 " data=%" PRIx64 "\n", (uint64_t)dev_ptr, (uint64_t)data); + gds_err("unsupported QWORD op\n"); + retcode = EINVAL; + break; + } + case IBV_EXP_PEER_OP_COPY_BLOCK: { + CUdeviceptr dev_ptr = range_from_id(op->wr.copy_op.target_id)->dptr + + op->wr.copy_op.offset; + size_t len = op->wr.copy_op.len; + void *src = op->wr.copy_op.src; + gds_err("unsupported COPY_BLOCK\n"); + retcode = EINVAL; + break; + } + case IBV_EXP_PEER_OP_POLL_AND_DWORD: + case IBV_EXP_PEER_OP_POLL_GEQ_DWORD: + case IBV_EXP_PEER_OP_POLL_NOR_DWORD: { + CUdeviceptr dev_ptr = range_from_id(op->wr.dword_va.target_id)->dptr + + op->wr.dword_va.offset; + uint32_t data = op->wr.dword_va.data; + + gds_dbg("OP_POLL_DWORD dev_ptr=%" PRIx64 " data=%08x\n", (uint64_t)dev_ptr, data); + + mlx5_i->cqe_ptr = (uint32_t *)dev_ptr; + mlx5_i->cqe_value = data; + + switch(op->type) { + case IBV_EXP_PEER_OP_POLL_NOR_DWORD: + // GPU SMs can always do NOR + mlx5_i->cond = GDS_WAIT_COND_NOR; + break; + case IBV_EXP_PEER_OP_POLL_GEQ_DWORD: + mlx5_i->cond = GDS_WAIT_COND_GEQ; + break; + case IBV_EXP_PEER_OP_POLL_AND_DWORD: + mlx5_i->cond = GDS_WAIT_COND_AND; + break; + default: + gds_err("unexpected op type\n"); + retcode = EINVAL; + goto err; + } + break; + } + default: + gds_err("undefined peer op type %d\n", op->type); + retcode = EINVAL; + break; + } + err: + if (retcode) { + gds_err("error in fill func at entry n=%zu\n", n); + break; + } + } + return retcode; +} + +//----------------------------------------------------------------------------- + +int gds_mlx5_exp_rollback_qp(gds_qp_t *gqp, gds_send_request_t *request) +{ + struct ibv_exp_rollback_ctx rollback; + int ret = 0; + enum ibv_exp_rollback_flags flag = IBV_EXP_ROLLBACK_ABORT_LATE; + gds_mlx5_exp_send_request_t *send_info; + + gds_mlx5_exp_qp_t *gmexpqp; + + assert(gqp); + assert(gqp->qp); + assert(request); + + gmexpqp = to_gds_mexp_qp(gqp); + send_info = to_gds_mexp_send_request(request); + + /* from ibv_exp_peer_commit call */ + rollback.rollback_id = send_info->commit.rollback_id; + /* from ibv_exp_rollback_flag */ + rollback.flags = flag; + /* Reserved for future expensions, must be 0 */ + rollback.comp_mask = 0; + gds_warn("Need to rollback WQE %lx\n", rollback.rollback_id); + ret = ibv_exp_rollback_qp(gmexpqp->gqp.qp, &rollback); + if (ret) + gds_err("error %d in ibv_exp_rollback_qp\n", ret); + +out: + return ret; +} + +//----------------------------------------------------------------------------- + +uint32_t gds_mlx5_exp_get_num_wait_request_entries(gds_wait_request_t *request) { + gds_mlx5_exp_wait_request_t *gmexp_request; + assert(request); + gmexp_request = to_gds_mexp_wait_request(request); + return gmexp_request->peek.entries; +} + +//----------------------------------------------------------------------------- + +uint32_t gds_mlx5_exp_get_num_send_request_entries(gds_send_request_t *request) { + gds_mlx5_exp_send_request_t *gmexp_request; + assert(request); + gmexp_request = to_gds_mexp_send_request(request); + return gmexp_request->commit.entries; +} + +//----------------------------------------------------------------------------- + +int gds_transport_mlx5_exp_init(gds_transport_t **transport) +{ + int status = 0; + + gds_transport_t *t = (gds_transport_t *)calloc(1, sizeof(gds_transport_t)); + if (!t) { + status = ENOMEM; + goto out; + } + + t->create_qp = gds_mlx5_exp_create_qp; + t->destroy_qp = gds_mlx5_exp_destroy_qp; + t->rollback_qp = gds_mlx5_exp_rollback_qp; + + t->init_send_info = gds_mlx5_exp_init_send_info; + t->post_send_ops = gds_mlx5_exp_post_send_ops; + t->post_send_ops_on_cpu = gds_mlx5_exp_post_send_ops_on_cpu; + t->prepare_send = gds_mlx5_exp_prepare_send; + t->get_send_descs = gds_mlx5_exp_get_send_descs; + t->get_num_send_request_entries = gds_mlx5_exp_get_num_send_request_entries; + + t->init_wait_request = gds_mlx5_exp_init_wait_request; + t->dump_wait_request = gds_mlx5_exp_dump_wait_request; + t->stream_post_wait_descriptor = gds_mlx5_exp_stream_post_wait_descriptor; + t->post_wait_descriptor = gds_mlx5_exp_post_wait_descriptor; + t->get_wait_descs = gds_mlx5_exp_get_wait_descs; + t->get_num_wait_request_entries = gds_mlx5_exp_get_num_wait_request_entries; + + t->prepare_wait_cq = gds_mlx5_exp_prepare_wait_cq; + t->append_wait_cq = gds_mlx5_exp_append_wait_cq; + t->abort_wait_cq = gds_mlx5_exp_abort_wait_cq; + + *transport = t; + +out: + return status; +} + diff --git a/src/transports/mlx5-exp/mlx5-exp.hpp b/src/transports/mlx5-exp/mlx5-exp.hpp new file mode 100644 index 0000000..861c12e --- /dev/null +++ b/src/transports/mlx5-exp/mlx5-exp.hpp @@ -0,0 +1,64 @@ +#include +#include +#include + +#include +#include +#include + +#include +#include + +#include "objs.hpp" +#include "utils.hpp" + +typedef struct gds_mlx5_exp_cq { + gds_cq_t gcq; + ibv_exp_res_domain *res_domain; +} gds_mlx5_exp_cq_t; + +typedef struct gds_mlx5_exp_qp { + gds_qp_t gqp; + ibv_exp_res_domain *res_domain; +} gds_mlx5_exp_qp_t; + +typedef struct gds_mlx5_exp_send_request { + struct ibv_exp_peer_commit commit; + struct peer_op_wr wr[GDS_SEND_INFO_MAX_OPS]; + uint8_t pad1[32]; +} gds_mlx5_exp_send_request_t; +static_assert(sizeof(gds_mlx5_exp_send_request_t) % 64 == 0, "gds_mlx5_exp_send_request_t must be 64-byte aligned."); +static_assert(sizeof(gds_mlx5_exp_send_request_t) <= sizeof(gds_send_request_t), "The size of gds_mlx5_exp_send_request_t must be less than or equal to that of gds_send_request_t."); + +typedef struct gds_mlx5_exp_wait_request { + struct ibv_exp_peer_peek peek; + struct peer_op_wr wr[GDS_WAIT_INFO_MAX_OPS]; + uint8_t pad1[24]; +} gds_mlx5_exp_wait_request_t; +static_assert(sizeof(gds_mlx5_exp_wait_request_t) % 64 == 0, "gds_mlx5_exp_wait_request_t must be 64-byte aligned."); +static_assert(sizeof(gds_mlx5_exp_wait_request_t) <= sizeof(gds_wait_request_t), "The size of gds_mlx5_exp_wait_request_t must be less than or equal to that of gds_wait_request_t."); + +static inline gds_mlx5_exp_cq_t *to_gds_mexp_cq(gds_cq_t *gcq) { + return container_of(gcq, gds_mlx5_exp_cq_t, gcq); +} + +static inline gds_mlx5_exp_qp_t *to_gds_mexp_qp(gds_qp_t *gqp) { + return container_of(gqp, gds_mlx5_exp_qp_t, gqp); +} + +static inline gds_mlx5_exp_send_request_t *to_gds_mexp_send_request(gds_send_request_t *gsreq) { + return (gds_mlx5_exp_send_request_t *)(gsreq); +} + +static inline const gds_mlx5_exp_send_request_t *to_gds_mexp_send_request(const gds_send_request_t *gsreq) { + return (const gds_mlx5_exp_send_request_t *)to_gds_mexp_send_request((const gds_send_request_t *)gsreq); +} + +static inline gds_mlx5_exp_wait_request_t *to_gds_mexp_wait_request(gds_wait_request_t *gwreq) { + return (gds_mlx5_exp_wait_request_t *)(gwreq); +} + +static inline const gds_mlx5_exp_wait_request_t *to_gds_mexp_wait_request(const gds_wait_request_t *gwreq) { + return (const gds_mlx5_exp_wait_request_t *)to_gds_mexp_wait_request((const gds_wait_request_t *)gwreq); +} + diff --git a/src/utils.hpp b/src/utils.hpp index b501bda..dccb125 100644 --- a/src/utils.hpp +++ b/src/utils.hpp @@ -31,6 +31,7 @@ #warning "__STDC_FORMAT_MACROS should be defined to pull definition of PRIx64, etc" #endif #include // to pull PRIx64 +#include // internal assert function @@ -191,6 +192,10 @@ typedef enum gds_alloc_qp_flags { #include +// TODO: use correct value +// TODO: make it dependent upon the particular GPU +const size_t GDS_GPU_MAX_INLINE_SIZE = 256; + typedef std::vector gds_op_list_t; struct gds_cq *gds_create_cq(struct ibv_context *context, int cqe, void *cq_context, struct ibv_comp_channel *channel, int comp_vector, int gpu_id, gds_alloc_cq_flags_t flags); @@ -205,9 +210,13 @@ struct gds_peer; int gds_fill_membar(gds_peer *peer, gds_op_list_t ¶m, int flags); int gds_fill_inlcpy(gds_peer *peer, gds_op_list_t ¶m, void *ptr, const void *data, size_t n_bytes, int flags); +int gds_fill_inlcpy(gds_peer *peer, gds_op_list_t &ops, CUdeviceptr addr, const void *data, size_t n_bytes, int flags); int gds_fill_poke(gds_peer *peer, gds_op_list_t ¶m, uint32_t *ptr, uint32_t value, int flags); +int gds_fill_poke(gds_peer *peer, gds_op_list_t &ops, CUdeviceptr addr, uint32_t value, int flags); int gds_fill_poke64(gds_peer *peer, gds_op_list_t ¶m, uint64_t *ptr, uint64_t value, int flags); +int gds_fill_poke64(gds_peer *peer, gds_op_list_t &ops, CUdeviceptr addr, uint64_t value, int flags); int gds_fill_poll(gds_peer *peer, gds_op_list_t ¶m, uint32_t *ptr, uint32_t magic, int cond_flag, int flags); +int gds_fill_poll(gds_peer *peer, gds_op_list_t &ops, CUdeviceptr ptr, uint32_t magic, int cond_flag, int flags); int gds_stream_batch_ops(gds_peer *peer, CUstream stream, gds_op_list_t ¶ms, int flags); @@ -216,10 +225,11 @@ enum gds_post_ops_flags { }; struct gds_peer; -int gds_post_ops(gds_peer *peer, size_t n_ops, struct peer_op_wr *op, gds_op_list_t ¶ms, int post_flags = 0); -int gds_post_ops_on_cpu(size_t n_descs, struct peer_op_wr *op, int post_flags = 0); gds_peer *peer_from_stream(CUstream stream); +bool gds_simulate_write64(); +void gds_enable_barrier_for_inlcpy(CUstreamBatchMemOpParams *param); + //----------------------------------------------------------------------------- /* diff --git a/tests/gds_kernel_latency.c b/tests/gds_kernel_latency.c index 63875bf..04370f5 100644 --- a/tests/gds_kernel_latency.c +++ b/tests/gds_kernel_latency.c @@ -495,7 +495,7 @@ static int pp_wait_cq(struct pingpong_context *ctx, int is_client) { int ret; if (ctx->peersync) { - ret = gds_stream_wait_cq(gpu_stream, &ctx->gds_qp->recv_cq, ctx->consume_rx_cqe); + ret = gds_stream_wait_cq(gpu_stream, ctx->gds_qp->recv_cq, ctx->consume_rx_cqe); } else { if (is_client) { do { @@ -542,23 +542,22 @@ static int pp_post_gpu_send(struct pingpong_context *ctx, uint32_t qpn, CUstream .wr_id = PINGPONG_SEND_WRID, .sg_list = &list, .num_sge = 1, - .exp_opcode = IBV_EXP_WR_SEND, - .exp_send_flags = IBV_EXP_SEND_SIGNALED, + .opcode = IBV_WR_SEND, + .send_flags = IBV_SEND_SIGNALED, .wr = { .ud = { .ah = ctx->ah, .remote_qpn = qpn, .remote_qkey = 0x11111111 } - }, - .comp_mask = 0 + } }; #if 0 if (IBV_QPT_UD != gds_qpt) { memset(&ewr, 0, sizeof(ewr)); ewr.num_sge = 1; - ewr.exp_send_flags = IBV_EXP_SEND_SIGNALED; - ewr.exp_opcode = IBV_EXP_WR_SEND; + ewr.send_flags = IBV_SEND_SIGNALED; + ewr.opcode = IBV_WR_SEND; ewr.wr_id = PINGPONG_SEND_WRID; ewr.sg_list = &list; ewr.next = NULL; @@ -580,23 +579,22 @@ static int pp_prepare_gpu_send(struct pingpong_context *ctx, uint32_t qpn, gds_s .wr_id = PINGPONG_SEND_WRID, .sg_list = &list, .num_sge = 1, - .exp_opcode = IBV_EXP_WR_SEND, - .exp_send_flags = IBV_EXP_SEND_SIGNALED, + .opcode = IBV_WR_SEND, + .send_flags = IBV_SEND_SIGNALED, .wr = { .ud = { .ah = ctx->ah, .remote_qpn = qpn, .remote_qkey = 0x11111111 } - }, - .comp_mask = 0 + } }; if (IBV_QPT_UD != gds_qpt) { memset(&ewr, 0, sizeof(ewr)); ewr.num_sge = 1; - ewr.exp_send_flags = IBV_EXP_SEND_SIGNALED; - ewr.exp_opcode = IBV_EXP_WR_SEND; + ewr.send_flags = IBV_SEND_SIGNALED; + ewr.opcode = IBV_WR_SEND; ewr.wr_id = PINGPONG_SEND_WRID; ewr.sg_list = &list; ewr.next = NULL; @@ -676,7 +674,7 @@ static int pp_post_work(struct pingpong_context *ctx, int n_posts, int rcnt, uin wdesc->descs[k].tag = GDS_TAG_SEND; wdesc->descs[k].send = &wdesc->send_rq; ++k; - ret = gds_prepare_wait_cq(&ctx->gds_qp->send_cq, &wdesc->wait_tx_rq, 0); + ret = gds_prepare_wait_cq(ctx->gds_qp->send_cq, &wdesc->wait_tx_rq, 0); if (ret) { retcode = -ret; break; @@ -685,7 +683,7 @@ static int pp_post_work(struct pingpong_context *ctx, int n_posts, int rcnt, uin wdesc->descs[k].tag = GDS_TAG_WAIT; wdesc->descs[k].wait = &wdesc->wait_tx_rq; ++k; - ret = gds_prepare_wait_cq(&ctx->gds_qp->recv_cq, &wdesc->wait_rx_rq, 0); + ret = gds_prepare_wait_cq(ctx->gds_qp->recv_cq, &wdesc->wait_rx_rq, 0); if (ret) { retcode = -ret; break; @@ -715,14 +713,14 @@ static int pp_post_work(struct pingpong_context *ctx, int n_posts, int rcnt, uin retcode = -ret; break; } - ret = gds_stream_wait_cq(gpu_stream, &ctx->gds_qp->send_cq, 0); + ret = gds_stream_wait_cq(gpu_stream, ctx->gds_qp->send_cq, 0); if (ret) { // TODO: rollback gpu send gpu_err("error %d in gds_stream_wait_cq\n", ret); retcode = -ret; break; } - ret = gds_stream_wait_cq(gpu_stream, &ctx->gds_qp->recv_cq, ctx->consume_rx_cqe); + ret = gds_stream_wait_cq(gpu_stream, ctx->gds_qp->recv_cq, ctx->consume_rx_cqe); if (ret) { // TODO: rollback gpu send and wait send_cq gpu_err("[%d] error %d in gds_stream_wait_cq\n", my_rank, ret); @@ -751,7 +749,7 @@ static int pp_post_work(struct pingpong_context *ctx, int n_posts, int rcnt, uin if (ctx->use_desc_apis) { work_desc_t *wdesc = calloc(1, sizeof(*wdesc)); int k = 0; - ret = gds_prepare_wait_cq(&ctx->gds_qp->recv_cq, &wdesc->wait_rx_rq, 0); + ret = gds_prepare_wait_cq(ctx->gds_qp->recv_cq, &wdesc->wait_rx_rq, 0); if (ret) { retcode = -ret; break; @@ -773,7 +771,7 @@ static int pp_post_work(struct pingpong_context *ctx, int n_posts, int rcnt, uin CUCHECK(cuStreamAddCallback(gpu_stream, post_work_cb, wdesc, 0)); } } else if (ctx->peersync) { - ret = gds_stream_wait_cq(gpu_stream, &ctx->gds_qp->recv_cq, ctx->consume_rx_cqe); + ret = gds_stream_wait_cq(gpu_stream, ctx->gds_qp->recv_cq, ctx->consume_rx_cqe); if (ret) { // TODO: rollback gpu send and wait send_cq gpu_err("error %d in gds_stream_wait_cq\n", ret); @@ -806,7 +804,7 @@ static int pp_post_work(struct pingpong_context *ctx, int n_posts, int rcnt, uin wdesc->descs[k].tag = GDS_TAG_SEND; wdesc->descs[k].send = &wdesc->send_rq; ++k; - ret = gds_prepare_wait_cq(&ctx->gds_qp->send_cq, &wdesc->wait_tx_rq, 0); + ret = gds_prepare_wait_cq(ctx->gds_qp->send_cq, &wdesc->wait_tx_rq, 0); if (ret) { retcode = -ret; break; @@ -835,7 +833,7 @@ static int pp_post_work(struct pingpong_context *ctx, int n_posts, int rcnt, uin retcode = -ret; break; } - ret = gds_stream_wait_cq(gpu_stream, &ctx->gds_qp->send_cq, 0); + ret = gds_stream_wait_cq(gpu_stream, ctx->gds_qp->send_cq, 0); if (ret) { // TODO: rollback gpu send gpu_err("error %d in gds_stream_wait_cq\n", ret); diff --git a/tests/gds_kernel_loopback_latency.c b/tests/gds_kernel_loopback_latency.c index b2d209c..f6ccc32 100644 --- a/tests/gds_kernel_loopback_latency.c +++ b/tests/gds_kernel_loopback_latency.c @@ -511,16 +511,15 @@ static int pp_post_send(struct pingpong_context *ctx, uint32_t qpn) .wr_id = PINGPONG_SEND_WRID, .sg_list = &list, .num_sge = 1, - .exp_opcode = IBV_EXP_WR_SEND, - .exp_send_flags = IBV_EXP_SEND_SIGNALED, + .opcode = IBV_WR_SEND, + .send_flags = IBV_SEND_SIGNALED, .wr = { .ud = { .ah = ctx->ah, .remote_qpn = qpn, .remote_qkey = 0x11111111 } - }, - .comp_mask = 0 + } }; gds_send_wr *bad_ewr; return gds_post_send(ctx->gds_qp, &ewr, &bad_ewr); @@ -538,16 +537,15 @@ static int pp_post_gpu_send(struct pingpong_context *ctx, uint32_t qpn, CUstream .wr_id = PINGPONG_SEND_WRID, .sg_list = &list, .num_sge = 1, - .exp_opcode = IBV_EXP_WR_SEND, - .exp_send_flags = IBV_EXP_SEND_SIGNALED, + .opcode = IBV_WR_SEND, + .send_flags = IBV_SEND_SIGNALED, .wr = { .ud = { .ah = ctx->ah, .remote_qpn = qpn, .remote_qkey = 0x11111111 } - }, - .comp_mask = 0 + } }; gds_send_wr *bad_ewr; return gds_stream_queue_send(*p_gpu_stream, ctx->gds_qp, &ewr, &bad_ewr); @@ -565,16 +563,15 @@ static int pp_prepare_gpu_send(struct pingpong_context *ctx, uint32_t qpn, gds_s .wr_id = PINGPONG_SEND_WRID, .sg_list = &list, .num_sge = 1, - .exp_opcode = IBV_EXP_WR_SEND, - .exp_send_flags = IBV_EXP_SEND_SIGNALED, + .opcode = IBV_WR_SEND, + .send_flags = IBV_SEND_SIGNALED, .wr = { .ud = { .ah = ctx->ah, .remote_qpn = qpn, .remote_qkey = 0x11111111 } - }, - .comp_mask = 0 + } }; gds_send_wr *bad_ewr; //printf("gpu_post_send_on_stream\n"); @@ -655,7 +652,7 @@ static int pp_post_work(struct pingpong_context *ctx, int n_posts, int rcnt, uin wdesc->descs[k].send = &wdesc->send_rq; ++k; - ret = gds_prepare_wait_cq(&ctx->gds_qp->send_cq, &wdesc->wait_tx_rq, 0); + ret = gds_prepare_wait_cq(ctx->gds_qp->send_cq, &wdesc->wait_tx_rq, 0); if (ret) { retcode = -ret; break; @@ -665,7 +662,7 @@ static int pp_post_work(struct pingpong_context *ctx, int n_posts, int rcnt, uin wdesc->descs[k].wait = &wdesc->wait_tx_rq; ++k; - ret = gds_prepare_wait_cq(&ctx->gds_qp->recv_cq, &wdesc->wait_rx_rq, 0); + ret = gds_prepare_wait_cq(ctx->gds_qp->recv_cq, &wdesc->wait_rx_rq, 0); if (ret) { retcode = -ret; break; @@ -697,7 +694,7 @@ static int pp_post_work(struct pingpong_context *ctx, int n_posts, int rcnt, uin break; } - ret = gds_stream_wait_cq(gpu_stream_server, &ctx->gds_qp->send_cq, 0); + ret = gds_stream_wait_cq(gpu_stream_server, ctx->gds_qp->send_cq, 0); if (ret) { // TODO: rollback gpu send gpu_err("error %d in gds_stream_wait_cq\n", ret); @@ -705,7 +702,7 @@ static int pp_post_work(struct pingpong_context *ctx, int n_posts, int rcnt, uin break; } - ret = gds_stream_wait_cq(gpu_stream_server, &ctx->gds_qp->recv_cq, ctx->consume_rx_cqe); + ret = gds_stream_wait_cq(gpu_stream_server, ctx->gds_qp->recv_cq, ctx->consume_rx_cqe); if (ret) { // TODO: rollback gpu send and wait send_cq gpu_err("error %d in gds_stream_wait_cq\n", ret); diff --git a/tests/gds_poll_lat.c b/tests/gds_poll_lat.c index cf2147b..17c4fdb 100644 --- a/tests/gds_poll_lat.c +++ b/tests/gds_poll_lat.c @@ -11,7 +11,6 @@ #include #include -#include #include #include #include diff --git a/tests/gds_sanity.cpp b/tests/gds_sanity.cpp index 910032b..5394833 100644 --- a/tests/gds_sanity.cpp +++ b/tests/gds_sanity.cpp @@ -14,7 +14,6 @@ #include #include -#include #include #include #include diff --git a/tests/gpu.cpp b/tests/gpu.cpp index 6d7da67..8bee5e3 100644 --- a/tests/gpu.cpp +++ b/tests/gpu.cpp @@ -29,7 +29,6 @@ #include #include -#include #include "gdrapi.h" #include "gdsync.h" diff --git a/tests/gpu.h b/tests/gpu.h index 401e88f..77a6db4 100644 --- a/tests/gpu.h +++ b/tests/gpu.h @@ -27,9 +27,18 @@ #pragma once +#include #include -#include +#undef BEGIN_C_DECLS +#undef END_C_DECLS +#ifdef __cplusplus +# define BEGIN_C_DECLS extern "C" { +# define END_C_DECLS } +#else +# define BEGIN_C_DECLS +# define END_C_DECLS +#endif #ifdef USE_PROFILE #include diff --git a/tests/pingpong.h b/tests/pingpong.h index 32f020b..9cdc03e 100644 --- a/tests/pingpong.h +++ b/tests/pingpong.h @@ -34,7 +34,6 @@ #define IBV_PINGPONG_H #include -#include enum ibv_mtu pp_mtu_to_enum(int mtu); uint16_t pp_get_local_lid(struct ibv_context *context, int port);