From ce78eb62a3684355829e159943235fc2c045b3ca Mon Sep 17 00:00:00 2001 From: Pak Markthub Date: Wed, 4 Aug 2021 01:20:19 -0400 Subject: [PATCH 01/50] Added --with-spectrum-mpi to configure --- Makefile.am | 6 +++--- configure.ac | 44 +++++++++++++++++++++++++++++++++++++------- 2 files changed, 40 insertions(+), 10 deletions(-) diff --git a/Makefile.am b/Makefile.am index a66dfda..ac9a0cc 100644 --- a/Makefile.am +++ b/Makefile.am @@ -36,7 +36,7 @@ bin_PROGRAMS = tests/gds_kernel_latency tests/gds_poll_lat tests/gds_kernel_loop noinst_PROGRAMS = tests/rstest tests/wqtest tests_gds_kernel_latency_SOURCES = tests/gds_kernel_latency.c tests/gpu_kernels.cu tests/pingpong.c tests/gpu.cpp -tests_gds_kernel_latency_LDADD = $(top_builddir)/src/libgdsync.la -lmpi $(LIBGDSTOOLS) -lgdrapi $(LIBNVTX) -lcuda -lcudart $(PTHREAD_LIBS) +tests_gds_kernel_latency_LDADD = $(top_builddir)/src/libgdsync.la $(MPILDFLAGS) $(LIBGDSTOOLS) -lgdrapi $(LIBNVTX) -lcuda -lcudart $(PTHREAD_LIBS) tests_rstest_SOURCES = tests/rstest.cpp tests_rstest_LDADD = @@ -45,10 +45,10 @@ tests_wqtest_SOURCES = tests/task_queue_test.cpp tests_wqtest_LDADD = $(PTHREAD_LIBS) tests_gds_poll_lat_SOURCES = tests/gds_poll_lat.c tests/gpu.cpp tests/gpu_kernels.cu -tests_gds_poll_lat_LDADD = $(top_builddir)/src/libgdsync.la $(LIBGDSTOOLS) -lgdrapi -lmpi $(LIBNVTX) -lcuda -lcudart $(PTHREAD_LIBS) +tests_gds_poll_lat_LDADD = $(top_builddir)/src/libgdsync.la $(LIBGDSTOOLS) -lgdrapi $(MPILDFLAGS) $(LIBNVTX) -lcuda -lcudart $(PTHREAD_LIBS) tests_gds_sanity_SOURCES = tests/gds_sanity.cpp tests/gpu.cpp tests/gpu_kernels.cu -tests_gds_sanity_LDADD = $(top_builddir)/src/libgdsync.la $(LIBGDSTOOLS) -lgdrapi -lmpi $(LIBNVTX) -lcuda -lcudart $(PTHREAD_LIBS) +tests_gds_sanity_LDADD = $(top_builddir)/src/libgdsync.la $(LIBGDSTOOLS) -lgdrapi $(MPILDFLAGS) $(LIBNVTX) -lcuda -lcudart $(PTHREAD_LIBS) tests_gds_kernel_loopback_latency_SOURCES = tests/gds_kernel_loopback_latency.c tests/pingpong.c tests/gpu.cpp tests/gpu_kernels.cu tests_gds_kernel_loopback_latency_LDADD = $(top_builddir)/src/libgdsync.la $(LIBGDSTOOLS) -lgdrapi $(LIBNVTX) -lcuda -lcudart $(PTHREAD_LIBS) diff --git a/configure.ac b/configure.ac index a79aed6..e20f313 100644 --- a/configure.ac +++ b/configure.ac @@ -93,25 +93,54 @@ else AC_SUBST(LIBGDSTOOLS) fi -AC_ARG_WITH([mpi], - AC_HELP_STRING([--with-mpi], [ Set path to mpi installation ])) -if test x$with_mpi = x || test x$with_mpi = xno; then +AC_ARG_WITH([spectrum-mpi], + AC_HELP_STRING([--with-spectrum-mpi], [ Set path to Spectrum MPI installation ])) +if test x$with_spectrum_mpi = x || test x$with_spectrum_mpi = xno; then # assuming system location mpi_home=/usr - MPICC=$with_home/bin/mpicc - MPICXX=$with_home/bin/mpic++ + MPICC=/bin/mpicc + MPICXX=/bin/mpic++ + MPILDFLAGS="-lmpi_ibm" else - if test -d $with_mpi; then - mpi_home=$with_mpi + if test -d $with_spectrum_mpi; then + mpi_home=$with_spectrum_mpi MPICC=${mpi_home}/bin/mpicc MPICXX=${mpi_home}/bin/mpic++ CPPFLAGS="$CPPFLAGS -I${mpi_home}/include" LDFLAGS="$LDFLAGS -L${mpi_home}/lib -L${mpi_home}/lib64" + MPILDFLAGS="-lmpi_ibm" else echo "MPI dir does not exist" fi fi +AC_ARG_WITH([mpi], + AC_HELP_STRING([--with-mpi], [ Set path to MPI installation ])) +if test x$with_spectrum_mpi = x || test x$with_spectrum_mpi == xno; then + if test x$with_mpi = x || test x$with_mpi = xno; then + # assuming system location + mpi_home=/usr + MPICC=/bin/mpicc + MPICXX=/bin/mpic++ + MPILDFLAGS="-lmpi" + else + if test -d $with_mpi; then + mpi_home=$with_mpi + MPICC=${mpi_home}/bin/mpicc + MPICXX=${mpi_home}/bin/mpic++ + CPPFLAGS="$CPPFLAGS -I${mpi_home}/include" + LDFLAGS="$LDFLAGS -L${mpi_home}/lib -L${mpi_home}/lib64" + MPILDFLAGS="-lmpi" + else + echo "MPI dir does not exist" + fi + fi +fi + +if test x$with_spectrum_mpi != x && test x$with_spectrum_mpi != xno && test x$with_mpi != x && test x$with_mpi != xno; then + AC_MSG_ERROR([--with-mpi and --with-spectrum-mpi are mutually exclusive.]) +fi + dnl Specify CUDA Location AC_ARG_WITH(cuda-toolkit, AC_HELP_STRING([--with-cuda-toolkit=CUDATKDIR], [ Specify CUDA toolkit installation directory (default: /usr/local/cuda)]), @@ -186,6 +215,7 @@ AC_MSG_NOTICE([Setting MPI_PATH = ${mpi_home} ]) AC_SUBST( MPI_PATH, [${mpi_home} ]) AC_SUBST( MPICC, [${MPICC} ]) AC_SUBST( MPICXX, [${MPICXX} ]) +AC_SUBST( MPILDFLAGS, [${MPILDFLAGS} ]) CPPFLAGS="$CPPFLAGS -I$CUDA_DRV_PATH/include -I$CUDA_PATH/include" LDFLAGS="$LDFLAGS -L$CUDA_DRV_PATH/lib64 -L$CUDA_DRV_PATH/lib -L$CUDA_PATH/lib64 -L$CUDA_PATH/lib" From 66409ac64785c884889e3d85df10a3a34bc2e8f9 Mon Sep 17 00:00:00 2001 From: Pak Markthub Date: Wed, 4 Aug 2021 21:42:55 -0400 Subject: [PATCH 02/50] Introduced GDS_DRIVER_TYPE to gds_qp and gds_cq --- include/gdsync/core.h | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/include/gdsync/core.h b/include/gdsync/core.h index 7ff0cbb..87a4cc3 100644 --- a/include/gdsync/core.h +++ b/include/gdsync/core.h @@ -40,26 +40,33 @@ ((((v) & 0x0000ffffU) >> 0 ) >= (unsigned)GDS_API_MINOR_VERSION) ) typedef enum gds_param { - GDS_PARAM_VERSION, - GDS_NUM_PARAMS + GDS_PARAM_VERSION, + GDS_NUM_PARAMS } gds_param_t; int gds_query_param(gds_param_t param, int *value); enum gds_create_qp_flags { - GDS_CREATE_QP_DEFAULT = 0, - GDS_CREATE_QP_WQ_ON_GPU = 1<<0, - GDS_CREATE_QP_TX_CQ_ON_GPU = 1<<1, - GDS_CREATE_QP_RX_CQ_ON_GPU = 1<<2, - GDS_CREATE_QP_WQ_DBREC_ON_GPU = 1<<5, + GDS_CREATE_QP_DEFAULT = 0, + GDS_CREATE_QP_WQ_ON_GPU = 1<<0, + GDS_CREATE_QP_TX_CQ_ON_GPU = 1<<1, + GDS_CREATE_QP_RX_CQ_ON_GPU = 1<<2, + GDS_CREATE_QP_WQ_DBREC_ON_GPU = 1<<5, }; typedef struct ibv_exp_qp_init_attr gds_qp_init_attr_t; typedef struct ibv_exp_send_wr gds_send_wr; +typedef enum gds_driver_type { + GDS_DRIVER_TYPE_MLX5_EXP = 0, + GDS_DRIVER_TYPE_MLX5_DV, + GDS_DRIVER_TYPE_MLX5_DEVX +} gds_driver_type_t; + struct gds_cq { struct ibv_cq *cq; uint32_t curr_offset; + gds_driver_type_t dtype; }; struct gds_qp { @@ -68,6 +75,7 @@ struct gds_qp { struct gds_cq recv_cq; struct ibv_exp_res_domain * res_domain; struct ibv_context *dev_context; + gds_driver_type_t dtype; }; /* \brief: Create a peer-enabled QP attached to the specified GPU id. From fdd2270a1a2736657c6385e75d0991b6146a6829 Mon Sep 17 00:00:00 2001 From: Pak Markthub Date: Wed, 4 Aug 2021 21:43:15 -0400 Subject: [PATCH 03/50] Set gds_qp and gds_cq dtype to MLX5_EXP in the creation functions --- src/gdsync.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/gdsync.cpp b/src/gdsync.cpp index 90d5508..e863ec0 100644 --- a/src/gdsync.cpp +++ b/src/gdsync.cpp @@ -1754,6 +1754,8 @@ gds_create_cq_internal(struct ibv_context *context, int cqe, return NULL; } + gcq->dtype = GDS_DRIVER_TYPE_MLX5_EXP; + return gcq; } @@ -1893,6 +1895,7 @@ struct gds_qp *gds_create_qp(struct ibv_pd *pd, struct ibv_context *context, gqp->send_cq.curr_offset = 0; gqp->recv_cq.cq = qp->recv_cq; gqp->recv_cq.curr_offset = 0; + gqp->dtype = GDS_DRIVER_TYPE_MLX5_EXP; gds_dbg("created gds_qp=%p\n", gqp); From 0073f813b84bf27579ba5d6868808fc349baef4d Mon Sep 17 00:00:00 2001 From: Pak Markthub Date: Wed, 4 Aug 2021 22:34:26 -0400 Subject: [PATCH 04/50] Changed the definition of gds_qp_init_attr_t and gds_send_wr. Also, removed res_domain from the gds_qp struct --- include/gdsync/core.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/include/gdsync/core.h b/include/gdsync/core.h index 87a4cc3..4b61e86 100644 --- a/include/gdsync/core.h +++ b/include/gdsync/core.h @@ -54,8 +54,8 @@ enum gds_create_qp_flags { GDS_CREATE_QP_WQ_DBREC_ON_GPU = 1<<5, }; -typedef struct ibv_exp_qp_init_attr gds_qp_init_attr_t; -typedef struct ibv_exp_send_wr gds_send_wr; +typedef struct ibv_qp_init_attr gds_qp_init_attr_t; +typedef struct ibv_send_wr gds_send_wr; typedef enum gds_driver_type { GDS_DRIVER_TYPE_MLX5_EXP = 0, @@ -73,7 +73,6 @@ struct gds_qp { struct ibv_qp *qp; struct gds_cq send_cq; struct gds_cq recv_cq; - struct ibv_exp_res_domain * res_domain; struct ibv_context *dev_context; gds_driver_type_t dtype; }; From 78eadbb4c899d66c8073f4e8e26ed79ed6a15d9a Mon Sep 17 00:00:00 2001 From: Pak Markthub Date: Thu, 5 Aug 2021 01:26:25 -0400 Subject: [PATCH 05/50] Implemented gds_get_driver_type --- include/gdsync/core.h | 7 ++++--- src/utils.hpp | 16 ++++++++++++++++ 2 files changed, 20 insertions(+), 3 deletions(-) diff --git a/include/gdsync/core.h b/include/gdsync/core.h index 4b61e86..c438594 100644 --- a/include/gdsync/core.h +++ b/include/gdsync/core.h @@ -58,7 +58,8 @@ typedef struct ibv_qp_init_attr gds_qp_init_attr_t; typedef struct ibv_send_wr gds_send_wr; typedef enum gds_driver_type { - GDS_DRIVER_TYPE_MLX5_EXP = 0, + GDS_DRIVER_TYPE_UNSUPPORTED = 0, + GDS_DRIVER_TYPE_MLX5_EXP, GDS_DRIVER_TYPE_MLX5_DV, GDS_DRIVER_TYPE_MLX5_DEVX } gds_driver_type_t; @@ -71,8 +72,8 @@ struct gds_cq { struct gds_qp { struct ibv_qp *qp; - struct gds_cq send_cq; - struct gds_cq recv_cq; + struct gds_cq *send_cq; + struct gds_cq *recv_cq; struct ibv_context *dev_context; gds_driver_type_t dtype; }; diff --git a/src/utils.hpp b/src/utils.hpp index b501bda..de2aaa8 100644 --- a/src/utils.hpp +++ b/src/utils.hpp @@ -222,6 +222,22 @@ gds_peer *peer_from_stream(CUstream stream); //----------------------------------------------------------------------------- +/* \brief: Get the underlying driver associated with the ibdev. + * + */ +static inline gds_driver_type gds_get_driver_type(struct ibv_device *ibdev) +{ + const char *dev_name = ibv_get_device_name(ibdev); + + // Heuristically guess the driver by the device name. + // Until we find a better way to do so... + if (strstr(dev_name, "mlx5") != NULL) + return GDS_DRIVER_TYPE_MLX5_EXP; + return GDS_DRIVER_TYPE_UNSUPPORTED; +} + +//----------------------------------------------------------------------------- + /* * Local variables: * c-indent-level: 8 From aade2abf15285314c88841a9a3b7c8d0948f0afc Mon Sep 17 00:00:00 2001 From: Pak Markthub Date: Thu, 5 Aug 2021 01:27:08 -0400 Subject: [PATCH 06/50] Initial implementation of mlx5-exp.cpp/hpp and moved create/destroy qp/cq functions to mlx5-exp --- src/gdsync.cpp | 125 ------------------------ src/mlx5-exp.cpp | 245 +++++++++++++++++++++++++++++++++++++++++++++++ src/mlx5-exp.hpp | 20 ++++ 3 files changed, 265 insertions(+), 125 deletions(-) create mode 100644 src/mlx5-exp.cpp create mode 100644 src/mlx5-exp.hpp diff --git a/src/gdsync.cpp b/src/gdsync.cpp index e863ec0..5a14c51 100644 --- a/src/gdsync.cpp +++ b/src/gdsync.cpp @@ -1678,131 +1678,6 @@ gds_peer *peer_from_stream(CUstream stream) //----------------------------------------------------------------------------- -static ibv_exp_res_domain *gds_create_res_domain(struct ibv_context *context) -{ - if (!context) { - gds_err("invalid context"); - return NULL; - } - - ibv_exp_res_domain_init_attr res_domain_attr; - memset(&res_domain_attr, 0, sizeof(res_domain_attr)); - - res_domain_attr.comp_mask |= IBV_EXP_RES_DOMAIN_THREAD_MODEL; - res_domain_attr.thread_model = IBV_EXP_THREAD_SINGLE; - - ibv_exp_res_domain *res_domain = ibv_exp_create_res_domain(context, &res_domain_attr); - if (!res_domain) { - gds_warn("Can't create resource domain\n"); - } - - return res_domain; -} - -//----------------------------------------------------------------------------- - -static struct gds_cq * -gds_create_cq_internal(struct ibv_context *context, int cqe, - void *cq_context, struct ibv_comp_channel *channel, - int comp_vector, int gpu_id, gds_alloc_cq_flags_t flags, - struct ibv_exp_res_domain * res_domain) -{ - struct gds_cq *gcq = NULL; - ibv_exp_cq_init_attr attr; - gds_peer *peer = NULL; - gds_peer_attr *peer_attr = NULL; - int ret=0; - - if(!context) - { - gds_dbg("Invalid input context\n"); - return NULL; - } - - gcq = (struct gds_cq*)calloc(1, sizeof(struct gds_cq)); - if (!gcq) { - gds_err("cannot allocate memory\n"); - return NULL; - } - - //Here we need to recover peer and peer_attr pointers to set alloc_type and alloc_flags - //before ibv_exp_create_cq - ret = gds_register_peer_by_ordinal(gpu_id, &peer, &peer_attr); - if (ret) { - gds_err("error %d while registering GPU peer\n", ret); - return NULL; - } - assert(peer); - assert(peer_attr); - - peer->alloc_type = gds_peer::CQ; - peer->alloc_flags = flags; - - attr.comp_mask = IBV_EXP_CQ_INIT_ATTR_PEER_DIRECT; - attr.flags = 0; // see ibv_exp_cq_create_flags - attr.peer_direct_attrs = peer_attr; - if (res_domain) { - gds_dbg("using peer->res_domain %p for CQ\n", res_domain); - attr.res_domain = res_domain; - attr.comp_mask |= IBV_EXP_CQ_INIT_ATTR_RES_DOMAIN; - } - - int old_errno = errno; - gcq->cq = ibv_exp_create_cq(context, cqe, cq_context, channel, comp_vector, &attr); - if (!gcq->cq) { - gds_err("error %d in ibv_exp_create_cq, old errno %d\n", errno, old_errno); - return NULL; - } - - gcq->dtype = GDS_DRIVER_TYPE_MLX5_EXP; - - return gcq; -} - -//Note: general create cq function, not really used for now! -struct gds_cq * -gds_create_cq(struct ibv_context *context, int cqe, - void *cq_context, struct ibv_comp_channel *channel, - int comp_vector, int gpu_id, gds_alloc_cq_flags_t flags) -{ - int ret = 0; - struct gds_cq *gcq = NULL; - //TODO: leak of res_domain - struct ibv_exp_res_domain * res_domain; - gds_dbg("cqe=%d gpu_id=%d cq_flags=%08x\n", cqe, gpu_id, flags); - - gds_peer *peer = NULL; - gds_peer_attr *peer_attr = NULL; - ret = gds_register_peer_by_ordinal(gpu_id, &peer, &peer_attr); - if (ret) { - gds_err("error %d while registering GPU peer\n", ret); - return NULL; - } - assert(peer); - assert(peer_attr); - - peer->alloc_type = gds_peer::CQ; - peer->alloc_flags = flags; - - res_domain = gds_create_res_domain(context); - if (res_domain) - gds_dbg("using res_domain %p\n", res_domain); - else - gds_warn("NOT using res_domain\n"); - - - gcq = gds_create_cq_internal(context, cqe, cq_context, channel, comp_vector, gpu_id, flags, res_domain); - - if (!gcq) { - gds_err("error in gds_create_cq_internal\n"); - return NULL; - } - - return gcq; -} - -//----------------------------------------------------------------------------- - struct gds_qp *gds_create_qp(struct ibv_pd *pd, struct ibv_context *context, gds_qp_init_attr_t *qp_attr, int gpu_id, int flags) { diff --git a/src/mlx5-exp.cpp b/src/mlx5-exp.cpp new file mode 100644 index 0000000..9e4efe4 --- /dev/null +++ b/src/mlx5-exp.cpp @@ -0,0 +1,245 @@ +#include +#include +#include + +#include "mlx5-exp.hpp" +#include "utils.hpp" + +static ibv_exp_res_domain *gds_mlx5_exp_create_res_domain(struct ibv_context *context) +{ + if (!context) { + gds_err("invalid context"); + return NULL; + } + + ibv_exp_res_domain_init_attr res_domain_attr; + memset(&res_domain_attr, 0, sizeof(res_domain_attr)); + + res_domain_attr.comp_mask |= IBV_EXP_RES_DOMAIN_THREAD_MODEL; + res_domain_attr.thread_model = IBV_EXP_THREAD_SINGLE; + + ibv_exp_res_domain *res_domain = ibv_exp_create_res_domain(context, &res_domain_attr); + if (!res_domain) { + gds_warn("Can't create resource domain\n"); + } + + return res_domain; +} + +//----------------------------------------------------------------------------- + +static struct gds_mlx5_exp_cq_t * +gds_mlx5_exp_create_cq(struct ibv_context *context, int cqe, + void *cq_context, struct ibv_comp_channel *channel, + int comp_vector, int gpu_id, gds_alloc_cq_flags_t flags, + struct ibv_exp_res_domain *res_domain) +{ + struct gds_mlx5_exp_cq_t *gmexpcq = NULL; + ibv_exp_cq_init_attr attr; + gds_peer *peer = NULL; + gds_peer_attr *peer_attr = NULL; + int ret = 0; + + if (!context) + { + gds_dbg("Invalid input context\n"); + return NULL; + } + + gmexpcq = (gds_mlx5_exp_cq_t *)calloc(1, sizeof(gds_mlx5_exp_cq_t)); + if (!gmexpcq) { + gds_err("cannot allocate memory\n"); + return NULL; + } + + //Here we need to recover peer and peer_attr pointers to set alloc_type and alloc_flags + //before ibv_exp_create_cq + ret = gds_register_peer_by_ordinal(gpu_id, &peer, &peer_attr); + if (ret) { + gds_err("error %d while registering GPU peer\n", ret); + return NULL; + } + assert(peer); + assert(peer_attr); + + peer->alloc_type = gds_peer::CQ; + peer->alloc_flags = flags; + + attr.comp_mask = IBV_EXP_CQ_INIT_ATTR_PEER_DIRECT; + attr.flags = 0; // see ibv_exp_cq_create_flags + attr.peer_direct_attrs = peer_attr; + if (res_domain) { + gds_dbg("using peer->res_domain %p for CQ\n", res_domain); + attr.res_domain = res_domain; + attr.comp_mask |= IBV_EXP_CQ_INIT_ATTR_RES_DOMAIN; + gmexpcq->res_domain = res_domain; + } + + int old_errno = errno; + gmexpcq->gcq.cq = ibv_exp_create_cq(context, cqe, cq_context, channel, comp_vector, &attr); + if (!gmexpcq->gcq.cq) { + gds_err("error %d in ibv_exp_create_cq, old errno %d\n", errno, old_errno); + return NULL; + } + + gmexpcq->gcq.dtype = GDS_DRIVER_TYPE_MLX5_EXP; + + return gmexpcq; +} + +//----------------------------------------------------------------------------- + +struct gds_mlx5_exp_qp_t *gds_mlx5_exp_create_qp(struct ibv_pd *pd, struct ibv_context *context, + gds_qp_init_attr_t *qp_attr, int gpu_id, int flags) +{ + int ret = 0; + gds_mlx5_exp_qp_t *gmexpqp = NULL; + struct ibv_qp *qp = NULL; + gds_mlx5_exp_cq_t *rx_gmexpcq = NULL, *tx_gmexpcq = NULL; + gds_peer *peer = NULL; + gds_peer_attr *peer_attr = NULL; + struct ibv_qp_init_attr exp_qp_attr = {0,}; + int old_errno = errno; + + assert(pd); + assert(context); + assert(qp_attr); + + gmexpqp = (gds_mlx5_exp_qp_t *)calloc(1, sizeof(struct gds_qp)); + if (!gqp) { + gds_err("cannot allocate memory\n"); + return NULL; + } + gmexpqp->gqp.dtype = GDS_DRIVER_TYPE_MLX5_EXP; + + gmexpqp->gqp.dev_context = context; + + // peer registration + gds_dbg("before gds_register_peer_ex\n"); + ret = gds_register_peer_by_ordinal(gpu_id, &peer, &peer_attr); + if (ret) { + gds_err("error %d in gds_register_peer_ex\n", ret); + goto err; + } + + gmexpqp->res_domain = gds_create_res_domain(context); + if (gmexpqp->res_domain) + gds_dbg("using res_domain %p\n", gmexpqp->res_domain); + else + gds_warn("NOT using res_domain\n"); + + tx_gmexpcq = gds_mlx5_exp_create_cq(context, qp_attr->cap.max_send_wr, NULL, NULL, 0, gpu_id, + (flags & GDS_CREATE_QP_TX_CQ_ON_GPU) ? GDS_ALLOC_CQ_ON_GPU : GDS_ALLOC_CQ_DEFAULT, + gmexpqp->res_domain); + if (!tx_gmexpcq) { + ret = errno; + gds_err("error %d while creating TX CQ, old_errno=%d\n", ret, old_errno); + goto err; + } + + rx_gmexpcq = gds_mlx5_exp_create_cq(context, qp_attr->cap.max_recv_wr, NULL, NULL, 0, gpu_id, + (flags & GDS_CREATE_QP_RX_CQ_ON_GPU) ? GDS_ALLOC_CQ_ON_GPU : GDS_ALLOC_CQ_DEFAULT, + gmexpqp->res_domain); + if (!rx_gmexpcq) { + ret = errno; + gds_err("error %d while creating RX CQ\n", ret); + goto err; + } + + // peer registration + peer->alloc_type = gds_peer::WQ; + peer->alloc_flags = GDS_ALLOC_WQ_DEFAULT | GDS_ALLOC_DBREC_DEFAULT; + if (flags & GDS_CREATE_QP_WQ_ON_GPU) { + gds_err("error, QP WQ on GPU is not supported yet\n"); + goto err; + } + if (flags & GDS_CREATE_QP_WQ_DBREC_ON_GPU) { + gds_warn("QP WQ DBREC on GPU\n"); + peer->alloc_flags |= GDS_ALLOC_DBREC_ON_GPU; + } + + exp_qp_attr = { + .send_cq = tx_gmexpcq->gcq.cq, + .recv_cq = rx_gmexpcq->gcq.cq, + .pd = pd, + .comp_mask = IBV_EXP_QP_INIT_ATTR_PD | IBV_EXP_QP_INIT_ATTR_PEER_DIRECT, + .peer_direct_attrs = peer_attr, + .qp_type = qp_attr->qp_type + }; + + assert(sizeof(exp_qp_attr.cap) == sizeof(qp_attr->cap)); + + memcpy(&exp_qp_attr.cap, &qp_attr->cap, sizeof(qp_attr->cap)); + + qp = ibv_exp_create_qp(context, qp_attr); + if (!qp) { + ret = EINVAL; + gds_err("error in ibv_exp_create_qp\n"); + goto err; + } + + gmexpqp->gqp.qp = qp; + gmexpqp->gqp.send_cq = tx_gmexpcq->gcq; + gmexpqp->gqp.recv_cq = rx_gmexpcq->gcq; + + gds_dbg("created gds_mlx5_exp_qp=%p\n", gmexpqp); + + return gmexpqp; + +err: + gds_dbg("destroying QP\n"); + gds_mlx5_exp_destroy_qp(gmexpqp); + + return NULL; +} + +//----------------------------------------------------------------------------- + +int gds_mlx5_exp_destroy_qp(gds_mlx5_exp_qp_t *gmexpqp) +{ + int retcode = 0; + int ret; + + if (!gmexpqp) + return retcode; + + assert(gmexpqp->gqp.dtype == GDS_DRIVER_TYPE_MLX5_EXP); + + if (gmexpqp->gqp.qp) { + ret = ibv_destroy_qp(gmexpqp->gqp.qp); + if (ret) { + gds_err("error %d in destroy_qp\n", ret); + retcode = ret; + } + } + + if (gmexpqp->gqp.send_cq) { + ret = gds_destroy_cq(gmexpqp->gqp.send_cq); + if (ret) { + gds_err("error %d in destroy_cq send_cq\n", ret); + retcode = ret; + } + } + + if (gmexpqp->gqp.recv_cq) { + ret = gds_destroy_cq(gmexpqp->gqp.recv_cq); + if (ret) { + gds_err("error %d in destroy_cq recv_cq\n", ret); + retcode = ret; + } + } + + if (gmexpqp->res_domain) { + struct ibv_exp_destroy_res_domain_attr attr = {0,}; //IBV_EXP_DESTROY_RES_DOMAIN_RESERVED + ret = ibv_exp_destroy_res_domain(gmexpqp->gqp.dev_context, gmexpqp->res_domain, &attr); + if (ret) { + gds_err("ibv_exp_destroy_res_domain error %d: %s\n", ret, strerror(ret)); + retcode = ret; + } + } + + free(gmexpqp); + + return retcode; +} + diff --git a/src/mlx5-exp.hpp b/src/mlx5-exp.hpp new file mode 100644 index 0000000..214342a --- /dev/null +++ b/src/mlx5-exp.hpp @@ -0,0 +1,20 @@ +typedef struct gds_mlx5_exp_cq { + gds_cq_t gcq; + ibv_exp_res_domain *res_domain +} gds_mlx5_exp_cq_t; + +typedef struct gds_mlx5_exp_qp { + gds_qp_t gqp; + ibv_exp_res_domain *res_domain +} gds_mlx5_exp_qp_t; + +static inline gds_mlx5_exp_cq_t *to_gds_mexp_cq(gds_cq_t *gcq) { + assert(gcq->dtype == GDS_DRIVER_TYPE_MLX5_EXP); + return container_of(gcq, gds_mlx5_exp_cq_t, gcq); +} + +static inline gds_mlx5_exp_qp_t *to_gds_mexp_qp(gds_qp_t *gqp) { + assert(gcq->dtype == GDS_DRIVER_TYPE_MLX5_EXP); + return container_of(gqp, gds_mlx5_exp_qp_t, gqp); +} + From 5e789db4f759d8b0ce94bbf8f6834aa7ce1f7588 Mon Sep 17 00:00:00 2001 From: Pak Markthub Date: Thu, 5 Aug 2021 02:12:31 -0400 Subject: [PATCH 07/50] Implemented gds_mlx5_exp_destroy_cq --- src/mlx5-exp.cpp | 31 ++++++++++++++++++++++++++++--- 1 file changed, 28 insertions(+), 3 deletions(-) diff --git a/src/mlx5-exp.cpp b/src/mlx5-exp.cpp index 9e4efe4..47dc80a 100644 --- a/src/mlx5-exp.cpp +++ b/src/mlx5-exp.cpp @@ -28,8 +28,7 @@ static ibv_exp_res_domain *gds_mlx5_exp_create_res_domain(struct ibv_context *co //----------------------------------------------------------------------------- -static struct gds_mlx5_exp_cq_t * -gds_mlx5_exp_create_cq(struct ibv_context *context, int cqe, +gds_mlx5_exp_cq_t *gds_mlx5_exp_create_cq(struct ibv_context *context, int cqe, void *cq_context, struct ibv_comp_channel *channel, int comp_vector, int gpu_id, gds_alloc_cq_flags_t flags, struct ibv_exp_res_domain *res_domain) @@ -89,7 +88,7 @@ gds_mlx5_exp_create_cq(struct ibv_context *context, int cqe, //----------------------------------------------------------------------------- -struct gds_mlx5_exp_qp_t *gds_mlx5_exp_create_qp(struct ibv_pd *pd, struct ibv_context *context, +gds_mlx5_exp_qp_t *gds_mlx5_exp_create_qp(struct ibv_pd *pd, struct ibv_context *context, gds_qp_init_attr_t *qp_attr, int gpu_id, int flags) { int ret = 0; @@ -243,3 +242,29 @@ int gds_mlx5_exp_destroy_qp(gds_mlx5_exp_qp_t *gmexpqp) return retcode; } +//----------------------------------------------------------------------------- + +int gds_mlx5_exp_destroy_cq(gds_mlx5_exp_cq_t *gmexpcq) +{ + int retcode = 0; + int ret; + + if (!gmexpcq) + return retcode; + + assert(gmexpcq->gcq.dtype == GDS_DRIVER_TYPE_MLX5_EXP); + + if (gmexpcq->gcq.cq) { + ret = ibv_destroy_cq(gmexpcq->gcq.cq); + if (ret) { + gds_err("error %d in destroy_cq\n", ret); + retcode = ret; + } + } + + // res_domain will be destroyed in gds_mlx5_exp_destroy_qp. + + free(gmexpcq); + + return retcode; +} From 2a8ac7d93bc93e961626c5f4e240945d53a7042b Mon Sep 17 00:00:00 2001 From: Pak Markthub Date: Thu, 5 Aug 2021 02:19:03 -0400 Subject: [PATCH 08/50] Implemented gds_destroy_qp and gds_destroy_cq by connecting to gds_mlx5_exp_* --- src/gdsync.cpp | 61 ++++++++++++++++++++---------------------------- src/mlx5-exp.hpp | 13 +++++++++++ 2 files changed, 38 insertions(+), 36 deletions(-) diff --git a/src/gdsync.cpp b/src/gdsync.cpp index 5a14c51..862c0da 100644 --- a/src/gdsync.cpp +++ b/src/gdsync.cpp @@ -1785,51 +1785,40 @@ struct gds_qp *gds_create_qp(struct ibv_pd *pd, struct ibv_context *context, //----------------------------------------------------------------------------- -int gds_destroy_qp(struct gds_qp *gqp) +int gds_destroy_cq(struct gds_cq *gcq) { int retcode = 0; int ret; - if(!gqp) return retcode; + if (!gcq) + return retcode; - if(gqp->qp) - { - ret = ibv_destroy_qp(gqp->qp); - if (ret) { - gds_err("error %d in destroy_qp\n", ret); - retcode = ret; - } - } + // Currently, we support only exp-verbs. + assert(gcq->dtype == GDS_DRIVER_TYPE_MLX5_EXP); - if(gqp->send_cq.cq) - { - ret = ibv_destroy_cq(gqp->send_cq.cq); - if (ret) { - gds_err("error %d in destroy_cq send_cq\n", ret); - retcode = ret; - } - } + gds_mlx5_exp_cq_t *gmexpcq = to_gds_mexp_cq(gcq); - if(gqp->recv_cq.cq) - { - ret = ibv_destroy_cq(gqp->recv_cq.cq); - if (ret) { - gds_err("error %d in destroy_cq recv_cq\n", ret); - retcode = ret; - } - } + retcode = gds_mlx5_exp_destroy_cq(gmexpcq); - if(gqp->res_domain) { - struct ibv_exp_destroy_res_domain_attr attr; //IBV_EXP_DESTROY_RES_DOMAIN_RESERVED - attr.comp_mask=0; - ret = ibv_exp_destroy_res_domain(gqp->dev_context, gqp->res_domain, &attr); - if (ret) { - gds_err("ibv_exp_destroy_res_domain error %d: %s\n", ret, strerror(ret)); - retcode = ret; - } - } + return retcode; +} + +//----------------------------------------------------------------------------- + +int gds_destroy_qp(struct gds_qp *gqp) +{ + int retcode = 0; + int ret; + + if (!gqp) + return retcode; + + // Currently, we support only exp-verbs. + assert(gqp->dtype == GDS_DRIVER_TYPE_MLX5_EXP); + + gds_mlx5_exp_qp_t *gmexpqp = to_gds_mexp_qp(gqp); - free(gqp); + retcode = gds_mlx5_exp_destroy_qp(gmexpqp); return retcode; } diff --git a/src/mlx5-exp.hpp b/src/mlx5-exp.hpp index 214342a..0f525ee 100644 --- a/src/mlx5-exp.hpp +++ b/src/mlx5-exp.hpp @@ -1,3 +1,6 @@ +#include +#include + typedef struct gds_mlx5_exp_cq { gds_cq_t gcq; ibv_exp_res_domain *res_domain @@ -18,3 +21,13 @@ static inline gds_mlx5_exp_qp_t *to_gds_mexp_qp(gds_qp_t *gqp) { return container_of(gqp, gds_mlx5_exp_qp_t, gqp); } +gds_mlx5_exp_cq_t *gds_mlx5_exp_create_cq(struct ibv_context *context, int cqe, + void *cq_context, struct ibv_comp_channel *channel, + int comp_vector, int gpu_id, gds_alloc_cq_flags_t flags, + struct ibv_exp_res_domain *res_domain); + +gds_mlx5_exp_qp_t *gds_mlx5_exp_create_qp(struct ibv_pd *pd, struct ibv_context *context, + gds_qp_init_attr_t *qp_attr, int gpu_id, int flags); + +int gds_mlx5_exp_destroy_cq(gds_mlx5_exp_cq_t *gmexpcq); +int gds_mlx5_exp_destroy_qp(gds_mlx5_exp_qp_t *gmexpqp); From a9351f2bfabd7c1d44adf29c4cba10f73c615744 Mon Sep 17 00:00:00 2001 From: Pak Markthub Date: Thu, 5 Aug 2021 03:11:13 -0400 Subject: [PATCH 09/50] Reimplemented gds_create_qp by connecting to gds_mlx5_exp_create_qp --- src/gdsync.cpp | 80 ++++++++---------------------------------------- src/mlx5-exp.cpp | 60 +++++++++++++++--------------------- src/mlx5-exp.hpp | 14 +++++---- src/utils.hpp | 4 +++ 4 files changed, 48 insertions(+), 110 deletions(-) diff --git a/src/gdsync.cpp b/src/gdsync.cpp index 862c0da..56d2743 100644 --- a/src/gdsync.cpp +++ b/src/gdsync.cpp @@ -1682,16 +1682,16 @@ struct gds_qp *gds_create_qp(struct ibv_pd *pd, struct ibv_context *context, gds_qp_init_attr_t *qp_attr, int gpu_id, int flags) { int ret = 0; - struct gds_qp *gqp = NULL; - struct ibv_qp *qp = NULL; - struct gds_cq *rx_gcq = NULL, *tx_gcq = NULL; + gds_mlx5_exp_qp_t *gmexpqp = NULL; gds_peer *peer = NULL; gds_peer_attr *peer_attr = NULL; + gds_driver_type dtype; int old_errno = errno; gds_dbg("pd=%p context=%p gpu_id=%d flags=%08x current errno=%d\n", pd, context, gpu_id, flags, errno); assert(pd); assert(context); + assert(context->device); assert(qp_attr); if (flags & ~(GDS_CREATE_QP_WQ_ON_GPU|GDS_CREATE_QP_TX_CQ_ON_GPU|GDS_CREATE_QP_RX_CQ_ON_GPU|GDS_CREATE_QP_WQ_DBREC_ON_GPU)) { @@ -1699,87 +1699,31 @@ struct gds_qp *gds_create_qp(struct ibv_pd *pd, struct ibv_context *context, return NULL; } - gqp = (struct gds_qp*)calloc(1, sizeof(struct gds_qp)); - if (!gqp) { - gds_err("cannot allocate memory\n"); - return NULL; - } - - gqp->dev_context=context; - // peer registration gds_dbg("before gds_register_peer_ex\n"); ret = gds_register_peer_by_ordinal(gpu_id, &peer, &peer_attr); if (ret) { - gds_err("error %d in gds_register_peer_ex\n", ret); - goto err; - } - - gqp->res_domain = gds_create_res_domain(context); - if (gqp->res_domain) - gds_dbg("using gqp->res_domain %p\n", gqp->res_domain); - else - gds_warn("NOT using gqp->res_domain\n"); - - tx_gcq = gds_create_cq_internal(context, qp_attr->cap.max_send_wr, NULL, NULL, 0, gpu_id, - (flags & GDS_CREATE_QP_TX_CQ_ON_GPU) ? GDS_ALLOC_CQ_ON_GPU : GDS_ALLOC_CQ_DEFAULT, - gqp->res_domain); - if (!tx_gcq) { - ret = errno; - gds_err("error %d while creating TX CQ, old_errno=%d\n", ret, old_errno); + gds_err("error %d in gds_register_peer_ex\n", ret); goto err; } - rx_gcq = gds_create_cq_internal(context, qp_attr->cap.max_recv_wr, NULL, NULL, 0, gpu_id, - (flags & GDS_CREATE_QP_RX_CQ_ON_GPU) ? GDS_ALLOC_CQ_ON_GPU : GDS_ALLOC_CQ_DEFAULT, - gqp->res_domain); - if (!rx_gcq) { - ret = errno; - gds_err("error %d while creating RX CQ\n", ret); + dtype = gds_get_driver_type(context->device); + if (dtype != GDS_DRIVER_TYPE_MLX5_EXP) { + gds_err("Unsupported IB device\n"); goto err; } - // peer registration - qp_attr->send_cq = tx_gcq->cq; - qp_attr->recv_cq = rx_gcq->cq; - qp_attr->pd = pd; - qp_attr->comp_mask |= IBV_EXP_QP_INIT_ATTR_PD; - - peer->alloc_type = gds_peer::WQ; - peer->alloc_flags = GDS_ALLOC_WQ_DEFAULT | GDS_ALLOC_DBREC_DEFAULT; - if (flags & GDS_CREATE_QP_WQ_ON_GPU) { - gds_err("error, QP WQ on GPU is not supported yet\n"); - goto err; - } - if (flags & GDS_CREATE_QP_WQ_DBREC_ON_GPU) { - gds_warn("QP WQ DBREC on GPU\n"); - peer->alloc_flags |= GDS_ALLOC_DBREC_ON_GPU; - } - qp_attr->comp_mask |= IBV_EXP_QP_INIT_ATTR_PEER_DIRECT; - qp_attr->peer_direct_attrs = peer_attr; - - qp = ibv_exp_create_qp(context, qp_attr); - if (!qp) { - ret = EINVAL; - gds_err("error in ibv_exp_create_qp\n"); + gmexpqp = gds_mlx5_exp_create_qp(pd, context, qp_attr, peer, peer_attr, flags); + if (!gmexpqp) { + gds_err("Error in gds_mlx5_exp_create_qp.\n"); goto err; } - gqp->qp = qp; - gqp->send_cq.cq = qp->send_cq; - gqp->send_cq.curr_offset = 0; - gqp->recv_cq.cq = qp->recv_cq; - gqp->recv_cq.curr_offset = 0; - gqp->dtype = GDS_DRIVER_TYPE_MLX5_EXP; - - gds_dbg("created gds_qp=%p\n", gqp); + gds_dbg("created gds_qp=%p\n", gmexpqp->gqp); - return gqp; + return gmexpqp->gqp; err: - gds_dbg("destroying QP\n"); - gds_destroy_qp(gqp); - return NULL; } diff --git a/src/mlx5-exp.cpp b/src/mlx5-exp.cpp index 47dc80a..b39fa16 100644 --- a/src/mlx5-exp.cpp +++ b/src/mlx5-exp.cpp @@ -28,10 +28,11 @@ static ibv_exp_res_domain *gds_mlx5_exp_create_res_domain(struct ibv_context *co //----------------------------------------------------------------------------- -gds_mlx5_exp_cq_t *gds_mlx5_exp_create_cq(struct ibv_context *context, int cqe, - void *cq_context, struct ibv_comp_channel *channel, - int comp_vector, int gpu_id, gds_alloc_cq_flags_t flags, - struct ibv_exp_res_domain *res_domain) +gds_mlx5_exp_cq_t *gds_mlx5_exp_create_cq( + struct ibv_context *context, int cqe, + void *cq_context, struct ibv_comp_channel *channel, + int comp_vector, gds_peer *peer, gds_peer_attr *peer_attr, gds_alloc_cq_flags_t flags, + struct ibv_exp_res_domain *res_domain) { struct gds_mlx5_exp_cq_t *gmexpcq = NULL; ibv_exp_cq_init_attr attr; @@ -39,11 +40,9 @@ gds_mlx5_exp_cq_t *gds_mlx5_exp_create_cq(struct ibv_context *context, int cqe, gds_peer_attr *peer_attr = NULL; int ret = 0; - if (!context) - { - gds_dbg("Invalid input context\n"); - return NULL; - } + assert(context); + assert(peer); + assert(peer_attr); gmexpcq = (gds_mlx5_exp_cq_t *)calloc(1, sizeof(gds_mlx5_exp_cq_t)); if (!gmexpcq) { @@ -51,16 +50,6 @@ gds_mlx5_exp_cq_t *gds_mlx5_exp_create_cq(struct ibv_context *context, int cqe, return NULL; } - //Here we need to recover peer and peer_attr pointers to set alloc_type and alloc_flags - //before ibv_exp_create_cq - ret = gds_register_peer_by_ordinal(gpu_id, &peer, &peer_attr); - if (ret) { - gds_err("error %d while registering GPU peer\n", ret); - return NULL; - } - assert(peer); - assert(peer_attr); - peer->alloc_type = gds_peer::CQ; peer->alloc_flags = flags; @@ -88,8 +77,9 @@ gds_mlx5_exp_cq_t *gds_mlx5_exp_create_cq(struct ibv_context *context, int cqe, //----------------------------------------------------------------------------- -gds_mlx5_exp_qp_t *gds_mlx5_exp_create_qp(struct ibv_pd *pd, struct ibv_context *context, - gds_qp_init_attr_t *qp_attr, int gpu_id, int flags) +gds_mlx5_exp_qp_t *gds_mlx5_exp_create_qp( + struct ibv_pd *pd, struct ibv_context *context, gds_qp_init_attr_t *qp_attr, + gds_peer *peer, gds_peer_attr *peer_attr, int flags) { int ret = 0; gds_mlx5_exp_qp_t *gmexpqp = NULL; @@ -103,8 +93,10 @@ gds_mlx5_exp_qp_t *gds_mlx5_exp_create_qp(struct ibv_pd *pd, struct ibv_context assert(pd); assert(context); assert(qp_attr); + assert(peer); + assert(peer_attr); - gmexpqp = (gds_mlx5_exp_qp_t *)calloc(1, sizeof(struct gds_qp)); + gmexpqp = (gds_mlx5_exp_qp_t *)calloc(1, sizeof(gds_mlx5_exp_qp_t)); if (!gqp) { gds_err("cannot allocate memory\n"); return NULL; @@ -113,32 +105,28 @@ gds_mlx5_exp_qp_t *gds_mlx5_exp_create_qp(struct ibv_pd *pd, struct ibv_context gmexpqp->gqp.dev_context = context; - // peer registration - gds_dbg("before gds_register_peer_ex\n"); - ret = gds_register_peer_by_ordinal(gpu_id, &peer, &peer_attr); - if (ret) { - gds_err("error %d in gds_register_peer_ex\n", ret); - goto err; - } - gmexpqp->res_domain = gds_create_res_domain(context); if (gmexpqp->res_domain) gds_dbg("using res_domain %p\n", gmexpqp->res_domain); else gds_warn("NOT using res_domain\n"); - tx_gmexpcq = gds_mlx5_exp_create_cq(context, qp_attr->cap.max_send_wr, NULL, NULL, 0, gpu_id, - (flags & GDS_CREATE_QP_TX_CQ_ON_GPU) ? GDS_ALLOC_CQ_ON_GPU : GDS_ALLOC_CQ_DEFAULT, - gmexpqp->res_domain); + tx_gmexpcq = gds_mlx5_exp_create_cq( + context, qp_attr->cap.max_send_wr, NULL, NULL, 0, peer, peer_attr, + (flags & GDS_CREATE_QP_TX_CQ_ON_GPU) ? GDS_ALLOC_CQ_ON_GPU : GDS_ALLOC_CQ_DEFAULT, + gmexpqp->res_domain + ); if (!tx_gmexpcq) { ret = errno; gds_err("error %d while creating TX CQ, old_errno=%d\n", ret, old_errno); goto err; } - rx_gmexpcq = gds_mlx5_exp_create_cq(context, qp_attr->cap.max_recv_wr, NULL, NULL, 0, gpu_id, - (flags & GDS_CREATE_QP_RX_CQ_ON_GPU) ? GDS_ALLOC_CQ_ON_GPU : GDS_ALLOC_CQ_DEFAULT, - gmexpqp->res_domain); + rx_gmexpcq = gds_mlx5_exp_create_cq( + context, qp_attr->cap.max_recv_wr, NULL, NULL, 0, peer, peer_attr, + (flags & GDS_CREATE_QP_RX_CQ_ON_GPU) ? GDS_ALLOC_CQ_ON_GPU : GDS_ALLOC_CQ_DEFAULT, + gmexpqp->res_domain + ); if (!rx_gmexpcq) { ret = errno; gds_err("error %d while creating RX CQ\n", ret); diff --git a/src/mlx5-exp.hpp b/src/mlx5-exp.hpp index 0f525ee..20e94f3 100644 --- a/src/mlx5-exp.hpp +++ b/src/mlx5-exp.hpp @@ -21,13 +21,15 @@ static inline gds_mlx5_exp_qp_t *to_gds_mexp_qp(gds_qp_t *gqp) { return container_of(gqp, gds_mlx5_exp_qp_t, gqp); } -gds_mlx5_exp_cq_t *gds_mlx5_exp_create_cq(struct ibv_context *context, int cqe, - void *cq_context, struct ibv_comp_channel *channel, - int comp_vector, int gpu_id, gds_alloc_cq_flags_t flags, - struct ibv_exp_res_domain *res_domain); +gds_mlx5_exp_cq_t *gds_mlx5_exp_create_cq( + struct ibv_context *context, int cqe, + void *cq_context, struct ibv_comp_channel *channel, + int comp_vector, gds_peer *peer, gds_peer_attr *peer_attr, gds_alloc_cq_flags_t flags, + struct ibv_exp_res_domain *res_domain); -gds_mlx5_exp_qp_t *gds_mlx5_exp_create_qp(struct ibv_pd *pd, struct ibv_context *context, - gds_qp_init_attr_t *qp_attr, int gpu_id, int flags); +gds_mlx5_exp_qp_t *gds_mlx5_exp_create_qp( + struct ibv_pd *pd, struct ibv_context *context, gds_qp_init_attr_t *qp_attr, + gds_peer *peer, gds_peer_attr *peer_attr, int flags); int gds_mlx5_exp_destroy_cq(gds_mlx5_exp_cq_t *gmexpcq); int gds_mlx5_exp_destroy_qp(gds_mlx5_exp_qp_t *gmexpqp); diff --git a/src/utils.hpp b/src/utils.hpp index de2aaa8..1bacbb6 100644 --- a/src/utils.hpp +++ b/src/utils.hpp @@ -238,6 +238,10 @@ static inline gds_driver_type gds_get_driver_type(struct ibv_device *ibdev) //----------------------------------------------------------------------------- +int gds_destroy_cq(struct gds_cq *gcq); + +//----------------------------------------------------------------------------- + /* * Local variables: * c-indent-level: 8 From e18866864b7da32f97bd2a73900cab7b552c453c Mon Sep 17 00:00:00 2001 From: Pak Markthub Date: Thu, 5 Aug 2021 03:16:09 -0400 Subject: [PATCH 10/50] Moved gds_send_wr from exp-verbs to ib-verbs in gds_kernel_latency.c --- tests/gds_kernel_latency.c | 22 ++++++++++------------ 1 file changed, 10 insertions(+), 12 deletions(-) diff --git a/tests/gds_kernel_latency.c b/tests/gds_kernel_latency.c index 63875bf..428dadb 100644 --- a/tests/gds_kernel_latency.c +++ b/tests/gds_kernel_latency.c @@ -542,23 +542,22 @@ static int pp_post_gpu_send(struct pingpong_context *ctx, uint32_t qpn, CUstream .wr_id = PINGPONG_SEND_WRID, .sg_list = &list, .num_sge = 1, - .exp_opcode = IBV_EXP_WR_SEND, - .exp_send_flags = IBV_EXP_SEND_SIGNALED, + .opcode = IBV_WR_SEND, + .send_flags = IBV_SEND_SIGNALED, .wr = { .ud = { .ah = ctx->ah, .remote_qpn = qpn, .remote_qkey = 0x11111111 } - }, - .comp_mask = 0 + } }; #if 0 if (IBV_QPT_UD != gds_qpt) { memset(&ewr, 0, sizeof(ewr)); ewr.num_sge = 1; - ewr.exp_send_flags = IBV_EXP_SEND_SIGNALED; - ewr.exp_opcode = IBV_EXP_WR_SEND; + ewr.send_flags = IBV_SEND_SIGNALED; + ewr.opcode = IBV_WR_SEND; ewr.wr_id = PINGPONG_SEND_WRID; ewr.sg_list = &list; ewr.next = NULL; @@ -580,23 +579,22 @@ static int pp_prepare_gpu_send(struct pingpong_context *ctx, uint32_t qpn, gds_s .wr_id = PINGPONG_SEND_WRID, .sg_list = &list, .num_sge = 1, - .exp_opcode = IBV_EXP_WR_SEND, - .exp_send_flags = IBV_EXP_SEND_SIGNALED, + .opcode = IBV_WR_SEND, + .send_flags = IBV_SEND_SIGNALED, .wr = { .ud = { .ah = ctx->ah, .remote_qpn = qpn, .remote_qkey = 0x11111111 } - }, - .comp_mask = 0 + } }; if (IBV_QPT_UD != gds_qpt) { memset(&ewr, 0, sizeof(ewr)); ewr.num_sge = 1; - ewr.exp_send_flags = IBV_EXP_SEND_SIGNALED; - ewr.exp_opcode = IBV_EXP_WR_SEND; + ewr.send_flags = IBV_SEND_SIGNALED; + ewr.opcode = IBV_WR_SEND; ewr.wr_id = PINGPONG_SEND_WRID; ewr.sg_list = &list; ewr.next = NULL; From 07d03a035c53876f29e10ca4e7608ba96399d270 Mon Sep 17 00:00:00 2001 From: Pak Markthub Date: Thu, 5 Aug 2021 03:20:27 -0400 Subject: [PATCH 11/50] Fixed compile issues in gdsync.cpp and mlx5-exp.hpp --- include/gdsync/core.h | 8 ++++---- src/gdsync.cpp | 3 ++- src/mlx5-exp.hpp | 8 +++++--- 3 files changed, 11 insertions(+), 8 deletions(-) diff --git a/include/gdsync/core.h b/include/gdsync/core.h index c438594..a74299d 100644 --- a/include/gdsync/core.h +++ b/include/gdsync/core.h @@ -64,19 +64,19 @@ typedef enum gds_driver_type { GDS_DRIVER_TYPE_MLX5_DEVX } gds_driver_type_t; -struct gds_cq { +typedef struct gds_cq { struct ibv_cq *cq; uint32_t curr_offset; gds_driver_type_t dtype; -}; +} gds_cq_t; -struct gds_qp { +typedef struct gds_qp { struct ibv_qp *qp; struct gds_cq *send_cq; struct gds_cq *recv_cq; struct ibv_context *dev_context; gds_driver_type_t dtype; -}; +} gds_qp_t; /* \brief: Create a peer-enabled QP attached to the specified GPU id. * diff --git a/src/gdsync.cpp b/src/gdsync.cpp index 56d2743..23907fe 100644 --- a/src/gdsync.cpp +++ b/src/gdsync.cpp @@ -43,6 +43,7 @@ #include "archutils.h" #include "mlnxutils.h" #include "task_queue.hpp" +#include "mlx5-exp.hpp" //----------------------------------------------------------------------------- @@ -1721,7 +1722,7 @@ struct gds_qp *gds_create_qp(struct ibv_pd *pd, struct ibv_context *context, gds_dbg("created gds_qp=%p\n", gmexpqp->gqp); - return gmexpqp->gqp; + return &gmexpqp->gqp; err: return NULL; diff --git a/src/mlx5-exp.hpp b/src/mlx5-exp.hpp index 20e94f3..60bb1fd 100644 --- a/src/mlx5-exp.hpp +++ b/src/mlx5-exp.hpp @@ -1,14 +1,16 @@ #include #include +#include + typedef struct gds_mlx5_exp_cq { gds_cq_t gcq; - ibv_exp_res_domain *res_domain + ibv_exp_res_domain *res_domain; } gds_mlx5_exp_cq_t; typedef struct gds_mlx5_exp_qp { gds_qp_t gqp; - ibv_exp_res_domain *res_domain + ibv_exp_res_domain *res_domain; } gds_mlx5_exp_qp_t; static inline gds_mlx5_exp_cq_t *to_gds_mexp_cq(gds_cq_t *gcq) { @@ -17,7 +19,7 @@ static inline gds_mlx5_exp_cq_t *to_gds_mexp_cq(gds_cq_t *gcq) { } static inline gds_mlx5_exp_qp_t *to_gds_mexp_qp(gds_qp_t *gqp) { - assert(gcq->dtype == GDS_DRIVER_TYPE_MLX5_EXP); + assert(gqp->dtype == GDS_DRIVER_TYPE_MLX5_EXP); return container_of(gqp, gds_mlx5_exp_qp_t, gqp); } From 173841f3174272eca05d55f7a8c838e7ecbaa01b Mon Sep 17 00:00:00 2001 From: Pak Markthub Date: Fri, 6 Aug 2021 00:54:30 -0400 Subject: [PATCH 12/50] Added mlx5-exp.cpp to the compile list --- Makefile.am | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Makefile.am b/Makefile.am index ac9a0cc..27fce57 100644 --- a/Makefile.am +++ b/Makefile.am @@ -20,13 +20,13 @@ EXTRA_DIST = autogen.sh include_HEADERS = include/gdsync.h libgdsyncincludedir = $(includedir)/gdsync -libgdsyncinclude_HEADERS = include/gdsync/core.h include/gdsync/device.cuh include/gdsync/mlx5.h include/gdsync/tools.h +libgdsyncinclude_HEADERS = include/gdsync/core.h include/gdsync/device.cuh include/gdsync/mlx5.h include/gdsync/tools.h src_libgdsync_la_CFLAGS = $(AM_CFLAGS) -src_libgdsync_la_SOURCES = src/gdsync.cpp src/memmgr.cpp src/mem.cpp src/objs.cpp src/apis.cpp src/mlx5.cpp include/gdsync.h +src_libgdsync_la_SOURCES = src/gdsync.cpp src/memmgr.cpp src/mem.cpp src/objs.cpp src/apis.cpp src/mlx5.cpp src/mlx5-exp.cpp include/gdsync.h src_libgdsync_la_LDFLAGS = -version-info @VERSION_INFO@ -noinst_HEADERS = src/mem.hpp src/memmgr.hpp src/objs.hpp src/rangeset.hpp src/utils.hpp src/archutils.h src/mlnxutils.h +noinst_HEADERS = src/mem.hpp src/memmgr.hpp src/objs.hpp src/rangeset.hpp src/utils.hpp src/archutils.h src/mlnxutils.h src/mlx5-exp.hpp # if enabled at configure time From 8d1ed503001c40dd6120b4d61c83f07b4395f631 Mon Sep 17 00:00:00 2001 From: Pak Markthub Date: Fri, 6 Aug 2021 00:55:35 -0400 Subject: [PATCH 13/50] Moved gds_prepare_send to gds_mlx5_exp_* and fixed compile errors in mlx5-exp.* --- src/apis.cpp | 34 ++++++++++--------------- src/mlx5-exp.cpp | 64 ++++++++++++++++++++++++++++++++++-------------- src/mlx5-exp.hpp | 11 +++++++++ 3 files changed, 69 insertions(+), 40 deletions(-) diff --git a/src/apis.cpp b/src/apis.cpp index cd532d7..0801771 100644 --- a/src/apis.cpp +++ b/src/apis.cpp @@ -51,6 +51,7 @@ #include "utils.hpp" #include "archutils.h" #include "mlnxutils.h" +#include "mlx5-exp.hpp" //----------------------------------------------------------------------------- @@ -171,33 +172,24 @@ int gds_post_recv(struct gds_qp *qp, struct ibv_recv_wr *wr, struct ibv_recv_wr //----------------------------------------------------------------------------- -int gds_prepare_send(struct gds_qp *qp, gds_send_wr *p_ewr, +int gds_prepare_send(struct gds_qp *gqp, gds_send_wr *p_ewr, gds_send_wr **bad_ewr, gds_send_request_t *request) { int ret = 0; + gds_mlx5_exp_qp_t *gmexpqp; + gds_init_send_info(request); - assert(qp); - assert(qp->qp); - ret = ibv_exp_post_send(qp->qp, p_ewr, bad_ewr); - if (ret) { + assert(gqp); + assert(gqp->qp); + assert(gqp->dtype == GDS_DRIVER_TYPE_MLX5_EXP); + + gmexpqp = to_gds_mexp_qp(gqp); + + ret = gds_mlx5_exp_prepare_send(gmexpqp, p_ewr, bad_ewr, request); + if (ret) + gds_err("Error %d in gds_mlx5_exp_prepare_send.\n", ret); - if (ret == ENOMEM) { - // out of space error can happen too often to report - gds_dbg("ENOMEM error %d in ibv_exp_post_send\n", ret); - } else { - gds_err("error %d in ibv_exp_post_send\n", ret); - } - goto out; - } - - ret = ibv_exp_peer_commit_qp(qp->qp, &request->commit); - if (ret) { - gds_err("error %d in ibv_exp_peer_commit_qp\n", ret); - //gds_wait_kernel(); - goto out; - } -out: return ret; } diff --git a/src/mlx5-exp.cpp b/src/mlx5-exp.cpp index b39fa16..f76cf12 100644 --- a/src/mlx5-exp.cpp +++ b/src/mlx5-exp.cpp @@ -34,10 +34,8 @@ gds_mlx5_exp_cq_t *gds_mlx5_exp_create_cq( int comp_vector, gds_peer *peer, gds_peer_attr *peer_attr, gds_alloc_cq_flags_t flags, struct ibv_exp_res_domain *res_domain) { - struct gds_mlx5_exp_cq_t *gmexpcq = NULL; + gds_mlx5_exp_cq_t *gmexpcq = NULL; ibv_exp_cq_init_attr attr; - gds_peer *peer = NULL; - gds_peer_attr *peer_attr = NULL; int ret = 0; assert(context); @@ -85,9 +83,7 @@ gds_mlx5_exp_qp_t *gds_mlx5_exp_create_qp( gds_mlx5_exp_qp_t *gmexpqp = NULL; struct ibv_qp *qp = NULL; gds_mlx5_exp_cq_t *rx_gmexpcq = NULL, *tx_gmexpcq = NULL; - gds_peer *peer = NULL; - gds_peer_attr *peer_attr = NULL; - struct ibv_qp_init_attr exp_qp_attr = {0,}; + struct ibv_exp_qp_init_attr exp_qp_attr = {0,}; int old_errno = errno; assert(pd); @@ -97,7 +93,7 @@ gds_mlx5_exp_qp_t *gds_mlx5_exp_create_qp( assert(peer_attr); gmexpqp = (gds_mlx5_exp_qp_t *)calloc(1, sizeof(gds_mlx5_exp_qp_t)); - if (!gqp) { + if (!gmexpqp) { gds_err("cannot allocate memory\n"); return NULL; } @@ -105,7 +101,7 @@ gds_mlx5_exp_qp_t *gds_mlx5_exp_create_qp( gmexpqp->gqp.dev_context = context; - gmexpqp->res_domain = gds_create_res_domain(context); + gmexpqp->res_domain = gds_mlx5_exp_create_res_domain(context); if (gmexpqp->res_domain) gds_dbg("using res_domain %p\n", gmexpqp->res_domain); else @@ -145,29 +141,30 @@ gds_mlx5_exp_qp_t *gds_mlx5_exp_create_qp( peer->alloc_flags |= GDS_ALLOC_DBREC_ON_GPU; } - exp_qp_attr = { - .send_cq = tx_gmexpcq->gcq.cq, - .recv_cq = rx_gmexpcq->gcq.cq, - .pd = pd, - .comp_mask = IBV_EXP_QP_INIT_ATTR_PD | IBV_EXP_QP_INIT_ATTR_PEER_DIRECT, - .peer_direct_attrs = peer_attr, - .qp_type = qp_attr->qp_type - }; + exp_qp_attr.send_cq = tx_gmexpcq->gcq.cq; + exp_qp_attr.recv_cq = rx_gmexpcq->gcq.cq; + exp_qp_attr.pd = pd; + exp_qp_attr.comp_mask = IBV_EXP_QP_INIT_ATTR_PD | IBV_EXP_QP_INIT_ATTR_PEER_DIRECT; + exp_qp_attr.peer_direct_attrs = peer_attr; + exp_qp_attr.qp_type = qp_attr->qp_type; assert(sizeof(exp_qp_attr.cap) == sizeof(qp_attr->cap)); memcpy(&exp_qp_attr.cap, &qp_attr->cap, sizeof(qp_attr->cap)); - qp = ibv_exp_create_qp(context, qp_attr); + qp = ibv_exp_create_qp(context, &exp_qp_attr); if (!qp) { ret = EINVAL; gds_err("error in ibv_exp_create_qp\n"); goto err; } + tx_gmexpcq->gcq.cq = qp->send_cq; + rx_gmexpcq->gcq.cq = qp->recv_cq; + gmexpqp->gqp.qp = qp; - gmexpqp->gqp.send_cq = tx_gmexpcq->gcq; - gmexpqp->gqp.recv_cq = rx_gmexpcq->gcq; + gmexpqp->gqp.send_cq = &tx_gmexpcq->gcq; + gmexpqp->gqp.recv_cq = &rx_gmexpcq->gcq; gds_dbg("created gds_mlx5_exp_qp=%p\n", gmexpqp); @@ -256,3 +253,32 @@ int gds_mlx5_exp_destroy_cq(gds_mlx5_exp_cq_t *gmexpcq) return retcode; } + +//----------------------------------------------------------------------------- + +int gds_mlx5_exp_prepare_send(gds_mlx5_exp_qp_t *gmexpqp, gds_send_wr *p_ewr, + gds_send_wr **bad_ewr, + gds_send_request_t *request) +{ + int ret = 0; + ret = ibv_post_send(gmexpqp->gqp.qp, p_ewr, bad_ewr); + if (ret) { + + if (ret == ENOMEM) { + // out of space error can happen too often to report + gds_dbg("ENOMEM error %d in ibv_post_send\n", ret); + } else { + gds_err("error %d in ibv_post_send\n", ret); + } + goto out; + } + + ret = ibv_exp_peer_commit_qp(gmexpqp->gqp.qp, &request->commit); + if (ret) { + gds_err("error %d in ibv_exp_peer_commit_qp\n", ret); + goto out; + } +out: + return ret; +} + diff --git a/src/mlx5-exp.hpp b/src/mlx5-exp.hpp index 60bb1fd..d289c83 100644 --- a/src/mlx5-exp.hpp +++ b/src/mlx5-exp.hpp @@ -1,8 +1,15 @@ +#include +#include +#include + #include #include #include +#include "objs.hpp" +#include "utils.hpp" + typedef struct gds_mlx5_exp_cq { gds_cq_t gcq; ibv_exp_res_domain *res_domain; @@ -35,3 +42,7 @@ gds_mlx5_exp_qp_t *gds_mlx5_exp_create_qp( int gds_mlx5_exp_destroy_cq(gds_mlx5_exp_cq_t *gmexpcq); int gds_mlx5_exp_destroy_qp(gds_mlx5_exp_qp_t *gmexpqp); + +int gds_mlx5_exp_prepare_send(gds_mlx5_exp_qp_t *gmexpqp, gds_send_wr *p_ewr, + gds_send_wr **bad_ewr, + gds_send_request_t *request); From 10e164ec794c24df2a997a4f8db69385f7256ca9 Mon Sep 17 00:00:00 2001 From: Pak Markthub Date: Fri, 6 Aug 2021 00:56:18 -0400 Subject: [PATCH 14/50] Modified gds_kernel_* applications to fit the new API/structs --- tests/gds_kernel_latency.c | 18 +++++++++--------- tests/gds_kernel_loopback_latency.c | 29 +++++++++++++---------------- 2 files changed, 22 insertions(+), 25 deletions(-) diff --git a/tests/gds_kernel_latency.c b/tests/gds_kernel_latency.c index 428dadb..04370f5 100644 --- a/tests/gds_kernel_latency.c +++ b/tests/gds_kernel_latency.c @@ -495,7 +495,7 @@ static int pp_wait_cq(struct pingpong_context *ctx, int is_client) { int ret; if (ctx->peersync) { - ret = gds_stream_wait_cq(gpu_stream, &ctx->gds_qp->recv_cq, ctx->consume_rx_cqe); + ret = gds_stream_wait_cq(gpu_stream, ctx->gds_qp->recv_cq, ctx->consume_rx_cqe); } else { if (is_client) { do { @@ -674,7 +674,7 @@ static int pp_post_work(struct pingpong_context *ctx, int n_posts, int rcnt, uin wdesc->descs[k].tag = GDS_TAG_SEND; wdesc->descs[k].send = &wdesc->send_rq; ++k; - ret = gds_prepare_wait_cq(&ctx->gds_qp->send_cq, &wdesc->wait_tx_rq, 0); + ret = gds_prepare_wait_cq(ctx->gds_qp->send_cq, &wdesc->wait_tx_rq, 0); if (ret) { retcode = -ret; break; @@ -683,7 +683,7 @@ static int pp_post_work(struct pingpong_context *ctx, int n_posts, int rcnt, uin wdesc->descs[k].tag = GDS_TAG_WAIT; wdesc->descs[k].wait = &wdesc->wait_tx_rq; ++k; - ret = gds_prepare_wait_cq(&ctx->gds_qp->recv_cq, &wdesc->wait_rx_rq, 0); + ret = gds_prepare_wait_cq(ctx->gds_qp->recv_cq, &wdesc->wait_rx_rq, 0); if (ret) { retcode = -ret; break; @@ -713,14 +713,14 @@ static int pp_post_work(struct pingpong_context *ctx, int n_posts, int rcnt, uin retcode = -ret; break; } - ret = gds_stream_wait_cq(gpu_stream, &ctx->gds_qp->send_cq, 0); + ret = gds_stream_wait_cq(gpu_stream, ctx->gds_qp->send_cq, 0); if (ret) { // TODO: rollback gpu send gpu_err("error %d in gds_stream_wait_cq\n", ret); retcode = -ret; break; } - ret = gds_stream_wait_cq(gpu_stream, &ctx->gds_qp->recv_cq, ctx->consume_rx_cqe); + ret = gds_stream_wait_cq(gpu_stream, ctx->gds_qp->recv_cq, ctx->consume_rx_cqe); if (ret) { // TODO: rollback gpu send and wait send_cq gpu_err("[%d] error %d in gds_stream_wait_cq\n", my_rank, ret); @@ -749,7 +749,7 @@ static int pp_post_work(struct pingpong_context *ctx, int n_posts, int rcnt, uin if (ctx->use_desc_apis) { work_desc_t *wdesc = calloc(1, sizeof(*wdesc)); int k = 0; - ret = gds_prepare_wait_cq(&ctx->gds_qp->recv_cq, &wdesc->wait_rx_rq, 0); + ret = gds_prepare_wait_cq(ctx->gds_qp->recv_cq, &wdesc->wait_rx_rq, 0); if (ret) { retcode = -ret; break; @@ -771,7 +771,7 @@ static int pp_post_work(struct pingpong_context *ctx, int n_posts, int rcnt, uin CUCHECK(cuStreamAddCallback(gpu_stream, post_work_cb, wdesc, 0)); } } else if (ctx->peersync) { - ret = gds_stream_wait_cq(gpu_stream, &ctx->gds_qp->recv_cq, ctx->consume_rx_cqe); + ret = gds_stream_wait_cq(gpu_stream, ctx->gds_qp->recv_cq, ctx->consume_rx_cqe); if (ret) { // TODO: rollback gpu send and wait send_cq gpu_err("error %d in gds_stream_wait_cq\n", ret); @@ -804,7 +804,7 @@ static int pp_post_work(struct pingpong_context *ctx, int n_posts, int rcnt, uin wdesc->descs[k].tag = GDS_TAG_SEND; wdesc->descs[k].send = &wdesc->send_rq; ++k; - ret = gds_prepare_wait_cq(&ctx->gds_qp->send_cq, &wdesc->wait_tx_rq, 0); + ret = gds_prepare_wait_cq(ctx->gds_qp->send_cq, &wdesc->wait_tx_rq, 0); if (ret) { retcode = -ret; break; @@ -833,7 +833,7 @@ static int pp_post_work(struct pingpong_context *ctx, int n_posts, int rcnt, uin retcode = -ret; break; } - ret = gds_stream_wait_cq(gpu_stream, &ctx->gds_qp->send_cq, 0); + ret = gds_stream_wait_cq(gpu_stream, ctx->gds_qp->send_cq, 0); if (ret) { // TODO: rollback gpu send gpu_err("error %d in gds_stream_wait_cq\n", ret); diff --git a/tests/gds_kernel_loopback_latency.c b/tests/gds_kernel_loopback_latency.c index b2d209c..f6ccc32 100644 --- a/tests/gds_kernel_loopback_latency.c +++ b/tests/gds_kernel_loopback_latency.c @@ -511,16 +511,15 @@ static int pp_post_send(struct pingpong_context *ctx, uint32_t qpn) .wr_id = PINGPONG_SEND_WRID, .sg_list = &list, .num_sge = 1, - .exp_opcode = IBV_EXP_WR_SEND, - .exp_send_flags = IBV_EXP_SEND_SIGNALED, + .opcode = IBV_WR_SEND, + .send_flags = IBV_SEND_SIGNALED, .wr = { .ud = { .ah = ctx->ah, .remote_qpn = qpn, .remote_qkey = 0x11111111 } - }, - .comp_mask = 0 + } }; gds_send_wr *bad_ewr; return gds_post_send(ctx->gds_qp, &ewr, &bad_ewr); @@ -538,16 +537,15 @@ static int pp_post_gpu_send(struct pingpong_context *ctx, uint32_t qpn, CUstream .wr_id = PINGPONG_SEND_WRID, .sg_list = &list, .num_sge = 1, - .exp_opcode = IBV_EXP_WR_SEND, - .exp_send_flags = IBV_EXP_SEND_SIGNALED, + .opcode = IBV_WR_SEND, + .send_flags = IBV_SEND_SIGNALED, .wr = { .ud = { .ah = ctx->ah, .remote_qpn = qpn, .remote_qkey = 0x11111111 } - }, - .comp_mask = 0 + } }; gds_send_wr *bad_ewr; return gds_stream_queue_send(*p_gpu_stream, ctx->gds_qp, &ewr, &bad_ewr); @@ -565,16 +563,15 @@ static int pp_prepare_gpu_send(struct pingpong_context *ctx, uint32_t qpn, gds_s .wr_id = PINGPONG_SEND_WRID, .sg_list = &list, .num_sge = 1, - .exp_opcode = IBV_EXP_WR_SEND, - .exp_send_flags = IBV_EXP_SEND_SIGNALED, + .opcode = IBV_WR_SEND, + .send_flags = IBV_SEND_SIGNALED, .wr = { .ud = { .ah = ctx->ah, .remote_qpn = qpn, .remote_qkey = 0x11111111 } - }, - .comp_mask = 0 + } }; gds_send_wr *bad_ewr; //printf("gpu_post_send_on_stream\n"); @@ -655,7 +652,7 @@ static int pp_post_work(struct pingpong_context *ctx, int n_posts, int rcnt, uin wdesc->descs[k].send = &wdesc->send_rq; ++k; - ret = gds_prepare_wait_cq(&ctx->gds_qp->send_cq, &wdesc->wait_tx_rq, 0); + ret = gds_prepare_wait_cq(ctx->gds_qp->send_cq, &wdesc->wait_tx_rq, 0); if (ret) { retcode = -ret; break; @@ -665,7 +662,7 @@ static int pp_post_work(struct pingpong_context *ctx, int n_posts, int rcnt, uin wdesc->descs[k].wait = &wdesc->wait_tx_rq; ++k; - ret = gds_prepare_wait_cq(&ctx->gds_qp->recv_cq, &wdesc->wait_rx_rq, 0); + ret = gds_prepare_wait_cq(ctx->gds_qp->recv_cq, &wdesc->wait_rx_rq, 0); if (ret) { retcode = -ret; break; @@ -697,7 +694,7 @@ static int pp_post_work(struct pingpong_context *ctx, int n_posts, int rcnt, uin break; } - ret = gds_stream_wait_cq(gpu_stream_server, &ctx->gds_qp->send_cq, 0); + ret = gds_stream_wait_cq(gpu_stream_server, ctx->gds_qp->send_cq, 0); if (ret) { // TODO: rollback gpu send gpu_err("error %d in gds_stream_wait_cq\n", ret); @@ -705,7 +702,7 @@ static int pp_post_work(struct pingpong_context *ctx, int n_posts, int rcnt, uin break; } - ret = gds_stream_wait_cq(gpu_stream_server, &ctx->gds_qp->recv_cq, ctx->consume_rx_cqe); + ret = gds_stream_wait_cq(gpu_stream_server, ctx->gds_qp->recv_cq, ctx->consume_rx_cqe); if (ret) { // TODO: rollback gpu send and wait send_cq gpu_err("error %d in gds_stream_wait_cq\n", ret); From b1cab0511afe6efe5227e00ed7e790e064884394 Mon Sep 17 00:00:00 2001 From: Pak Markthub Date: Wed, 11 Aug 2021 04:06:38 -0400 Subject: [PATCH 15/50] Moved gds_wait_request to gds_mlx5_exp_wait_request and made the former an opaque struct --- include/gdsync/core.h | 7 +++++-- src/mlx5-exp.hpp | 35 +++++++++++++++++++++++++++++++++++ 2 files changed, 40 insertions(+), 2 deletions(-) diff --git a/include/gdsync/core.h b/include/gdsync/core.h index a74299d..30fc977 100644 --- a/include/gdsync/core.h +++ b/include/gdsync/core.h @@ -175,8 +175,11 @@ int gds_stream_post_send_all(CUstream stream, int count, gds_send_request_t *req */ typedef struct gds_wait_request { - struct ibv_exp_peer_peek peek; - struct peer_op_wr wr[GDS_WAIT_INFO_MAX_OPS]; + gds_driver_type_t dtype; + uint8_t pad0[4]; + uint8_t reserved0[40]; + uint8_t reserved1[56 * GDS_WAIT_INFO_MAX_OPS]; + uint8_t pad1[16]; } gds_wait_request_t; /** diff --git a/src/mlx5-exp.hpp b/src/mlx5-exp.hpp index d289c83..a92e0e6 100644 --- a/src/mlx5-exp.hpp +++ b/src/mlx5-exp.hpp @@ -6,6 +6,7 @@ #include #include +#include #include "objs.hpp" #include "utils.hpp" @@ -20,6 +21,18 @@ typedef struct gds_mlx5_exp_qp { ibv_exp_res_domain *res_domain; } gds_mlx5_exp_qp_t; +typedef struct gds_mlx5_exp_wait_request { + gds_driver_type_t dtype; + uint8_t pad0[4]; + struct ibv_exp_peer_peek peek; + struct peer_op_wr wr[GDS_WAIT_INFO_MAX_OPS]; + uint8_t pad1[16]; +} gds_mlx5_exp_wait_request_t; + +static_assert(sizeof(gds_mlx5_exp_wait_request_t) % 64 == 0, "gds_mlx5_exp_wait_request_t must be 64-byte aligned."); +static_assert(sizeof(gds_mlx5_exp_wait_request_t) <= sizeof(gds_wait_request_t), "The size of gds_mlx5_exp_wait_request_t must be less than or equal to that of gds_wait_request_t."); +static_assert(offsetof(gds_mlx5_exp_wait_request_t, dtype) == offsetof(gds_wait_request_t, dtype), "dtype of gds_mlx5_exp_wait_request_t and gds_wait_request_t must be at the same offset."); + static inline gds_mlx5_exp_cq_t *to_gds_mexp_cq(gds_cq_t *gcq) { assert(gcq->dtype == GDS_DRIVER_TYPE_MLX5_EXP); return container_of(gcq, gds_mlx5_exp_cq_t, gcq); @@ -30,6 +43,19 @@ static inline gds_mlx5_exp_qp_t *to_gds_mexp_qp(gds_qp_t *gqp) { return container_of(gqp, gds_mlx5_exp_qp_t, gqp); } +static inline gds_mlx5_exp_wait_request_t *to_gds_mexp_wait_request(gds_wait_request_t *gwreq) { + assert(gwreq->dtype == GDS_DRIVER_TYPE_MLX5_EXP); + return (gds_mlx5_exp_wait_request_t *)(gwreq); +} + +static inline const gds_mlx5_exp_wait_request_t *to_gds_mexp_wait_request(const gds_wait_request_t *gwreq) { + return (const gds_mlx5_exp_wait_request_t *)to_gds_mexp_wait_request((const gds_wait_request_t *)gwreq); +} + +static inline uint32_t gds_mlx5_exp_get_num_wait_request_entries(gds_mlx5_exp_wait_request_t *gmexp_request) { + return gmexp_request->peek.entries; +} + gds_mlx5_exp_cq_t *gds_mlx5_exp_create_cq( struct ibv_context *context, int cqe, void *cq_context, struct ibv_comp_channel *channel, @@ -46,3 +72,12 @@ int gds_mlx5_exp_destroy_qp(gds_mlx5_exp_qp_t *gmexpqp); int gds_mlx5_exp_prepare_send(gds_mlx5_exp_qp_t *gmexpqp, gds_send_wr *p_ewr, gds_send_wr **bad_ewr, gds_send_request_t *request); + +void gds_mlx5_exp_init_wait_request(gds_mlx5_exp_wait_request_t *request, uint32_t offset); +void gds_mlx5_exp_dump_wait_request(gds_mlx5_exp_wait_request_t *request, size_t count); +int gds_mlx5_exp_prepare_wait_cq(gds_mlx5_exp_cq_t *mexpcq, gds_mlx5_exp_wait_request_t *request, int flags); +int gds_mlx5_exp_append_wait_cq(gds_mlx5_exp_wait_request_t *request, uint32_t *dw, uint32_t val); +int gds_mlx5_exp_abort_wait_cq(gds_mlx5_exp_cq_t *gmexpcq, gds_mlx5_exp_wait_request_t *request); +int gds_mlx5_exp_stream_post_wait_descriptor(gds_peer *peer, gds_mlx5_exp_wait_request_t *request, gds_op_list_t ¶ms, int flags); +int gds_mlx5_exp_post_wait_descriptor(gds_mlx5_exp_wait_request_t *request, int flags); +int gds_mlx5_exp_get_wait_descs(gds_mlx5_wait_info_t *mlx5_i, const gds_mlx5_exp_wait_request_t *request); From 17b246f2ec485f97d324a36f6937b8c02dd646a4 Mon Sep 17 00:00:00 2001 From: Pak Markthub Date: Wed, 11 Aug 2021 04:07:49 -0400 Subject: [PATCH 16/50] Moved gds_wait_request related functions to mlx5-exp --- src/apis.cpp | 88 ++++++--------- src/gdsync.cpp | 71 +----------- src/mlx5-exp.cpp | 280 +++++++++++++++++++++++++++++++++++++++++++++++ src/mlx5.cpp | 104 +----------------- src/utils.hpp | 2 + 5 files changed, 323 insertions(+), 222 deletions(-) diff --git a/src/apis.cpp b/src/apis.cpp index 0801771..e9ad2ed 100644 --- a/src/apis.cpp +++ b/src/apis.cpp @@ -56,7 +56,7 @@ //----------------------------------------------------------------------------- -static void gds_init_ops(struct peer_op_wr *op, int count) +void gds_init_ops(struct peer_op_wr *op, int count) { int i = count; while (--i) @@ -80,13 +80,15 @@ static void gds_init_send_info(gds_send_request_t *info) static void gds_init_wait_request(gds_wait_request_t *request, uint32_t offset) { + gds_mlx5_exp_wait_request_t *gmexp_request; gds_dbg("wait_request=%p offset=%08x\n", request, offset); memset(request, 0, sizeof(*request)); - request->peek.storage = request->wr; - request->peek.entries = sizeof(request->wr)/sizeof(request->wr[0]); - request->peek.whence = IBV_EXP_PEER_PEEK_ABSOLUTE; - request->peek.offset = offset; - gds_init_ops(request->peek.storage, request->peek.entries); + + request->dtype = GDS_DRIVER_TYPE_MLX5_EXP; + + gmexp_request = to_gds_mexp_wait_request(request); + + gds_mlx5_exp_init_wait_request(gmexp_request, offset); } //----------------------------------------------------------------------------- @@ -273,7 +275,9 @@ int gds_stream_post_send_all(CUstream stream, int count, gds_send_request_t *req int gds_prepare_wait_cq(struct gds_cq *cq, gds_wait_request_t *request, int flags) { - int retcode = 0; + gds_mlx5_exp_cq_t *gmexpcq; + gds_mlx5_exp_wait_request_t *gmexp_request; + if (flags != 0) { gds_err("invalid flags != 0\n"); return EINVAL; @@ -281,51 +285,19 @@ int gds_prepare_wait_cq(struct gds_cq *cq, gds_wait_request_t *request, int flag gds_init_wait_request(request, cq->curr_offset++); - retcode = ibv_exp_peer_peek_cq(cq->cq, &request->peek); - if (retcode == -ENOSPC) { - // TODO: handle too few entries - gds_err("not enough ops in peer_peek_cq\n"); - goto out; - } else if (retcode) { - gds_err("error %d in peer_peek_cq\n", retcode); - goto out; - } - //gds_dump_wait_request(request, 1); - out: - return retcode; + gmexpcq = to_gds_mexp_cq(cq); + gmexp_request = to_gds_mexp_wait_request(request); + + return gds_mlx5_exp_prepare_wait_cq(gmexpcq, gmexp_request, flags); } //----------------------------------------------------------------------------- int gds_append_wait_cq(gds_wait_request_t *request, uint32_t *dw, uint32_t val) { - int ret = 0; - unsigned MAX_NUM_ENTRIES = sizeof(request->wr)/sizeof(request->wr[0]); - unsigned n = request->peek.entries; - struct peer_op_wr *wr = request->peek.storage; - - if (n + 1 > MAX_NUM_ENTRIES) { - gds_err("no space left to stuff a poke\n"); - ret = ENOMEM; - goto out; - } + gds_mlx5_exp_wait_request_t *gmexp_request = to_gds_mexp_wait_request(request); - // at least 1 op - assert(n); - assert(wr); - - for (; n; --n) wr = wr->next; - assert(wr); - - wr->type = IBV_EXP_PEER_OP_STORE_DWORD; - wr->wr.dword_va.data = val; - wr->wr.dword_va.target_id = 0; // direct mapping, offset IS the address - wr->wr.dword_va.offset = (ptrdiff_t)(dw-(uint32_t*)0); - - ++request->peek.entries; - - out: - return ret; + return gds_mlx5_exp_append_wait_cq(gmexp_request, dw, val); } //----------------------------------------------------------------------------- @@ -346,12 +318,16 @@ int gds_stream_post_wait_cq_all(CUstream stream, int count, gds_wait_request_t * static int gds_abort_wait_cq(struct gds_cq *cq, gds_wait_request_t *request) { + gds_mlx5_exp_cq_t *gmexpcq; + gds_mlx5_exp_wait_request_t *gmexp_request; + assert(cq); assert(request); - struct ibv_exp_peer_abort_peek abort_ctx; - abort_ctx.peek_id = request->peek.peek_id; - abort_ctx.comp_mask = 0; - return ibv_exp_peer_abort_peek_cq(cq->cq, &abort_ctx); + + gmexpcq = to_gds_mexp_cq(cq); + gmexp_request = to_gds_mexp_wait_request(request); + + return gds_mlx5_exp_abort_wait_cq(gmexpcq, gmexp_request); } //----------------------------------------------------------------------------- @@ -549,7 +525,7 @@ static int calc_n_mem_ops(size_t n_descs, gds_descriptor_t *descs, size_t &n_mem n_mem_ops += desc->send->commit.entries + 2; // extra space, ugly break; case GDS_TAG_WAIT: - n_mem_ops += desc->wait->peek.entries + 2; // ditto + n_mem_ops += gds_mlx5_exp_get_num_wait_request_entries(to_gds_mexp_wait_request(desc->wait)) + 2; // ditto break; case GDS_TAG_WAIT_VALUE32: case GDS_TAG_WRITE_VALUE32: @@ -618,15 +594,15 @@ int gds_stream_post_descriptors(CUstream stream, size_t n_descs, gds_descriptor_ break; } case GDS_TAG_WAIT: { - gds_wait_request_t *wreq = desc->wait; + gds_mlx5_exp_wait_request_t *wreq = to_gds_mexp_wait_request(desc->wait); int flags = 0; if (move_flush && i != last_wait) { gds_dbg("discarding FLUSH!\n"); flags = GDS_POST_OPS_DISCARD_WAIT_FLUSH; } - retcode = gds_post_ops(peer, wreq->peek.entries, wreq->peek.storage, params, flags); + retcode = gds_mlx5_exp_stream_post_wait_descriptor(peer, wreq, params, flags); if (retcode) { - gds_err("error %d in gds_post_ops\n", retcode); + gds_err("error %d in gds_mlx5_exp_stream_post_wait_descriptor\n", retcode); ret = retcode; goto out; } @@ -697,10 +673,10 @@ int gds_post_descriptors(size_t n_descs, gds_descriptor_t *descs, int flags) } case GDS_TAG_WAIT: { gds_dbg("desc[%zu] WAIT\n", i); - gds_wait_request_t *wreq = desc->wait; - retcode = gds_post_ops_on_cpu(wreq->peek.entries, wreq->peek.storage, flags); + gds_mlx5_exp_wait_request_t *wreq = to_gds_mexp_wait_request(desc->wait); + retcode = gds_mlx5_exp_post_wait_descriptor(wreq, flags); if (retcode) { - gds_err("error %d in gds_post_ops_on_cpu\n", retcode); + gds_err("error %d in gds_mlx5_exp_post_wait_descriptor\n", retcode); ret = retcode; goto out; } diff --git a/src/gdsync.cpp b/src/gdsync.cpp index 23907fe..670bc45 100644 --- a/src/gdsync.cpp +++ b/src/gdsync.cpp @@ -1169,73 +1169,14 @@ int gds_post_pokes_on_cpu(int count, gds_send_request_t *info, uint32_t *dw, uin //----------------------------------------------------------------------------- -static void gds_dump_ops(struct peer_op_wr *op, size_t count) -{ - size_t n = 0; - for (; op; op = op->next, ++n) { - gds_dbg("op[%zu] type:%d\n", n, op->type); - switch(op->type) { - case IBV_EXP_PEER_OP_FENCE: { - gds_dbg("FENCE flags=%" PRIu64 "\n", op->wr.fence.fence_flags); - break; - } - case IBV_EXP_PEER_OP_STORE_DWORD: { - CUdeviceptr dev_ptr = range_from_id(op->wr.dword_va.target_id)->dptr + - op->wr.dword_va.offset; - gds_dbg("STORE_QWORD data:%x target_id:%" PRIx64 " offset:%zu dev_ptr=%llx\n", - op->wr.dword_va.data, op->wr.dword_va.target_id, - op->wr.dword_va.offset, dev_ptr); - break; - } - case IBV_EXP_PEER_OP_STORE_QWORD: { - CUdeviceptr dev_ptr = range_from_id(op->wr.qword_va.target_id)->dptr + - op->wr.qword_va.offset; - gds_dbg("STORE_QWORD data:%" PRIx64 " target_id:%" PRIx64 " offset:%zu dev_ptr=%llx\n", - op->wr.qword_va.data, op->wr.qword_va.target_id, - op->wr.qword_va.offset, dev_ptr); - break; - } - case IBV_EXP_PEER_OP_COPY_BLOCK: { - CUdeviceptr dev_ptr = range_from_id(op->wr.copy_op.target_id)->dptr + - op->wr.copy_op.offset; - gds_dbg("COPY_BLOCK src:%p len:%zu target_id:%" PRIx64 " offset:%zu dev_ptr=%llx\n", - op->wr.copy_op.src, op->wr.copy_op.len, - op->wr.copy_op.target_id, op->wr.copy_op.offset, - dev_ptr); - break; - } - case IBV_EXP_PEER_OP_POLL_AND_DWORD: - case IBV_EXP_PEER_OP_POLL_NOR_DWORD: { - CUdeviceptr dev_ptr = range_from_id(op->wr.dword_va.target_id)->dptr + - op->wr.dword_va.offset; - gds_dbg("%s data:%08x target_id:%" PRIx64 " offset:%zu dev_ptr=%llx\n", - (op->type==IBV_EXP_PEER_OP_POLL_AND_DWORD) ? "POLL_AND_DW" : "POLL_NOR_SDW", - op->wr.dword_va.data, - op->wr.dword_va.target_id, - op->wr.dword_va.offset, - dev_ptr); - break; - } - default: - gds_err("undefined peer op type %d\n", op->type); - break; - } - } - - assert(count == n); -} - -//----------------------------------------------------------------------------- - void gds_dump_wait_request(gds_wait_request_t *request, size_t count) { - for (size_t j=0; jentries, peek->whence, peek->offset, - peek->peek_id, peek->comp_mask); - gds_dump_ops(peek->storage, peek->entries); - } + gds_mlx5_exp_wait_request_t *gmexp_request; + if (count == 0) + return; + + gmexp_request = to_gds_mexp_wait_request(request); + gds_mlx5_exp_dump_wait_request(gmexp_request, count); } //----------------------------------------------------------------------------- diff --git a/src/mlx5-exp.cpp b/src/mlx5-exp.cpp index f76cf12..8d7e939 100644 --- a/src/mlx5-exp.cpp +++ b/src/mlx5-exp.cpp @@ -282,3 +282,283 @@ int gds_mlx5_exp_prepare_send(gds_mlx5_exp_qp_t *gmexpqp, gds_send_wr *p_ewr, return ret; } +//----------------------------------------------------------------------------- + +void gds_mlx5_exp_init_wait_request(gds_mlx5_exp_wait_request_t *request, uint32_t offset) +{ + gds_dbg("wait_request=%p offset=%08x\n", request, offset); + request->peek.storage = request->wr; + request->peek.entries = sizeof(request->wr)/sizeof(request->wr[0]); + request->peek.whence = IBV_EXP_PEER_PEEK_ABSOLUTE; + request->peek.offset = offset; + gds_init_ops(request->peek.storage, request->peek.entries); +} + +//----------------------------------------------------------------------------- + +static void gds_mlx5_exp_dump_ops(struct peer_op_wr *op, size_t count) +{ + size_t n = 0; + for (; op; op = op->next, ++n) { + gds_dbg("op[%zu] type:%d\n", n, op->type); + switch(op->type) { + case IBV_EXP_PEER_OP_FENCE: { + gds_dbg("FENCE flags=%" PRIu64 "\n", op->wr.fence.fence_flags); + break; + } + case IBV_EXP_PEER_OP_STORE_DWORD: { + CUdeviceptr dev_ptr = range_from_id(op->wr.dword_va.target_id)->dptr + + op->wr.dword_va.offset; + gds_dbg("STORE_QWORD data:%x target_id:%" PRIx64 " offset:%zu dev_ptr=%llx\n", + op->wr.dword_va.data, op->wr.dword_va.target_id, + op->wr.dword_va.offset, dev_ptr); + break; + } + case IBV_EXP_PEER_OP_STORE_QWORD: { + CUdeviceptr dev_ptr = range_from_id(op->wr.qword_va.target_id)->dptr + + op->wr.qword_va.offset; + gds_dbg("STORE_QWORD data:%" PRIx64 " target_id:%" PRIx64 " offset:%zu dev_ptr=%llx\n", + op->wr.qword_va.data, op->wr.qword_va.target_id, + op->wr.qword_va.offset, dev_ptr); + break; + } + case IBV_EXP_PEER_OP_COPY_BLOCK: { + CUdeviceptr dev_ptr = range_from_id(op->wr.copy_op.target_id)->dptr + + op->wr.copy_op.offset; + gds_dbg("COPY_BLOCK src:%p len:%zu target_id:%" PRIx64 " offset:%zu dev_ptr=%llx\n", + op->wr.copy_op.src, op->wr.copy_op.len, + op->wr.copy_op.target_id, op->wr.copy_op.offset, + dev_ptr); + break; + } + case IBV_EXP_PEER_OP_POLL_AND_DWORD: + case IBV_EXP_PEER_OP_POLL_NOR_DWORD: { + CUdeviceptr dev_ptr = range_from_id(op->wr.dword_va.target_id)->dptr + + op->wr.dword_va.offset; + gds_dbg("%s data:%08x target_id:%" PRIx64 " offset:%zu dev_ptr=%llx\n", + (op->type==IBV_EXP_PEER_OP_POLL_AND_DWORD) ? "POLL_AND_DW" : "POLL_NOR_SDW", + op->wr.dword_va.data, + op->wr.dword_va.target_id, + op->wr.dword_va.offset, + dev_ptr); + break; + } + default: + gds_err("undefined peer op type %d\n", op->type); + break; + } + } + + assert(count == n); +} + +//----------------------------------------------------------------------------- + +void gds_mlx5_exp_dump_wait_request(gds_mlx5_exp_wait_request_t *request, size_t count) +{ + for (size_t j = 0; j < count; ++j) { + struct ibv_exp_peer_peek *peek = &request[j].peek; + gds_dbg("req[%zu] entries:%u whence:%u offset:%u peek_id:%" PRIx64 " comp_mask:%08x\n", + j, peek->entries, peek->whence, peek->offset, + peek->peek_id, peek->comp_mask); + gds_mlx5_exp_dump_ops(peek->storage, peek->entries); + } +} + +//----------------------------------------------------------------------------- + +int gds_mlx5_exp_prepare_wait_cq(gds_mlx5_exp_cq_t *mexpcq, gds_mlx5_exp_wait_request_t *request, int flags) +{ + int retcode = 0; + + retcode = ibv_exp_peer_peek_cq(mexpcq->gcq.cq, &request->peek); + if (retcode == ENOSPC) { + // TODO: handle too few entries + gds_err("not enough ops in peer_peek_cq\n"); + goto out; + } else if (retcode) { + gds_err("error %d in peer_peek_cq\n", retcode); + goto out; + } + //gds_dump_wait_request(request, 1); + out: + return retcode; +} + +//----------------------------------------------------------------------------- + +int gds_mlx5_exp_append_wait_cq(gds_mlx5_exp_wait_request_t *request, uint32_t *dw, uint32_t val) +{ + int ret = 0; + unsigned MAX_NUM_ENTRIES = sizeof(request->wr) / sizeof(request->wr[0]); + unsigned n = request->peek.entries; + struct peer_op_wr *wr = request->peek.storage; + + if (n + 1 > MAX_NUM_ENTRIES) { + gds_err("no space left to stuff a poke\n"); + ret = ENOMEM; + goto out; + } + + // at least 1 op + assert(n); + assert(wr); + + for (; n; --n) + wr = wr->next; + + assert(wr); + + wr->type = IBV_EXP_PEER_OP_STORE_DWORD; + wr->wr.dword_va.data = val; + wr->wr.dword_va.target_id = 0; // direct mapping, offset IS the address + wr->wr.dword_va.offset = (ptrdiff_t)(dw-(uint32_t*)0); + + ++request->peek.entries; + +out: + return ret; +} + +//----------------------------------------------------------------------------- + +int gds_mlx5_exp_abort_wait_cq(gds_mlx5_exp_cq_t *gmexpcq, gds_mlx5_exp_wait_request_t *request) +{ + struct ibv_exp_peer_abort_peek abort_ctx; + abort_ctx.peek_id = request->peek.peek_id; + abort_ctx.comp_mask = 0; + return ibv_exp_peer_abort_peek_cq(gmexpcq->gcq.cq, &abort_ctx); +} + +//----------------------------------------------------------------------------- + +int gds_mlx5_exp_stream_post_wait_descriptor(gds_peer *peer, gds_mlx5_exp_wait_request_t *request, gds_op_list_t ¶ms, int flags) +{ + int ret = 0; + + ret = gds_post_ops(peer, request->peek.entries, request->peek.storage, params, flags); + if (ret) + gds_err("error %d in gds_post_ops\n", ret); + + return ret; +} + +//----------------------------------------------------------------------------- + +int gds_mlx5_exp_post_wait_descriptor(gds_mlx5_exp_wait_request_t *request, int flags) +{ + int ret = 0; + + ret = gds_post_ops_on_cpu(request->peek.entries, request->peek.storage, flags); + if (ret) + gds_err("error %d in gds_post_ops_on_cpu\n", ret); + + return ret; +} + +//----------------------------------------------------------------------------- + +int gds_mlx5_exp_get_wait_descs(gds_mlx5_wait_info_t *mlx5_i, const gds_mlx5_exp_wait_request_t *request) +{ + int retcode = 0; + size_t n_ops = request->peek.entries; + peer_op_wr *op = request->peek.storage; + size_t n = 0; + + memset(mlx5_i, 0, sizeof(*mlx5_i)); + + for (; op && n < n_ops; op = op->next, ++n) { + switch(op->type) { + case IBV_EXP_PEER_OP_FENCE: { + gds_dbg("OP_FENCE: fence_flags=%" PRIu64 "\n", op->wr.fence.fence_flags); + uint32_t fence_op = (op->wr.fence.fence_flags & (IBV_EXP_PEER_FENCE_OP_READ|IBV_EXP_PEER_FENCE_OP_WRITE)); + uint32_t fence_from = (op->wr.fence.fence_flags & (IBV_EXP_PEER_FENCE_FROM_CPU|IBV_EXP_PEER_FENCE_FROM_HCA)); + uint32_t fence_mem = (op->wr.fence.fence_flags & (IBV_EXP_PEER_FENCE_MEM_SYS|IBV_EXP_PEER_FENCE_MEM_PEER)); + if (fence_op == IBV_EXP_PEER_FENCE_OP_READ) { + gds_dbg("nothing to do for read fences\n"); + break; + } + if (fence_from != IBV_EXP_PEER_FENCE_FROM_HCA) { + gds_err("unexpected from fence\n"); + retcode = EINVAL; + break; + } + gds_err("unsupported fence combination\n"); + retcode = EINVAL; + break; + } + case IBV_EXP_PEER_OP_STORE_DWORD: { + CUdeviceptr dev_ptr = range_from_id(op->wr.dword_va.target_id)->dptr + + op->wr.dword_va.offset; + uint32_t data = op->wr.dword_va.data; + gds_dbg("OP_STORE_DWORD dev_ptr=%" PRIx64 " data=%08x\n", (uint64_t)dev_ptr, data); + if (n != 1) { + gds_err("store DWORD is not 2nd op\n"); + retcode = EINVAL; + break; + } + mlx5_i->flag_ptr = (uint32_t*)dev_ptr; + mlx5_i->flag_value = data; + break; + } + case IBV_EXP_PEER_OP_STORE_QWORD: { + CUdeviceptr dev_ptr = range_from_id(op->wr.qword_va.target_id)->dptr + + op->wr.qword_va.offset; + uint64_t data = op->wr.qword_va.data; + gds_dbg("OP_STORE_QWORD dev_ptr=%" PRIx64 " data=%" PRIx64 "\n", (uint64_t)dev_ptr, (uint64_t)data); + gds_err("unsupported QWORD op\n"); + retcode = EINVAL; + break; + } + case IBV_EXP_PEER_OP_COPY_BLOCK: { + CUdeviceptr dev_ptr = range_from_id(op->wr.copy_op.target_id)->dptr + + op->wr.copy_op.offset; + size_t len = op->wr.copy_op.len; + void *src = op->wr.copy_op.src; + gds_err("unsupported COPY_BLOCK\n"); + retcode = EINVAL; + break; + } + case IBV_EXP_PEER_OP_POLL_AND_DWORD: + case IBV_EXP_PEER_OP_POLL_GEQ_DWORD: + case IBV_EXP_PEER_OP_POLL_NOR_DWORD: { + CUdeviceptr dev_ptr = range_from_id(op->wr.dword_va.target_id)->dptr + + op->wr.dword_va.offset; + uint32_t data = op->wr.dword_va.data; + + gds_dbg("OP_POLL_DWORD dev_ptr=%" PRIx64 " data=%08x\n", (uint64_t)dev_ptr, data); + + mlx5_i->cqe_ptr = (uint32_t *)dev_ptr; + mlx5_i->cqe_value = data; + + switch(op->type) { + case IBV_EXP_PEER_OP_POLL_NOR_DWORD: + // GPU SMs can always do NOR + mlx5_i->cond = GDS_WAIT_COND_NOR; + break; + case IBV_EXP_PEER_OP_POLL_GEQ_DWORD: + mlx5_i->cond = GDS_WAIT_COND_GEQ; + break; + case IBV_EXP_PEER_OP_POLL_AND_DWORD: + mlx5_i->cond = GDS_WAIT_COND_AND; + break; + default: + gds_err("unexpected op type\n"); + retcode = EINVAL; + goto err; + } + break; + } + default: + gds_err("undefined peer op type %d\n", op->type); + retcode = EINVAL; + break; + } + err: + if (retcode) { + gds_err("error in fill func at entry n=%zu\n", n); + break; + } + } + return retcode; +} diff --git a/src/mlx5.cpp b/src/mlx5.cpp index a2c7b39..b026374 100644 --- a/src/mlx5.cpp +++ b/src/mlx5.cpp @@ -40,6 +40,7 @@ //#include "mem.hpp" #include "objs.hpp" #include "utils.hpp" +#include "mlx5-exp.hpp" #if 0 union { uint64_t qw; uint32_t dw[2]; } db_val; @@ -180,107 +181,8 @@ int gds_mlx5_get_send_info(int count, const gds_send_request_t *requests, gds_ml int gds_mlx5_get_wait_descs(gds_mlx5_wait_info_t *mlx5_i, const gds_wait_request_t *request) { - int retcode = 0; - size_t n_ops = request->peek.entries; - peer_op_wr *op = request->peek.storage; - size_t n = 0; - - memset(mlx5_i, 0, sizeof(*mlx5_i)); - - for (; op && n < n_ops; op = op->next, ++n) { - switch(op->type) { - case IBV_EXP_PEER_OP_FENCE: { - gds_dbg("OP_FENCE: fence_flags=%" PRIu64 "\n", op->wr.fence.fence_flags); - uint32_t fence_op = (op->wr.fence.fence_flags & (IBV_EXP_PEER_FENCE_OP_READ|IBV_EXP_PEER_FENCE_OP_WRITE)); - uint32_t fence_from = (op->wr.fence.fence_flags & (IBV_EXP_PEER_FENCE_FROM_CPU|IBV_EXP_PEER_FENCE_FROM_HCA)); - uint32_t fence_mem = (op->wr.fence.fence_flags & (IBV_EXP_PEER_FENCE_MEM_SYS|IBV_EXP_PEER_FENCE_MEM_PEER)); - if (fence_op == IBV_EXP_PEER_FENCE_OP_READ) { - gds_dbg("nothing to do for read fences\n"); - break; - } - if (fence_from != IBV_EXP_PEER_FENCE_FROM_HCA) { - gds_err("unexpected from fence\n"); - retcode = EINVAL; - break; - } - gds_err("unsupported fence combination\n"); - retcode = EINVAL; - break; - } - case IBV_EXP_PEER_OP_STORE_DWORD: { - CUdeviceptr dev_ptr = range_from_id(op->wr.dword_va.target_id)->dptr + - op->wr.dword_va.offset; - uint32_t data = op->wr.dword_va.data; - gds_dbg("OP_STORE_DWORD dev_ptr=%" PRIx64 " data=%08x\n", (uint64_t)dev_ptr, data); - if (n != 1) { - gds_err("store DWORD is not 2nd op\n"); - retcode = EINVAL; - break; - } - mlx5_i->flag_ptr = (uint32_t*)dev_ptr; - mlx5_i->flag_value = data; - break; - } - case IBV_EXP_PEER_OP_STORE_QWORD: { - CUdeviceptr dev_ptr = range_from_id(op->wr.qword_va.target_id)->dptr + - op->wr.qword_va.offset; - uint64_t data = op->wr.qword_va.data; - gds_dbg("OP_STORE_QWORD dev_ptr=%" PRIx64 " data=%" PRIx64 "\n", (uint64_t)dev_ptr, (uint64_t)data); - gds_err("unsupported QWORD op\n"); - retcode = EINVAL; - break; - } - case IBV_EXP_PEER_OP_COPY_BLOCK: { - CUdeviceptr dev_ptr = range_from_id(op->wr.copy_op.target_id)->dptr + - op->wr.copy_op.offset; - size_t len = op->wr.copy_op.len; - void *src = op->wr.copy_op.src; - gds_err("unsupported COPY_BLOCK\n"); - retcode = EINVAL; - break; - } - case IBV_EXP_PEER_OP_POLL_AND_DWORD: - case IBV_EXP_PEER_OP_POLL_GEQ_DWORD: - case IBV_EXP_PEER_OP_POLL_NOR_DWORD: { - CUdeviceptr dev_ptr = range_from_id(op->wr.dword_va.target_id)->dptr + - op->wr.dword_va.offset; - uint32_t data = op->wr.dword_va.data; - - gds_dbg("OP_POLL_DWORD dev_ptr=%" PRIx64 " data=%08x\n", (uint64_t)dev_ptr, data); - - mlx5_i->cqe_ptr = (uint32_t *)dev_ptr; - mlx5_i->cqe_value = data; - - switch(op->type) { - case IBV_EXP_PEER_OP_POLL_NOR_DWORD: - // GPU SMs can always do NOR - mlx5_i->cond = GDS_WAIT_COND_NOR; - break; - case IBV_EXP_PEER_OP_POLL_GEQ_DWORD: - mlx5_i->cond = GDS_WAIT_COND_GEQ; - break; - case IBV_EXP_PEER_OP_POLL_AND_DWORD: - mlx5_i->cond = GDS_WAIT_COND_AND; - break; - default: - gds_err("unexpected op type\n"); - retcode = EINVAL; - goto err; - } - break; - } - default: - gds_err("undefined peer op type %d\n", op->type); - retcode = EINVAL; - break; - } - err: - if (retcode) { - gds_err("error in fill func at entry n=%zu\n", n); - break; - } - } - return retcode; + const gds_mlx5_exp_wait_request_t *gmexp_request = to_gds_mexp_wait_request(request); + return gds_mlx5_exp_get_wait_descs(mlx5_i, gmexp_request); } //----------------------------------------------------------------------------- diff --git a/src/utils.hpp b/src/utils.hpp index 1bacbb6..c5d0774 100644 --- a/src/utils.hpp +++ b/src/utils.hpp @@ -240,6 +240,8 @@ static inline gds_driver_type gds_get_driver_type(struct ibv_device *ibdev) int gds_destroy_cq(struct gds_cq *gcq); +void gds_init_ops(struct peer_op_wr *op, int count); + //----------------------------------------------------------------------------- /* From 6be27ac863da603a2d7928b07e59750b6e02f5d0 Mon Sep 17 00:00:00 2001 From: Pak Markthub Date: Thu, 9 Sep 2021 04:41:13 -0400 Subject: [PATCH 17/50] Changed the definition of gds_send_request_t --- include/gdsync/core.h | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/include/gdsync/core.h b/include/gdsync/core.h index 30fc977..fa213d3 100644 --- a/include/gdsync/core.h +++ b/include/gdsync/core.h @@ -161,9 +161,13 @@ enum { */ typedef struct gds_send_request { - struct ibv_exp_peer_commit commit; - struct peer_op_wr wr[GDS_SEND_INFO_MAX_OPS]; + gds_driver_type_t dtype; + uint8_t pad0[4]; + uint8_t reserved0[32]; + uint8_t reserved1[56 * GDS_SEND_INFO_MAX_OPS]; + uint8_t pad1[24]; } gds_send_request_t; +static_assert(sizeof(gds_send_request_t) % 64 == 0, "gds_send_request_t must be 64-byte aligned."); int gds_prepare_send(struct gds_qp *qp, gds_send_wr *p_ewr, gds_send_wr **bad_ewr, gds_send_request_t *request); int gds_stream_post_send(CUstream stream, gds_send_request_t *request); @@ -181,6 +185,7 @@ typedef struct gds_wait_request { uint8_t reserved1[56 * GDS_WAIT_INFO_MAX_OPS]; uint8_t pad1[16]; } gds_wait_request_t; +static_assert(sizeof(gds_wait_request_t) % 64 == 0, "gds_wait_request_t must be 64-byte aligned."); /** * Initializes a wait request out of the next heading CQE, which is kept in From f02657839f251cd89d2ba4bc37e511cc464bfac1 Mon Sep 17 00:00:00 2001 From: Pak Markthub Date: Thu, 9 Sep 2021 04:41:56 -0400 Subject: [PATCH 18/50] Added gds_mlx5_exp_send_request_t definition and supported functions --- src/mlx5-exp.hpp | 34 ++++++++++++++++++++++++++++++++-- 1 file changed, 32 insertions(+), 2 deletions(-) diff --git a/src/mlx5-exp.hpp b/src/mlx5-exp.hpp index a92e0e6..7e4ba14 100644 --- a/src/mlx5-exp.hpp +++ b/src/mlx5-exp.hpp @@ -21,6 +21,17 @@ typedef struct gds_mlx5_exp_qp { ibv_exp_res_domain *res_domain; } gds_mlx5_exp_qp_t; +typedef struct gds_mlx5_exp_send_request { + gds_driver_type_t dtype; + uint8_t pad0[4]; + struct ibv_exp_peer_commit commit; + struct peer_op_wr wr[GDS_SEND_INFO_MAX_OPS]; + uint8_t pad1[24]; +} gds_mlx5_exp_send_request_t; +static_assert(sizeof(gds_mlx5_exp_send_request_t) % 64 == 0, "gds_mlx5_exp_send_request_t must be 64-byte aligned."); +static_assert(sizeof(gds_mlx5_exp_send_request_t) <= sizeof(gds_send_request_t), "The size of gds_mlx5_exp_send_request_t must be less than or equal to that of gds_send_request_t."); +static_assert(offsetof(gds_mlx5_exp_send_request_t, dtype) == offsetof(gds_send_request_t, dtype), "dtype of gds_mlx5_exp_send_request_t and gds_send_request_t must be at the same offset."); + typedef struct gds_mlx5_exp_wait_request { gds_driver_type_t dtype; uint8_t pad0[4]; @@ -28,7 +39,6 @@ typedef struct gds_mlx5_exp_wait_request { struct peer_op_wr wr[GDS_WAIT_INFO_MAX_OPS]; uint8_t pad1[16]; } gds_mlx5_exp_wait_request_t; - static_assert(sizeof(gds_mlx5_exp_wait_request_t) % 64 == 0, "gds_mlx5_exp_wait_request_t must be 64-byte aligned."); static_assert(sizeof(gds_mlx5_exp_wait_request_t) <= sizeof(gds_wait_request_t), "The size of gds_mlx5_exp_wait_request_t must be less than or equal to that of gds_wait_request_t."); static_assert(offsetof(gds_mlx5_exp_wait_request_t, dtype) == offsetof(gds_wait_request_t, dtype), "dtype of gds_mlx5_exp_wait_request_t and gds_wait_request_t must be at the same offset."); @@ -43,6 +53,15 @@ static inline gds_mlx5_exp_qp_t *to_gds_mexp_qp(gds_qp_t *gqp) { return container_of(gqp, gds_mlx5_exp_qp_t, gqp); } +static inline gds_mlx5_exp_send_request_t *to_gds_mexp_send_request(gds_send_request_t *gsreq) { + assert(gsreq->dtype == GDS_DRIVER_TYPE_MLX5_EXP); + return (gds_mlx5_exp_send_request_t *)(gsreq); +} + +static inline const gds_mlx5_exp_send_request_t *to_gds_mexp_send_request(const gds_send_request_t *gsreq) { + return (const gds_mlx5_exp_send_request_t *)to_gds_mexp_send_request((const gds_send_request_t *)gsreq); +} + static inline gds_mlx5_exp_wait_request_t *to_gds_mexp_wait_request(gds_wait_request_t *gwreq) { assert(gwreq->dtype == GDS_DRIVER_TYPE_MLX5_EXP); return (gds_mlx5_exp_wait_request_t *)(gwreq); @@ -56,6 +75,10 @@ static inline uint32_t gds_mlx5_exp_get_num_wait_request_entries(gds_mlx5_exp_wa return gmexp_request->peek.entries; } +static inline uint32_t gds_mlx5_exp_get_num_send_request_entries(gds_mlx5_exp_send_request_t *gmexp_request) { + return gmexp_request->commit.entries; +} + gds_mlx5_exp_cq_t *gds_mlx5_exp_create_cq( struct ibv_context *context, int cqe, void *cq_context, struct ibv_comp_channel *channel, @@ -71,7 +94,12 @@ int gds_mlx5_exp_destroy_qp(gds_mlx5_exp_qp_t *gmexpqp); int gds_mlx5_exp_prepare_send(gds_mlx5_exp_qp_t *gmexpqp, gds_send_wr *p_ewr, gds_send_wr **bad_ewr, - gds_send_request_t *request); + gds_mlx5_exp_send_request_t *request); + + +void gds_mlx5_exp_init_send_info(gds_mlx5_exp_send_request_t *info); +int gds_mlx5_exp_post_send_ops(gds_peer *peer, gds_mlx5_exp_send_request_t *info, gds_op_list_t &ops); +int gds_mlx5_exp_post_send_ops_on_cpu(gds_mlx5_exp_send_request_t *info, int flags = 0); void gds_mlx5_exp_init_wait_request(gds_mlx5_exp_wait_request_t *request, uint32_t offset); void gds_mlx5_exp_dump_wait_request(gds_mlx5_exp_wait_request_t *request, size_t count); @@ -81,3 +109,5 @@ int gds_mlx5_exp_abort_wait_cq(gds_mlx5_exp_cq_t *gmexpcq, gds_mlx5_exp_wait_req int gds_mlx5_exp_stream_post_wait_descriptor(gds_peer *peer, gds_mlx5_exp_wait_request_t *request, gds_op_list_t ¶ms, int flags); int gds_mlx5_exp_post_wait_descriptor(gds_mlx5_exp_wait_request_t *request, int flags); int gds_mlx5_exp_get_wait_descs(gds_mlx5_wait_info_t *mlx5_i, const gds_mlx5_exp_wait_request_t *request); + +int gds_mlx5_exp_rollback_qp(gds_mlx5_exp_qp_t *gmexpqp, gds_mlx5_exp_send_request_t *send_info); From 92b09f04923211e3bf6f968f369e4c4b3bc38dd6 Mon Sep 17 00:00:00 2001 From: Pak Markthub Date: Thu, 9 Sep 2021 04:42:33 -0400 Subject: [PATCH 19/50] Moved APIs that use gds_send_request_t to mlx5-exp.cpp --- src/apis.cpp | 58 ++++++++++++++++++------------------------------ src/gdsync.cpp | 6 +++-- src/mlx5-exp.cpp | 54 +++++++++++++++++++++++++++++++++++++++++++- src/mlx5.cpp | 3 ++- 4 files changed, 81 insertions(+), 40 deletions(-) diff --git a/src/apis.cpp b/src/apis.cpp index e9ad2ed..23577b0 100644 --- a/src/apis.cpp +++ b/src/apis.cpp @@ -68,12 +68,15 @@ void gds_init_ops(struct peer_op_wr *op, int count) static void gds_init_send_info(gds_send_request_t *info) { + gds_mlx5_exp_send_request_t *gmexp_info; gds_dbg("send_request=%p\n", info); memset(info, 0, sizeof(*info)); - info->commit.storage = info->wr; - info->commit.entries = sizeof(info->wr)/sizeof(info->wr[0]); - gds_init_ops(info->commit.storage, info->commit.entries); + info->dtype = GDS_DRIVER_TYPE_MLX5_EXP; + + gmexp_info = to_gds_mexp_send_request(info); + + gds_mlx5_exp_init_send_info(gmexp_info); } //----------------------------------------------------------------------------- @@ -93,37 +96,18 @@ static void gds_init_wait_request(gds_wait_request_t *request, uint32_t offset) //----------------------------------------------------------------------------- -static int gds_rollback_qp(struct gds_qp *qp, gds_send_request_t * send_info, enum ibv_exp_rollback_flags flag) +static int gds_rollback_qp(struct gds_qp *qp, gds_send_request_t *send_info) { - struct ibv_exp_rollback_ctx rollback; - int ret=0; + gds_mlx5_exp_qp_t *gmexpqp; + gds_mlx5_exp_send_request_t *gmexp_sreq; assert(qp); - assert(qp->qp); assert(send_info); - if( - flag != IBV_EXP_ROLLBACK_ABORT_UNCOMMITED && - flag != IBV_EXP_ROLLBACK_ABORT_LATE - ) - { - gds_err("erroneous ibv_exp_rollback_flags flag input value\n"); - ret=EINVAL; - goto out; - } - - /* from ibv_exp_peer_commit call */ - rollback.rollback_id = send_info->commit.rollback_id; - /* from ibv_exp_rollback_flag */ - rollback.flags = flag; - /* Reserved for future expensions, must be 0 */ - rollback.comp_mask = 0; - gds_warn("Need to rollback WQE %lx\n", rollback.rollback_id); - ret = ibv_exp_rollback_qp(qp->qp, &rollback); - if(ret) - gds_err("error %d in ibv_exp_rollback_qp\n", ret); -out: - return ret; + gmexpqp = to_gds_mexp_qp(qp); + gmexp_sreq = to_gds_mexp_send_request(send_info); + + return gds_mlx5_exp_rollback_qp(gmexpqp, gmexp_sreq); } //----------------------------------------------------------------------------- @@ -141,7 +125,7 @@ int gds_post_send(struct gds_qp *qp, gds_send_wr *p_ewr, gds_send_wr **bad_ewr) ret = gds_post_pokes_on_cpu(1, &send_info, NULL, 0); if (ret) { gds_err("error %d in gds_post_pokes_on_cpu\n", ret); - ret_roll = gds_rollback_qp(qp, &send_info, IBV_EXP_ROLLBACK_ABORT_LATE); + ret_roll = gds_rollback_qp(qp, &send_info); if (ret_roll) { gds_err("error %d in gds_rollback_qp\n", ret_roll); } @@ -180,6 +164,7 @@ int gds_prepare_send(struct gds_qp *gqp, gds_send_wr *p_ewr, { int ret = 0; gds_mlx5_exp_qp_t *gmexpqp; + gds_mlx5_exp_send_request_t *sreq; gds_init_send_info(request); assert(gqp); @@ -187,8 +172,9 @@ int gds_prepare_send(struct gds_qp *gqp, gds_send_wr *p_ewr, assert(gqp->dtype == GDS_DRIVER_TYPE_MLX5_EXP); gmexpqp = to_gds_mexp_qp(gqp); + sreq = to_gds_mexp_send_request(request); - ret = gds_mlx5_exp_prepare_send(gmexpqp, p_ewr, bad_ewr, request); + ret = gds_mlx5_exp_prepare_send(gmexpqp, p_ewr, bad_ewr, sreq); if (ret) gds_err("Error %d in gds_mlx5_exp_prepare_send.\n", ret); @@ -522,7 +508,7 @@ static int calc_n_mem_ops(size_t n_descs, gds_descriptor_t *descs, size_t &n_mem gds_descriptor_t *desc = descs + i; switch(desc->tag) { case GDS_TAG_SEND: - n_mem_ops += desc->send->commit.entries + 2; // extra space, ugly + n_mem_ops += gds_mlx5_exp_get_num_send_request_entries(to_gds_mexp_send_request(desc->send)) + 2; // extra space, ugly break; case GDS_TAG_WAIT: n_mem_ops += gds_mlx5_exp_get_num_wait_request_entries(to_gds_mexp_wait_request(desc->wait)) + 2; // ditto @@ -584,8 +570,8 @@ int gds_stream_post_descriptors(CUstream stream, size_t n_descs, gds_descriptor_ gds_descriptor_t *desc = descs + i; switch(desc->tag) { case GDS_TAG_SEND: { - gds_send_request_t *sreq = desc->send; - retcode = gds_post_ops(peer, sreq->commit.entries, sreq->commit.storage, params); + gds_mlx5_exp_send_request_t *sreq = to_gds_mexp_send_request(desc->send); + retcode = gds_mlx5_exp_post_send_ops(peer, sreq, params); if (retcode) { gds_err("error %d in gds_post_ops\n", retcode); ret = retcode; @@ -662,8 +648,8 @@ int gds_post_descriptors(size_t n_descs, gds_descriptor_t *descs, int flags) switch(desc->tag) { case GDS_TAG_SEND: { gds_dbg("desc[%zu] SEND\n", i); - gds_send_request_t *sreq = desc->send; - retcode = gds_post_ops_on_cpu(sreq->commit.entries, sreq->commit.storage, flags); + gds_mlx5_exp_send_request_t *sreq = to_gds_mexp_send_request(desc->send); + retcode = gds_mlx5_exp_post_send_ops_on_cpu(sreq, flags); if (retcode) { gds_err("error %d in gds_post_ops_on_cpu\n", retcode); ret = retcode; diff --git a/src/gdsync.cpp b/src/gdsync.cpp index 670bc45..9f1ddde 100644 --- a/src/gdsync.cpp +++ b/src/gdsync.cpp @@ -997,8 +997,9 @@ int gds_post_pokes(CUstream stream, int count, gds_send_request_t *info, uint32_ } for (int j=0; jgqp.qp, p_ewr, bad_ewr); @@ -284,6 +284,31 @@ int gds_mlx5_exp_prepare_send(gds_mlx5_exp_qp_t *gmexpqp, gds_send_wr *p_ewr, //----------------------------------------------------------------------------- +void gds_mlx5_exp_init_send_info(gds_mlx5_exp_send_request_t *info) +{ + gds_dbg("send_request=%p\n", info); + + info->commit.storage = info->wr; + info->commit.entries = sizeof(info->wr)/sizeof(info->wr[0]); + gds_init_ops(info->commit.storage, info->commit.entries); +} + +//----------------------------------------------------------------------------- + +int gds_mlx5_exp_post_send_ops(gds_peer *peer, gds_mlx5_exp_send_request_t *info, gds_op_list_t &ops) +{ + return gds_post_ops(peer, info->commit.entries, info->commit.storage, ops, 0); +} + +//----------------------------------------------------------------------------- + +int gds_mlx5_exp_post_send_ops_on_cpu(gds_mlx5_exp_send_request_t *info, int flags) +{ + return gds_post_ops_on_cpu(info->commit.entries, info->commit.storage, flags); +} + +//----------------------------------------------------------------------------- + void gds_mlx5_exp_init_wait_request(gds_mlx5_exp_wait_request_t *request, uint32_t offset) { gds_dbg("wait_request=%p offset=%08x\n", request, offset); @@ -562,3 +587,30 @@ int gds_mlx5_exp_get_wait_descs(gds_mlx5_wait_info_t *mlx5_i, const gds_mlx5_exp } return retcode; } + +//----------------------------------------------------------------------------- + +int gds_mlx5_exp_rollback_qp(gds_mlx5_exp_qp_t *gmexpqp, gds_mlx5_exp_send_request_t *send_info) +{ + struct ibv_exp_rollback_ctx rollback; + int ret = 0; + enum ibv_exp_rollback_flags flag = IBV_EXP_ROLLBACK_ABORT_LATE; + + assert(gmexpqp); + assert(gmexpqp->gqp.qp); + assert(send_info); + + /* from ibv_exp_peer_commit call */ + rollback.rollback_id = send_info->commit.rollback_id; + /* from ibv_exp_rollback_flag */ + rollback.flags = flag; + /* Reserved for future expensions, must be 0 */ + rollback.comp_mask = 0; + gds_warn("Need to rollback WQE %lx\n", rollback.rollback_id); + ret = ibv_exp_rollback_qp(gmexpqp->gqp.qp, &rollback); + if (ret) + gds_err("error %d in ibv_exp_rollback_qp\n", ret); + +out: + return ret; +} diff --git a/src/mlx5.cpp b/src/mlx5.cpp index b026374..526f544 100644 --- a/src/mlx5.cpp +++ b/src/mlx5.cpp @@ -52,9 +52,10 @@ //----------------------------------------------------------------------------- -int gds_mlx5_get_send_descs(gds_mlx5_send_info_t *mlx5_i, const gds_send_request_t *request) +int gds_mlx5_get_send_descs(gds_mlx5_send_info_t *mlx5_i, const gds_send_request_t *_request) { int retcode = 0; + const gds_mlx5_exp_send_request_t *request = to_gds_mexp_send_request(_request); size_t n_ops = request->commit.entries; peer_op_wr *op = request->commit.storage; size_t n = 0; From e088b01ef8c7c6cb95998d7ebc002103ba381d2d Mon Sep 17 00:00:00 2001 From: Pak Markthub Date: Thu, 9 Sep 2021 21:46:09 -0400 Subject: [PATCH 20/50] Removed include verbs_exp.h and peer_ops.h from public header files --- include/gdsync.h | 2 -- src/gdsync.cpp | 3 +++ src/objs.hpp | 3 +++ 3 files changed, 6 insertions(+), 2 deletions(-) diff --git a/include/gdsync.h b/include/gdsync.h index 7d6a45b..f2ed858 100644 --- a/include/gdsync.h +++ b/include/gdsync.h @@ -33,8 +33,6 @@ */ #include -#include -#include #include #include diff --git a/src/gdsync.cpp b/src/gdsync.cpp index 9f1ddde..f951a54 100644 --- a/src/gdsync.cpp +++ b/src/gdsync.cpp @@ -33,6 +33,9 @@ #include #include +#include +#include + #include #include diff --git a/src/objs.hpp b/src/objs.hpp index 796b6bd..a57e288 100644 --- a/src/objs.hpp +++ b/src/objs.hpp @@ -27,6 +27,9 @@ #pragma once +#include +#include + static const size_t max_gpus = 16; typedef struct ibv_exp_peer_direct_attr gds_peer_attr; From 942aa8975f55c2518982dd1d69c463d29a3a360f Mon Sep 17 00:00:00 2001 From: Pak Markthub Date: Thu, 9 Sep 2021 23:14:15 -0400 Subject: [PATCH 21/50] Replaced structs related to ibv_exp_ with gds_* structs --- src/gdsync.cpp | 8 +-- src/mlx5-exp.cpp | 6 +- src/objs.hpp | 165 ++++++++++++++++++++++++++++++++++++++++++++++- 3 files changed, 171 insertions(+), 8 deletions(-) diff --git a/src/gdsync.cpp b/src/gdsync.cpp index f951a54..2697dce 100644 --- a/src/gdsync.cpp +++ b/src/gdsync.cpp @@ -1226,7 +1226,7 @@ int gds_stream_post_wait_cq_multi(CUstream stream, int count, gds_wait_request_t // If NULL returned then buffer will be allocated in system memory // by ibverbs driver. -static struct ibv_exp_peer_buf *gds_buf_alloc(ibv_exp_peer_buf_alloc_attr *attr) +static gds_peer_buf_t *gds_buf_alloc(gds_peer_buf_alloc_attr_t *attr) { assert(attr); gds_peer *peer = peer_from_id(attr->peer_id); @@ -1238,7 +1238,7 @@ static struct ibv_exp_peer_buf *gds_buf_alloc(ibv_exp_peer_buf_alloc_attr *attr) return peer->buf_alloc(peer->alloc_type, attr->length, attr->dir, attr->alignment, peer->alloc_flags); } -static int gds_buf_release(struct ibv_exp_peer_buf *pb) +static int gds_buf_release(gds_peer_buf_t *pb) { gds_dbg("freeing pb=%p\n", pb); gds_buf *buf = static_cast(pb); @@ -1247,14 +1247,14 @@ static int gds_buf_release(struct ibv_exp_peer_buf *pb) return 0; } -static uint64_t gds_register_va(void *start, size_t length, uint64_t peer_id, struct ibv_exp_peer_buf *pb) +static uint64_t gds_register_va(void *start, size_t length, uint64_t peer_id, gds_peer_buf_t *pb) { gds_peer *peer = peer_from_id(peer_id); gds_range *range = NULL; gds_dbg("start=%p length=%zu peer_id=%" PRIx64 " peer_buf=%p\n", start, length, peer_id, pb); - if (IBV_EXP_PEER_IOMEMORY == pb) { + if (GDS_PEER_IOMEMORY == pb) { // register as IOMEM range = peer->register_range(start, length, GDS_MEMORY_IO); } diff --git a/src/mlx5-exp.cpp b/src/mlx5-exp.cpp index dfaf91c..b75f3cb 100644 --- a/src/mlx5-exp.cpp +++ b/src/mlx5-exp.cpp @@ -53,7 +53,8 @@ gds_mlx5_exp_cq_t *gds_mlx5_exp_create_cq( attr.comp_mask = IBV_EXP_CQ_INIT_ATTR_PEER_DIRECT; attr.flags = 0; // see ibv_exp_cq_create_flags - attr.peer_direct_attrs = peer_attr; + static_assert(sizeof(gds_peer_attr) == sizeof(struct ibv_exp_peer_direct_attr)); + attr.peer_direct_attrs = (struct ibv_exp_peer_direct_attr *)(peer_attr); if (res_domain) { gds_dbg("using peer->res_domain %p for CQ\n", res_domain); attr.res_domain = res_domain; @@ -145,7 +146,8 @@ gds_mlx5_exp_qp_t *gds_mlx5_exp_create_qp( exp_qp_attr.recv_cq = rx_gmexpcq->gcq.cq; exp_qp_attr.pd = pd; exp_qp_attr.comp_mask = IBV_EXP_QP_INIT_ATTR_PD | IBV_EXP_QP_INIT_ATTR_PEER_DIRECT; - exp_qp_attr.peer_direct_attrs = peer_attr; + static_assert(sizeof(gds_peer_attr) == sizeof(struct ibv_exp_peer_direct_attr)); + exp_qp_attr.peer_direct_attrs = (struct ibv_exp_peer_direct_attr *)peer_attr; exp_qp_attr.qp_type = qp_attr->qp_type; assert(sizeof(exp_qp_attr.cap) == sizeof(qp_attr->cap)); diff --git a/src/objs.hpp b/src/objs.hpp index a57e288..e524155 100644 --- a/src/objs.hpp +++ b/src/objs.hpp @@ -32,11 +32,172 @@ static const size_t max_gpus = 16; -typedef struct ibv_exp_peer_direct_attr gds_peer_attr; +/** + * Compatible with enum ibv_exp_peer_op + */ +enum gds_peer_op { + GDS_PEER_OP_RESERVED1 = 1, + + GDS_PEER_OP_FENCE = 0, + + GDS_PEER_OP_STORE_DWORD = 4, + GDS_PEER_OP_STORE_QWORD = 2, + GDS_PEER_OP_COPY_BLOCK = 3, + + GDS_PEER_OP_POLL_AND_DWORD = 12, + GDS_PEER_OP_POLL_NOR_DWORD = 13, + GDS_PEER_OP_POLL_GEQ_DWORD = 14, +}; + +/** + * Compatible with enum ibv_exp_peer_op_caps + */ +enum gds_peer_op_caps { + GDS_PEER_OP_FENCE_CAP = (1 << GDS_PEER_OP_FENCE), + GDS_PEER_OP_STORE_DWORD_CAP = (1 << GDS_PEER_OP_STORE_DWORD), + GDS_PEER_OP_STORE_QWORD_CAP = (1 << GDS_PEER_OP_STORE_QWORD), + GDS_PEER_OP_COPY_BLOCK_CAP = (1 << GDS_PEER_OP_COPY_BLOCK), + GDS_PEER_OP_POLL_AND_DWORD_CAP + = (1 << GDS_PEER_OP_POLL_AND_DWORD), + GDS_PEER_OP_POLL_NOR_DWORD_CAP + = (1 << GDS_PEER_OP_POLL_NOR_DWORD), + GDS_PEER_OP_POLL_GEQ_DWORD_CAP + = (1 << GDS_PEER_OP_POLL_GEQ_DWORD), +}; + + +/** + * Compatible with enum ibv_exp_peer_fence + */ +enum gds_peer_fence { + GDS_PEER_FENCE_OP_READ = (1 << 0), + GDS_PEER_FENCE_OP_WRITE = (1 << 1), + GDS_PEER_FENCE_FROM_CPU = (1 << 2), + GDS_PEER_FENCE_FROM_HCA = (1 << 3), + GDS_PEER_FENCE_MEM_SYS = (1 << 4), + GDS_PEER_FENCE_MEM_PEER = (1 << 5), +}; + +/** + * Indicate HW entities supposed to access memory buffer: + * GDS_PEER_DIRECTION_FROM_X means X writes to the buffer + * GDS_PEER_DIRECTION_TO_Y means Y read from the buffer + * + * Compatible with enum ibv_exp_peer_direction + */ +enum gds_peer_direction { + GDS_PEER_DIRECTION_FROM_CPU = (1 << 0), + GDS_PEER_DIRECTION_FROM_HCA = (1 << 1), + GDS_PEER_DIRECTION_FROM_PEER = (1 << 2), + GDS_PEER_DIRECTION_TO_CPU = (1 << 3), + GDS_PEER_DIRECTION_TO_HCA = (1 << 4), + GDS_PEER_DIRECTION_TO_PEER = (1 << 5), +}; + +/** + * Compatible with enum ibv_exp_peer_direct_attr_mask + */ +enum gds_peer_direct_attr_mask { + GDS_PEER_DIRECT_VERSION = (1 << 0) /* Must be set */ +}; + +/** + * Compatible with IBV_EXP_PEER_IOMEMORY + */ +#define GDS_PEER_IOMEMORY ((struct gds_buf *)-1UL) + +/** + * Compatible with struct ibv_exp_peer_buf_alloc_attr + */ +typedef struct gds_peer_buf_alloc_attr { + size_t length; + /* Bitmask from enum gds_peer_direction */ + uint32_t dir; + /* The ID of the peer device which will be + * * accessing the allocated buffer + * */ + uint64_t peer_id; + /* Data alignment */ + uint32_t alignment; + /* Reserved for future extensions, must be 0 */ + uint32_t comp_mask; +} gds_peer_buf_alloc_attr_t; + + +/** + * Compatible with struct ibv_exp_peer_buf + */ +typedef struct gds_peer_buf { + void *addr; + size_t length; + /* Reserved for future extensions, must be 0 */ + uint32_t comp_mask; +} gds_peer_buf_t; + +/** + * Compatible with struct ibv_exp_peer_direct_attr + */ +typedef struct { + /* Unique ID per peer device. + * Used to identify specific HW devices where relevant. + */ + uint64_t peer_id; + /* buf_alloc callback should return gds_peer_buf_t with buffer + * of at least attr->length. + * @attr: description of desired buffer + * + * Buffer should be mapped in the application address space + * for read/write (depends on attr->dir value). + * attr->dir value is supposed to indicate the expected directions + * of access to the buffer, to allow optimization by the peer driver. + * If NULL returned then buffer will be allocated in system memory + * by ibverbs driver. + */ + gds_peer_buf_t *(*buf_alloc)(gds_peer_buf_alloc_attr_t *attr); + /* If buffer was allocated by buf_alloc then buf_release will be + * called to release it. + * @pb: struct returned by buf_alloc + * + * buf_release is responsible to release everything allocated by + * buf_alloc. + * Return 0 on succes. + */ + int (*buf_release)(gds_peer_buf_t *pb); + /* register_va callback should register virtual address from the + * application as an area the peer is allowed to access. + * @start: pointer to beginning of region in virtual space + * @length: length of region + * @peer_id: the ID of the peer device which will be accessing + * the region. + * @pb: if registering a buffer that was returned from buf_alloc(), + * pb is the struct that was returned. If registering io memory area, + * pb is GDS_PEER_IOMEMORY. Otherwise - NULL + * + * Return id of registered address on success, 0 on failure. + */ + uint64_t (*register_va)(void *start, size_t length, uint64_t peer_id, + gds_peer_buf_t *pb); + /* If virtual address was registered with register_va then + * unregister_va will be called to unregister it. + * @target_id: id returned by register_va + * @peer_id: the ID of the peer device passed to register_va + * + * Return 0 on success. + */ + int (*unregister_va)(uint64_t target_id, uint64_t peer_id); + /* Bitmask from gds_peer_op_caps */ + uint64_t caps; + /* Maximal length of DMA operation the peer can do in copy-block */ + size_t peer_dma_op_map_len; + /* From gds_peer_direct_attr_mask */ + uint32_t comp_mask; + /* Feature version, must be 1 */ + uint32_t version; +} gds_peer_attr; struct gds_peer; -struct gds_buf: ibv_exp_peer_buf { +struct gds_buf: gds_peer_buf_t { gds_peer *peer; CUdeviceptr peer_addr; void *handle; From 35db8665b4c485fdf2de4ddd7db2796c97664abd Mon Sep 17 00:00:00 2001 From: Pak Markthub Date: Thu, 9 Sep 2021 23:19:54 -0400 Subject: [PATCH 22/50] Removed include verbs_exp.h and peer_ops.h from objs.hpp --- src/mlx5-exp.hpp | 1 + src/objs.cpp | 8 ++++---- src/objs.hpp | 3 --- 3 files changed, 5 insertions(+), 7 deletions(-) diff --git a/src/mlx5-exp.hpp b/src/mlx5-exp.hpp index 7e4ba14..cb43147 100644 --- a/src/mlx5-exp.hpp +++ b/src/mlx5-exp.hpp @@ -4,6 +4,7 @@ #include #include +#include #include #include diff --git a/src/objs.cpp b/src/objs.cpp index 1dac250..9c9f6ff 100644 --- a/src/objs.cpp +++ b/src/objs.cpp @@ -71,7 +71,7 @@ gds_buf *gds_peer::buf_alloc_cq(size_t length, uint32_t dir, uint32_t alignment, { gds_buf *buf = NULL; switch (dir) { - case (IBV_EXP_PEER_DIRECTION_FROM_HCA|IBV_EXP_PEER_DIRECTION_TO_PEER|IBV_EXP_PEER_DIRECTION_TO_CPU): + case (GDS_PEER_DIRECTION_FROM_HCA|GDS_PEER_DIRECTION_TO_PEER|GDS_PEER_DIRECTION_TO_CPU): // CQ buf if (GDS_ALLOC_CQ_ON_GPU == (flags & GDS_ALLOC_CQ_MASK)) { gds_dbg("allocating CQ on GPU mem\n"); @@ -80,14 +80,14 @@ gds_buf *gds_peer::buf_alloc_cq(size_t length, uint32_t dir, uint32_t alignment, gds_dbg("allocating CQ on Host mem\n"); } break; - case (IBV_EXP_PEER_DIRECTION_FROM_PEER|IBV_EXP_PEER_DIRECTION_TO_CPU): + case (GDS_PEER_DIRECTION_FROM_PEER|GDS_PEER_DIRECTION_TO_CPU): // CQ peer buf, helper buffer // on SYSMEM for the near future // GPU does a store to the 'busy' field as part of the peek_cq task // CPU polls on that field gds_dbg("allocating CQ peer buf on Host mem\n"); break; - case (IBV_EXP_PEER_DIRECTION_FROM_PEER|IBV_EXP_PEER_DIRECTION_TO_HCA): + case (GDS_PEER_DIRECTION_FROM_PEER|GDS_PEER_DIRECTION_TO_HCA): gds_dbg("allocating CQ dbrec on Host mem\n"); break; default: @@ -101,7 +101,7 @@ gds_buf *gds_peer::buf_alloc_wq(size_t length, uint32_t dir, uint32_t alignment, { gds_buf *buf = NULL; switch (dir) { - case IBV_EXP_PEER_DIRECTION_FROM_PEER|IBV_EXP_PEER_DIRECTION_TO_HCA: + case GDS_PEER_DIRECTION_FROM_PEER|GDS_PEER_DIRECTION_TO_HCA: // dbrec if (GDS_ALLOC_DBREC_ON_GPU == (flags & GDS_ALLOC_DBREC_MASK)) { gds_dbg("allocating DBREC on GPU mem\n"); diff --git a/src/objs.hpp b/src/objs.hpp index e524155..71f4ea4 100644 --- a/src/objs.hpp +++ b/src/objs.hpp @@ -27,9 +27,6 @@ #pragma once -#include -#include - static const size_t max_gpus = 16; /** From 24a0a20abb727f3c07491055b8154018427abfe0 Mon Sep 17 00:00:00 2001 From: Pak Markthub Date: Thu, 9 Sep 2021 23:20:52 -0400 Subject: [PATCH 23/50] Removed include verbs_exp.h from objs.cpp --- src/objs.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/src/objs.cpp b/src/objs.cpp index 9c9f6ff..475c9d3 100644 --- a/src/objs.cpp +++ b/src/objs.cpp @@ -39,7 +39,6 @@ using namespace std; #include -#include #include #include "gdsync.h" From 5d7c328a1240b3e81d9e632f699b9f310a4f880f Mon Sep 17 00:00:00 2001 From: Pak Markthub Date: Fri, 10 Sep 2021 01:17:58 -0400 Subject: [PATCH 24/50] Removed include verbs_exp.h and peer_ops.h from gdsync.cpp --- src/gdsync.cpp | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/gdsync.cpp b/src/gdsync.cpp index 2697dce..86d277d 100644 --- a/src/gdsync.cpp +++ b/src/gdsync.cpp @@ -33,9 +33,6 @@ #include #include -#include -#include - #include #include From bf1a35807915f938e22d2c8acb95d2a3bf505cb9 Mon Sep 17 00:00:00 2001 From: Pak Markthub Date: Fri, 10 Sep 2021 01:19:50 -0400 Subject: [PATCH 25/50] Relaced all IBV_EXP_ enum with GDS_ in gdsync.cpp --- src/gdsync.cpp | 86 +++++++++++++++++++++++++------------------------- 1 file changed, 43 insertions(+), 43 deletions(-) diff --git a/src/gdsync.cpp b/src/gdsync.cpp index 86d277d..1af7ed6 100644 --- a/src/gdsync.cpp +++ b/src/gdsync.cpp @@ -783,13 +783,13 @@ int gds_post_ops(gds_peer *peer, size_t n_ops, struct peer_op_wr *op, gds_op_lis //int flags = 0; gds_dbg("op[%zu] type:%08x\n", n, op->type); switch(op->type) { - case IBV_EXP_PEER_OP_FENCE: { + case GDS_PEER_OP_FENCE: { gds_dbg("OP_FENCE: fence_flags=%" PRIu64 "\n", op->wr.fence.fence_flags); - uint32_t fence_op = (op->wr.fence.fence_flags & (IBV_EXP_PEER_FENCE_OP_READ|IBV_EXP_PEER_FENCE_OP_WRITE)); - uint32_t fence_from = (op->wr.fence.fence_flags & (IBV_EXP_PEER_FENCE_FROM_CPU|IBV_EXP_PEER_FENCE_FROM_HCA)); - uint32_t fence_mem = (op->wr.fence.fence_flags & (IBV_EXP_PEER_FENCE_MEM_SYS|IBV_EXP_PEER_FENCE_MEM_PEER)); + uint32_t fence_op = (op->wr.fence.fence_flags & (GDS_PEER_FENCE_OP_READ|GDS_PEER_FENCE_OP_WRITE)); + uint32_t fence_from = (op->wr.fence.fence_flags & (GDS_PEER_FENCE_FROM_CPU|GDS_PEER_FENCE_FROM_HCA)); + uint32_t fence_mem = (op->wr.fence.fence_flags & (GDS_PEER_FENCE_MEM_SYS|GDS_PEER_FENCE_MEM_PEER)); - if (fence_op == IBV_EXP_PEER_FENCE_OP_READ) { + if (fence_op == GDS_PEER_FENCE_OP_READ) { gds_dbg("nothing to do for read fences\n"); //retcode = EINVAL; break; @@ -808,17 +808,17 @@ int gds_post_ops(gds_peer *peer, size_t n_ops, struct peer_op_wr *op, gds_op_lis //retcode = 0; } else { - if (fence_from != IBV_EXP_PEER_FENCE_FROM_HCA) { + if (fence_from != GDS_PEER_FENCE_FROM_HCA) { gds_err("unexpected from fence\n"); retcode = EINVAL; break; } int flags = 0; - if (fence_mem == IBV_EXP_PEER_FENCE_MEM_PEER) { + if (fence_mem == GDS_PEER_FENCE_MEM_PEER) { gds_dbg("using light membar\n"); flags = GDS_MEMBAR_DEFAULT | GDS_MEMBAR_MLX5; } - else if (fence_mem == IBV_EXP_PEER_FENCE_MEM_SYS) { + else if (fence_mem == GDS_PEER_FENCE_MEM_SYS) { gds_dbg("using heavy membar\n"); flags = GDS_MEMBAR_SYS | GDS_MEMBAR_MLX5; } @@ -832,7 +832,7 @@ int gds_post_ops(gds_peer *peer, size_t n_ops, struct peer_op_wr *op, gds_op_lis } break; } - case IBV_EXP_PEER_OP_STORE_DWORD: { + case GDS_PEER_OP_STORE_DWORD: { CUdeviceptr dev_ptr = range_from_id(op->wr.dword_va.target_id)->dptr + op->wr.dword_va.offset; uint32_t data = op->wr.dword_va.data; @@ -867,7 +867,7 @@ int gds_post_ops(gds_peer *peer, size_t n_ops, struct peer_op_wr *op, gds_op_lis } break; } - case IBV_EXP_PEER_OP_STORE_QWORD: { + case GDS_PEER_OP_STORE_QWORD: { CUdeviceptr dev_ptr = range_from_id(op->wr.qword_va.target_id)->dptr + op->wr.qword_va.offset; uint64_t data = op->wr.qword_va.data; @@ -910,7 +910,7 @@ int gds_post_ops(gds_peer *peer, size_t n_ops, struct peer_op_wr *op, gds_op_lis break; } - case IBV_EXP_PEER_OP_COPY_BLOCK: { + case GDS_PEER_OP_COPY_BLOCK: { CUdeviceptr dev_ptr = range_from_id(op->wr.copy_op.target_id)->dptr + op->wr.copy_op.offset; size_t len = op->wr.copy_op.len; @@ -931,9 +931,9 @@ int gds_post_ops(gds_peer *peer, size_t n_ops, struct peer_op_wr *op, gds_op_lis retcode = gds_fill_inlcpy(peer, ops, dev_ptr, src, len, flags); break; } - case IBV_EXP_PEER_OP_POLL_AND_DWORD: - case IBV_EXP_PEER_OP_POLL_GEQ_DWORD: - case IBV_EXP_PEER_OP_POLL_NOR_DWORD: { + case GDS_PEER_OP_POLL_AND_DWORD: + case GDS_PEER_OP_POLL_GEQ_DWORD: + case GDS_PEER_OP_POLL_NOR_DWORD: { int poll_cond; CUdeviceptr dev_ptr = range_from_id(op->wr.dword_va.target_id)->dptr + op->wr.dword_va.offset; @@ -946,13 +946,13 @@ int gds_post_ops(gds_peer *peer, size_t n_ops, struct peer_op_wr *op, gds_op_lis gds_dbg("OP_WAIT_DWORD dev_ptr=%llx data=%" PRIx32 " type=%" PRIx32 "\n", dev_ptr, data, (uint32_t)op->type); switch(op->type) { - case IBV_EXP_PEER_OP_POLL_NOR_DWORD: + case GDS_PEER_OP_POLL_NOR_DWORD: poll_cond = GDS_WAIT_COND_NOR; break; - case IBV_EXP_PEER_OP_POLL_GEQ_DWORD: + case GDS_PEER_OP_POLL_GEQ_DWORD: poll_cond = GDS_WAIT_COND_GEQ; break; - case IBV_EXP_PEER_OP_POLL_AND_DWORD: + case GDS_PEER_OP_POLL_AND_DWORD: poll_cond = GDS_WAIT_COND_AND; break; default: @@ -1036,28 +1036,28 @@ int gds_post_ops_on_cpu(size_t n_ops, struct peer_op_wr *op, int post_flags) gds_dbg("op[%zu]=%p\n", n, op); //gds_dbg("op[%zu]=%p type:%08x\n", n, op, op->type); switch(op->type) { - case IBV_EXP_PEER_OP_FENCE: { + case GDS_PEER_OP_FENCE: { gds_dbg("FENCE flags=%" PRIu64 "\n", op->wr.fence.fence_flags); - uint32_t fence_op = (op->wr.fence.fence_flags & (IBV_EXP_PEER_FENCE_OP_READ|IBV_EXP_PEER_FENCE_OP_WRITE)); - uint32_t fence_from = (op->wr.fence.fence_flags & (IBV_EXP_PEER_FENCE_FROM_CPU|IBV_EXP_PEER_FENCE_FROM_HCA)); - uint32_t fence_mem = (op->wr.fence.fence_flags & (IBV_EXP_PEER_FENCE_MEM_SYS|IBV_EXP_PEER_FENCE_MEM_PEER)); + uint32_t fence_op = (op->wr.fence.fence_flags & (GDS_PEER_FENCE_OP_READ|GDS_PEER_FENCE_OP_WRITE)); + uint32_t fence_from = (op->wr.fence.fence_flags & (GDS_PEER_FENCE_FROM_CPU|GDS_PEER_FENCE_FROM_HCA)); + uint32_t fence_mem = (op->wr.fence.fence_flags & (GDS_PEER_FENCE_MEM_SYS|GDS_PEER_FENCE_MEM_PEER)); - if (fence_op == IBV_EXP_PEER_FENCE_OP_READ) { + if (fence_op == GDS_PEER_FENCE_OP_READ) { gds_warnc(1, "nothing to do for read fences\n"); //retcode = EINVAL; break; } else { - if (fence_from != IBV_EXP_PEER_FENCE_FROM_HCA) { + if (fence_from != GDS_PEER_FENCE_FROM_HCA) { gds_err("unexpected from %08x fence, expected FROM_HCA\n", fence_from); retcode = EINVAL; break; } - if (fence_mem == IBV_EXP_PEER_FENCE_MEM_PEER) { + if (fence_mem == GDS_PEER_FENCE_MEM_PEER) { gds_dbg("using light membar\n"); wmb(); } - else if (fence_mem == IBV_EXP_PEER_FENCE_MEM_SYS) { + else if (fence_mem == GDS_PEER_FENCE_MEM_SYS) { gds_dbg("using heavy membar\n"); wmb(); } @@ -1069,7 +1069,7 @@ int gds_post_ops_on_cpu(size_t n_ops, struct peer_op_wr *op, int post_flags) } break; } - case IBV_EXP_PEER_OP_STORE_DWORD: { + case GDS_PEER_OP_STORE_DWORD: { uint32_t *ptr = (uint32_t*)((ptrdiff_t)range_from_id(op->wr.dword_va.target_id)->va + op->wr.dword_va.offset); uint32_t data = op->wr.dword_va.data; // A || B || C || E @@ -1077,14 +1077,14 @@ int gds_post_ops_on_cpu(size_t n_ops, struct peer_op_wr *op, int post_flags) gds_atomic_set(ptr, data); break; } - case IBV_EXP_PEER_OP_STORE_QWORD: { + case GDS_PEER_OP_STORE_QWORD: { uint64_t *ptr = (uint64_t*)((ptrdiff_t)range_from_id(op->wr.qword_va.target_id)->va + op->wr.qword_va.offset); uint64_t data = op->wr.qword_va.data; gds_dbg("STORE_QWORD ptr=%p data=%016" PRIx64 "\n", ptr, data); gds_atomic_set(ptr, data); break; } - case IBV_EXP_PEER_OP_COPY_BLOCK: { + case GDS_PEER_OP_COPY_BLOCK: { uint64_t *ptr = (uint64_t*)((ptrdiff_t)range_from_id(op->wr.copy_op.target_id)->va + op->wr.copy_op.offset); uint64_t *src = (uint64_t*)op->wr.copy_op.src; size_t n_bytes = op->wr.copy_op.len; @@ -1092,9 +1092,9 @@ int gds_post_ops_on_cpu(size_t n_ops, struct peer_op_wr *op, int post_flags) gds_bf_copy(ptr, src, n_bytes); break; } - case IBV_EXP_PEER_OP_POLL_AND_DWORD: - case IBV_EXP_PEER_OP_POLL_GEQ_DWORD: - case IBV_EXP_PEER_OP_POLL_NOR_DWORD: { + case GDS_PEER_OP_POLL_AND_DWORD: + case GDS_PEER_OP_POLL_GEQ_DWORD: + case GDS_PEER_OP_POLL_NOR_DWORD: { int poll_cond; uint32_t *ptr = (uint32_t*)((ptrdiff_t)range_from_id(op->wr.dword_va.target_id)->va + op->wr.dword_va.offset); uint32_t value = op->wr.dword_va.data; @@ -1106,13 +1106,13 @@ int gds_post_ops_on_cpu(size_t n_ops, struct peer_op_wr *op, int post_flags) do { uint32_t data = gds_atomic_get(ptr); switch(op->type) { - case IBV_EXP_PEER_OP_POLL_NOR_DWORD: + case GDS_PEER_OP_POLL_NOR_DWORD: done = (0 != ~(data | value)); break; - case IBV_EXP_PEER_OP_POLL_GEQ_DWORD: + case GDS_PEER_OP_POLL_GEQ_DWORD: done = ((int32_t)data - (int32_t)value >= 0); break; - case IBV_EXP_PEER_OP_POLL_AND_DWORD: + case GDS_PEER_OP_POLL_AND_DWORD: done = (0 != (data & value)); break; default: @@ -1453,28 +1453,28 @@ static void gds_init_peer(gds_peer *peer, CUdevice dev, int gpu_id) peer->attr.register_va = gds_register_va; peer->attr.unregister_va = gds_unregister_va; - peer->attr.caps = ( IBV_EXP_PEER_OP_STORE_DWORD_CAP | - IBV_EXP_PEER_OP_STORE_QWORD_CAP | - IBV_EXP_PEER_OP_FENCE_CAP | - IBV_EXP_PEER_OP_POLL_AND_DWORD_CAP ); + peer->attr.caps = ( GDS_PEER_OP_STORE_DWORD_CAP | + GDS_PEER_OP_STORE_QWORD_CAP | + GDS_PEER_OP_FENCE_CAP | + GDS_PEER_OP_POLL_AND_DWORD_CAP ); if (peer->has_wait_nor) { gds_dbg("enabling NOR feature\n"); - peer->attr.caps |= IBV_EXP_PEER_OP_POLL_NOR_DWORD_CAP; + peer->attr.caps |= GDS_PEER_OP_POLL_NOR_DWORD_CAP; } else - peer->attr.caps |= IBV_EXP_PEER_OP_POLL_GEQ_DWORD_CAP; + peer->attr.caps |= GDS_PEER_OP_POLL_GEQ_DWORD_CAP; if (peer->has_inlcpy) { gds_dbg("enabling COPY BLOCK feature\n"); - peer->attr.caps |= IBV_EXP_PEER_OP_COPY_BLOCK_CAP; + peer->attr.caps |= GDS_PEER_OP_COPY_BLOCK_CAP; } else if (peer->has_write64 || gds_simulate_write64()) { gds_dbg("enabling STORE QWORD feature\n"); - peer->attr.caps |= IBV_EXP_PEER_OP_STORE_QWORD_CAP; + peer->attr.caps |= GDS_PEER_OP_STORE_QWORD_CAP; } gds_dbg("caps=%016lx\n", peer->attr.caps); peer->attr.peer_dma_op_map_len = GDS_GPU_MAX_INLINE_SIZE; - peer->attr.comp_mask = IBV_EXP_PEER_DIRECT_VERSION; + peer->attr.comp_mask = GDS_PEER_DIRECT_VERSION; peer->attr.version = 1; peer->tq = new task_queue; From f95c72d9ecf982f2e4714ee0ec1eb10356bc7513 Mon Sep 17 00:00:00 2001 From: Pak Markthub Date: Fri, 10 Sep 2021 01:28:53 -0400 Subject: [PATCH 26/50] Replaced all IBV_EXP_ enum with GDS_ in mlx5.cpp --- src/mlx5.cpp | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/src/mlx5.cpp b/src/mlx5.cpp index 526f544..c67a2f6 100644 --- a/src/mlx5.cpp +++ b/src/mlx5.cpp @@ -64,25 +64,25 @@ int gds_mlx5_get_send_descs(gds_mlx5_send_info_t *mlx5_i, const gds_send_request for (; op && n < n_ops; op = op->next, ++n) { switch(op->type) { - case IBV_EXP_PEER_OP_FENCE: { + case GDS_PEER_OP_FENCE: { gds_dbg("OP_FENCE: fence_flags=%" PRIu64 "\n", op->wr.fence.fence_flags); - uint32_t fence_op = (op->wr.fence.fence_flags & (IBV_EXP_PEER_FENCE_OP_READ|IBV_EXP_PEER_FENCE_OP_WRITE)); - uint32_t fence_from = (op->wr.fence.fence_flags & (IBV_EXP_PEER_FENCE_FROM_CPU|IBV_EXP_PEER_FENCE_FROM_HCA)); - uint32_t fence_mem = (op->wr.fence.fence_flags & (IBV_EXP_PEER_FENCE_MEM_SYS|IBV_EXP_PEER_FENCE_MEM_PEER)); - if (fence_op == IBV_EXP_PEER_FENCE_OP_READ) { + uint32_t fence_op = (op->wr.fence.fence_flags & (GDS_PEER_FENCE_OP_READ|GDS_PEER_FENCE_OP_WRITE)); + uint32_t fence_from = (op->wr.fence.fence_flags & (GDS_PEER_FENCE_FROM_CPU|GDS_PEER_FENCE_FROM_HCA)); + uint32_t fence_mem = (op->wr.fence.fence_flags & (GDS_PEER_FENCE_MEM_SYS|GDS_PEER_FENCE_MEM_PEER)); + if (fence_op == GDS_PEER_FENCE_OP_READ) { gds_dbg("nothing to do for read fences\n"); break; } - if (fence_from != IBV_EXP_PEER_FENCE_FROM_HCA) { + if (fence_from != GDS_PEER_FENCE_FROM_HCA) { gds_err("unexpected from fence\n"); retcode = EINVAL; break; } - if (fence_mem == IBV_EXP_PEER_FENCE_MEM_PEER) { + if (fence_mem == GDS_PEER_FENCE_MEM_PEER) { gds_dbg("using light membar\n"); mlx5_i->membar = 1; } - else if (fence_mem == IBV_EXP_PEER_FENCE_MEM_SYS) { + else if (fence_mem == GDS_PEER_FENCE_MEM_SYS) { gds_dbg("using heavy membar\n"); mlx5_i->membar_full = 1; } @@ -93,7 +93,7 @@ int gds_mlx5_get_send_descs(gds_mlx5_send_info_t *mlx5_i, const gds_send_request } break; } - case IBV_EXP_PEER_OP_STORE_DWORD: { + case GDS_PEER_OP_STORE_DWORD: { CUdeviceptr dev_ptr = range_from_id(op->wr.dword_va.target_id)->dptr + op->wr.dword_va.offset; uint32_t data = op->wr.dword_va.data; @@ -107,7 +107,7 @@ int gds_mlx5_get_send_descs(gds_mlx5_send_info_t *mlx5_i, const gds_send_request mlx5_i->dbrec_value = data; break; } - case IBV_EXP_PEER_OP_STORE_QWORD: { + case GDS_PEER_OP_STORE_QWORD: { CUdeviceptr dev_ptr = range_from_id(op->wr.qword_va.target_id)->dptr + op->wr.qword_va.offset; uint64_t data = op->wr.qword_va.data; @@ -121,7 +121,7 @@ int gds_mlx5_get_send_descs(gds_mlx5_send_info_t *mlx5_i, const gds_send_request mlx5_i->db_value = data; break; } - case IBV_EXP_PEER_OP_COPY_BLOCK: { + case GDS_PEER_OP_COPY_BLOCK: { CUdeviceptr dev_ptr = range_from_id(op->wr.copy_op.target_id)->dptr + op->wr.copy_op.offset; size_t len = op->wr.copy_op.len; @@ -136,9 +136,9 @@ int gds_mlx5_get_send_descs(gds_mlx5_send_info_t *mlx5_i, const gds_send_request mlx5_i->db_value = *(uint64_t*)src; break; } - case IBV_EXP_PEER_OP_POLL_AND_DWORD: - case IBV_EXP_PEER_OP_POLL_GEQ_DWORD: - case IBV_EXP_PEER_OP_POLL_NOR_DWORD: { + case GDS_PEER_OP_POLL_AND_DWORD: + case GDS_PEER_OP_POLL_GEQ_DWORD: + case GDS_PEER_OP_POLL_NOR_DWORD: { gds_err("unexpected polling op in send request\n"); retcode = EINVAL; break; From 6a23931a2dcdf5c321eb7705f937bba22be7502b Mon Sep 17 00:00:00 2001 From: Pak Markthub Date: Fri, 10 Sep 2021 01:31:04 -0400 Subject: [PATCH 27/50] Relaced all IBV_EXP_ enum with GDS_ enum in gdsync_debug_hostregister_bug.cpp --- src/gdsync_debug_hostregister_bug.cpp | 56 +++++++++++++-------------- 1 file changed, 28 insertions(+), 28 deletions(-) diff --git a/src/gdsync_debug_hostregister_bug.cpp b/src/gdsync_debug_hostregister_bug.cpp index 1e36d08..63d0f3b 100644 --- a/src/gdsync_debug_hostregister_bug.cpp +++ b/src/gdsync_debug_hostregister_bug.cpp @@ -704,11 +704,11 @@ static int gds_post_ops(size_t n_ops, struct peer_op_wr *op, CUstreamBatchMemOpP switch(op->type) { case IBV_PEER_OP_FENCE: { gds_dbg("OP_FENCE: fence_flags=%"PRIu64"\n", op->wr.fence.fence_flags); - uint32_t fence_op = (op->wr.fence.fence_flags & (IBV_EXP_PEER_FENCE_OP_READ|IBV_EXP_PEER_FENCE_OP_WRITE)); - uint32_t fence_from = (op->wr.fence.fence_flags & (IBV_EXP_PEER_FENCE_FROM_CPU|IBV_EXP_PEER_FENCE_FROM_HCA)); - uint32_t fence_mem = (op->wr.fence.fence_flags & (IBV_EXP_PEER_FENCE_MEM_SYS|IBV_EXP_PEER_FENCE_MEM_PEER)); + uint32_t fence_op = (op->wr.fence.fence_flags & (GDS_PEER_FENCE_OP_READ|GDS_PEER_FENCE_OP_WRITE)); + uint32_t fence_from = (op->wr.fence.fence_flags & (GDS_PEER_FENCE_FROM_CPU|GDS_PEER_FENCE_FROM_HCA)); + uint32_t fence_mem = (op->wr.fence.fence_flags & (GDS_PEER_FENCE_MEM_SYS|GDS_PEER_FENCE_MEM_PEER)); - if (fence_op == IBV_EXP_PEER_FENCE_OP_READ) { + if (fence_op == GDS_PEER_FENCE_OP_READ) { gds_dbg("nothing to do for read fences\n"); //retcode = EINVAL; break; @@ -727,17 +727,17 @@ static int gds_post_ops(size_t n_ops, struct peer_op_wr *op, CUstreamBatchMemOpP //retcode = 0; } else { - if (fence_from != IBV_EXP_PEER_FENCE_FROM_HCA) { + if (fence_from != GDS_PEER_FENCE_FROM_HCA) { gds_err("unexpected from fence\n"); retcode = EINVAL; break; } int flags = 0; - if (fence_mem == IBV_EXP_PEER_FENCE_MEM_PEER) { + if (fence_mem == GDS_PEER_FENCE_MEM_PEER) { gds_dbg("using light membar\n"); flags = GDS_MEMBAR_DEFAULT; } - else if (fence_mem == IBV_EXP_PEER_FENCE_MEM_SYS) { + else if (fence_mem == GDS_PEER_FENCE_MEM_SYS) { gds_dbg("using heavy membar\n"); flags = GDS_MEMBAR_SYS; } @@ -973,26 +973,26 @@ static int gds_post_ops_on_cpu(size_t n_descs, struct peer_op_wr *op) switch(op->type) { case IBV_PEER_OP_FENCE: { gds_dbg("fence_flags=%"PRIu64"\n", op->wr.fence.fence_flags); - uint32_t fence_op = (op->wr.fence.fence_flags & (IBV_EXP_PEER_FENCE_OP_READ|IBV_EXP_PEER_FENCE_OP_WRITE)); - uint32_t fence_from = (op->wr.fence.fence_flags & (IBV_EXP_PEER_FENCE_FROM_CPU|IBV_EXP_PEER_FENCE_FROM_HCA)); - uint32_t fence_mem = (op->wr.fence.fence_flags & (IBV_EXP_PEER_FENCE_MEM_SYS|IBV_EXP_PEER_FENCE_MEM_PEER)); + uint32_t fence_op = (op->wr.fence.fence_flags & (GDS_PEER_FENCE_OP_READ|GDS_PEER_FENCE_OP_WRITE)); + uint32_t fence_from = (op->wr.fence.fence_flags & (GDS_PEER_FENCE_FROM_CPU|GDS_PEER_FENCE_FROM_HCA)); + uint32_t fence_mem = (op->wr.fence.fence_flags & (GDS_PEER_FENCE_MEM_SYS|GDS_PEER_FENCE_MEM_PEER)); - if (fence_op == IBV_EXP_PEER_FENCE_OP_READ) { + if (fence_op == GDS_PEER_FENCE_OP_READ) { gds_warnc(1, "nothing to do for read fences\n"); //retcode = EINVAL; break; } else { - if (fence_from != IBV_EXP_PEER_FENCE_FROM_HCA) { + if (fence_from != GDS_PEER_FENCE_FROM_HCA) { gds_err("unexpected from %08x fence, expected FROM_HCA\n", fence_from); retcode = EINVAL; break; } - if (fence_mem == IBV_EXP_PEER_FENCE_MEM_PEER) { + if (fence_mem == GDS_PEER_FENCE_MEM_PEER) { gds_dbg("using light membar\n"); wmb(); } - else if (fence_mem == IBV_EXP_PEER_FENCE_MEM_SYS) { + else if (fence_mem == GDS_PEER_FENCE_MEM_SYS) { gds_dbg("using heavy membar\n"); wmb(); } @@ -1322,7 +1322,7 @@ static uint64_t gds_register_va(void *start, size_t length, uint64_t peer_id, st gds_dbg("start=%p length=%zu peer_id=%"PRIx64" peer_buf=%p\n", start, length, peer_id, pb); - if (IBV_EXP_PEER_IOMEMORY == pb) { + if (GDS_PEER_IOMEMORY == pb) { // register as IOMEM range = peer->register_range(start, length, GDS_MEMORY_IO); } @@ -1375,25 +1375,25 @@ static void gds_init_peer_attr(gds_peer_attr *attr, gds_peer *peer) attr->register_va = gds_register_va; attr->unregister_va = gds_unregister_va; - attr->caps = ( IBV_EXP_PEER_OP_STORE_DWORD_CAP | - IBV_EXP_PEER_OP_STORE_QWORD_CAP | - IBV_EXP_PEER_OP_FENCE_CAP | - IBV_EXP_PEER_OP_POLL_AND_DWORD_CAP ); + attr->caps = ( GDS_PEER_OP_STORE_DWORD_CAP | + GDS_PEER_OP_STORE_QWORD_CAP | + GDS_PEER_OP_FENCE_CAP | + GDS_PEER_OP_POLL_AND_DWORD_CAP ); if (gpu_does_support_nor(peer)) - attr->caps |= IBV_EXP_PEER_OP_POLL_NOR_DWORD_CAP; + attr->caps |= GDS_PEER_OP_POLL_NOR_DWORD_CAP; else - attr->caps |= IBV_EXP_PEER_OP_POLL_GEQ_DWORD_CAP; + attr->caps |= GDS_PEER_OP_POLL_GEQ_DWORD_CAP; if (gds_enable_inlcpy()) { - attr->caps |= IBV_EXP_PEER_OP_COPY_BLOCK_CAP; + attr->caps |= GDS_PEER_OP_COPY_BLOCK_CAP; } else if (gds_enable_write64() || gds_simulate_write64()) { - attr->caps |= IBV_EXP_PEER_OP_STORE_QWORD_CAP; + attr->caps |= GDS_PEER_OP_STORE_QWORD_CAP; } gds_dbg("caps=%016lx\n", attr->caps); attr->peer_dma_op_map_len = GDS_GPU_MAX_INLINE_SIZE; - attr->comp_mask = IBV_EXP_PEER_DIRECT_VERSION; + attr->comp_mask = GDS_PEER_DIRECT_VERSION; attr->version = 1; gds_dbg("peer_attr: peer_id=%"PRIx64"\n", attr->peer_id); @@ -1536,13 +1536,13 @@ struct gds_qp *gds_create_qp(struct ibv_pd *pd, struct ibv_context *context, gds // the CQE without updating the tracking variables if (flags & GDS_CREATE_QP_GPU_INVALIDATE_RX_CQ) { gds_warn("IGNORE_RQ_OVERFLOW\n"); - qp_attr->exp_create_flags |= IBV_EXP_QP_CREATE_IGNORE_RQ_OVERFLOW; - qp_attr->comp_mask |= IBV_EXP_QP_INIT_ATTR_CREATE_FLAGS; + qp_attr->exp_create_flags |= GDS_QP_CREATE_IGNORE_RQ_OVERFLOW; + qp_attr->comp_mask |= GDS_QP_INIT_ATTR_CREATE_FLAGS; } if (flags & GDS_CREATE_QP_GPU_INVALIDATE_TX_CQ) { gds_warn("IGNORE_SQ_OVERFLOW\n"); - qp_attr->exp_create_flags |= IBV_EXP_QP_CREATE_IGNORE_SQ_OVERFLOW; - qp_attr->comp_mask |= IBV_EXP_QP_INIT_ATTR_CREATE_FLAGS; + qp_attr->exp_create_flags |= GDS_QP_CREATE_IGNORE_SQ_OVERFLOW; + qp_attr->comp_mask |= GDS_QP_INIT_ATTR_CREATE_FLAGS; } gds_dbg("before gds_register_peer_ex\n"); From ab745dea2d5772c9a12fabc6587bfed85832985f Mon Sep 17 00:00:00 2001 From: Pak Markthub Date: Fri, 10 Sep 2021 01:37:10 -0400 Subject: [PATCH 28/50] Fixed bugs in gds_dump_wait_request --- src/gdsync.cpp | 12 +++++++----- src/mlx5-exp.cpp | 14 ++++++-------- src/mlx5-exp.hpp | 2 +- 3 files changed, 14 insertions(+), 14 deletions(-) diff --git a/src/gdsync.cpp b/src/gdsync.cpp index 1af7ed6..b01cd36 100644 --- a/src/gdsync.cpp +++ b/src/gdsync.cpp @@ -1173,12 +1173,14 @@ int gds_post_pokes_on_cpu(int count, gds_send_request_t *info, uint32_t *dw, uin void gds_dump_wait_request(gds_wait_request_t *request, size_t count) { - gds_mlx5_exp_wait_request_t *gmexp_request; - if (count == 0) - return; + for (size_t j = 0; j < count; ++j) { + gds_mlx5_exp_wait_request_t *gmexp_request; + if (count == 0) + return; - gmexp_request = to_gds_mexp_wait_request(request); - gds_mlx5_exp_dump_wait_request(gmexp_request, count); + gmexp_request = to_gds_mexp_wait_request(&request[j]); + gds_mlx5_exp_dump_wait_request(gmexp_request, j); + } } //----------------------------------------------------------------------------- diff --git a/src/mlx5-exp.cpp b/src/mlx5-exp.cpp index b75f3cb..7806438 100644 --- a/src/mlx5-exp.cpp +++ b/src/mlx5-exp.cpp @@ -381,15 +381,13 @@ static void gds_mlx5_exp_dump_ops(struct peer_op_wr *op, size_t count) //----------------------------------------------------------------------------- -void gds_mlx5_exp_dump_wait_request(gds_mlx5_exp_wait_request_t *request, size_t count) +void gds_mlx5_exp_dump_wait_request(gds_mlx5_exp_wait_request_t *request, size_t idx) { - for (size_t j = 0; j < count; ++j) { - struct ibv_exp_peer_peek *peek = &request[j].peek; - gds_dbg("req[%zu] entries:%u whence:%u offset:%u peek_id:%" PRIx64 " comp_mask:%08x\n", - j, peek->entries, peek->whence, peek->offset, - peek->peek_id, peek->comp_mask); - gds_mlx5_exp_dump_ops(peek->storage, peek->entries); - } + struct ibv_exp_peer_peek *peek = &request->peek; + gds_dbg("req[%zu] entries:%u whence:%u offset:%u peek_id:%" PRIx64 " comp_mask:%08x\n", + idx, peek->entries, peek->whence, peek->offset, + peek->peek_id, peek->comp_mask); + gds_mlx5_exp_dump_ops(peek->storage, peek->entries); } //----------------------------------------------------------------------------- diff --git a/src/mlx5-exp.hpp b/src/mlx5-exp.hpp index cb43147..1d67137 100644 --- a/src/mlx5-exp.hpp +++ b/src/mlx5-exp.hpp @@ -103,7 +103,7 @@ int gds_mlx5_exp_post_send_ops(gds_peer *peer, gds_mlx5_exp_send_request_t *info int gds_mlx5_exp_post_send_ops_on_cpu(gds_mlx5_exp_send_request_t *info, int flags = 0); void gds_mlx5_exp_init_wait_request(gds_mlx5_exp_wait_request_t *request, uint32_t offset); -void gds_mlx5_exp_dump_wait_request(gds_mlx5_exp_wait_request_t *request, size_t count); +void gds_mlx5_exp_dump_wait_request(gds_mlx5_exp_wait_request_t *request, size_t idx); int gds_mlx5_exp_prepare_wait_cq(gds_mlx5_exp_cq_t *mexpcq, gds_mlx5_exp_wait_request_t *request, int flags); int gds_mlx5_exp_append_wait_cq(gds_mlx5_exp_wait_request_t *request, uint32_t *dw, uint32_t val); int gds_mlx5_exp_abort_wait_cq(gds_mlx5_exp_cq_t *gmexpcq, gds_mlx5_exp_wait_request_t *request); From f43f905e8b4dfa2d7fcb027ab02cbdc1b0a2a2c0 Mon Sep 17 00:00:00 2001 From: Pak Markthub Date: Fri, 10 Sep 2021 01:43:44 -0400 Subject: [PATCH 29/50] Updated some functions in gdsync_debug_hostregister_bug.cpp to reflect changes in gdsync.cpp --- src/gdsync_debug_hostregister_bug.cpp | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/src/gdsync_debug_hostregister_bug.cpp b/src/gdsync_debug_hostregister_bug.cpp index 63d0f3b..2537a74 100644 --- a/src/gdsync_debug_hostregister_bug.cpp +++ b/src/gdsync_debug_hostregister_bug.cpp @@ -41,6 +41,7 @@ #include "objs.hpp" #include "archutils.h" #include "mlnxutils.h" +#include "mlx5-exp.hpp" //----------------------------------------------------------------------------- @@ -1233,12 +1234,13 @@ static void gds_dump_ops(struct peer_op_wr *op, size_t count) void gds_dump_wait_request(gds_wait_request_t *request, size_t count) { - for (size_t j=0; jentries, peek->whence, peek->offset, - peek->peek_id, peek->comp_mask); - gds_dump_ops(peek->storage, peek->entries); + for (size_t j = 0; j < count; ++j) { + gds_mlx5_exp_wait_request_t *gmexp_request; + if (count == 0) + return; + + gmexp_request = to_gds_mexp_wait_request(&request[j]); + gds_mlx5_exp_dump_wait_request(gmexp_request, j); } } @@ -1315,7 +1317,7 @@ static int gds_buf_release(struct ibv_peer_buf *pb) return 0; } -static uint64_t gds_register_va(void *start, size_t length, uint64_t peer_id, struct ibv_exp_peer_buf *pb) +static uint64_t gds_register_va(void *start, size_t length, uint64_t peer_id, gds_peer_buf_t *pb) { gds_peer *peer = peer_from_id(peer_id); gds_range *range = NULL; From aeb1ed7da448b826d83a8b7dd71877f213058c83 Mon Sep 17 00:00:00 2001 From: Pak Markthub Date: Fri, 10 Sep 2021 01:52:03 -0400 Subject: [PATCH 30/50] Removed include verbs_exp.h and peer_ops.h from all files except from mlx-exp.* --- src/apis.cpp | 1 - src/mem.cpp | 1 - src/memmgr.cpp | 1 - tests/gds_poll_lat.c | 1 - tests/gds_sanity.cpp | 1 - tests/gpu.cpp | 1 - tests/gpu.h | 1 - tests/pingpong.h | 1 - 8 files changed, 8 deletions(-) diff --git a/src/apis.cpp b/src/apis.cpp index 23577b0..69e491c 100644 --- a/src/apis.cpp +++ b/src/apis.cpp @@ -40,7 +40,6 @@ //using namespace std; //#include -//#include //#include #include "gdsync.h" diff --git a/src/mem.cpp b/src/mem.cpp index 59a6af6..7cf3602 100644 --- a/src/mem.cpp +++ b/src/mem.cpp @@ -40,7 +40,6 @@ using namespace std; #include -#include #include #include "gdsync.h" diff --git a/src/memmgr.cpp b/src/memmgr.cpp index ab3e490..6d6a21c 100644 --- a/src/memmgr.cpp +++ b/src/memmgr.cpp @@ -40,7 +40,6 @@ using namespace std; #include -#include #include #include "gdsync.h" diff --git a/tests/gds_poll_lat.c b/tests/gds_poll_lat.c index cf2147b..17c4fdb 100644 --- a/tests/gds_poll_lat.c +++ b/tests/gds_poll_lat.c @@ -11,7 +11,6 @@ #include #include -#include #include #include #include diff --git a/tests/gds_sanity.cpp b/tests/gds_sanity.cpp index 910032b..5394833 100644 --- a/tests/gds_sanity.cpp +++ b/tests/gds_sanity.cpp @@ -14,7 +14,6 @@ #include #include -#include #include #include #include diff --git a/tests/gpu.cpp b/tests/gpu.cpp index 6d7da67..8bee5e3 100644 --- a/tests/gpu.cpp +++ b/tests/gpu.cpp @@ -29,7 +29,6 @@ #include #include -#include #include "gdrapi.h" #include "gdsync.h" diff --git a/tests/gpu.h b/tests/gpu.h index 401e88f..0fde885 100644 --- a/tests/gpu.h +++ b/tests/gpu.h @@ -28,7 +28,6 @@ #pragma once #include -#include #ifdef USE_PROFILE diff --git a/tests/pingpong.h b/tests/pingpong.h index 32f020b..9cdc03e 100644 --- a/tests/pingpong.h +++ b/tests/pingpong.h @@ -34,7 +34,6 @@ #define IBV_PINGPONG_H #include -#include enum ibv_mtu pp_mtu_to_enum(int mtu); uint16_t pp_get_local_lid(struct ibv_context *context, int port); From 59d20f5ef2ae8d6b7c4b5debad472fe28a55bfe9 Mon Sep 17 00:00:00 2001 From: Pak Markthub Date: Fri, 10 Sep 2021 00:36:27 -0700 Subject: [PATCH 31/50] Updated configure.ac and Makefile.am to make exp-verbs optional --- Makefile.am | 9 +++++++-- configure.ac | 17 +++++++++++++---- 2 files changed, 20 insertions(+), 6 deletions(-) diff --git a/Makefile.am b/Makefile.am index 27fce57..84a8085 100644 --- a/Makefile.am +++ b/Makefile.am @@ -23,10 +23,15 @@ libgdsyncincludedir = $(includedir)/gdsync libgdsyncinclude_HEADERS = include/gdsync/core.h include/gdsync/device.cuh include/gdsync/mlx5.h include/gdsync/tools.h src_libgdsync_la_CFLAGS = $(AM_CFLAGS) -src_libgdsync_la_SOURCES = src/gdsync.cpp src/memmgr.cpp src/mem.cpp src/objs.cpp src/apis.cpp src/mlx5.cpp src/mlx5-exp.cpp include/gdsync.h +src_libgdsync_la_SOURCES = src/gdsync.cpp src/memmgr.cpp src/mem.cpp src/objs.cpp src/apis.cpp src/mlx5.cpp include/gdsync.h src_libgdsync_la_LDFLAGS = -version-info @VERSION_INFO@ -noinst_HEADERS = src/mem.hpp src/memmgr.hpp src/objs.hpp src/rangeset.hpp src/utils.hpp src/archutils.h src/mlnxutils.h src/mlx5-exp.hpp +noinst_HEADERS = src/mem.hpp src/memmgr.hpp src/objs.hpp src/rangeset.hpp src/utils.hpp src/archutils.h src/mlnxutils.h + +if COMPILE_EXP_VERBS +src_libgdsync_la_SOURCES += src/mlx5-exp.cpp +noinst_HEADERS += src/mlx5-exp.hpp +endif # if enabled at configure time diff --git a/configure.ac b/configure.ac index e20f313..dab8d6a 100644 --- a/configure.ac +++ b/configure.ac @@ -190,11 +190,20 @@ dnl Checks for Verbs support AC_CHECK_LIB(ibverbs, ibv_get_device_list, [], AC_MSG_ERROR([ibv_get_device_list() not found. libgdsync requires libibverbs.])) -AC_CHECK_LIB(ibverbs, ibv_exp_create_qp, - AC_MSG_ERROR([ibv_exp_create_qp not found. libgdsync requires verbs extension support.])) +AC_CHECK_LIB(ibverbs, ibv_exp_create_qp, [have_exp_verbs=1]) +AC_CHECK_HEADER([infiniband/peer_ops.h], [have_peer_ops=1], [], +[[ +#include +]]) + +if test "x$have_exp_verbs" != "x" && test "x$have_peer_ops" != "x"; then + AC_DEFINE([HAVE_EXP_VERBS], [1], [Define if exp-verbs exists.]) + $enable_exp_verbs=1 +else + AC_MSG_WARN([This version of libgdsync cannot be used with out exp-verbs.]) +fi +AM_CONDITIONAL([COMPILE_EXP_VERBS], [test "x$enable_exp_verbs" != "x"]) -AC_CHECK_HEADER(infiniband/peer_ops.h, [], - AC_MSG_ERROR([ not found. libgdsync requires verbs peer-direct support.])) AC_HEADER_STDC dnl Checks for typedefs, structures, and compiler characteristics. From bfb683bd2f8e5e21e462999d2e7a115cf46fdb8f Mon Sep 17 00:00:00 2001 From: Pak Markthub Date: Fri, 10 Sep 2021 03:58:09 -0400 Subject: [PATCH 32/50] Fixed bug in configure.ac related to checking for exp-verbs --- configure.ac | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/configure.ac b/configure.ac index dab8d6a..34bc650 100644 --- a/configure.ac +++ b/configure.ac @@ -190,7 +190,8 @@ dnl Checks for Verbs support AC_CHECK_LIB(ibverbs, ibv_get_device_list, [], AC_MSG_ERROR([ibv_get_device_list() not found. libgdsync requires libibverbs.])) -AC_CHECK_LIB(ibverbs, ibv_exp_create_qp, [have_exp_verbs=1]) +dnl ibv_exp_create_qp is an inline function. So, we check for exp_cmd instead. +AC_CHECK_LIB(ibverbs, ibv_exp_cmd_create_qp, [have_exp_verbs=1]) AC_CHECK_HEADER([infiniband/peer_ops.h], [have_peer_ops=1], [], [[ #include @@ -198,7 +199,7 @@ AC_CHECK_HEADER([infiniband/peer_ops.h], [have_peer_ops=1], [], if test "x$have_exp_verbs" != "x" && test "x$have_peer_ops" != "x"; then AC_DEFINE([HAVE_EXP_VERBS], [1], [Define if exp-verbs exists.]) - $enable_exp_verbs=1 + enable_exp_verbs=1 else AC_MSG_WARN([This version of libgdsync cannot be used with out exp-verbs.]) fi From e61a4057921fed209d9082906600774b6266ee1e Mon Sep 17 00:00:00 2001 From: Pak Markthub Date: Thu, 14 Oct 2021 23:08:59 -0400 Subject: [PATCH 33/50] Defined gds_transport_t --- src/transport.hpp | 63 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 63 insertions(+) create mode 100644 src/transport.hpp diff --git a/src/transport.hpp b/src/transport.hpp new file mode 100644 index 0000000..ddef072 --- /dev/null +++ b/src/transport.hpp @@ -0,0 +1,63 @@ +/* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of NVIDIA CORPORATION nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#pragma once + +typedef struct gds_transport { + int (*create_qp)(struct ibv_pd *pd, struct ibv_context *context, gds_qp_init_attr_t *qp_attr, gds_peer *peer, gds_peer_attr *peer_attr, int flags, gds_qp_t **gqp); + int (*destroy_qp)(gds_qp_t *gqp); + int (*rollback_qp)(gds_qp_t *gqp, gds_send_request_t *request); + + void (*init_send_info)(gds_send_request_t *request); + int (*post_send_ops)(gds_peer *peer, gds_send_request_t *request, gds_op_list_t &ops); + int (*post_send_ops_on_cpu)(gds_send_request_t *request, int flags = 0); + int (*prepare_send)(gds_qp_t *gqp, gds_send_wr *p_ewr, gds_send_wr **bad_ewr, gds_send_request_t *request); + uint32_t (*get_num_send_request_entries)(gds_send_request_t *request); + + void (*init_wait_request)(gds_wait_request_t *request, uint32_t offset); + void (*dump_wait_request)(gds_wait_request_t *request, size_t idx); + int (*stream_post_wait_descriptor)(gds_peer *peer, gds_wait_request_t *request, gds_op_list_t ¶ms, int flags); + int (*post_wait_descriptor)(gds_wait_request_t *request, int flags); + int (*get_wait_descs)(gds_mlx5_wait_info_t *mlx5_i, const gds_wait_request_t *request); + uint32_t (*get_num_wait_request_entries)(gds_wait_request_t *request); + + int (*prepare_wait_cq)(gds_cq_t *gcq, gds_wait_request_t *request, in flags); + int (*append_wait_cq)(gds_cq_t *gcq, gds_wait_request_t *request, in flags); + int (*abort_wait_cq)(gds_cq_t *gcq, gds_wait_request_t *request); +} gds_transport_t; + +extern gds_transport_t *gds_main_transport; + +int gds_transport_mlx5_exp_init(gds_transport_t *transport); + +/* + * Local variables: + * c-indent-level: 8 + * c-basic-offset: 8 + * tab-width: 8 + * End: + */ From 8720c8666cbfe40d4ace8c15095d0b30c9c6be86 Mon Sep 17 00:00:00 2001 From: Pak Markthub Date: Thu, 14 Oct 2021 23:09:45 -0400 Subject: [PATCH 34/50] Moved mlx5-exp.* into transports/mlx5-exp and implemented gds_transport_mlx5_exp_init --- src/transports/mlx5-exp/.mlx5-exp.cpp.swp | Bin 0 -> 40960 bytes src/transports/mlx5-exp/.mlx5-exp.hpp.swp | Bin 0 -> 16384 bytes src/{ => transports/mlx5-exp}/mlx5-exp.cpp | 30 +++++++++++++++++++++ src/{ => transports/mlx5-exp}/mlx5-exp.hpp | 0 4 files changed, 30 insertions(+) create mode 100644 src/transports/mlx5-exp/.mlx5-exp.cpp.swp create mode 100644 src/transports/mlx5-exp/.mlx5-exp.hpp.swp rename src/{ => transports/mlx5-exp}/mlx5-exp.cpp (94%) rename src/{ => transports/mlx5-exp}/mlx5-exp.hpp (100%) diff --git a/src/transports/mlx5-exp/.mlx5-exp.cpp.swp b/src/transports/mlx5-exp/.mlx5-exp.cpp.swp new file mode 100644 index 0000000000000000000000000000000000000000..09e0b7542379f4809868a318355bb10c5039cc58 GIT binary patch literal 40960 zcmeI53v^{wd4R8#hoihKJ}cHkV8&#~WM<@LGh`r40z_t#OeVvCikF*vPm;?d_ukxl zGs!SKl-kuUDX2?%)S^;R)CDdpw4zAWiqfhT-;XL#T3^*lwUr`>O22=f{W|BKo5wga zW!amxzMSNoefHVs-_QTw|K4Y8-FX9Bf|qx#%kXnlCUfo2CUZBQ_^D_A(U&rr%5<)} zvpzXHCf64DnJP~dOY0X1pf6o+BV0RNEnghw>$SC0#j%M(ZFecZrm$;mt(sq3ujWd% zO1a9l>8bgX*M##G`$t#4Qd#^ltq?5c_Pm8yx-n+fzfePPQb_ zl0ZuWEeW(F(2_t)0xb!&B+!z;@1_Lmm1ktG;MIOlzuuwx__W0FApL)+kIzV4H~$y) z`KiS96^Zj-Ph3A(|G!xudvyX#Iu6n2C-w2_#C4PIv-I`bkNuVWxmIQt$C6Fs*GHZ^^WI~Ynf1^kLg#Yh@ z+u&As3tSEoZWSC4hrvTfWHR@_N8uxICG^22`1RqL%qQUEkcB>Y2K?qQuED>-L+~%~ zMz{v@a56mptW4%@uoH@~5ncp`z`^j7XJ#_rfRDjlFb&(`cz6~(4Knae3=e+;cfdUK z!oOpzxE?NpQ{XW80x$kfxEB5#reOvA2*bmhVIE2_4X41#@M3ro`~iFp1IB0J9dHgj z3s%4*7*0M5?}ZT7z!7jbJdRP~A@~RQ5L^e>!a1-Gp2SA+1GpdF1Mh};n1gk2B>a{)up!1U}752ikZrqjqXy%hKz}!I3Bd8 zg|GoqE(Se;JXO3lvc#RuSHoOA%)0p7^W{=KoUiko#&{-XDqV$eS5}81GW}Q6!1djg zkYH!`ZOitJ44l_Lk{!KZxIepP^Z6%d`_CVCFLRlFn!H{$tj|_U!Ql4In_UpOAc+sF z)%Lc0u2d@5gWS|qInNse)8TZvx*Mq_1azzxLPxMCBN51tK1I6PW5SrJWb47|j=YMr zeSK$8D_#p^jBm&$0KRxB0k>XCuuynZ}4?pfD)nY%A0k^GEEq&lBf zBFoPZTJoebSY1z^b1`^No~Z<@A-c~cy33PzrpsGW8i-=uhRmLfk%+_~5)W3( zZ)%GEKYBlPQvKN(NCe|kxrthbz1d4(r9cb?a@tAD6zkRTUq5Lw9 z7JpCDc7#*>QJ$`3cj4kzrUb1@RK`WBR}(R5@%t?{sdLS%s{t=ePUdae zs&d9fnaC6Gg4dj?w@}Srp;$$ls^Zm1O}kgCq{&H!p{-4AHf!}SNkGSs~#n5o~({8Relg%Jxz1g-Mwk67uj;FQQUdLh zPSW*-HaB8Dg2^IV|7W~zPO{sHIprjoyUIoSDZdT#(x;xQ+cW~{#?9Uv3#o>71d7IZQRgV_W4sfKvN$fU zs9xk}4oD~FMBKFSOU6qI=W^9jd)ua5>80d=N~E;5<=JXJ45Gnw%9zl_@AoRE47sa* zW>XPN0NqHqaIUjiyfRE_Kx>NpB%LT`q)E#BQ4Y(Q`)nd;7B61_saG-0zBS z@^052dEJKVDY816Ekk|%jG=9DV4PeF>+Q*Ob_VP0a5wQJzX3H8@PfEerFSx_dG?|4 zc1yCZ>2N|WO1?5WXm2Z)cIBoh`ntR}jO|V9Kxeck6ZIN4lU!&2;B@}zNqkIuW$rxi=%$Yv~8Dy-cVR>ydy`AXO1#sxye zuTpW*3gw*W|F1#Uyajz!^#60UmE#lW_E$q524FS30G>pz|68~Urr?#Z1-jt7==Pt6 z+u?oiCb$N!h8{Q#zJqT6&u}YT2ZOK$PJ*N1F?9Q1z~|v=C_q0P4iBTxzZWK92RsLk zh9lsY==G1mFW|j!3tSA7upS;m$NxLH8}`6y@Kf~r+hIG5!r^c)y8KS~LwErF{k3of zoCZ%Iqr2f_@O}`TUdc_%taWHfpe2F*A_3RR>((8OJtOV8{Mbfy&BojqD~IT~u|_D1 zLacR;U39&le|x18mI_Vpk6lfTpi&JhxoX%fhQ#IccbEw4+0yKE6h}3jnGI|8Y*?bn z4n^~cy^;xAlVm5trY2BsP~s_swR~!H?rLfTl`^TZVXDOn8g4RGX<2g0~VySgD$ zo~?Jgb~S2Uy)ko|8P1ccHt>jj-`pzJHez<#EdP)%cC7Dm%S|`POu$S9tC@+~i%$d? zsbzya~sW`qmitSfU-TmZuxvUV^8_V-ia8k(#X- zYc`^xO&9C_>otFjvTQ{+1Tz01PqIlihA`97m!<)@U}FvGztUf` zrtA;Aj8REPhBj|Lvv#Yd0QI%3TG#zY)I_o zv4$l!O%U1S?M+JPre~#A+Dtf5wK9(m?=sg7giK*=JY%km(Yfw)x0=)DC?Zg*a8#`^ zt&bNe9pkg5Jl4!WMjX3?QqRguW>uXkc{y*Q(A8U#=vzC8usdVInB!ijlO{tRK$+7a{m!o~7*-{}yL#Fyl;Sb7{Kp8*v>_C}WPx(LPg}5y@J4GYtM=O8tb?Ten zXdW?9rWr$I&2!3xT)HI89iBv_^~%=~iIzrWyAu8X9SqqAnM)S^Kh(qbC(-ra2e-gA zFb4gw0lMKJxEG!OGjIj;!Wr4B zfY<=8f}L3dHkH_J5cr&Q{*)OWqzKqlqJ2^_x^X7$|O|0dC z(DFd=$A~Qt1iH4C2SS7MLdydIrwV1oGU}4nCz&a;vurI7god6bn)8nJH!TkY-0@l- z2zxSSfZ}H7Bv^z){rMY+4=&4Smy+Po#8Wu4VEpCXH&MjGtQ`2)^T5%LBpL z)U2b>(mW6%XI(vNuueK1=;Q_F(NO^}5pbHLy@1tjb74CuxY=zU|WH<>#7k?f+ z7e0gD{b?w{6r2cO(fapA9FK!zA%HI-uP?$Fi0rbE_T%jS)}m-;WYnw@V>zkl_F~il ziEq9*w#F{`vrx))I+_}cxgqaQFW^{W)a)4(M&e|-bIryYvw*n?oMEGX^=xH|fi1c& zzp-hX;MU(L<%VKm-UQRJnt$ge!&2TpvOGSH^HFEux_D!Mz5+>2&PZ*&(f*=gYH9U*Q-lR$wcE#!jp8 zj2Uq))foL0GToY_%d(V8Z1IJ;l01rKd9K=3m@8Kc*G@Kn~PA1zs*q80wF*MTG5u6^hd8gWfZvFq@ z&`5S$-;OrtO`1%jlyzkUTv-bwU8))d_U&SI&D6w8W&b4=XE&c6*&O@*Z;6s%1{ z!xv=F+&r}D+<-{BS#7|5-L&`1FJ;S3^gb+OMtoMVwum((Dz&IMDWsLIy>68gxiyn) z*Opy?Yz$E&rAvW5YhQl}8NY3GXrw>8Raz+Bw0QltjcBPYOoN!stD9posrxYV8()8! zx$lW$e`bF}x3=u;w=u-B-=4L(wpZCdzyE5yP2oPwbsMLBYn$s!>{g;|Z`!Bqb*;CZ z_F4UdoBE>#BR$cmcveQm${j%0hjAH$cd6ge3X_x}d}@oO;!Z2AwYg$FKWRrNzF&c< zi#CDfN;I;U+5B!jyI+brlEehRxbd`aptlm}=_UCk)*^QXC!A37pP>iVB{L_ZUH_Ym z+(iS@dVm`Lm+?7d;b zC$$p$Lv;Kfz&qeF*aAnvqv-nIf*as+sKeRN4Q&v>OW=#>|M$TZTnyXcAovz`fWLy9 zU^n!_OX1mYC>#Rf3-E2Y7d`_YfNS7t*bVb=4jd0(!)|abycKFt1=$Ng`~hT*zEv8wBtY_#$?P`(O@c z;UdVwq3|g7hdbbQkbMDeguevw6L48FICvNvg!l>E2;Fce z`6PKI>;L~xeoP%q9%r!1OwFlX=MuxUzHw|yU zwArF&8HHz)cPYl=(FzfQd?f|ppGHKtc> z$d`uu^O-d@@S!hx$c z@#O_4oDi3hZO(0nGsjL6z5y=MzGPT|^{r4GA7>|k_QKqnb~E@`-LcNgx8&_s(Ne9D z?9dQxDiKWQDizklqBy8YgkWG>P@fEgT%o{RZLMi?M2RIT))BjWm1innW9Q*+tbm@n z=2c2?R z3?s*5lRR-DFJVbtA8ZV^^q!v`+`c8-KR7xv(7#R8vWy9rqGc)R#LJM@cU-b&S+Fr7 zj^Luu#lzb2W*%X~Cep#3G7e$4CL00S6|lia!`WA@yU;6ui*NMD%tT8)8y)H!>JHdA zqc9ax5z44Xb7mqkW5Rip^X-Gt;l$9!(cVyD-VLaeU&@Z&-AR zbR%TZ|Njt0azy5@(f`lVO68^K`lrB;(dA`-zxTsMa18tyJ^mheKZLLWj)7;wH_+dA z!A8iyThYP1BM6doRTnTOPV|3`-;8r*TWbQu$_n}LF0p0~6w1e!m_X)TX&VU!f z{mA8QPy>W!04=4Nx;7pXkEpB;!ygvn#`RC1f1No#tuiELk_l8L+KsWcCws-Hc) zJ=nIT#u~d4dk4^pET3S%1mvhTD27<^0APF^jP-_^_WvmVq~Wu(y3DF zH)7@)=4cb92V~kbZ4K-Tzp05OdDm0Q)URr%D-_F`L0+J`G%tYrW7^sEV#`>_SDUTK z*I*gJxjD}G2-0MrIYv8LF&Rvu)tPFhS2@|d$ZJ~@)5ncBa!luEOnwwn{jCzhndJ0X+D3;42|Y)%d#JB%HYYQ!EZ^tiOvO@5;qU528Agt zig${}XMu-H(;|(LYJ*&{urs#HYFg}F`>aZnKGhT9J}vo6FFv-|V0p0JoSG~W$J4Q5iM>k8 zrG=Zvr$VWp#XO}Km#Ymf)GI8UY$@@2T1>XYpy)3&wgFiq@J@IQTnNvD=fZ>72|f=WfI7SqhG7U+!V}mFWWE3Wa4TE_yWw=$2&-Tv z$XfqjVN3W5d>QVByFly=MVN$jAZreu3$pG&_6qzSd>UlGK-mM}5_lEJ-2MuXegAKQ zD`6hyU>iIieueEp*8X1wyP*svcm*5*l3)M5s4XJZQju)xwe=~jMADQ|^i<8znd&*zUN8z?HjkYt1ra{flP5GH< z_niEU8WIwbmtymv-%1c_UzmjjVS+3o2$LK+X8eErzxRbuW}+`a?4>Zo5^2OM8|s+v zNi=K|)KDO1DpJ<$tLX`fSOj`6`Q~4D@UqWFWaPsO>DltQrCJi=zS&@Jvwvvgs4HR( z*3y*KJiZ99QLACEOjpPD6zVt--T~3a4eZ-UeXyzVQyFEHCqUct=}^-7Z5@t!I<=LL zjZ;fjXO(CEu~9@z)C+N+iJ3}I{Hr58+{PKpcimRd=&W*IhERrWmn*Rwbe)nL+-D;e zxI}C$miG348@KF{I;LgWXg7*-A5VZGIxRd5n5)>sR(1rk~;_VLW%Sd1J z`hLn9rREDwUa9}A-tC)5DJDr->MKr|eq6d%iCpD$clQo&eufziIf*hffjduCsirnR z?H_wDH2J*C++>kyQq@37Cp2=t`)o+sfGukO=%Pwx_d=@qa|PI!TS^mZI}^7m-@c0} zXHAOE$ocXq6_Y!sd!n|dSt;i-fU{cqosHuPs&)$5&*cuOmgj z*QrIYoRlnyzD8t|?cSZbE_rE30dD^o*ZuC(3>%dU!;Qv#I;uzez<;6l%Xk0237-U6 z3vfN`fb-x~5Z{6CV+XhyE``^?C~Six;RyHz34avCR&YOj2d;uQzy^@-0(=!4!I$A~ zxEywZ>#syJu~V-`rYI%nAL7BHwebWqgAe7v#xL2Ub=%P9?fwp} z@msY^756r}Ms=)hJQsJDipQm0xroc2pY!&}l#;HD;lA@JYKDF}H=(TWHqL!CK56Ul z)%qDfs^xlPMKbL;5Q?7apJhjG=iA*bp6hm7VCx|qJ8Rbh8OPuhZ8mK)@VF#YBc^34 zZrJjO1GWVP-GpfNz5uj~bnHzRd4I&l>IGF0M8lWZY!?|5z`|b7>5`DxJeOylwSUWm zJkKuvib=udckHZ|jn@aKU3N7%x{AM}slMh!yza&#aaGp@_LKohu&TJe{EU4-!Y@=q z)jbku@j|(O#jZ;t2fb^$X>yy4ry9IyO>f<>a>Jyz=5$(Vn#U(CXlge9H`EZuU>o|!pKg8EQ@J-2>o6S+@ zjP~VZx<&HHhFUP+Y+4($tC9_6@HobIz4*h+EN!MF=VQBwn$)PH$jCNwe$Dfi(zqSp z8g5m0$ja8$`thBC{h)!3)@*Dff#da4%rk6M6=F-60xBBssl4pzz>x$3Lbr;pVI)lYfFMzW_#A#?>$ z?3RsBinoj6BWg`2DYhJ0(G~x0wmdqq_^R_!2kWq*F`nQda3|-)T zI#>SeA33kTFC($bkGxz;a$@~JJg4NY#xrtOewL6+63ph5$qrekU|x?oP;KEg8AOi! EFH+&182|tP literal 0 HcmV?d00001 diff --git a/src/transports/mlx5-exp/.mlx5-exp.hpp.swp b/src/transports/mlx5-exp/.mlx5-exp.hpp.swp new file mode 100644 index 0000000000000000000000000000000000000000..64c77c8d59fee822a476d37dfa63f8ef615c2d7b GIT binary patch literal 16384 zcmeHO&5s<#6|X>WfCS9fDN>YTQOM3}_RG6iB*rTdc@tS;?Uike!-ATg?%LV5d%AnN zYj(XMCJKcAAQHDok&;6WK_SY4C{ZLZCk}{PawLl6gapBrj|;!oAJbJeJ+ot*1Bz;- z-}ZEUy!Yz&UR8Cyp6x81Ke@plU07oH+{4&EKm5LX=;{;qeD7VxlAfDhk~@PA)lK+> zaW{yTCp0LlUKk2o+)U$(!jsuz7<9URwjX)({_bLydW$l3qb!M2)b+y4kIak9$?(g< zPLdp~#UFsSO)$- z8IZ{>>}MF-E#>g46Tw%?`|5gYdH=2QdfuonZ|%i0U>UFsSOzQumI2FvWxz6E8L$jk z1}p=XfiEEguFu#vVAoISppWQ3YE5M7u&w=NFW58kH)4Le^1b7GdJ@88)09Ju-0-wQ{mw|V_j`qOaz`yQf>`mZz zz>B~JumU^){OS(I{(U=Rv%q(OYhPpRRX_qM&pNZ?NpV@Wc_!0=Cwb7>MOiO#yd5`+M2JSkmo}^VYAy5LXjF*_ zBFVkJXfyo8!l7SNIQTV%14WY0=|!nB_+4K@DQ%acCT52uGOjUV{KYy@& zi4d1=%(^`{kdv*`T1;o%EWD8!beLjVGInwnh-kHPR_U#llLK4bJ@}+$I$#QQW3|+VtSrpH&`EoxI{P=PHqZ4PH6K77IUq9oV`|;+wv+=}* zN1XKwnaDH5J2oV}_>YHMLUpnp$3XEt}G7W>0l!YMl}yYH>ct zb6rz-&Fl!Tm1de=qvD(PnyGH=(8z0MhTKnc^Z9Y@11@X4-HN81^BR{VomVncEYk)`gkj5>lC;1wRp!jc@VCj&r4%uB`=Q&yg0$ftHE(R)&Ql%V3%t@i^t! zOWlymm8wo$xdS%jp>L8@8{!xT{ z;`+;5%SWHBYs!Zb?8KOf*wIqHmu}Gl|J36r&p9Vg{qVH2@%RPj^yb;oPV~$}+hE?A z_Hs^tYT1g`rL9$^KaJK;gI(yPJq3p4@pO#QoiDblhpMrxOdICI{z%6ebQ6p|_ zn4eugb>e!$ym+MM#idd67ZB>|`0{;7pXIbk)vBvmmn9C%(6&>{zFs@>P0O^y>rb}B>ysT11(6pHe8Fc1G6*yJt@(^n7k{6B!s|JD2bU*jy_2NGZ%_y}kB*ML6&d%zlS9QY7t_BVi+ zfnNYG0c*hDafW{ZxB@%~yp41GGr%V}v%dn|3;Y>p_fxGhnHrS74z%pPNunbrR zECZH-FOC7qhHHnvG!Da#>s`XrtDeXx6E>839!VmpkI|Ix$?x&t79-2RMpRJZJ3rU^ zBJ;9ZOI}93NJO4t4BB}9KQL2VX5pc0L{$=Jimo(BWSrIt&CD|CErqp_yO(@Bbh}xT zTs%Jtx95x%Pmi*Ej{X{nD?Oi~s3RvJEfC@v6lOe}xT)LA4zR-Q;9o)uD$D%TFfUss zs>oX-$+IywG>y|u5_&t4_b^qA5!0?;cg*>F%H6ITHmOKbfx(Jq9mO!NXB0YR`CB9{ zzWo7S9jT^I84i@zf@1>zG6pd!7H62!6J-ERDR%g3sK^FF_Lv~fr&OFENFv8e2D*52 z{?Q~-h~P+lHBwKrqY|7;92>XI8jPTYdE?2+C6HkUn@%h+XWz@fyX6gMgl%bZN({mCn8lYgp?=FHP3H<4A1m*_EF zNcWB_Ww{>ZUzONE_o{c9x}o$!f_JZpZ@hCg-=vZ@Fp`HpDWK&O_w*ICYOX4YLx*~W zDqgQ*o2l(rETFZyPOQ*EStVT*i)oF{-v#gD|K{Uzt(}elGS3ae*rS|&zRtt^{Mu%p E1BZcreate_qp = gds_mlx5_exp_create_qp; + transport->destroy_qp = gds_mlx5_exp_destroy_qp; + transport->rollback_qp = gds_mlx5_exp_rollback_qp; + + transport->init_send_info = gds_mlx5_exp_init_send_info; + transport->post_send_ops = gds_mlx5_exp_post_send_ops; + transport->post_send_ops_on_cpu = gds_mlx5_exp_post_send_ops_on_cpu; + transport->prepare_send = gds_mlx5_exp_prepare_send; + transport->get_num_send_request_entries = gds_mlx5_exp_get_num_send_request_entries; + + transport->init_wait_request = gds_mlx5_exp_init_wait_request; + transport->dump_wait_request = gds_mlx5_exp_dump_wait_request; + transport->stream_post_wait_descriptor = gds_mlx5_exp_stream_post_wait_descriptor; + transport->post_wait_descriptor = gds_mlx5_exp_post_wait_descriptor; + transport->get_wait_descs = gds_mlx5_exp_get_wait_descs; + transport->get_num_wait_request_entries = gds_mlx5_exp_get_num_send_request_entries; + + transport->prepare_wait_cq = gds_mlx5_exp_prepare_wait_cq; + transport->append_wait_cq = gds_mlx5_exp_append_wait_cq; + transport->abort_wait_cq = gds_mlx5_exp_abort_wait_cq; + + return 0; +} + diff --git a/src/mlx5-exp.hpp b/src/transports/mlx5-exp/mlx5-exp.hpp similarity index 100% rename from src/mlx5-exp.hpp rename to src/transports/mlx5-exp/mlx5-exp.hpp From de7d52daa2025667c9e0652e606a1ad5a9418b27 Mon Sep 17 00:00:00 2001 From: Pak Markthub Date: Fri, 15 Oct 2021 03:57:32 -0400 Subject: [PATCH 35/50] Fixed bugs and modified the code to use the gds_transport_t interface --- .gitignore | 3 + Makefile.am | 4 +- src/apis.cpp | 121 ++- src/gdsync.cpp | 471 +----------- src/mlx5.cpp | 120 +-- src/transport.hpp | 30 +- src/transports/mlx5-exp/.mlx5-exp.cpp.swp | Bin 40960 -> 0 bytes src/transports/mlx5-exp/.mlx5-exp.hpp.swp | Bin 16384 -> 0 bytes src/transports/mlx5-exp/mlx5-exp.cpp | 886 ++++++++++++++++++---- src/transports/mlx5-exp/mlx5-exp.hpp | 40 - src/utils.hpp | 19 +- 11 files changed, 884 insertions(+), 810 deletions(-) delete mode 100644 src/transports/mlx5-exp/.mlx5-exp.cpp.swp delete mode 100644 src/transports/mlx5-exp/.mlx5-exp.hpp.swp diff --git a/.gitignore b/.gitignore index 9a8cb0a..beb6022 100644 --- a/.gitignore +++ b/.gitignore @@ -45,3 +45,6 @@ libtool # Debug files *.dSYM/ *.su + +# Editor files +*.swp diff --git a/Makefile.am b/Makefile.am index 84a8085..61e9a2d 100644 --- a/Makefile.am +++ b/Makefile.am @@ -29,8 +29,8 @@ src_libgdsync_la_LDFLAGS = -version-info @VERSION_INFO@ noinst_HEADERS = src/mem.hpp src/memmgr.hpp src/objs.hpp src/rangeset.hpp src/utils.hpp src/archutils.h src/mlnxutils.h if COMPILE_EXP_VERBS -src_libgdsync_la_SOURCES += src/mlx5-exp.cpp -noinst_HEADERS += src/mlx5-exp.hpp +src_libgdsync_la_SOURCES += src/transports/mlx5-exp/mlx5-exp.cpp +noinst_HEADERS += src/transports/mlx5-exp/mlx5-exp.hpp endif # if enabled at configure time diff --git a/src/apis.cpp b/src/apis.cpp index 69e491c..ed0a1d5 100644 --- a/src/apis.cpp +++ b/src/apis.cpp @@ -50,63 +50,36 @@ #include "utils.hpp" #include "archutils.h" #include "mlnxutils.h" -#include "mlx5-exp.hpp" - - -//----------------------------------------------------------------------------- - -void gds_init_ops(struct peer_op_wr *op, int count) -{ - int i = count; - while (--i) - op[i-1].next = &op[i]; - op[count-1].next = NULL; -} +#include "transport.hpp" //----------------------------------------------------------------------------- static void gds_init_send_info(gds_send_request_t *info) { - gds_mlx5_exp_send_request_t *gmexp_info; gds_dbg("send_request=%p\n", info); memset(info, 0, sizeof(*info)); - info->dtype = GDS_DRIVER_TYPE_MLX5_EXP; - - gmexp_info = to_gds_mexp_send_request(info); - - gds_mlx5_exp_init_send_info(gmexp_info); + gds_main_transport->init_send_info(info); } //----------------------------------------------------------------------------- static void gds_init_wait_request(gds_wait_request_t *request, uint32_t offset) { - gds_mlx5_exp_wait_request_t *gmexp_request; gds_dbg("wait_request=%p offset=%08x\n", request, offset); memset(request, 0, sizeof(*request)); - request->dtype = GDS_DRIVER_TYPE_MLX5_EXP; - - gmexp_request = to_gds_mexp_wait_request(request); - - gds_mlx5_exp_init_wait_request(gmexp_request, offset); + gds_main_transport->init_wait_request(request, offset); } //----------------------------------------------------------------------------- static int gds_rollback_qp(struct gds_qp *qp, gds_send_request_t *send_info) { - gds_mlx5_exp_qp_t *gmexpqp; - gds_mlx5_exp_send_request_t *gmexp_sreq; - assert(qp); assert(send_info); - gmexpqp = to_gds_mexp_qp(qp); - gmexp_sreq = to_gds_mexp_send_request(send_info); - - return gds_mlx5_exp_rollback_qp(gmexpqp, gmexp_sreq); + return gds_main_transport->rollback_qp(qp, send_info); } //----------------------------------------------------------------------------- @@ -162,20 +135,14 @@ int gds_prepare_send(struct gds_qp *gqp, gds_send_wr *p_ewr, gds_send_request_t *request) { int ret = 0; - gds_mlx5_exp_qp_t *gmexpqp; - gds_mlx5_exp_send_request_t *sreq; gds_init_send_info(request); assert(gqp); assert(gqp->qp); - assert(gqp->dtype == GDS_DRIVER_TYPE_MLX5_EXP); - - gmexpqp = to_gds_mexp_qp(gqp); - sreq = to_gds_mexp_send_request(request); - ret = gds_mlx5_exp_prepare_send(gmexpqp, p_ewr, bad_ewr, sreq); + ret = gds_main_transport->prepare_send(gqp, p_ewr, bad_ewr, request); if (ret) - gds_err("Error %d in gds_mlx5_exp_prepare_send.\n", ret); + gds_err("Error %d in prepare_send.\n", ret); return ret; } @@ -260,9 +227,6 @@ int gds_stream_post_send_all(CUstream stream, int count, gds_send_request_t *req int gds_prepare_wait_cq(struct gds_cq *cq, gds_wait_request_t *request, int flags) { - gds_mlx5_exp_cq_t *gmexpcq; - gds_mlx5_exp_wait_request_t *gmexp_request; - if (flags != 0) { gds_err("invalid flags != 0\n"); return EINVAL; @@ -270,19 +234,23 @@ int gds_prepare_wait_cq(struct gds_cq *cq, gds_wait_request_t *request, int flag gds_init_wait_request(request, cq->curr_offset++); - gmexpcq = to_gds_mexp_cq(cq); - gmexp_request = to_gds_mexp_wait_request(request); - - return gds_mlx5_exp_prepare_wait_cq(gmexpcq, gmexp_request, flags); + return gds_main_transport->prepare_wait_cq(cq, request, flags); } //----------------------------------------------------------------------------- int gds_append_wait_cq(gds_wait_request_t *request, uint32_t *dw, uint32_t val) { - gds_mlx5_exp_wait_request_t *gmexp_request = to_gds_mexp_wait_request(request); + int ret = gds_transport_init(); + if (!ret) { + gds_err("error in gds_transport_init\n"); + goto out; + } - return gds_mlx5_exp_append_wait_cq(gmexp_request, dw, val); + ret = gds_main_transport->append_wait_cq(request, dw, val); + +out: + return ret; } //----------------------------------------------------------------------------- @@ -303,16 +271,10 @@ int gds_stream_post_wait_cq_all(CUstream stream, int count, gds_wait_request_t * static int gds_abort_wait_cq(struct gds_cq *cq, gds_wait_request_t *request) { - gds_mlx5_exp_cq_t *gmexpcq; - gds_mlx5_exp_wait_request_t *gmexp_request; - assert(cq); assert(request); - gmexpcq = to_gds_mexp_cq(cq); - gmexp_request = to_gds_mexp_wait_request(request); - - return gds_mlx5_exp_abort_wait_cq(gmexpcq, gmexp_request); + return gds_main_transport->abort_wait_cq(cq, request); } //----------------------------------------------------------------------------- @@ -503,14 +465,21 @@ static int calc_n_mem_ops(size_t n_descs, gds_descriptor_t *descs, size_t &n_mem int ret = 0; n_mem_ops = 0; size_t i; - for(i = 0; i < n_descs; ++i) { + + ret = gds_transport_init(); + if (!ret) { + gds_err("error in gds_transport_init\n"); + goto out; + } + + for (i = 0; i < n_descs; ++i) { gds_descriptor_t *desc = descs + i; - switch(desc->tag) { + switch (desc->tag) { case GDS_TAG_SEND: - n_mem_ops += gds_mlx5_exp_get_num_send_request_entries(to_gds_mexp_send_request(desc->send)) + 2; // extra space, ugly + n_mem_ops += gds_main_transport->get_num_send_request_entries(desc->send) + 2; // extra space, ugly break; case GDS_TAG_WAIT: - n_mem_ops += gds_mlx5_exp_get_num_wait_request_entries(to_gds_mexp_wait_request(desc->wait)) + 2; // ditto + n_mem_ops += gds_main_transport->get_num_wait_request_entries(desc->wait) + 2; // ditto break; case GDS_TAG_WAIT_VALUE32: case GDS_TAG_WRITE_VALUE32: @@ -522,6 +491,8 @@ static int calc_n_mem_ops(size_t n_descs, gds_descriptor_t *descs, size_t &n_mem ret = EINVAL; } } + +out: return ret; } @@ -538,6 +509,11 @@ int gds_stream_post_descriptors(CUstream stream, size_t n_descs, gds_descriptor_ gds_peer *peer = NULL; gds_op_list_t params; + ret = gds_transport_init(); + if (!ret) { + gds_err("error in gds_transport_init\n"); + goto out; + } ret = calc_n_mem_ops(n_descs, descs, n_mem_ops); if (ret) { @@ -565,12 +541,11 @@ int gds_stream_post_descriptors(CUstream stream, size_t n_descs, gds_descriptor_ return EINVAL; } - for(i = 0; i < n_descs; ++i) { + for (i = 0; i < n_descs; ++i) { gds_descriptor_t *desc = descs + i; - switch(desc->tag) { + switch (desc->tag) { case GDS_TAG_SEND: { - gds_mlx5_exp_send_request_t *sreq = to_gds_mexp_send_request(desc->send); - retcode = gds_mlx5_exp_post_send_ops(peer, sreq, params); + retcode = gds_main_transport->post_send_ops(peer, desc->send, params); if (retcode) { gds_err("error %d in gds_post_ops\n", retcode); ret = retcode; @@ -579,15 +554,14 @@ int gds_stream_post_descriptors(CUstream stream, size_t n_descs, gds_descriptor_ break; } case GDS_TAG_WAIT: { - gds_mlx5_exp_wait_request_t *wreq = to_gds_mexp_wait_request(desc->wait); int flags = 0; if (move_flush && i != last_wait) { gds_dbg("discarding FLUSH!\n"); flags = GDS_POST_OPS_DISCARD_WAIT_FLUSH; } - retcode = gds_mlx5_exp_stream_post_wait_descriptor(peer, wreq, params, flags); + retcode = gds_main_transport->stream_post_wait_descriptor(peer, desc->wait, params, flags); if (retcode) { - gds_err("error %d in gds_mlx5_exp_stream_post_wait_descriptor\n", retcode); + gds_err("error %d in stream_post_wait_descriptor\n", retcode); ret = retcode; goto out; } @@ -642,13 +616,19 @@ int gds_post_descriptors(size_t n_descs, gds_descriptor_t *descs, int flags) size_t i; int ret = 0; int retcode = 0; + + ret = gds_transport_init(); + if (!ret) { + gds_err("error in gds_transport_init\n"); + goto out; + } + for(i = 0; i < n_descs; ++i) { gds_descriptor_t *desc = descs + i; switch(desc->tag) { case GDS_TAG_SEND: { gds_dbg("desc[%zu] SEND\n", i); - gds_mlx5_exp_send_request_t *sreq = to_gds_mexp_send_request(desc->send); - retcode = gds_mlx5_exp_post_send_ops_on_cpu(sreq, flags); + retcode = gds_main_transport->post_send_ops_on_cpu(desc->send, flags); if (retcode) { gds_err("error %d in gds_post_ops_on_cpu\n", retcode); ret = retcode; @@ -658,10 +638,9 @@ int gds_post_descriptors(size_t n_descs, gds_descriptor_t *descs, int flags) } case GDS_TAG_WAIT: { gds_dbg("desc[%zu] WAIT\n", i); - gds_mlx5_exp_wait_request_t *wreq = to_gds_mexp_wait_request(desc->wait); - retcode = gds_mlx5_exp_post_wait_descriptor(wreq, flags); + retcode = gds_main_transport->post_wait_descriptor(desc->wait, flags); if (retcode) { - gds_err("error %d in gds_mlx5_exp_post_wait_descriptor\n", retcode); + gds_err("error %d in post_wait_descriptor\n", retcode); ret = retcode; goto out; } diff --git a/src/gdsync.cpp b/src/gdsync.cpp index b01cd36..e9fafc8 100644 --- a/src/gdsync.cpp +++ b/src/gdsync.cpp @@ -43,7 +43,11 @@ #include "archutils.h" #include "mlnxutils.h" #include "task_queue.hpp" -#include "mlx5-exp.hpp" +#include "transport.hpp" + +//----------------------------------------------------------------------------- + +gds_transport_t *gds_main_transport = NULL; //----------------------------------------------------------------------------- @@ -93,10 +97,6 @@ int gds_flusher_enabled() #define CU_STREAM_BATCH_MEM_OP_RELAXED_ORDERING 0x1 #endif -// TODO: use correct value -// TODO: make it dependent upon the particular GPU -const size_t GDS_GPU_MAX_INLINE_SIZE = 256; - //----------------------------------------------------------------------------- // Note: these are default overrides, i.e. allow to disable/enable the features @@ -173,7 +173,7 @@ static bool gds_enable_inlcpy() } // simulate 64-bits writes with inlcpy -static bool gds_simulate_write64() +bool gds_simulate_write64() { static int gds_simulate_write64 = -1; if (-1 == gds_simulate_write64) { @@ -349,7 +349,7 @@ int gds_fill_membar(gds_peer *peer, gds_op_list_t &ops, int flags) //----------------------------------------------------------------------------- -static int gds_fill_inlcpy(gds_peer *peer, gds_op_list_t &ops, CUdeviceptr addr, const void *data, size_t n_bytes, int flags) +int gds_fill_inlcpy(gds_peer *peer, gds_op_list_t &ops, CUdeviceptr addr, const void *data, size_t n_bytes, int flags) { int retcode = 0; #if HAVE_DECL_CU_STREAM_MEM_OP_WRITE_MEMORY @@ -410,7 +410,7 @@ int gds_fill_inlcpy(gds_peer *peer, gds_op_list_t &ops, void *ptr, const void *d //----------------------------------------------------------------------------- -static void gds_enable_barrier_for_inlcpy(CUstreamBatchMemOpParams *param) +void gds_enable_barrier_for_inlcpy(CUstreamBatchMemOpParams *param) { #if HAVE_DECL_CU_STREAM_MEM_OP_WRITE_MEMORY assert(param->operation == CU_STREAM_MEM_OP_WRITE_MEMORY); @@ -420,7 +420,7 @@ static void gds_enable_barrier_for_inlcpy(CUstreamBatchMemOpParams *param) //----------------------------------------------------------------------------- -static int gds_fill_poke(gds_peer *peer, gds_op_list_t &ops, CUdeviceptr addr, uint32_t value, int flags) +int gds_fill_poke(gds_peer *peer, gds_op_list_t &ops, CUdeviceptr addr, uint32_t value, int flags) { int retcode = 0; CUdeviceptr dev_ptr = addr; @@ -467,7 +467,7 @@ int gds_fill_poke(gds_peer *peer, gds_op_list_t &ops, uint32_t *ptr, uint32_t va //----------------------------------------------------------------------------- -static int gds_fill_poke64(gds_peer *peer, gds_op_list_t &ops, CUdeviceptr addr, uint64_t value, int flags) +int gds_fill_poke64(gds_peer *peer, gds_op_list_t &ops, CUdeviceptr addr, uint64_t value, int flags) { int retcode = 0; #if HAVE_DECL_CU_STREAM_MEM_OP_WRITE_VALUE_64 @@ -581,7 +581,7 @@ unsigned poll_checker::m_global_index = 0; //----------------------------------------------------------------------------- -static int gds_fill_poll(gds_peer *peer, gds_op_list_t &ops, CUdeviceptr ptr, uint32_t magic, int cond_flag, int flags) +int gds_fill_poll(gds_peer *peer, gds_op_list_t &ops, CUdeviceptr ptr, uint32_t magic, int cond_flag, int flags) { int retcode = 0; const char *cond_str = NULL; @@ -718,270 +718,6 @@ int gds_stream_batch_ops(gds_peer *peer, CUstream stream, gds_op_list_t &ops, in //----------------------------------------------------------------------------- -/* - A) plain+membar: - WR32 - MEMBAR - WR32 - WR32 - - B) plain: - WR32 - WR32+PREBARRIER - WR32 - - C) sim64+membar: - WR32 - MEMBAR - INLCPY 8B - - D) sim64: - INLCPY 4B + POSTBARRIER - INLCPY 8B - - E) inlcpy+membar: - WR32 - MEMBAR - INLCPY XB - - F) inlcpy: - INLCPY 4B + POSTBARRIER - INLCPY 128B -*/ - -int gds_post_ops(gds_peer *peer, size_t n_ops, struct peer_op_wr *op, gds_op_list_t &ops, int post_flags) -{ - int retcode = 0; - size_t n = 0; - bool prev_was_fence = false; - bool use_inlcpy_for_dword = false; - //size_t n_ops = ops.size(); - CUstreamBatchMemOpParams param; - - gds_dbg("n_ops=%zu\n", n_ops); - - if (!peer->has_memops) { - gds_err("CUDA MemOps are required\n"); - return EINVAL; - } - - // divert the request to the same engine handling 64bits - // to avoid out-of-order execution - // caveat: can't use membar if inlcpy is used for 4B writes (to simulate 8B writes) - if (peer->has_inlcpy) { - if (!peer->has_membar) - use_inlcpy_for_dword = true; // F - } - if (gds_simulate_write64()) { - if (!peer->has_membar) { - gds_warn_once("enabling use_inlcpy_for_dword\n"); - use_inlcpy_for_dword = true; // D - } - } - - for (; op && n < n_ops; op = op->next, ++n) { - //int flags = 0; - gds_dbg("op[%zu] type:%08x\n", n, op->type); - switch(op->type) { - case GDS_PEER_OP_FENCE: { - gds_dbg("OP_FENCE: fence_flags=%" PRIu64 "\n", op->wr.fence.fence_flags); - uint32_t fence_op = (op->wr.fence.fence_flags & (GDS_PEER_FENCE_OP_READ|GDS_PEER_FENCE_OP_WRITE)); - uint32_t fence_from = (op->wr.fence.fence_flags & (GDS_PEER_FENCE_FROM_CPU|GDS_PEER_FENCE_FROM_HCA)); - uint32_t fence_mem = (op->wr.fence.fence_flags & (GDS_PEER_FENCE_MEM_SYS|GDS_PEER_FENCE_MEM_PEER)); - - if (fence_op == GDS_PEER_FENCE_OP_READ) { - gds_dbg("nothing to do for read fences\n"); - //retcode = EINVAL; - break; - } - else { - if (!peer->has_membar) { - if (use_inlcpy_for_dword) { - assert(ops.size() > 0); - gds_dbg("patching previous param\n"); - gds_enable_barrier_for_inlcpy(&ops.back()); - } - else { - gds_dbg("recording fence event\n"); - prev_was_fence = true; - } - //retcode = 0; - } - else { - if (fence_from != GDS_PEER_FENCE_FROM_HCA) { - gds_err("unexpected from fence\n"); - retcode = EINVAL; - break; - } - int flags = 0; - if (fence_mem == GDS_PEER_FENCE_MEM_PEER) { - gds_dbg("using light membar\n"); - flags = GDS_MEMBAR_DEFAULT | GDS_MEMBAR_MLX5; - } - else if (fence_mem == GDS_PEER_FENCE_MEM_SYS) { - gds_dbg("using heavy membar\n"); - flags = GDS_MEMBAR_SYS | GDS_MEMBAR_MLX5; - } - else { - gds_err("unsupported fence combination\n"); - retcode = EINVAL; - break; - } - retcode = gds_fill_membar(peer, ops, flags); - } - } - break; - } - case GDS_PEER_OP_STORE_DWORD: { - CUdeviceptr dev_ptr = range_from_id(op->wr.dword_va.target_id)->dptr + - op->wr.dword_va.offset; - uint32_t data = op->wr.dword_va.data; - int flags = 0; - gds_dbg("OP_STORE_DWORD dev_ptr=%llx data=%" PRIx32 "\n", dev_ptr, data); - if (use_inlcpy_for_dword) { // F || D - // membar may be out of order WRT inlcpy - if (peer->has_membar) { - gds_err("invalid feature combination, inlcpy + membar\n"); - retcode = EINVAL; - break; - } - // tail flush is set when following fence is met - // flags |= GDS_IMMCOPY_POST_TAIL_FLUSH; - retcode = gds_fill_inlcpy(peer, ops, dev_ptr, &data, sizeof(data), flags); - } - else { // A || B || C || E - // can't guarantee ordering of write32+inlcpy unless - // a membar is there - // TODO: fix driver when !weak - if (peer->has_inlcpy && !peer->has_membar) { - gds_err("invalid feature combination, inlcpy needs membar\n"); - retcode = EINVAL; - break; - } - if (prev_was_fence) { - gds_dbg("using PRE_BARRIER as fence\n"); - flags |= GDS_WRITE_PRE_BARRIER; - prev_was_fence = false; - } - retcode = gds_fill_poke(peer, ops, dev_ptr, data, flags); - } - break; - } - case GDS_PEER_OP_STORE_QWORD: { - CUdeviceptr dev_ptr = range_from_id(op->wr.qword_va.target_id)->dptr + - op->wr.qword_va.offset; - uint64_t data = op->wr.qword_va.data; - int flags = 0; - gds_dbg("OP_STORE_QWORD dev_ptr=%llx data=%" PRIx64 "\n", dev_ptr, data); - // C || D - if (gds_simulate_write64()) { - // simulate 64-bit poke by inline copy - if (!peer->has_membar) { - gds_err("invalid feature combination, inlcpy needs membar\n"); - retcode = EINVAL; - break; - } - - // tail flush is never useful here - //flags |= GDS_IMMCOPY_POST_TAIL_FLUSH; - retcode = gds_fill_inlcpy(peer, ops, dev_ptr, &data, sizeof(data), flags); - } - else if (peer->has_write64) { - retcode = gds_fill_poke64(peer, ops, dev_ptr, data, flags); - } - else { - uint32_t datalo = gds_qword_lo(op->wr.qword_va.data); - uint32_t datahi = gds_qword_hi(op->wr.qword_va.data); - - if (prev_was_fence) { - gds_dbg("enabling PRE_BARRIER\n"); - flags |= GDS_WRITE_PRE_BARRIER; - prev_was_fence = false; - } - retcode = gds_fill_poke(peer, ops, dev_ptr, datalo, flags); - - // get rid of the barrier, if there - flags &= ~GDS_WRITE_PRE_BARRIER; - - // advance to next DWORD - dev_ptr += sizeof(uint32_t); - retcode = gds_fill_poke(peer, ops, dev_ptr, datahi, flags); - } - - break; - } - case GDS_PEER_OP_COPY_BLOCK: { - CUdeviceptr dev_ptr = range_from_id(op->wr.copy_op.target_id)->dptr + - op->wr.copy_op.offset; - size_t len = op->wr.copy_op.len; - void *src = op->wr.copy_op.src; - int flags = 0; - gds_dbg("OP_COPY_BLOCK dev_ptr=%llx src=%p len=%zu\n", dev_ptr, src, len); - // catching any other size here - if (!peer->has_inlcpy) { - gds_err("inline copy is not supported\n"); - retcode = EINVAL; - break; - } - // IB Verbs bug - assert(len <= GDS_GPU_MAX_INLINE_SIZE); - //if (desc->need_flush) { - // flags |= GDS_IMMCOPY_POST_TAIL_FLUSH; - //} - retcode = gds_fill_inlcpy(peer, ops, dev_ptr, src, len, flags); - break; - } - case GDS_PEER_OP_POLL_AND_DWORD: - case GDS_PEER_OP_POLL_GEQ_DWORD: - case GDS_PEER_OP_POLL_NOR_DWORD: { - int poll_cond; - CUdeviceptr dev_ptr = range_from_id(op->wr.dword_va.target_id)->dptr + - op->wr.dword_va.offset; - uint32_t data = op->wr.dword_va.data; - // TODO: properly handle a following fence instead of blidly flushing - int flags = 0; - if (!(post_flags & GDS_POST_OPS_DISCARD_WAIT_FLUSH)) - flags |= GDS_WAIT_POST_FLUSH_REMOTE; - - gds_dbg("OP_WAIT_DWORD dev_ptr=%llx data=%" PRIx32 " type=%" PRIx32 "\n", dev_ptr, data, (uint32_t)op->type); - - switch(op->type) { - case GDS_PEER_OP_POLL_NOR_DWORD: - poll_cond = GDS_WAIT_COND_NOR; - break; - case GDS_PEER_OP_POLL_GEQ_DWORD: - poll_cond = GDS_WAIT_COND_GEQ; - break; - case GDS_PEER_OP_POLL_AND_DWORD: - poll_cond = GDS_WAIT_COND_AND; - break; - default: - assert(!"cannot happen"); - retcode = EINVAL; - goto out; - } - retcode = gds_fill_poll(peer, ops, dev_ptr, data, poll_cond, flags); - break; - } - default: - gds_err("undefined peer op type %d\n", op->type); - retcode = EINVAL; - break; - } - if (retcode) { - gds_err("error in fill func at entry n=%zu\n", n); - goto out; - } - } - - assert(n_ops == n); - -out: - return retcode; -} - -//----------------------------------------------------------------------------- - int gds_post_pokes(CUstream stream, int count, gds_send_request_t *info, uint32_t *dw, uint32_t val) { int retcode = 0; @@ -997,9 +733,8 @@ int gds_post_pokes(CUstream stream, int count, gds_send_request_t *info, uint32_ } for (int j=0; jpost_send_ops(peer, &info[j], ops); if (retcode) { goto out; } @@ -1026,124 +761,6 @@ int gds_post_pokes(CUstream stream, int count, gds_send_request_t *info, uint32_ //----------------------------------------------------------------------------- -int gds_post_ops_on_cpu(size_t n_ops, struct peer_op_wr *op, int post_flags) -{ - int retcode = 0; - size_t n = 0; - gds_dbg("n_ops=%zu op=%p post_flags=0x%x\n", n_ops, op, post_flags); - for (; op && n < n_ops; op = op->next, ++n) { - //int flags = 0; - gds_dbg("op[%zu]=%p\n", n, op); - //gds_dbg("op[%zu]=%p type:%08x\n", n, op, op->type); - switch(op->type) { - case GDS_PEER_OP_FENCE: { - gds_dbg("FENCE flags=%" PRIu64 "\n", op->wr.fence.fence_flags); - uint32_t fence_op = (op->wr.fence.fence_flags & (GDS_PEER_FENCE_OP_READ|GDS_PEER_FENCE_OP_WRITE)); - uint32_t fence_from = (op->wr.fence.fence_flags & (GDS_PEER_FENCE_FROM_CPU|GDS_PEER_FENCE_FROM_HCA)); - uint32_t fence_mem = (op->wr.fence.fence_flags & (GDS_PEER_FENCE_MEM_SYS|GDS_PEER_FENCE_MEM_PEER)); - - if (fence_op == GDS_PEER_FENCE_OP_READ) { - gds_warnc(1, "nothing to do for read fences\n"); - //retcode = EINVAL; - break; - } - else { - if (fence_from != GDS_PEER_FENCE_FROM_HCA) { - gds_err("unexpected from %08x fence, expected FROM_HCA\n", fence_from); - retcode = EINVAL; - break; - } - if (fence_mem == GDS_PEER_FENCE_MEM_PEER) { - gds_dbg("using light membar\n"); - wmb(); - } - else if (fence_mem == GDS_PEER_FENCE_MEM_SYS) { - gds_dbg("using heavy membar\n"); - wmb(); - } - else { - gds_err("unsupported fence combination\n"); - retcode = EINVAL; - break; - } - } - break; - } - case GDS_PEER_OP_STORE_DWORD: { - uint32_t *ptr = (uint32_t*)((ptrdiff_t)range_from_id(op->wr.dword_va.target_id)->va + op->wr.dword_va.offset); - uint32_t data = op->wr.dword_va.data; - // A || B || C || E - gds_dbg("STORE_DWORD ptr=%p data=%08" PRIx32 "\n", ptr, data); - gds_atomic_set(ptr, data); - break; - } - case GDS_PEER_OP_STORE_QWORD: { - uint64_t *ptr = (uint64_t*)((ptrdiff_t)range_from_id(op->wr.qword_va.target_id)->va + op->wr.qword_va.offset); - uint64_t data = op->wr.qword_va.data; - gds_dbg("STORE_QWORD ptr=%p data=%016" PRIx64 "\n", ptr, data); - gds_atomic_set(ptr, data); - break; - } - case GDS_PEER_OP_COPY_BLOCK: { - uint64_t *ptr = (uint64_t*)((ptrdiff_t)range_from_id(op->wr.copy_op.target_id)->va + op->wr.copy_op.offset); - uint64_t *src = (uint64_t*)op->wr.copy_op.src; - size_t n_bytes = op->wr.copy_op.len; - gds_dbg("COPY_BLOCK ptr=%p src=%p len=%zu\n", ptr, src, n_bytes); - gds_bf_copy(ptr, src, n_bytes); - break; - } - case GDS_PEER_OP_POLL_AND_DWORD: - case GDS_PEER_OP_POLL_GEQ_DWORD: - case GDS_PEER_OP_POLL_NOR_DWORD: { - int poll_cond; - uint32_t *ptr = (uint32_t*)((ptrdiff_t)range_from_id(op->wr.dword_va.target_id)->va + op->wr.dword_va.offset); - uint32_t value = op->wr.dword_va.data; - bool flush = true; - if (post_flags & GDS_POST_OPS_DISCARD_WAIT_FLUSH) - flush = false; - gds_dbg("WAIT_32 dev_ptr=%p data=%" PRIx32 " type=%" PRIx32 "\n", ptr, value, (uint32_t)op->type); - bool done = false; - do { - uint32_t data = gds_atomic_get(ptr); - switch(op->type) { - case GDS_PEER_OP_POLL_NOR_DWORD: - done = (0 != ~(data | value)); - break; - case GDS_PEER_OP_POLL_GEQ_DWORD: - done = ((int32_t)data - (int32_t)value >= 0); - break; - case GDS_PEER_OP_POLL_AND_DWORD: - done = (0 != (data & value)); - break; - default: - gds_err("invalid op type %02x\n", op->type); - retcode = EINVAL; - goto out; - } - if (done) - break; - // TODO: more aggressive CPU relaxing needed here to avoid starving I/O fabric - arch_cpu_relax(); - } while(true); - break; - } - default: - gds_err("undefined peer op type %d\n", op->type); - retcode = EINVAL; - break; - } - if (retcode) { - gds_err("error %d at entry n=%zu\n", retcode, n); - goto out; - } - } - -out: - return retcode; -} - -//----------------------------------------------------------------------------- - int gds_post_pokes_on_cpu(int count, gds_send_request_t *info, uint32_t *dw, uint32_t val) { int retcode = 0; @@ -1152,9 +769,8 @@ int gds_post_pokes_on_cpu(int count, gds_send_request_t *info, uint32_t *dw, uin assert(info); for (int j=0; jpost_send_ops_on_cpu(&info[j], 0); if (retcode) { goto out; } @@ -1174,12 +790,10 @@ int gds_post_pokes_on_cpu(int count, gds_send_request_t *info, uint32_t *dw, uin void gds_dump_wait_request(gds_wait_request_t *request, size_t count) { for (size_t j = 0; j < count; ++j) { - gds_mlx5_exp_wait_request_t *gmexp_request; if (count == 0) return; - gmexp_request = to_gds_mexp_wait_request(&request[j]); - gds_mlx5_exp_dump_wait_request(gmexp_request, j); + gds_main_transport->dump_wait_request(&request[j], j); } } @@ -1628,7 +1242,7 @@ struct gds_qp *gds_create_qp(struct ibv_pd *pd, struct ibv_context *context, gds_qp_init_attr_t *qp_attr, int gpu_id, int flags) { int ret = 0; - gds_mlx5_exp_qp_t *gmexpqp = NULL; + gds_qp_t *gqp = NULL; gds_peer *peer = NULL; gds_peer_attr *peer_attr = NULL; gds_driver_type dtype; @@ -1645,6 +1259,12 @@ struct gds_qp *gds_create_qp(struct ibv_pd *pd, struct ibv_context *context, return NULL; } + ret = gds_transport_init(); + if (ret) { + gds_err("error in gds_transport_init\n"); + goto err; + } + // peer registration gds_dbg("before gds_register_peer_ex\n"); ret = gds_register_peer_by_ordinal(gpu_id, &peer, &peer_attr); @@ -1653,64 +1273,33 @@ struct gds_qp *gds_create_qp(struct ibv_pd *pd, struct ibv_context *context, goto err; } - dtype = gds_get_driver_type(context->device); - if (dtype != GDS_DRIVER_TYPE_MLX5_EXP) { - gds_err("Unsupported IB device\n"); - goto err; - } - - gmexpqp = gds_mlx5_exp_create_qp(pd, context, qp_attr, peer, peer_attr, flags); - if (!gmexpqp) { - gds_err("Error in gds_mlx5_exp_create_qp.\n"); + ret = gds_main_transport->create_qp(pd, context, qp_attr, peer, peer_attr, flags, &gqp); + if (ret) { + gds_err("Error in create_qp.\n"); goto err; } - gds_dbg("created gds_qp=%p\n", gmexpqp->gqp); + gds_dbg("created gds_qp=%p\n", gqp); - return &gmexpqp->gqp; + return gqp; err: return NULL; } -//----------------------------------------------------------------------------- - -int gds_destroy_cq(struct gds_cq *gcq) -{ - int retcode = 0; - int ret; - - if (!gcq) - return retcode; - - // Currently, we support only exp-verbs. - assert(gcq->dtype == GDS_DRIVER_TYPE_MLX5_EXP); - - gds_mlx5_exp_cq_t *gmexpcq = to_gds_mexp_cq(gcq); - - retcode = gds_mlx5_exp_destroy_cq(gmexpcq); - - return retcode; -} //----------------------------------------------------------------------------- int gds_destroy_qp(struct gds_qp *gqp) { - int retcode = 0; - int ret; + int ret = 0; if (!gqp) - return retcode; - - // Currently, we support only exp-verbs. - assert(gqp->dtype == GDS_DRIVER_TYPE_MLX5_EXP); + return ret; - gds_mlx5_exp_qp_t *gmexpqp = to_gds_mexp_qp(gqp); + ret = gds_main_transport->destroy_qp(gqp); - retcode = gds_mlx5_exp_destroy_qp(gmexpqp); - - return retcode; + return ret; } //----------------------------------------------------------------------------- diff --git a/src/mlx5.cpp b/src/mlx5.cpp index c67a2f6..94872cc 100644 --- a/src/mlx5.cpp +++ b/src/mlx5.cpp @@ -40,7 +40,7 @@ //#include "mem.hpp" #include "objs.hpp" #include "utils.hpp" -#include "mlx5-exp.hpp" +#include "transport.hpp" #if 0 union { uint64_t qw; uint32_t dw[2]; } db_val; @@ -50,123 +50,21 @@ mlx5_i->db_value = db_val.qw; #endif -//----------------------------------------------------------------------------- - -int gds_mlx5_get_send_descs(gds_mlx5_send_info_t *mlx5_i, const gds_send_request_t *_request) -{ - int retcode = 0; - const gds_mlx5_exp_send_request_t *request = to_gds_mexp_send_request(_request); - size_t n_ops = request->commit.entries; - peer_op_wr *op = request->commit.storage; - size_t n = 0; - - memset(mlx5_i, 0, sizeof(*mlx5_i)); - - for (; op && n < n_ops; op = op->next, ++n) { - switch(op->type) { - case GDS_PEER_OP_FENCE: { - gds_dbg("OP_FENCE: fence_flags=%" PRIu64 "\n", op->wr.fence.fence_flags); - uint32_t fence_op = (op->wr.fence.fence_flags & (GDS_PEER_FENCE_OP_READ|GDS_PEER_FENCE_OP_WRITE)); - uint32_t fence_from = (op->wr.fence.fence_flags & (GDS_PEER_FENCE_FROM_CPU|GDS_PEER_FENCE_FROM_HCA)); - uint32_t fence_mem = (op->wr.fence.fence_flags & (GDS_PEER_FENCE_MEM_SYS|GDS_PEER_FENCE_MEM_PEER)); - if (fence_op == GDS_PEER_FENCE_OP_READ) { - gds_dbg("nothing to do for read fences\n"); - break; - } - if (fence_from != GDS_PEER_FENCE_FROM_HCA) { - gds_err("unexpected from fence\n"); - retcode = EINVAL; - break; - } - if (fence_mem == GDS_PEER_FENCE_MEM_PEER) { - gds_dbg("using light membar\n"); - mlx5_i->membar = 1; - } - else if (fence_mem == GDS_PEER_FENCE_MEM_SYS) { - gds_dbg("using heavy membar\n"); - mlx5_i->membar_full = 1; - } - else { - gds_err("unsupported fence combination\n"); - retcode = EINVAL; - break; - } - break; - } - case GDS_PEER_OP_STORE_DWORD: { - CUdeviceptr dev_ptr = range_from_id(op->wr.dword_va.target_id)->dptr + - op->wr.dword_va.offset; - uint32_t data = op->wr.dword_va.data; - gds_dbg("OP_STORE_DWORD dev_ptr=%" PRIx64 " data=%08x\n", (uint64_t)dev_ptr, data); - if (n != 0) { - gds_err("store DWORD is not 1st op\n"); - retcode = EINVAL; - break; - } - mlx5_i->dbrec_ptr = (uint32_t*)dev_ptr; - mlx5_i->dbrec_value = data; - break; - } - case GDS_PEER_OP_STORE_QWORD: { - CUdeviceptr dev_ptr = range_from_id(op->wr.qword_va.target_id)->dptr + - op->wr.qword_va.offset; - uint64_t data = op->wr.qword_va.data; - gds_dbg("OP_STORE_QWORD dev_ptr=%" PRIx64 " data=%" PRIx64 "\n", (uint64_t)dev_ptr, (uint64_t)data); - if (n != 2) { - gds_err("store QWORD is not 3rd op\n"); - retcode = EINVAL; - break; - } - mlx5_i->db_ptr = (uint64_t*)dev_ptr; - mlx5_i->db_value = data; - break; - } - case GDS_PEER_OP_COPY_BLOCK: { - CUdeviceptr dev_ptr = range_from_id(op->wr.copy_op.target_id)->dptr + - op->wr.copy_op.offset; - size_t len = op->wr.copy_op.len; - void *src = op->wr.copy_op.src; - gds_dbg("send inline detected\n"); - if (len < 8 || len > 64) { - gds_err("unexpected len %zu\n", len); - retcode = EINVAL; - break; - } - mlx5_i->db_ptr = (uint64_t*)dev_ptr; - mlx5_i->db_value = *(uint64_t*)src; - break; - } - case GDS_PEER_OP_POLL_AND_DWORD: - case GDS_PEER_OP_POLL_GEQ_DWORD: - case GDS_PEER_OP_POLL_NOR_DWORD: { - gds_err("unexpected polling op in send request\n"); - retcode = EINVAL; - break; - } - default: - gds_err("undefined peer op type %d\n", op->type); - retcode = EINVAL; - break; - } - - if (retcode) { - gds_err("error in fill func at entry n=%zu\n", n); - break; - } - } - return retcode; -} //----------------------------------------------------------------------------- int gds_mlx5_get_send_info(int count, const gds_send_request_t *requests, gds_mlx5_send_info_t *mlx5_infos) { - int retcode = 0; + int retcode = gds_transport_init(); + if (retcode) { + gds_err("error in gds_transport_init\n"); + goto out; + } for (int j=0; jget_send_descs(mlx5_i, request); if (retcode) { gds_err("error %d while retrieving descriptors for %dth request\n", retcode, j); break; @@ -175,6 +73,7 @@ int gds_mlx5_get_send_info(int count, const gds_send_request_t *requests, gds_ml mlx5_i->dbrec_ptr, mlx5_i->dbrec_value, mlx5_i->db_ptr, mlx5_i->db_value); } +out: return retcode; } @@ -182,8 +81,7 @@ int gds_mlx5_get_send_info(int count, const gds_send_request_t *requests, gds_ml int gds_mlx5_get_wait_descs(gds_mlx5_wait_info_t *mlx5_i, const gds_wait_request_t *request) { - const gds_mlx5_exp_wait_request_t *gmexp_request = to_gds_mexp_wait_request(request); - return gds_mlx5_exp_get_wait_descs(mlx5_i, gmexp_request); + return gds_main_transport->get_wait_descs(mlx5_i, request); } //----------------------------------------------------------------------------- diff --git a/src/transport.hpp b/src/transport.hpp index ddef072..d14d9e3 100644 --- a/src/transport.hpp +++ b/src/transport.hpp @@ -27,6 +27,10 @@ #pragma once +#include +#include +#include + typedef struct gds_transport { int (*create_qp)(struct ibv_pd *pd, struct ibv_context *context, gds_qp_init_attr_t *qp_attr, gds_peer *peer, gds_peer_attr *peer_attr, int flags, gds_qp_t **gqp); int (*destroy_qp)(gds_qp_t *gqp); @@ -34,8 +38,9 @@ typedef struct gds_transport { void (*init_send_info)(gds_send_request_t *request); int (*post_send_ops)(gds_peer *peer, gds_send_request_t *request, gds_op_list_t &ops); - int (*post_send_ops_on_cpu)(gds_send_request_t *request, int flags = 0); + int (*post_send_ops_on_cpu)(gds_send_request_t *request, int flags); int (*prepare_send)(gds_qp_t *gqp, gds_send_wr *p_ewr, gds_send_wr **bad_ewr, gds_send_request_t *request); + int (*get_send_descs)(gds_mlx5_send_info_t *mlx5_i, const gds_send_request_t *_request); uint32_t (*get_num_send_request_entries)(gds_send_request_t *request); void (*init_wait_request)(gds_wait_request_t *request, uint32_t offset); @@ -45,14 +50,31 @@ typedef struct gds_transport { int (*get_wait_descs)(gds_mlx5_wait_info_t *mlx5_i, const gds_wait_request_t *request); uint32_t (*get_num_wait_request_entries)(gds_wait_request_t *request); - int (*prepare_wait_cq)(gds_cq_t *gcq, gds_wait_request_t *request, in flags); - int (*append_wait_cq)(gds_cq_t *gcq, gds_wait_request_t *request, in flags); + int (*prepare_wait_cq)(gds_cq_t *gcq, gds_wait_request_t *request, int flags); + int (*append_wait_cq)(gds_wait_request_t *request, uint32_t *dw, uint32_t val); int (*abort_wait_cq)(gds_cq_t *gcq, gds_wait_request_t *request); } gds_transport_t; extern gds_transport_t *gds_main_transport; -int gds_transport_mlx5_exp_init(gds_transport_t *transport); +int gds_transport_mlx5_exp_init(gds_transport_t **transport); + +static inline int gds_transport_init() +{ + int status = 0; + if (!gds_main_transport) { + gds_transport_t *t = NULL; + status = gds_transport_mlx5_exp_init(&t); + if (status) { + gds_err("error in gds_transport_mlx5_exp_init\n"); + goto out; + } + assert(t); + gds_main_transport = t; + } +out: + return status; +} /* * Local variables: diff --git a/src/transports/mlx5-exp/.mlx5-exp.cpp.swp b/src/transports/mlx5-exp/.mlx5-exp.cpp.swp deleted file mode 100644 index 09e0b7542379f4809868a318355bb10c5039cc58..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 40960 zcmeI53v^{wd4R8#hoihKJ}cHkV8&#~WM<@LGh`r40z_t#OeVvCikF*vPm;?d_ukxl zGs!SKl-kuUDX2?%)S^;R)CDdpw4zAWiqfhT-;XL#T3^*lwUr`>O22=f{W|BKo5wga zW!amxzMSNoefHVs-_QTw|K4Y8-FX9Bf|qx#%kXnlCUfo2CUZBQ_^D_A(U&rr%5<)} zvpzXHCf64DnJP~dOY0X1pf6o+BV0RNEnghw>$SC0#j%M(ZFecZrm$;mt(sq3ujWd% zO1a9l>8bgX*M##G`$t#4Qd#^ltq?5c_Pm8yx-n+fzfePPQb_ zl0ZuWEeW(F(2_t)0xb!&B+!z;@1_Lmm1ktG;MIOlzuuwx__W0FApL)+kIzV4H~$y) z`KiS96^Zj-Ph3A(|G!xudvyX#Iu6n2C-w2_#C4PIv-I`bkNuVWxmIQt$C6Fs*GHZ^^WI~Ynf1^kLg#Yh@ z+u&As3tSEoZWSC4hrvTfWHR@_N8uxICG^22`1RqL%qQUEkcB>Y2K?qQuED>-L+~%~ zMz{v@a56mptW4%@uoH@~5ncp`z`^j7XJ#_rfRDjlFb&(`cz6~(4Knae3=e+;cfdUK z!oOpzxE?NpQ{XW80x$kfxEB5#reOvA2*bmhVIE2_4X41#@M3ro`~iFp1IB0J9dHgj z3s%4*7*0M5?}ZT7z!7jbJdRP~A@~RQ5L^e>!a1-Gp2SA+1GpdF1Mh};n1gk2B>a{)up!1U}752ikZrqjqXy%hKz}!I3Bd8 zg|GoqE(Se;JXO3lvc#RuSHoOA%)0p7^W{=KoUiko#&{-XDqV$eS5}81GW}Q6!1djg zkYH!`ZOitJ44l_Lk{!KZxIepP^Z6%d`_CVCFLRlFn!H{$tj|_U!Ql4In_UpOAc+sF z)%Lc0u2d@5gWS|qInNse)8TZvx*Mq_1azzxLPxMCBN51tK1I6PW5SrJWb47|j=YMr zeSK$8D_#p^jBm&$0KRxB0k>XCuuynZ}4?pfD)nY%A0k^GEEq&lBf zBFoPZTJoebSY1z^b1`^No~Z<@A-c~cy33PzrpsGW8i-=uhRmLfk%+_~5)W3( zZ)%GEKYBlPQvKN(NCe|kxrthbz1d4(r9cb?a@tAD6zkRTUq5Lw9 z7JpCDc7#*>QJ$`3cj4kzrUb1@RK`WBR}(R5@%t?{sdLS%s{t=ePUdae zs&d9fnaC6Gg4dj?w@}Srp;$$ls^Zm1O}kgCq{&H!p{-4AHf!}SNkGSs~#n5o~({8Relg%Jxz1g-Mwk67uj;FQQUdLh zPSW*-HaB8Dg2^IV|7W~zPO{sHIprjoyUIoSDZdT#(x;xQ+cW~{#?9Uv3#o>71d7IZQRgV_W4sfKvN$fU zs9xk}4oD~FMBKFSOU6qI=W^9jd)ua5>80d=N~E;5<=JXJ45Gnw%9zl_@AoRE47sa* zW>XPN0NqHqaIUjiyfRE_Kx>NpB%LT`q)E#BQ4Y(Q`)nd;7B61_saG-0zBS z@^052dEJKVDY816Ekk|%jG=9DV4PeF>+Q*Ob_VP0a5wQJzX3H8@PfEerFSx_dG?|4 zc1yCZ>2N|WO1?5WXm2Z)cIBoh`ntR}jO|V9Kxeck6ZIN4lU!&2;B@}zNqkIuW$rxi=%$Yv~8Dy-cVR>ydy`AXO1#sxye zuTpW*3gw*W|F1#Uyajz!^#60UmE#lW_E$q524FS30G>pz|68~Urr?#Z1-jt7==Pt6 z+u?oiCb$N!h8{Q#zJqT6&u}YT2ZOK$PJ*N1F?9Q1z~|v=C_q0P4iBTxzZWK92RsLk zh9lsY==G1mFW|j!3tSA7upS;m$NxLH8}`6y@Kf~r+hIG5!r^c)y8KS~LwErF{k3of zoCZ%Iqr2f_@O}`TUdc_%taWHfpe2F*A_3RR>((8OJtOV8{Mbfy&BojqD~IT~u|_D1 zLacR;U39&le|x18mI_Vpk6lfTpi&JhxoX%fhQ#IccbEw4+0yKE6h}3jnGI|8Y*?bn z4n^~cy^;xAlVm5trY2BsP~s_swR~!H?rLfTl`^TZVXDOn8g4RGX<2g0~VySgD$ zo~?Jgb~S2Uy)ko|8P1ccHt>jj-`pzJHez<#EdP)%cC7Dm%S|`POu$S9tC@+~i%$d? zsbzya~sW`qmitSfU-TmZuxvUV^8_V-ia8k(#X- zYc`^xO&9C_>otFjvTQ{+1Tz01PqIlihA`97m!<)@U}FvGztUf` zrtA;Aj8REPhBj|Lvv#Yd0QI%3TG#zY)I_o zv4$l!O%U1S?M+JPre~#A+Dtf5wK9(m?=sg7giK*=JY%km(Yfw)x0=)DC?Zg*a8#`^ zt&bNe9pkg5Jl4!WMjX3?QqRguW>uXkc{y*Q(A8U#=vzC8usdVInB!ijlO{tRK$+7a{m!o~7*-{}yL#Fyl;Sb7{Kp8*v>_C}WPx(LPg}5y@J4GYtM=O8tb?Ten zXdW?9rWr$I&2!3xT)HI89iBv_^~%=~iIzrWyAu8X9SqqAnM)S^Kh(qbC(-ra2e-gA zFb4gw0lMKJxEG!OGjIj;!Wr4B zfY<=8f}L3dHkH_J5cr&Q{*)OWqzKqlqJ2^_x^X7$|O|0dC z(DFd=$A~Qt1iH4C2SS7MLdydIrwV1oGU}4nCz&a;vurI7god6bn)8nJH!TkY-0@l- z2zxSSfZ}H7Bv^z){rMY+4=&4Smy+Po#8Wu4VEpCXH&MjGtQ`2)^T5%LBpL z)U2b>(mW6%XI(vNuueK1=;Q_F(NO^}5pbHLy@1tjb74CuxY=zU|WH<>#7k?f+ z7e0gD{b?w{6r2cO(fapA9FK!zA%HI-uP?$Fi0rbE_T%jS)}m-;WYnw@V>zkl_F~il ziEq9*w#F{`vrx))I+_}cxgqaQFW^{W)a)4(M&e|-bIryYvw*n?oMEGX^=xH|fi1c& zzp-hX;MU(L<%VKm-UQRJnt$ge!&2TpvOGSH^HFEux_D!Mz5+>2&PZ*&(f*=gYH9U*Q-lR$wcE#!jp8 zj2Uq))foL0GToY_%d(V8Z1IJ;l01rKd9K=3m@8Kc*G@Kn~PA1zs*q80wF*MTG5u6^hd8gWfZvFq@ z&`5S$-;OrtO`1%jlyzkUTv-bwU8))d_U&SI&D6w8W&b4=XE&c6*&O@*Z;6s%1{ z!xv=F+&r}D+<-{BS#7|5-L&`1FJ;S3^gb+OMtoMVwum((Dz&IMDWsLIy>68gxiyn) z*Opy?Yz$E&rAvW5YhQl}8NY3GXrw>8Raz+Bw0QltjcBPYOoN!stD9posrxYV8()8! zx$lW$e`bF}x3=u;w=u-B-=4L(wpZCdzyE5yP2oPwbsMLBYn$s!>{g;|Z`!Bqb*;CZ z_F4UdoBE>#BR$cmcveQm${j%0hjAH$cd6ge3X_x}d}@oO;!Z2AwYg$FKWRrNzF&c< zi#CDfN;I;U+5B!jyI+brlEehRxbd`aptlm}=_UCk)*^QXC!A37pP>iVB{L_ZUH_Ym z+(iS@dVm`Lm+?7d;b zC$$p$Lv;Kfz&qeF*aAnvqv-nIf*as+sKeRN4Q&v>OW=#>|M$TZTnyXcAovz`fWLy9 zU^n!_OX1mYC>#Rf3-E2Y7d`_YfNS7t*bVb=4jd0(!)|abycKFt1=$Ng`~hT*zEv8wBtY_#$?P`(O@c z;UdVwq3|g7hdbbQkbMDeguevw6L48FICvNvg!l>E2;Fce z`6PKI>;L~xeoP%q9%r!1OwFlX=MuxUzHw|yU zwArF&8HHz)cPYl=(FzfQd?f|ppGHKtc> z$d`uu^O-d@@S!hx$c z@#O_4oDi3hZO(0nGsjL6z5y=MzGPT|^{r4GA7>|k_QKqnb~E@`-LcNgx8&_s(Ne9D z?9dQxDiKWQDizklqBy8YgkWG>P@fEgT%o{RZLMi?M2RIT))BjWm1innW9Q*+tbm@n z=2c2?R z3?s*5lRR-DFJVbtA8ZV^^q!v`+`c8-KR7xv(7#R8vWy9rqGc)R#LJM@cU-b&S+Fr7 zj^Luu#lzb2W*%X~Cep#3G7e$4CL00S6|lia!`WA@yU;6ui*NMD%tT8)8y)H!>JHdA zqc9ax5z44Xb7mqkW5Rip^X-Gt;l$9!(cVyD-VLaeU&@Z&-AR zbR%TZ|Njt0azy5@(f`lVO68^K`lrB;(dA`-zxTsMa18tyJ^mheKZLLWj)7;wH_+dA z!A8iyThYP1BM6doRTnTOPV|3`-;8r*TWbQu$_n}LF0p0~6w1e!m_X)TX&VU!f z{mA8QPy>W!04=4Nx;7pXkEpB;!ygvn#`RC1f1No#tuiELk_l8L+KsWcCws-Hc) zJ=nIT#u~d4dk4^pET3S%1mvhTD27<^0APF^jP-_^_WvmVq~Wu(y3DF zH)7@)=4cb92V~kbZ4K-Tzp05OdDm0Q)URr%D-_F`L0+J`G%tYrW7^sEV#`>_SDUTK z*I*gJxjD}G2-0MrIYv8LF&Rvu)tPFhS2@|d$ZJ~@)5ncBa!luEOnwwn{jCzhndJ0X+D3;42|Y)%d#JB%HYYQ!EZ^tiOvO@5;qU528Agt zig${}XMu-H(;|(LYJ*&{urs#HYFg}F`>aZnKGhT9J}vo6FFv-|V0p0JoSG~W$J4Q5iM>k8 zrG=Zvr$VWp#XO}Km#Ymf)GI8UY$@@2T1>XYpy)3&wgFiq@J@IQTnNvD=fZ>72|f=WfI7SqhG7U+!V}mFWWE3Wa4TE_yWw=$2&-Tv z$XfqjVN3W5d>QVByFly=MVN$jAZreu3$pG&_6qzSd>UlGK-mM}5_lEJ-2MuXegAKQ zD`6hyU>iIieueEp*8X1wyP*svcm*5*l3)M5s4XJZQju)xwe=~jMADQ|^i<8znd&*zUN8z?HjkYt1ra{flP5GH< z_niEU8WIwbmtymv-%1c_UzmjjVS+3o2$LK+X8eErzxRbuW}+`a?4>Zo5^2OM8|s+v zNi=K|)KDO1DpJ<$tLX`fSOj`6`Q~4D@UqWFWaPsO>DltQrCJi=zS&@Jvwvvgs4HR( z*3y*KJiZ99QLACEOjpPD6zVt--T~3a4eZ-UeXyzVQyFEHCqUct=}^-7Z5@t!I<=LL zjZ;fjXO(CEu~9@z)C+N+iJ3}I{Hr58+{PKpcimRd=&W*IhERrWmn*Rwbe)nL+-D;e zxI}C$miG348@KF{I;LgWXg7*-A5VZGIxRd5n5)>sR(1rk~;_VLW%Sd1J z`hLn9rREDwUa9}A-tC)5DJDr->MKr|eq6d%iCpD$clQo&eufziIf*hffjduCsirnR z?H_wDH2J*C++>kyQq@37Cp2=t`)o+sfGukO=%Pwx_d=@qa|PI!TS^mZI}^7m-@c0} zXHAOE$ocXq6_Y!sd!n|dSt;i-fU{cqosHuPs&)$5&*cuOmgj z*QrIYoRlnyzD8t|?cSZbE_rE30dD^o*ZuC(3>%dU!;Qv#I;uzez<;6l%Xk0237-U6 z3vfN`fb-x~5Z{6CV+XhyE``^?C~Six;RyHz34avCR&YOj2d;uQzy^@-0(=!4!I$A~ zxEywZ>#syJu~V-`rYI%nAL7BHwebWqgAe7v#xL2Ub=%P9?fwp} z@msY^756r}Ms=)hJQsJDipQm0xroc2pY!&}l#;HD;lA@JYKDF}H=(TWHqL!CK56Ul z)%qDfs^xlPMKbL;5Q?7apJhjG=iA*bp6hm7VCx|qJ8Rbh8OPuhZ8mK)@VF#YBc^34 zZrJjO1GWVP-GpfNz5uj~bnHzRd4I&l>IGF0M8lWZY!?|5z`|b7>5`DxJeOylwSUWm zJkKuvib=udckHZ|jn@aKU3N7%x{AM}slMh!yza&#aaGp@_LKohu&TJe{EU4-!Y@=q z)jbku@j|(O#jZ;t2fb^$X>yy4ry9IyO>f<>a>Jyz=5$(Vn#U(CXlge9H`EZuU>o|!pKg8EQ@J-2>o6S+@ zjP~VZx<&HHhFUP+Y+4($tC9_6@HobIz4*h+EN!MF=VQBwn$)PH$jCNwe$Dfi(zqSp z8g5m0$ja8$`thBC{h)!3)@*Dff#da4%rk6M6=F-60xBBssl4pzz>x$3Lbr;pVI)lYfFMzW_#A#?>$ z?3RsBinoj6BWg`2DYhJ0(G~x0wmdqq_^R_!2kWq*F`nQda3|-)T zI#>SeA33kTFC($bkGxz;a$@~JJg4NY#xrtOewL6+63ph5$qrekU|x?oP;KEg8AOi! EFH+&182|tP diff --git a/src/transports/mlx5-exp/.mlx5-exp.hpp.swp b/src/transports/mlx5-exp/.mlx5-exp.hpp.swp deleted file mode 100644 index 64c77c8d59fee822a476d37dfa63f8ef615c2d7b..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 16384 zcmeHO&5s<#6|X>WfCS9fDN>YTQOM3}_RG6iB*rTdc@tS;?Uike!-ATg?%LV5d%AnN zYj(XMCJKcAAQHDok&;6WK_SY4C{ZLZCk}{PawLl6gapBrj|;!oAJbJeJ+ot*1Bz;- z-}ZEUy!Yz&UR8Cyp6x81Ke@plU07oH+{4&EKm5LX=;{;qeD7VxlAfDhk~@PA)lK+> zaW{yTCp0LlUKk2o+)U$(!jsuz7<9URwjX)({_bLydW$l3qb!M2)b+y4kIak9$?(g< zPLdp~#UFsSO)$- z8IZ{>>}MF-E#>g46Tw%?`|5gYdH=2QdfuonZ|%i0U>UFsSOzQumI2FvWxz6E8L$jk z1}p=XfiEEguFu#vVAoISppWQ3YE5M7u&w=NFW58kH)4Le^1b7GdJ@88)09Ju-0-wQ{mw|V_j`qOaz`yQf>`mZz zz>B~JumU^){OS(I{(U=Rv%q(OYhPpRRX_qM&pNZ?NpV@Wc_!0=Cwb7>MOiO#yd5`+M2JSkmo}^VYAy5LXjF*_ zBFVkJXfyo8!l7SNIQTV%14WY0=|!nB_+4K@DQ%acCT52uGOjUV{KYy@& zi4d1=%(^`{kdv*`T1;o%EWD8!beLjVGInwnh-kHPR_U#llLK4bJ@}+$I$#QQW3|+VtSrpH&`EoxI{P=PHqZ4PH6K77IUq9oV`|;+wv+=}* zN1XKwnaDH5J2oV}_>YHMLUpnp$3XEt}G7W>0l!YMl}yYH>ct zb6rz-&Fl!Tm1de=qvD(PnyGH=(8z0MhTKnc^Z9Y@11@X4-HN81^BR{VomVncEYk)`gkj5>lC;1wRp!jc@VCj&r4%uB`=Q&yg0$ftHE(R)&Ql%V3%t@i^t! zOWlymm8wo$xdS%jp>L8@8{!xT{ z;`+;5%SWHBYs!Zb?8KOf*wIqHmu}Gl|J36r&p9Vg{qVH2@%RPj^yb;oPV~$}+hE?A z_Hs^tYT1g`rL9$^KaJK;gI(yPJq3p4@pO#QoiDblhpMrxOdICI{z%6ebQ6p|_ zn4eugb>e!$ym+MM#idd67ZB>|`0{;7pXIbk)vBvmmn9C%(6&>{zFs@>P0O^y>rb}B>ysT11(6pHe8Fc1G6*yJt@(^n7k{6B!s|JD2bU*jy_2NGZ%_y}kB*ML6&d%zlS9QY7t_BVi+ zfnNYG0c*hDafW{ZxB@%~yp41GGr%V}v%dn|3;Y>p_fxGhnHrS74z%pPNunbrR zECZH-FOC7qhHHnvG!Da#>s`XrtDeXx6E>839!VmpkI|Ix$?x&t79-2RMpRJZJ3rU^ zBJ;9ZOI}93NJO4t4BB}9KQL2VX5pc0L{$=Jimo(BWSrIt&CD|CErqp_yO(@Bbh}xT zTs%Jtx95x%Pmi*Ej{X{nD?Oi~s3RvJEfC@v6lOe}xT)LA4zR-Q;9o)uD$D%TFfUss zs>oX-$+IywG>y|u5_&t4_b^qA5!0?;cg*>F%H6ITHmOKbfx(Jq9mO!NXB0YR`CB9{ zzWo7S9jT^I84i@zf@1>zG6pd!7H62!6J-ERDR%g3sK^FF_Lv~fr&OFENFv8e2D*52 z{?Q~-h~P+lHBwKrqY|7;92>XI8jPTYdE?2+C6HkUn@%h+XWz@fyX6gMgl%bZN({mCn8lYgp?=FHP3H<4A1m*_EF zNcWB_Ww{>ZUzONE_o{c9x}o$!f_JZpZ@hCg-=vZ@Fp`HpDWK&O_w*ICYOX4YLx*~W zDqgQ*o2l(rETFZyPOQ*EStVT*i)oF{-v#gD|K{Uzt(}elGS3ae*rS|&zRtt^{Mu%p E1BZhas_memops) { + gds_err("CUDA MemOps are required\n"); + return EINVAL; + } + + // divert the request to the same engine handling 64bits + // to avoid out-of-order execution + // caveat: can't use membar if inlcpy is used for 4B writes (to simulate 8B writes) + if (peer->has_inlcpy) { + if (!peer->has_membar) + use_inlcpy_for_dword = true; // F + } + if (gds_simulate_write64()) { + if (!peer->has_membar) { + gds_warn_once("enabling use_inlcpy_for_dword\n"); + use_inlcpy_for_dword = true; // D + } + } + + for (; op && n < n_ops; op = op->next, ++n) { + //int flags = 0; + gds_dbg("op[%zu] type:%08x\n", n, op->type); + switch(op->type) { + case GDS_PEER_OP_FENCE: { + gds_dbg("OP_FENCE: fence_flags=%" PRIu64 "\n", op->wr.fence.fence_flags); + uint32_t fence_op = (op->wr.fence.fence_flags & (GDS_PEER_FENCE_OP_READ|GDS_PEER_FENCE_OP_WRITE)); + uint32_t fence_from = (op->wr.fence.fence_flags & (GDS_PEER_FENCE_FROM_CPU|GDS_PEER_FENCE_FROM_HCA)); + uint32_t fence_mem = (op->wr.fence.fence_flags & (GDS_PEER_FENCE_MEM_SYS|GDS_PEER_FENCE_MEM_PEER)); + + if (fence_op == GDS_PEER_FENCE_OP_READ) { + gds_dbg("nothing to do for read fences\n"); + //retcode = EINVAL; + break; + } + else { + if (!peer->has_membar) { + if (use_inlcpy_for_dword) { + assert(ops.size() > 0); + gds_dbg("patching previous param\n"); + gds_enable_barrier_for_inlcpy(&ops.back()); + } + else { + gds_dbg("recording fence event\n"); + prev_was_fence = true; + } + //retcode = 0; + } + else { + if (fence_from != GDS_PEER_FENCE_FROM_HCA) { + gds_err("unexpected from fence\n"); + retcode = EINVAL; + break; + } + int flags = 0; + if (fence_mem == GDS_PEER_FENCE_MEM_PEER) { + gds_dbg("using light membar\n"); + flags = GDS_MEMBAR_DEFAULT | GDS_MEMBAR_MLX5; + } + else if (fence_mem == GDS_PEER_FENCE_MEM_SYS) { + gds_dbg("using heavy membar\n"); + flags = GDS_MEMBAR_SYS | GDS_MEMBAR_MLX5; + } + else { + gds_err("unsupported fence combination\n"); + retcode = EINVAL; + break; + } + retcode = gds_fill_membar(peer, ops, flags); + } + } + break; + } + case GDS_PEER_OP_STORE_DWORD: { + CUdeviceptr dev_ptr = range_from_id(op->wr.dword_va.target_id)->dptr + + op->wr.dword_va.offset; + uint32_t data = op->wr.dword_va.data; + int flags = 0; + gds_dbg("OP_STORE_DWORD dev_ptr=%llx data=%" PRIx32 "\n", dev_ptr, data); + if (use_inlcpy_for_dword) { // F || D + // membar may be out of order WRT inlcpy + if (peer->has_membar) { + gds_err("invalid feature combination, inlcpy + membar\n"); + retcode = EINVAL; + break; + } + // tail flush is set when following fence is met + // flags |= GDS_IMMCOPY_POST_TAIL_FLUSH; + retcode = gds_fill_inlcpy(peer, ops, dev_ptr, &data, sizeof(data), flags); + } + else { // A || B || C || E + // can't guarantee ordering of write32+inlcpy unless + // a membar is there + // TODO: fix driver when !weak + if (peer->has_inlcpy && !peer->has_membar) { + gds_err("invalid feature combination, inlcpy needs membar\n"); + retcode = EINVAL; + break; + } + if (prev_was_fence) { + gds_dbg("using PRE_BARRIER as fence\n"); + flags |= GDS_WRITE_PRE_BARRIER; + prev_was_fence = false; + } + retcode = gds_fill_poke(peer, ops, dev_ptr, data, flags); + } + break; + } + case GDS_PEER_OP_STORE_QWORD: { + CUdeviceptr dev_ptr = range_from_id(op->wr.qword_va.target_id)->dptr + + op->wr.qword_va.offset; + uint64_t data = op->wr.qword_va.data; + int flags = 0; + gds_dbg("OP_STORE_QWORD dev_ptr=%llx data=%" PRIx64 "\n", dev_ptr, data); + // C || D + if (gds_simulate_write64()) { + // simulate 64-bit poke by inline copy + if (!peer->has_membar) { + gds_err("invalid feature combination, inlcpy needs membar\n"); + retcode = EINVAL; + break; + } + + // tail flush is never useful here + //flags |= GDS_IMMCOPY_POST_TAIL_FLUSH; + retcode = gds_fill_inlcpy(peer, ops, dev_ptr, &data, sizeof(data), flags); + } + else if (peer->has_write64) { + retcode = gds_fill_poke64(peer, ops, dev_ptr, data, flags); + } + else { + uint32_t datalo = gds_qword_lo(op->wr.qword_va.data); + uint32_t datahi = gds_qword_hi(op->wr.qword_va.data); + + if (prev_was_fence) { + gds_dbg("enabling PRE_BARRIER\n"); + flags |= GDS_WRITE_PRE_BARRIER; + prev_was_fence = false; + } + retcode = gds_fill_poke(peer, ops, dev_ptr, datalo, flags); + + // get rid of the barrier, if there + flags &= ~GDS_WRITE_PRE_BARRIER; + + // advance to next DWORD + dev_ptr += sizeof(uint32_t); + retcode = gds_fill_poke(peer, ops, dev_ptr, datahi, flags); + } + + break; + } + case GDS_PEER_OP_COPY_BLOCK: { + CUdeviceptr dev_ptr = range_from_id(op->wr.copy_op.target_id)->dptr + + op->wr.copy_op.offset; + size_t len = op->wr.copy_op.len; + void *src = op->wr.copy_op.src; + int flags = 0; + gds_dbg("OP_COPY_BLOCK dev_ptr=%llx src=%p len=%zu\n", dev_ptr, src, len); + // catching any other size here + if (!peer->has_inlcpy) { + gds_err("inline copy is not supported\n"); + retcode = EINVAL; + break; + } + // IB Verbs bug + assert(len <= GDS_GPU_MAX_INLINE_SIZE); + //if (desc->need_flush) { + // flags |= GDS_IMMCOPY_POST_TAIL_FLUSH; + //} + retcode = gds_fill_inlcpy(peer, ops, dev_ptr, src, len, flags); + break; + } + case GDS_PEER_OP_POLL_AND_DWORD: + case GDS_PEER_OP_POLL_GEQ_DWORD: + case GDS_PEER_OP_POLL_NOR_DWORD: { + int poll_cond; + CUdeviceptr dev_ptr = range_from_id(op->wr.dword_va.target_id)->dptr + + op->wr.dword_va.offset; + uint32_t data = op->wr.dword_va.data; + // TODO: properly handle a following fence instead of blidly flushing + int flags = 0; + if (!(post_flags & GDS_POST_OPS_DISCARD_WAIT_FLUSH)) + flags |= GDS_WAIT_POST_FLUSH_REMOTE; + + gds_dbg("OP_WAIT_DWORD dev_ptr=%llx data=%" PRIx32 " type=%" PRIx32 "\n", dev_ptr, data, (uint32_t)op->type); + + switch(op->type) { + case GDS_PEER_OP_POLL_NOR_DWORD: + poll_cond = GDS_WAIT_COND_NOR; + break; + case GDS_PEER_OP_POLL_GEQ_DWORD: + poll_cond = GDS_WAIT_COND_GEQ; + break; + case GDS_PEER_OP_POLL_AND_DWORD: + poll_cond = GDS_WAIT_COND_AND; + break; + default: + assert(!"cannot happen"); + retcode = EINVAL; + goto out; + } + retcode = gds_fill_poll(peer, ops, dev_ptr, data, poll_cond, flags); + break; + } + default: + gds_err("undefined peer op type %d\n", op->type); + retcode = EINVAL; + break; + } + if (retcode) { + gds_err("error in fill func at entry n=%zu\n", n); + goto out; + } + } + + assert(n_ops == n); + +out: + return retcode; +} + +//----------------------------------------------------------------------------- + +static int gds_mlx5_exp_post_ops_on_cpu(size_t n_ops, struct peer_op_wr *op, int post_flags) +{ + int retcode = 0; + size_t n = 0; + gds_dbg("n_ops=%zu op=%p post_flags=0x%x\n", n_ops, op, post_flags); + for (; op && n < n_ops; op = op->next, ++n) { + //int flags = 0; + gds_dbg("op[%zu]=%p\n", n, op); + //gds_dbg("op[%zu]=%p type:%08x\n", n, op, op->type); + switch(op->type) { + case GDS_PEER_OP_FENCE: { + gds_dbg("FENCE flags=%" PRIu64 "\n", op->wr.fence.fence_flags); + uint32_t fence_op = (op->wr.fence.fence_flags & (GDS_PEER_FENCE_OP_READ|GDS_PEER_FENCE_OP_WRITE)); + uint32_t fence_from = (op->wr.fence.fence_flags & (GDS_PEER_FENCE_FROM_CPU|GDS_PEER_FENCE_FROM_HCA)); + uint32_t fence_mem = (op->wr.fence.fence_flags & (GDS_PEER_FENCE_MEM_SYS|GDS_PEER_FENCE_MEM_PEER)); + + if (fence_op == GDS_PEER_FENCE_OP_READ) { + gds_warnc(1, "nothing to do for read fences\n"); + //retcode = EINVAL; + break; + } + else { + if (fence_from != GDS_PEER_FENCE_FROM_HCA) { + gds_err("unexpected from %08x fence, expected FROM_HCA\n", fence_from); + retcode = EINVAL; + break; + } + if (fence_mem == GDS_PEER_FENCE_MEM_PEER) { + gds_dbg("using light membar\n"); + wmb(); + } + else if (fence_mem == GDS_PEER_FENCE_MEM_SYS) { + gds_dbg("using heavy membar\n"); + wmb(); + } + else { + gds_err("unsupported fence combination\n"); + retcode = EINVAL; + break; + } + } + break; + } + case GDS_PEER_OP_STORE_DWORD: { + uint32_t *ptr = (uint32_t*)((ptrdiff_t)range_from_id(op->wr.dword_va.target_id)->va + op->wr.dword_va.offset); + uint32_t data = op->wr.dword_va.data; + // A || B || C || E + gds_dbg("STORE_DWORD ptr=%p data=%08" PRIx32 "\n", ptr, data); + gds_atomic_set(ptr, data); + break; + } + case GDS_PEER_OP_STORE_QWORD: { + uint64_t *ptr = (uint64_t*)((ptrdiff_t)range_from_id(op->wr.qword_va.target_id)->va + op->wr.qword_va.offset); + uint64_t data = op->wr.qword_va.data; + gds_dbg("STORE_QWORD ptr=%p data=%016" PRIx64 "\n", ptr, data); + gds_atomic_set(ptr, data); + break; + } + case GDS_PEER_OP_COPY_BLOCK: { + uint64_t *ptr = (uint64_t*)((ptrdiff_t)range_from_id(op->wr.copy_op.target_id)->va + op->wr.copy_op.offset); + uint64_t *src = (uint64_t*)op->wr.copy_op.src; + size_t n_bytes = op->wr.copy_op.len; + gds_dbg("COPY_BLOCK ptr=%p src=%p len=%zu\n", ptr, src, n_bytes); + gds_bf_copy(ptr, src, n_bytes); + break; + } + case GDS_PEER_OP_POLL_AND_DWORD: + case GDS_PEER_OP_POLL_GEQ_DWORD: + case GDS_PEER_OP_POLL_NOR_DWORD: { + int poll_cond; + uint32_t *ptr = (uint32_t*)((ptrdiff_t)range_from_id(op->wr.dword_va.target_id)->va + op->wr.dword_va.offset); + uint32_t value = op->wr.dword_va.data; + bool flush = true; + if (post_flags & GDS_POST_OPS_DISCARD_WAIT_FLUSH) + flush = false; + gds_dbg("WAIT_32 dev_ptr=%p data=%" PRIx32 " type=%" PRIx32 "\n", ptr, value, (uint32_t)op->type); + bool done = false; + do { + uint32_t data = gds_atomic_get(ptr); + switch(op->type) { + case GDS_PEER_OP_POLL_NOR_DWORD: + done = (0 != ~(data | value)); + break; + case GDS_PEER_OP_POLL_GEQ_DWORD: + done = ((int32_t)data - (int32_t)value >= 0); + break; + case GDS_PEER_OP_POLL_AND_DWORD: + done = (0 != (data & value)); + break; + default: + gds_err("invalid op type %02x\n", op->type); + retcode = EINVAL; + goto out; + } + if (done) + break; + // TODO: more aggressive CPU relaxing needed here to avoid starving I/O fabric + arch_cpu_relax(); + } while(true); + break; + } + default: + gds_err("undefined peer op type %d\n", op->type); + retcode = EINVAL; + break; + } + if (retcode) { + gds_err("error %d at entry n=%zu\n", retcode, n); + goto out; + } + } + +out: + return retcode; +} + +//----------------------------------------------------------------------------- + +int gds_mlx5_exp_get_send_descs(gds_mlx5_send_info_t *mlx5_i, const gds_send_request_t *_request) +{ + int retcode = 0; + const gds_mlx5_exp_send_request_t *request = to_gds_mexp_send_request(_request); + size_t n_ops = request->commit.entries; + peer_op_wr *op = request->commit.storage; + size_t n = 0; + + memset(mlx5_i, 0, sizeof(*mlx5_i)); + + for (; op && n < n_ops; op = op->next, ++n) { + switch(op->type) { + case GDS_PEER_OP_FENCE: { + gds_dbg("OP_FENCE: fence_flags=%" PRIu64 "\n", op->wr.fence.fence_flags); + uint32_t fence_op = (op->wr.fence.fence_flags & (GDS_PEER_FENCE_OP_READ|GDS_PEER_FENCE_OP_WRITE)); + uint32_t fence_from = (op->wr.fence.fence_flags & (GDS_PEER_FENCE_FROM_CPU|GDS_PEER_FENCE_FROM_HCA)); + uint32_t fence_mem = (op->wr.fence.fence_flags & (GDS_PEER_FENCE_MEM_SYS|GDS_PEER_FENCE_MEM_PEER)); + if (fence_op == GDS_PEER_FENCE_OP_READ) { + gds_dbg("nothing to do for read fences\n"); + break; + } + if (fence_from != GDS_PEER_FENCE_FROM_HCA) { + gds_err("unexpected from fence\n"); + retcode = EINVAL; + break; + } + if (fence_mem == GDS_PEER_FENCE_MEM_PEER) { + gds_dbg("using light membar\n"); + mlx5_i->membar = 1; + } + else if (fence_mem == GDS_PEER_FENCE_MEM_SYS) { + gds_dbg("using heavy membar\n"); + mlx5_i->membar_full = 1; + } + else { + gds_err("unsupported fence combination\n"); + retcode = EINVAL; + break; + } + break; + } + case GDS_PEER_OP_STORE_DWORD: { + CUdeviceptr dev_ptr = range_from_id(op->wr.dword_va.target_id)->dptr + + op->wr.dword_va.offset; + uint32_t data = op->wr.dword_va.data; + gds_dbg("OP_STORE_DWORD dev_ptr=%" PRIx64 " data=%08x\n", (uint64_t)dev_ptr, data); + if (n != 0) { + gds_err("store DWORD is not 1st op\n"); + retcode = EINVAL; + break; + } + mlx5_i->dbrec_ptr = (uint32_t*)dev_ptr; + mlx5_i->dbrec_value = data; + break; + } + case GDS_PEER_OP_STORE_QWORD: { + CUdeviceptr dev_ptr = range_from_id(op->wr.qword_va.target_id)->dptr + + op->wr.qword_va.offset; + uint64_t data = op->wr.qword_va.data; + gds_dbg("OP_STORE_QWORD dev_ptr=%" PRIx64 " data=%" PRIx64 "\n", (uint64_t)dev_ptr, (uint64_t)data); + if (n != 2) { + gds_err("store QWORD is not 3rd op\n"); + retcode = EINVAL; + break; + } + mlx5_i->db_ptr = (uint64_t*)dev_ptr; + mlx5_i->db_value = data; + break; + } + case GDS_PEER_OP_COPY_BLOCK: { + CUdeviceptr dev_ptr = range_from_id(op->wr.copy_op.target_id)->dptr + + op->wr.copy_op.offset; + size_t len = op->wr.copy_op.len; + void *src = op->wr.copy_op.src; + gds_dbg("send inline detected\n"); + if (len < 8 || len > 64) { + gds_err("unexpected len %zu\n", len); + retcode = EINVAL; + break; + } + mlx5_i->db_ptr = (uint64_t*)dev_ptr; + mlx5_i->db_value = *(uint64_t*)src; + break; + } + case GDS_PEER_OP_POLL_AND_DWORD: + case GDS_PEER_OP_POLL_GEQ_DWORD: + case GDS_PEER_OP_POLL_NOR_DWORD: { + gds_err("unexpected polling op in send request\n"); + retcode = EINVAL; + break; + } + default: + gds_err("undefined peer op type %d\n", op->type); + retcode = EINVAL; + break; + } + + if (retcode) { + gds_err("error in fill func at entry n=%zu\n", n); + break; + } + } + return retcode; +} + +//----------------------------------------------------------------------------- + static ibv_exp_res_domain *gds_mlx5_exp_create_res_domain(struct ibv_context *context) { if (!context) { @@ -77,9 +580,89 @@ gds_mlx5_exp_cq_t *gds_mlx5_exp_create_cq( //----------------------------------------------------------------------------- -gds_mlx5_exp_qp_t *gds_mlx5_exp_create_qp( +int gds_mlx5_exp_destroy_cq(gds_mlx5_exp_cq_t *gmexpcq) +{ + int retcode = 0; + int ret; + + if (!gmexpcq) + return retcode; + + assert(gmexpcq->gcq.dtype == GDS_DRIVER_TYPE_MLX5_EXP); + + if (gmexpcq->gcq.cq) { + ret = ibv_destroy_cq(gmexpcq->gcq.cq); + if (ret) { + gds_err("error %d in destroy_cq\n", ret); + retcode = ret; + } + } + + // res_domain will be destroyed in gds_mlx5_exp_destroy_qp. + + free(gmexpcq); + + return retcode; +} + + +//----------------------------------------------------------------------------- + +int gds_mlx5_exp_destroy_qp(gds_qp_t *gqp) +{ + int retcode = 0; + int ret; + + if (!gqp) + return retcode; + + gds_mlx5_exp_qp_t *gmexpqp = to_gds_mexp_qp(gqp); + + assert(gmexpqp->gqp.dtype == GDS_DRIVER_TYPE_MLX5_EXP); + + if (gmexpqp->gqp.qp) { + ret = ibv_destroy_qp(gmexpqp->gqp.qp); + if (ret) { + gds_err("error %d in destroy_qp\n", ret); + retcode = ret; + } + } + + if (gmexpqp->gqp.send_cq) { + ret = gds_mlx5_exp_destroy_cq(to_gds_mexp_cq(gmexpqp->gqp.send_cq)); + if (ret) { + gds_err("error %d in destroy_cq send_cq\n", ret); + retcode = ret; + } + } + + if (gmexpqp->gqp.recv_cq) { + ret = gds_mlx5_exp_destroy_cq(to_gds_mexp_cq(gmexpqp->gqp.recv_cq)); + if (ret) { + gds_err("error %d in destroy_cq recv_cq\n", ret); + retcode = ret; + } + } + + if (gmexpqp->res_domain) { + struct ibv_exp_destroy_res_domain_attr attr = {0,}; //IBV_EXP_DESTROY_RES_DOMAIN_RESERVED + ret = ibv_exp_destroy_res_domain(gmexpqp->gqp.dev_context, gmexpqp->res_domain, &attr); + if (ret) { + gds_err("ibv_exp_destroy_res_domain error %d: %s\n", ret, strerror(ret)); + retcode = ret; + } + } + + free(gmexpqp); + + return retcode; +} + +//----------------------------------------------------------------------------- + +int gds_mlx5_exp_create_qp( struct ibv_pd *pd, struct ibv_context *context, gds_qp_init_attr_t *qp_attr, - gds_peer *peer, gds_peer_attr *peer_attr, int flags) + gds_peer *peer, gds_peer_attr *peer_attr, int flags, gds_qp_t **gqp) { int ret = 0; gds_mlx5_exp_qp_t *gmexpqp = NULL; @@ -96,8 +679,9 @@ gds_mlx5_exp_qp_t *gds_mlx5_exp_create_qp( gmexpqp = (gds_mlx5_exp_qp_t *)calloc(1, sizeof(gds_mlx5_exp_qp_t)); if (!gmexpqp) { - gds_err("cannot allocate memory\n"); - return NULL; + ret = ENOMEM; + gds_err("cannot allocate memory\n"); + goto err; } gmexpqp->gqp.dtype = GDS_DRIVER_TYPE_MLX5_EXP; @@ -171,99 +755,37 @@ gds_mlx5_exp_qp_t *gds_mlx5_exp_create_qp( gds_dbg("created gds_mlx5_exp_qp=%p\n", gmexpqp); - return gmexpqp; - -err: - gds_dbg("destroying QP\n"); - gds_mlx5_exp_destroy_qp(gmexpqp); - - return NULL; -} - -//----------------------------------------------------------------------------- - -int gds_mlx5_exp_destroy_qp(gds_mlx5_exp_qp_t *gmexpqp) -{ - int retcode = 0; - int ret; - - if (!gmexpqp) - return retcode; - - assert(gmexpqp->gqp.dtype == GDS_DRIVER_TYPE_MLX5_EXP); - - if (gmexpqp->gqp.qp) { - ret = ibv_destroy_qp(gmexpqp->gqp.qp); - if (ret) { - gds_err("error %d in destroy_qp\n", ret); - retcode = ret; - } - } - - if (gmexpqp->gqp.send_cq) { - ret = gds_destroy_cq(gmexpqp->gqp.send_cq); - if (ret) { - gds_err("error %d in destroy_cq send_cq\n", ret); - retcode = ret; - } - } + *gqp = &gmexpqp->gqp; - if (gmexpqp->gqp.recv_cq) { - ret = gds_destroy_cq(gmexpqp->gqp.recv_cq); - if (ret) { - gds_err("error %d in destroy_cq recv_cq\n", ret); - retcode = ret; - } - } + return 0; - if (gmexpqp->res_domain) { - struct ibv_exp_destroy_res_domain_attr attr = {0,}; //IBV_EXP_DESTROY_RES_DOMAIN_RESERVED - ret = ibv_exp_destroy_res_domain(gmexpqp->gqp.dev_context, gmexpqp->res_domain, &attr); - if (ret) { - gds_err("ibv_exp_destroy_res_domain error %d: %s\n", ret, strerror(ret)); - retcode = ret; - } +err: + if (gmexpqp) { + gds_dbg("destroying QP\n"); + gds_mlx5_exp_destroy_qp(&gmexpqp->gqp); } - free(gmexpqp); - - return retcode; + return ret; } + //----------------------------------------------------------------------------- -int gds_mlx5_exp_destroy_cq(gds_mlx5_exp_cq_t *gmexpcq) +int gds_mlx5_exp_prepare_send(gds_qp_t *gqp, gds_send_wr *p_ewr, + gds_send_wr **bad_ewr, + gds_send_request_t *_request) { - int retcode = 0; - int ret; - - if (!gmexpcq) - return retcode; - - assert(gmexpcq->gcq.dtype == GDS_DRIVER_TYPE_MLX5_EXP); - - if (gmexpcq->gcq.cq) { - ret = ibv_destroy_cq(gmexpcq->gcq.cq); - if (ret) { - gds_err("error %d in destroy_cq\n", ret); - retcode = ret; - } - } - - // res_domain will be destroyed in gds_mlx5_exp_destroy_qp. + int ret = 0; - free(gmexpcq); + gds_mlx5_exp_qp_t *gmexpqp; + gds_mlx5_exp_send_request_t *request; - return retcode; -} + assert(gqp); + assert(_request); -//----------------------------------------------------------------------------- + gmexpqp = to_gds_mexp_qp(gqp); + request = to_gds_mexp_send_request(_request); -int gds_mlx5_exp_prepare_send(gds_mlx5_exp_qp_t *gmexpqp, gds_send_wr *p_ewr, - gds_send_wr **bad_ewr, - gds_mlx5_exp_send_request_t *request) -{ - int ret = 0; ret = ibv_post_send(gmexpqp->gqp.qp, p_ewr, bad_ewr); if (ret) { @@ -287,39 +809,60 @@ int gds_mlx5_exp_prepare_send(gds_mlx5_exp_qp_t *gmexpqp, gds_send_wr *p_ewr, //----------------------------------------------------------------------------- -void gds_mlx5_exp_init_send_info(gds_mlx5_exp_send_request_t *info) +void gds_mlx5_exp_init_send_info(gds_send_request_t *_info) { + gds_mlx5_exp_send_request_t *info; + + assert(_info); + info = to_gds_mexp_send_request(_info); + gds_dbg("send_request=%p\n", info); info->commit.storage = info->wr; info->commit.entries = sizeof(info->wr)/sizeof(info->wr[0]); - gds_init_ops(info->commit.storage, info->commit.entries); + gds_mlx5_exp_init_ops(info->commit.storage, info->commit.entries); } //----------------------------------------------------------------------------- -int gds_mlx5_exp_post_send_ops(gds_peer *peer, gds_mlx5_exp_send_request_t *info, gds_op_list_t &ops) +int gds_mlx5_exp_post_send_ops(gds_peer *peer, gds_send_request_t *_info, gds_op_list_t &ops) { - return gds_post_ops(peer, info->commit.entries, info->commit.storage, ops, 0); + gds_mlx5_exp_send_request_t *info; + + assert(peer); + assert(_info); + + info = to_gds_mexp_send_request(_info); + return gds_mlx5_exp_post_ops(peer, info->commit.entries, info->commit.storage, ops, 0); } //----------------------------------------------------------------------------- -int gds_mlx5_exp_post_send_ops_on_cpu(gds_mlx5_exp_send_request_t *info, int flags) +int gds_mlx5_exp_post_send_ops_on_cpu(gds_send_request_t *_info, int flags) { - return gds_post_ops_on_cpu(info->commit.entries, info->commit.storage, flags); + gds_mlx5_exp_send_request_t *info; + + assert(_info); + + info = to_gds_mexp_send_request(_info); + return gds_mlx5_exp_post_ops_on_cpu(info->commit.entries, info->commit.storage, flags); } //----------------------------------------------------------------------------- -void gds_mlx5_exp_init_wait_request(gds_mlx5_exp_wait_request_t *request, uint32_t offset) +void gds_mlx5_exp_init_wait_request(gds_wait_request_t *_request, uint32_t offset) { + gds_mlx5_exp_wait_request_t *request; + + assert(_request); + request = to_gds_mexp_wait_request(_request); + gds_dbg("wait_request=%p offset=%08x\n", request, offset); request->peek.storage = request->wr; request->peek.entries = sizeof(request->wr)/sizeof(request->wr[0]); request->peek.whence = IBV_EXP_PEER_PEEK_ABSOLUTE; request->peek.offset = offset; - gds_init_ops(request->peek.storage, request->peek.entries); + gds_mlx5_exp_init_ops(request->peek.storage, request->peek.entries); } //----------------------------------------------------------------------------- @@ -382,9 +925,14 @@ static void gds_mlx5_exp_dump_ops(struct peer_op_wr *op, size_t count) //----------------------------------------------------------------------------- -void gds_mlx5_exp_dump_wait_request(gds_mlx5_exp_wait_request_t *request, size_t idx) +void gds_mlx5_exp_dump_wait_request(gds_wait_request_t *_request, size_t idx) { - struct ibv_exp_peer_peek *peek = &request->peek; + gds_mlx5_exp_wait_request_t *request; + struct ibv_exp_peer_peek *peek; + + assert(_request); + request = to_gds_mexp_wait_request(_request); + peek = &request->peek; gds_dbg("req[%zu] entries:%u whence:%u offset:%u peek_id:%" PRIx64 " comp_mask:%08x\n", idx, peek->entries, peek->whence, peek->offset, peek->peek_id, peek->comp_mask); @@ -393,9 +941,17 @@ void gds_mlx5_exp_dump_wait_request(gds_mlx5_exp_wait_request_t *request, size_t //----------------------------------------------------------------------------- -int gds_mlx5_exp_prepare_wait_cq(gds_mlx5_exp_cq_t *mexpcq, gds_mlx5_exp_wait_request_t *request, int flags) +int gds_mlx5_exp_prepare_wait_cq(gds_cq_t *gcq, gds_wait_request_t *_request, int flags) { int retcode = 0; + gds_mlx5_exp_cq_t *mexpcq; + gds_mlx5_exp_wait_request_t *request; + + assert(gcq); + assert(_request); + + mexpcq = to_gds_mexp_cq(gcq); + request = to_gds_mexp_wait_request(_request); retcode = ibv_exp_peer_peek_cq(mexpcq->gcq.cq, &request->peek); if (retcode == ENOSPC) { @@ -413,12 +969,20 @@ int gds_mlx5_exp_prepare_wait_cq(gds_mlx5_exp_cq_t *mexpcq, gds_mlx5_exp_wait_re //----------------------------------------------------------------------------- -int gds_mlx5_exp_append_wait_cq(gds_mlx5_exp_wait_request_t *request, uint32_t *dw, uint32_t val) +int gds_mlx5_exp_append_wait_cq(gds_wait_request_t *_request, uint32_t *dw, uint32_t val) { int ret = 0; - unsigned MAX_NUM_ENTRIES = sizeof(request->wr) / sizeof(request->wr[0]); - unsigned n = request->peek.entries; - struct peer_op_wr *wr = request->peek.storage; + unsigned MAX_NUM_ENTRIES; + unsigned n; + struct peer_op_wr *wr; + gds_mlx5_exp_wait_request_t *request; + + assert(_request); + + request = to_gds_mexp_wait_request(_request); + MAX_NUM_ENTRIES = sizeof(request->wr) / sizeof(request->wr[0]); + n = request->peek.entries; + wr = request->peek.storage; if (n + 1 > MAX_NUM_ENTRIES) { gds_err("no space left to stuff a poke\n"); @@ -448,9 +1012,18 @@ int gds_mlx5_exp_append_wait_cq(gds_mlx5_exp_wait_request_t *request, uint32_t * //----------------------------------------------------------------------------- -int gds_mlx5_exp_abort_wait_cq(gds_mlx5_exp_cq_t *gmexpcq, gds_mlx5_exp_wait_request_t *request) +int gds_mlx5_exp_abort_wait_cq(gds_cq_t *gcq, gds_wait_request_t *_request) { struct ibv_exp_peer_abort_peek abort_ctx; + gds_mlx5_exp_cq_t *gmexpcq; + gds_mlx5_exp_wait_request_t *request; + + assert(gcq); + assert(_request); + + gmexpcq = to_gds_mexp_cq(gcq); + request = to_gds_mexp_wait_request(_request); + abort_ctx.peek_id = request->peek.peek_id; abort_ctx.comp_mask = 0; return ibv_exp_peer_abort_peek_cq(gmexpcq->gcq.cq, &abort_ctx); @@ -458,35 +1031,46 @@ int gds_mlx5_exp_abort_wait_cq(gds_mlx5_exp_cq_t *gmexpcq, gds_mlx5_exp_wait_req //----------------------------------------------------------------------------- -int gds_mlx5_exp_stream_post_wait_descriptor(gds_peer *peer, gds_mlx5_exp_wait_request_t *request, gds_op_list_t ¶ms, int flags) +int gds_mlx5_exp_stream_post_wait_descriptor(gds_peer *peer, gds_wait_request_t *_request, gds_op_list_t ¶ms, int flags) { int ret = 0; + gds_mlx5_exp_wait_request_t *request; - ret = gds_post_ops(peer, request->peek.entries, request->peek.storage, params, flags); + assert(peer); + assert(_request); + + request = to_gds_mexp_wait_request(_request); + + ret = gds_mlx5_exp_post_ops(peer, request->peek.entries, request->peek.storage, params, flags); if (ret) - gds_err("error %d in gds_post_ops\n", ret); + gds_err("error %d in gds_mlx5_exp_post_ops\n", ret); return ret; } //----------------------------------------------------------------------------- -int gds_mlx5_exp_post_wait_descriptor(gds_mlx5_exp_wait_request_t *request, int flags) +int gds_mlx5_exp_post_wait_descriptor(gds_wait_request_t *_request, int flags) { int ret = 0; + gds_mlx5_exp_wait_request_t *request; - ret = gds_post_ops_on_cpu(request->peek.entries, request->peek.storage, flags); + assert(_request); + request = to_gds_mexp_wait_request(_request); + + ret = gds_mlx5_exp_post_ops_on_cpu(request->peek.entries, request->peek.storage, flags); if (ret) - gds_err("error %d in gds_post_ops_on_cpu\n", ret); + gds_err("error %d in gds_mlx5_exp_post_ops_on_cpu\n", ret); return ret; } //----------------------------------------------------------------------------- -int gds_mlx5_exp_get_wait_descs(gds_mlx5_wait_info_t *mlx5_i, const gds_mlx5_exp_wait_request_t *request) +int gds_mlx5_exp_get_wait_descs(gds_mlx5_wait_info_t *mlx5_i, const gds_wait_request_t *_request) { int retcode = 0; + const gds_mlx5_exp_wait_request_t *request = to_gds_mexp_wait_request(_request); size_t n_ops = request->peek.entries; peer_op_wr *op = request->peek.storage; size_t n = 0; @@ -591,15 +1175,21 @@ int gds_mlx5_exp_get_wait_descs(gds_mlx5_wait_info_t *mlx5_i, const gds_mlx5_exp //----------------------------------------------------------------------------- -int gds_mlx5_exp_rollback_qp(gds_mlx5_exp_qp_t *gmexpqp, gds_mlx5_exp_send_request_t *send_info) +int gds_mlx5_exp_rollback_qp(gds_qp_t *gqp, gds_send_request_t *request) { struct ibv_exp_rollback_ctx rollback; int ret = 0; enum ibv_exp_rollback_flags flag = IBV_EXP_ROLLBACK_ABORT_LATE; + gds_mlx5_exp_send_request_t *send_info; + + gds_mlx5_exp_qp_t *gmexpqp; + + assert(gqp); + assert(gqp->qp); + assert(request); - assert(gmexpqp); - assert(gmexpqp->gqp.qp); - assert(send_info); + gmexpqp = to_gds_mexp_qp(gqp); + send_info = to_gds_mexp_send_request(request); /* from ibv_exp_peer_commit call */ rollback.rollback_id = send_info->commit.rollback_id; @@ -618,29 +1208,59 @@ int gds_mlx5_exp_rollback_qp(gds_mlx5_exp_qp_t *gmexpqp, gds_mlx5_exp_send_reque //----------------------------------------------------------------------------- -int gds_transport_mlx5_exp_init(gds_transport_t *transport) +uint32_t gds_mlx5_exp_get_num_wait_request_entries(gds_wait_request_t *request) { + gds_mlx5_exp_wait_request_t *gmexp_request; + assert(request); + gmexp_request = to_gds_mexp_wait_request(request); + return gmexp_request->peek.entries; +} + +//----------------------------------------------------------------------------- + +uint32_t gds_mlx5_exp_get_num_send_request_entries(gds_send_request_t *request) { + gds_mlx5_exp_send_request_t *gmexp_request; + assert(request); + gmexp_request = to_gds_mexp_send_request(request); + return gmexp_request->commit.entries; +} + +//----------------------------------------------------------------------------- + +int gds_transport_mlx5_exp_init(gds_transport_t **transport) { - transport->create_qp = gds_mlx5_exp_create_qp; - transport->destroy_qp = gds_mlx5_exp_destroy_qp; - transport->rollback_qp = gds_mlx5_exp_rollback_qp; - - transport->init_send_info = gds_mlx5_exp_init_send_info; - transport->post_send_ops = gds_mlx5_exp_post_send_ops; - transport->post_send_ops_on_cpu = gds_mlx5_exp_post_send_ops_on_cpu; - transport->prepare_send = gds_mlx5_exp_prepare_send; - transport->get_num_send_request_entries = gds_mlx5_exp_get_num_send_request_entries; - - transport->init_wait_request = gds_mlx5_exp_init_wait_request; - transport->dump_wait_request = gds_mlx5_exp_dump_wait_request; - transport->stream_post_wait_descriptor = gds_mlx5_exp_stream_post_wait_descriptor; - transport->post_wait_descriptor = gds_mlx5_exp_post_wait_descriptor; - transport->get_wait_descs = gds_mlx5_exp_get_wait_descs; - transport->get_num_wait_request_entries = gds_mlx5_exp_get_num_send_request_entries; - - transport->prepare_wait_cq = gds_mlx5_exp_prepare_wait_cq; - transport->append_wait_cq = gds_mlx5_exp_append_wait_cq; - transport->abort_wait_cq = gds_mlx5_exp_abort_wait_cq; + int status = 0; - return 0; + gds_transport_t *t = (gds_transport_t *)calloc(1, sizeof(gds_transport_t)); + if (!t) { + status = ENOMEM; + goto out; + } + + t->create_qp = gds_mlx5_exp_create_qp; + t->destroy_qp = gds_mlx5_exp_destroy_qp; + t->rollback_qp = gds_mlx5_exp_rollback_qp; + + t->init_send_info = gds_mlx5_exp_init_send_info; + t->post_send_ops = gds_mlx5_exp_post_send_ops; + t->post_send_ops_on_cpu = gds_mlx5_exp_post_send_ops_on_cpu; + t->prepare_send = gds_mlx5_exp_prepare_send; + t->get_send_descs = gds_mlx5_exp_get_send_descs; + t->get_num_send_request_entries = gds_mlx5_exp_get_num_send_request_entries; + + t->init_wait_request = gds_mlx5_exp_init_wait_request; + t->dump_wait_request = gds_mlx5_exp_dump_wait_request; + t->stream_post_wait_descriptor = gds_mlx5_exp_stream_post_wait_descriptor; + t->post_wait_descriptor = gds_mlx5_exp_post_wait_descriptor; + t->get_wait_descs = gds_mlx5_exp_get_wait_descs; + t->get_num_wait_request_entries = gds_mlx5_exp_get_num_wait_request_entries; + + t->prepare_wait_cq = gds_mlx5_exp_prepare_wait_cq; + t->append_wait_cq = gds_mlx5_exp_append_wait_cq; + t->abort_wait_cq = gds_mlx5_exp_abort_wait_cq; + + *transport = t; + +out: + return status; } diff --git a/src/transports/mlx5-exp/mlx5-exp.hpp b/src/transports/mlx5-exp/mlx5-exp.hpp index 1d67137..e4f792d 100644 --- a/src/transports/mlx5-exp/mlx5-exp.hpp +++ b/src/transports/mlx5-exp/mlx5-exp.hpp @@ -72,43 +72,3 @@ static inline const gds_mlx5_exp_wait_request_t *to_gds_mexp_wait_request(const return (const gds_mlx5_exp_wait_request_t *)to_gds_mexp_wait_request((const gds_wait_request_t *)gwreq); } -static inline uint32_t gds_mlx5_exp_get_num_wait_request_entries(gds_mlx5_exp_wait_request_t *gmexp_request) { - return gmexp_request->peek.entries; -} - -static inline uint32_t gds_mlx5_exp_get_num_send_request_entries(gds_mlx5_exp_send_request_t *gmexp_request) { - return gmexp_request->commit.entries; -} - -gds_mlx5_exp_cq_t *gds_mlx5_exp_create_cq( - struct ibv_context *context, int cqe, - void *cq_context, struct ibv_comp_channel *channel, - int comp_vector, gds_peer *peer, gds_peer_attr *peer_attr, gds_alloc_cq_flags_t flags, - struct ibv_exp_res_domain *res_domain); - -gds_mlx5_exp_qp_t *gds_mlx5_exp_create_qp( - struct ibv_pd *pd, struct ibv_context *context, gds_qp_init_attr_t *qp_attr, - gds_peer *peer, gds_peer_attr *peer_attr, int flags); - -int gds_mlx5_exp_destroy_cq(gds_mlx5_exp_cq_t *gmexpcq); -int gds_mlx5_exp_destroy_qp(gds_mlx5_exp_qp_t *gmexpqp); - -int gds_mlx5_exp_prepare_send(gds_mlx5_exp_qp_t *gmexpqp, gds_send_wr *p_ewr, - gds_send_wr **bad_ewr, - gds_mlx5_exp_send_request_t *request); - - -void gds_mlx5_exp_init_send_info(gds_mlx5_exp_send_request_t *info); -int gds_mlx5_exp_post_send_ops(gds_peer *peer, gds_mlx5_exp_send_request_t *info, gds_op_list_t &ops); -int gds_mlx5_exp_post_send_ops_on_cpu(gds_mlx5_exp_send_request_t *info, int flags = 0); - -void gds_mlx5_exp_init_wait_request(gds_mlx5_exp_wait_request_t *request, uint32_t offset); -void gds_mlx5_exp_dump_wait_request(gds_mlx5_exp_wait_request_t *request, size_t idx); -int gds_mlx5_exp_prepare_wait_cq(gds_mlx5_exp_cq_t *mexpcq, gds_mlx5_exp_wait_request_t *request, int flags); -int gds_mlx5_exp_append_wait_cq(gds_mlx5_exp_wait_request_t *request, uint32_t *dw, uint32_t val); -int gds_mlx5_exp_abort_wait_cq(gds_mlx5_exp_cq_t *gmexpcq, gds_mlx5_exp_wait_request_t *request); -int gds_mlx5_exp_stream_post_wait_descriptor(gds_peer *peer, gds_mlx5_exp_wait_request_t *request, gds_op_list_t ¶ms, int flags); -int gds_mlx5_exp_post_wait_descriptor(gds_mlx5_exp_wait_request_t *request, int flags); -int gds_mlx5_exp_get_wait_descs(gds_mlx5_wait_info_t *mlx5_i, const gds_mlx5_exp_wait_request_t *request); - -int gds_mlx5_exp_rollback_qp(gds_mlx5_exp_qp_t *gmexpqp, gds_mlx5_exp_send_request_t *send_info); diff --git a/src/utils.hpp b/src/utils.hpp index c5d0774..a146f60 100644 --- a/src/utils.hpp +++ b/src/utils.hpp @@ -191,6 +191,10 @@ typedef enum gds_alloc_qp_flags { #include +// TODO: use correct value +// TODO: make it dependent upon the particular GPU +const size_t GDS_GPU_MAX_INLINE_SIZE = 256; + typedef std::vector gds_op_list_t; struct gds_cq *gds_create_cq(struct ibv_context *context, int cqe, void *cq_context, struct ibv_comp_channel *channel, int comp_vector, int gpu_id, gds_alloc_cq_flags_t flags); @@ -205,9 +209,13 @@ struct gds_peer; int gds_fill_membar(gds_peer *peer, gds_op_list_t ¶m, int flags); int gds_fill_inlcpy(gds_peer *peer, gds_op_list_t ¶m, void *ptr, const void *data, size_t n_bytes, int flags); +int gds_fill_inlcpy(gds_peer *peer, gds_op_list_t &ops, CUdeviceptr addr, const void *data, size_t n_bytes, int flags); int gds_fill_poke(gds_peer *peer, gds_op_list_t ¶m, uint32_t *ptr, uint32_t value, int flags); +int gds_fill_poke(gds_peer *peer, gds_op_list_t &ops, CUdeviceptr addr, uint32_t value, int flags); int gds_fill_poke64(gds_peer *peer, gds_op_list_t ¶m, uint64_t *ptr, uint64_t value, int flags); +int gds_fill_poke64(gds_peer *peer, gds_op_list_t &ops, CUdeviceptr addr, uint64_t value, int flags); int gds_fill_poll(gds_peer *peer, gds_op_list_t ¶m, uint32_t *ptr, uint32_t magic, int cond_flag, int flags); +int gds_fill_poll(gds_peer *peer, gds_op_list_t &ops, CUdeviceptr ptr, uint32_t magic, int cond_flag, int flags); int gds_stream_batch_ops(gds_peer *peer, CUstream stream, gds_op_list_t ¶ms, int flags); @@ -216,10 +224,11 @@ enum gds_post_ops_flags { }; struct gds_peer; -int gds_post_ops(gds_peer *peer, size_t n_ops, struct peer_op_wr *op, gds_op_list_t ¶ms, int post_flags = 0); -int gds_post_ops_on_cpu(size_t n_descs, struct peer_op_wr *op, int post_flags = 0); gds_peer *peer_from_stream(CUstream stream); +bool gds_simulate_write64(); +void gds_enable_barrier_for_inlcpy(CUstreamBatchMemOpParams *param); + //----------------------------------------------------------------------------- /* \brief: Get the underlying driver associated with the ibdev. @@ -238,12 +247,6 @@ static inline gds_driver_type gds_get_driver_type(struct ibv_device *ibdev) //----------------------------------------------------------------------------- -int gds_destroy_cq(struct gds_cq *gcq); - -void gds_init_ops(struct peer_op_wr *op, int count); - -//----------------------------------------------------------------------------- - /* * Local variables: * c-indent-level: 8 From c914e89f99543c87f2ebd4d50400ed1f2738ee48 Mon Sep 17 00:00:00 2001 From: Pak Markthub Date: Fri, 15 Oct 2021 04:37:53 -0400 Subject: [PATCH 36/50] Removed driver_type and fixed bugs --- include/gdsync/core.h | 17 ++--------------- src/apis.cpp | 8 ++++---- src/gdsync.cpp | 1 - src/transports/mlx5-exp/mlx5-exp.cpp | 7 ------- src/transports/mlx5-exp/mlx5-exp.hpp | 14 ++------------ src/utils.hpp | 16 ---------------- 6 files changed, 8 insertions(+), 55 deletions(-) diff --git a/include/gdsync/core.h b/include/gdsync/core.h index fa213d3..e500c93 100644 --- a/include/gdsync/core.h +++ b/include/gdsync/core.h @@ -57,17 +57,9 @@ enum gds_create_qp_flags { typedef struct ibv_qp_init_attr gds_qp_init_attr_t; typedef struct ibv_send_wr gds_send_wr; -typedef enum gds_driver_type { - GDS_DRIVER_TYPE_UNSUPPORTED = 0, - GDS_DRIVER_TYPE_MLX5_EXP, - GDS_DRIVER_TYPE_MLX5_DV, - GDS_DRIVER_TYPE_MLX5_DEVX -} gds_driver_type_t; - typedef struct gds_cq { struct ibv_cq *cq; uint32_t curr_offset; - gds_driver_type_t dtype; } gds_cq_t; typedef struct gds_qp { @@ -75,7 +67,6 @@ typedef struct gds_qp { struct gds_cq *send_cq; struct gds_cq *recv_cq; struct ibv_context *dev_context; - gds_driver_type_t dtype; } gds_qp_t; /* \brief: Create a peer-enabled QP attached to the specified GPU id. @@ -161,11 +152,9 @@ enum { */ typedef struct gds_send_request { - gds_driver_type_t dtype; - uint8_t pad0[4]; uint8_t reserved0[32]; uint8_t reserved1[56 * GDS_SEND_INFO_MAX_OPS]; - uint8_t pad1[24]; + uint8_t pad0[32]; } gds_send_request_t; static_assert(sizeof(gds_send_request_t) % 64 == 0, "gds_send_request_t must be 64-byte aligned."); @@ -179,11 +168,9 @@ int gds_stream_post_send_all(CUstream stream, int count, gds_send_request_t *req */ typedef struct gds_wait_request { - gds_driver_type_t dtype; - uint8_t pad0[4]; uint8_t reserved0[40]; uint8_t reserved1[56 * GDS_WAIT_INFO_MAX_OPS]; - uint8_t pad1[16]; + uint8_t pad0[24]; } gds_wait_request_t; static_assert(sizeof(gds_wait_request_t) % 64 == 0, "gds_wait_request_t must be 64-byte aligned."); diff --git a/src/apis.cpp b/src/apis.cpp index ed0a1d5..cf9008c 100644 --- a/src/apis.cpp +++ b/src/apis.cpp @@ -242,7 +242,7 @@ int gds_prepare_wait_cq(struct gds_cq *cq, gds_wait_request_t *request, int flag int gds_append_wait_cq(gds_wait_request_t *request, uint32_t *dw, uint32_t val) { int ret = gds_transport_init(); - if (!ret) { + if (ret) { gds_err("error in gds_transport_init\n"); goto out; } @@ -467,7 +467,7 @@ static int calc_n_mem_ops(size_t n_descs, gds_descriptor_t *descs, size_t &n_mem size_t i; ret = gds_transport_init(); - if (!ret) { + if (ret) { gds_err("error in gds_transport_init\n"); goto out; } @@ -510,7 +510,7 @@ int gds_stream_post_descriptors(CUstream stream, size_t n_descs, gds_descriptor_ gds_op_list_t params; ret = gds_transport_init(); - if (!ret) { + if (ret) { gds_err("error in gds_transport_init\n"); goto out; } @@ -618,7 +618,7 @@ int gds_post_descriptors(size_t n_descs, gds_descriptor_t *descs, int flags) int retcode = 0; ret = gds_transport_init(); - if (!ret) { + if (ret) { gds_err("error in gds_transport_init\n"); goto out; } diff --git a/src/gdsync.cpp b/src/gdsync.cpp index e9fafc8..84a9b05 100644 --- a/src/gdsync.cpp +++ b/src/gdsync.cpp @@ -1245,7 +1245,6 @@ struct gds_qp *gds_create_qp(struct ibv_pd *pd, struct ibv_context *context, gds_qp_t *gqp = NULL; gds_peer *peer = NULL; gds_peer_attr *peer_attr = NULL; - gds_driver_type dtype; int old_errno = errno; gds_dbg("pd=%p context=%p gpu_id=%d flags=%08x current errno=%d\n", pd, context, gpu_id, flags, errno); diff --git a/src/transports/mlx5-exp/mlx5-exp.cpp b/src/transports/mlx5-exp/mlx5-exp.cpp index 23abb7a..85f97e2 100644 --- a/src/transports/mlx5-exp/mlx5-exp.cpp +++ b/src/transports/mlx5-exp/mlx5-exp.cpp @@ -573,8 +573,6 @@ gds_mlx5_exp_cq_t *gds_mlx5_exp_create_cq( return NULL; } - gmexpcq->gcq.dtype = GDS_DRIVER_TYPE_MLX5_EXP; - return gmexpcq; } @@ -588,8 +586,6 @@ int gds_mlx5_exp_destroy_cq(gds_mlx5_exp_cq_t *gmexpcq) if (!gmexpcq) return retcode; - assert(gmexpcq->gcq.dtype == GDS_DRIVER_TYPE_MLX5_EXP); - if (gmexpcq->gcq.cq) { ret = ibv_destroy_cq(gmexpcq->gcq.cq); if (ret) { @@ -618,8 +614,6 @@ int gds_mlx5_exp_destroy_qp(gds_qp_t *gqp) gds_mlx5_exp_qp_t *gmexpqp = to_gds_mexp_qp(gqp); - assert(gmexpqp->gqp.dtype == GDS_DRIVER_TYPE_MLX5_EXP); - if (gmexpqp->gqp.qp) { ret = ibv_destroy_qp(gmexpqp->gqp.qp); if (ret) { @@ -683,7 +677,6 @@ int gds_mlx5_exp_create_qp( gds_err("cannot allocate memory\n"); goto err; } - gmexpqp->gqp.dtype = GDS_DRIVER_TYPE_MLX5_EXP; gmexpqp->gqp.dev_context = context; diff --git a/src/transports/mlx5-exp/mlx5-exp.hpp b/src/transports/mlx5-exp/mlx5-exp.hpp index e4f792d..861c12e 100644 --- a/src/transports/mlx5-exp/mlx5-exp.hpp +++ b/src/transports/mlx5-exp/mlx5-exp.hpp @@ -23,39 +23,30 @@ typedef struct gds_mlx5_exp_qp { } gds_mlx5_exp_qp_t; typedef struct gds_mlx5_exp_send_request { - gds_driver_type_t dtype; - uint8_t pad0[4]; struct ibv_exp_peer_commit commit; struct peer_op_wr wr[GDS_SEND_INFO_MAX_OPS]; - uint8_t pad1[24]; + uint8_t pad1[32]; } gds_mlx5_exp_send_request_t; static_assert(sizeof(gds_mlx5_exp_send_request_t) % 64 == 0, "gds_mlx5_exp_send_request_t must be 64-byte aligned."); static_assert(sizeof(gds_mlx5_exp_send_request_t) <= sizeof(gds_send_request_t), "The size of gds_mlx5_exp_send_request_t must be less than or equal to that of gds_send_request_t."); -static_assert(offsetof(gds_mlx5_exp_send_request_t, dtype) == offsetof(gds_send_request_t, dtype), "dtype of gds_mlx5_exp_send_request_t and gds_send_request_t must be at the same offset."); typedef struct gds_mlx5_exp_wait_request { - gds_driver_type_t dtype; - uint8_t pad0[4]; struct ibv_exp_peer_peek peek; struct peer_op_wr wr[GDS_WAIT_INFO_MAX_OPS]; - uint8_t pad1[16]; + uint8_t pad1[24]; } gds_mlx5_exp_wait_request_t; static_assert(sizeof(gds_mlx5_exp_wait_request_t) % 64 == 0, "gds_mlx5_exp_wait_request_t must be 64-byte aligned."); static_assert(sizeof(gds_mlx5_exp_wait_request_t) <= sizeof(gds_wait_request_t), "The size of gds_mlx5_exp_wait_request_t must be less than or equal to that of gds_wait_request_t."); -static_assert(offsetof(gds_mlx5_exp_wait_request_t, dtype) == offsetof(gds_wait_request_t, dtype), "dtype of gds_mlx5_exp_wait_request_t and gds_wait_request_t must be at the same offset."); static inline gds_mlx5_exp_cq_t *to_gds_mexp_cq(gds_cq_t *gcq) { - assert(gcq->dtype == GDS_DRIVER_TYPE_MLX5_EXP); return container_of(gcq, gds_mlx5_exp_cq_t, gcq); } static inline gds_mlx5_exp_qp_t *to_gds_mexp_qp(gds_qp_t *gqp) { - assert(gqp->dtype == GDS_DRIVER_TYPE_MLX5_EXP); return container_of(gqp, gds_mlx5_exp_qp_t, gqp); } static inline gds_mlx5_exp_send_request_t *to_gds_mexp_send_request(gds_send_request_t *gsreq) { - assert(gsreq->dtype == GDS_DRIVER_TYPE_MLX5_EXP); return (gds_mlx5_exp_send_request_t *)(gsreq); } @@ -64,7 +55,6 @@ static inline const gds_mlx5_exp_send_request_t *to_gds_mexp_send_request(const } static inline gds_mlx5_exp_wait_request_t *to_gds_mexp_wait_request(gds_wait_request_t *gwreq) { - assert(gwreq->dtype == GDS_DRIVER_TYPE_MLX5_EXP); return (gds_mlx5_exp_wait_request_t *)(gwreq); } diff --git a/src/utils.hpp b/src/utils.hpp index a146f60..cd2102e 100644 --- a/src/utils.hpp +++ b/src/utils.hpp @@ -231,22 +231,6 @@ void gds_enable_barrier_for_inlcpy(CUstreamBatchMemOpParams *param); //----------------------------------------------------------------------------- -/* \brief: Get the underlying driver associated with the ibdev. - * - */ -static inline gds_driver_type gds_get_driver_type(struct ibv_device *ibdev) -{ - const char *dev_name = ibv_get_device_name(ibdev); - - // Heuristically guess the driver by the device name. - // Until we find a better way to do so... - if (strstr(dev_name, "mlx5") != NULL) - return GDS_DRIVER_TYPE_MLX5_EXP; - return GDS_DRIVER_TYPE_UNSUPPORTED; -} - -//----------------------------------------------------------------------------- - /* * Local variables: * c-indent-level: 8 From ed703ccf3482c895998601d28c07387d00c34351 Mon Sep 17 00:00:00 2001 From: Pak Markthub Date: Fri, 15 Oct 2021 04:42:31 -0400 Subject: [PATCH 37/50] Added exp-verbs support checking in transport.hpp --- src/transport.hpp | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/src/transport.hpp b/src/transport.hpp index d14d9e3..00078cb 100644 --- a/src/transport.hpp +++ b/src/transport.hpp @@ -27,6 +27,7 @@ #pragma once +#include #include #include #include @@ -57,19 +58,28 @@ typedef struct gds_transport { extern gds_transport_t *gds_main_transport; +#if HAVE_EXP_VERBS int gds_transport_mlx5_exp_init(gds_transport_t **transport); +#else +#warning "This library requires exp-verbs." +#endif static inline int gds_transport_init() { int status = 0; if (!gds_main_transport) { gds_transport_t *t = NULL; + #if HAVE_EXP_VERBS status = gds_transport_mlx5_exp_init(&t); if (status) { gds_err("error in gds_transport_mlx5_exp_init\n"); goto out; } assert(t); + #else + status = ENOTSUPP; + goto out; + #endif gds_main_transport = t; } out: From 73404832fdd2b2bc6205b5b00470e7b55fe7c8be Mon Sep 17 00:00:00 2001 From: Pak Markthub Date: Sun, 17 Oct 2021 19:07:12 -0700 Subject: [PATCH 38/50] Fixed typo in configure.ac --- configure.ac | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/configure.ac b/configure.ac index 34bc650..6f4449c 100644 --- a/configure.ac +++ b/configure.ac @@ -201,7 +201,7 @@ if test "x$have_exp_verbs" != "x" && test "x$have_peer_ops" != "x"; then AC_DEFINE([HAVE_EXP_VERBS], [1], [Define if exp-verbs exists.]) enable_exp_verbs=1 else - AC_MSG_WARN([This version of libgdsync cannot be used with out exp-verbs.]) + AC_MSG_WARN([This version of libgdsync cannot be used without exp-verbs.]) fi AM_CONDITIONAL([COMPILE_EXP_VERBS], [test "x$enable_exp_verbs" != "x"]) From 9986794f49cffc52864e33fcfd43669bb41493ac Mon Sep 17 00:00:00 2001 From: Pak Markthub Date: Sun, 17 Oct 2021 19:07:38 -0700 Subject: [PATCH 39/50] Fixed compile errors when compiling on x86 --- src/transport.hpp | 3 ++- src/utils.hpp | 1 + tests/gpu.h | 10 ++++++++++ 3 files changed, 13 insertions(+), 1 deletion(-) diff --git a/src/transport.hpp b/src/transport.hpp index 00078cb..079f9ce 100644 --- a/src/transport.hpp +++ b/src/transport.hpp @@ -29,6 +29,7 @@ #include #include +#include #include #include @@ -77,7 +78,7 @@ static inline int gds_transport_init() } assert(t); #else - status = ENOTSUPP; + status = ENOTSUP; goto out; #endif gds_main_transport = t; diff --git a/src/utils.hpp b/src/utils.hpp index cd2102e..dccb125 100644 --- a/src/utils.hpp +++ b/src/utils.hpp @@ -31,6 +31,7 @@ #warning "__STDC_FORMAT_MACROS should be defined to pull definition of PRIx64, etc" #endif #include // to pull PRIx64 +#include // internal assert function diff --git a/tests/gpu.h b/tests/gpu.h index 0fde885..77a6db4 100644 --- a/tests/gpu.h +++ b/tests/gpu.h @@ -27,8 +27,18 @@ #pragma once +#include #include +#undef BEGIN_C_DECLS +#undef END_C_DECLS +#ifdef __cplusplus +# define BEGIN_C_DECLS extern "C" { +# define END_C_DECLS } +#else +# define BEGIN_C_DECLS +# define END_C_DECLS +#endif #ifdef USE_PROFILE #include From 1ea8492fb0e5e85a0be36b39f91193d22d4623ca Mon Sep 17 00:00:00 2001 From: Pak Markthub Date: Mon, 18 Oct 2021 18:29:52 -0700 Subject: [PATCH 40/50] Added initial implementation of DEVX and DirectVerbs QP/CQ create/destroy --- Makefile.am | 15 + src/mem.cpp | 8 +- src/mem.hpp | 4 +- src/objs.cpp | 50 +- src/objs.hpp | 3 +- src/transport.hpp | 11 +- src/transports/mlx5-dv/mlx5-dv.cpp | 1122 ++++++ src/transports/mlx5-dv/mlx5-dv.hpp | 206 ++ src/transports/mlx5-dv/mlx5_ifc.h | 5499 ++++++++++++++++++++++++++++ src/utils.hpp | 50 +- 10 files changed, 6933 insertions(+), 35 deletions(-) create mode 100644 src/transports/mlx5-dv/mlx5-dv.cpp create mode 100644 src/transports/mlx5-dv/mlx5-dv.hpp create mode 100644 src/transports/mlx5-dv/mlx5_ifc.h diff --git a/Makefile.am b/Makefile.am index 61e9a2d..c0f16cc 100644 --- a/Makefile.am +++ b/Makefile.am @@ -31,6 +31,9 @@ noinst_HEADERS = src/mem.hpp src/memmgr.hpp src/objs.hpp src/rangeset.hpp src/ut if COMPILE_EXP_VERBS src_libgdsync_la_SOURCES += src/transports/mlx5-exp/mlx5-exp.cpp noinst_HEADERS += src/transports/mlx5-exp/mlx5-exp.hpp +else +src_libgdsync_la_SOURCES += src/transports/mlx5-dv/mlx5-dv.cpp +noinst_HEADERS += src/transports/mlx5-dv/mlx5-dv.hpp endif # if enabled at configure time @@ -40,8 +43,13 @@ if TEST_ENABLE bin_PROGRAMS = tests/gds_kernel_latency tests/gds_poll_lat tests/gds_kernel_loopback_latency tests/gds_sanity noinst_PROGRAMS = tests/rstest tests/wqtest +if !COMPILE_EXP_VERBS +LDADD=$(top_builddir)/src/libgdsync.la -lmpi $(LIBGDSTOOLS) -lcuda -lcudart -lmlx5 +endif + tests_gds_kernel_latency_SOURCES = tests/gds_kernel_latency.c tests/gpu_kernels.cu tests/pingpong.c tests/gpu.cpp tests_gds_kernel_latency_LDADD = $(top_builddir)/src/libgdsync.la $(MPILDFLAGS) $(LIBGDSTOOLS) -lgdrapi $(LIBNVTX) -lcuda -lcudart $(PTHREAD_LIBS) +tests_gds_kernel_latency_CFLAGS = -D_GNU_SOURCE $(AM_CFLAGS) tests_rstest_SOURCES = tests/rstest.cpp tests_rstest_LDADD = @@ -57,7 +65,14 @@ tests_gds_sanity_LDADD = $(top_builddir)/src/libgdsync.la $(LIBGDSTOOLS) -lgdrap tests_gds_kernel_loopback_latency_SOURCES = tests/gds_kernel_loopback_latency.c tests/pingpong.c tests/gpu.cpp tests/gpu_kernels.cu tests_gds_kernel_loopback_latency_LDADD = $(top_builddir)/src/libgdsync.la $(LIBGDSTOOLS) -lgdrapi $(LIBNVTX) -lcuda -lcudart $(PTHREAD_LIBS) +tests_gds_kernel_loopback_latency_CFLAGS = -D_GNU_SOURCE $(AM_CFLAGS) +if !COMPILE_EXP_VERBS +tests_gds_kernel_latency_LDADD += -lmlx5 +tests_gds_poll_lat_LDADD += -lmlx5 +tests_gds_sanity_LDADD += -lmlx5 +tests_gds_kernel_loopback_latency_LDADD += -lmlx5 +endif SUFFIXES= .cu diff --git a/src/mem.cpp b/src/mem.cpp index 7cf3602..5388694 100644 --- a/src/mem.cpp +++ b/src/mem.cpp @@ -372,7 +372,7 @@ int gds_free_mapped_memory(gds_mem_desc_t *desc) //#define ROUND_TO_GDR_GPU_PAGE(V) ROUND_TO(V, GDR_GPU_PAGE_SIZE) // allocate GPU memory with a GDR mapping (CPU can dereference it) -int gds_peer_malloc_ex(int peer_id, uint64_t peer_data, void **host_addr, CUdeviceptr *peer_addr, size_t req_size, void **phandle, bool has_cpu_mapping) +int gds_peer_malloc_ex(int peer_id, uint64_t peer_data, void **host_addr, CUdeviceptr *peer_addr, size_t req_size, gds_memory_type_t mem_type, bool has_cpu_mapping, void **phandle) { int ret = 0; // assume GPUs are the only peers!!! @@ -425,7 +425,7 @@ int gds_peer_malloc_ex(int peer_id, uint64_t peer_data, void **host_addr, CUdevi goto out; } - ret = gds_alloc_mapped_memory(desc, size, GDS_MEMORY_GPU); + ret = gds_alloc_mapped_memory(desc, size, mem_type); if (ret) { gds_err("error %d while allocating mapped GPU buffers\n", ret); goto out; @@ -447,9 +447,9 @@ int gds_peer_malloc_ex(int peer_id, uint64_t peer_data, void **host_addr, CUdevi //----------------------------------------------------------------------------- -int gds_peer_malloc(int peer_id, uint64_t peer_data, void **host_addr, CUdeviceptr *peer_addr, size_t req_size, void **phandle) +int gds_peer_malloc(int peer_id, uint64_t peer_data, void **host_addr, CUdeviceptr *peer_addr, size_t req_size, gds_memory_type_t mem_type, void **phandle) { - return gds_peer_malloc_ex(peer_id, peer_data, host_addr, peer_addr, req_size, phandle, true); + return gds_peer_malloc_ex(peer_id, peer_data, host_addr, peer_addr, req_size, mem_type, true, phandle); } //----------------------------------------------------------------------------- diff --git a/src/mem.hpp b/src/mem.hpp index 639f9a1..5ffc73d 100644 --- a/src/mem.hpp +++ b/src/mem.hpp @@ -1,8 +1,8 @@ #pragma once int gds_peer_mfree(int peer_id, uint64_t peer_data, void *host_addr, void *handle); -int gds_peer_malloc(int peer_id, uint64_t peer_data, void **host_addr, CUdeviceptr *peer_addr, size_t req_size, void **phandle); -int gds_peer_malloc_ex(int peer_id, uint64_t peer_data, void **host_addr, CUdeviceptr *peer_addr, size_t req_size, void **phandle, bool has_cpu_mapping); +int gds_peer_malloc(int peer_id, uint64_t peer_data, void **host_addr, CUdeviceptr *peer_addr, size_t req_size, gds_memory_type_t mem_type, void **phandle); +int gds_peer_malloc_ex(int peer_id, uint64_t peer_data, void **host_addr, CUdeviceptr *peer_addr, size_t req_size, gds_memory_type_t mem_type, bool has_cpu_mapping, void **phandle); diff --git a/src/objs.cpp b/src/objs.cpp index 475c9d3..3c04ca4 100644 --- a/src/objs.cpp +++ b/src/objs.cpp @@ -50,14 +50,14 @@ using namespace std; //----------------------------------------------------------------------------- -gds_buf *gds_peer::alloc(size_t sz, uint32_t alignment) +gds_buf *gds_peer::alloc(size_t sz, uint32_t alignment, gds_memory_type_t mem_type) { // TODO: support alignment // TODO: handle exception here gds_buf *buf = new gds_buf(this, sz); if (!buf) return buf; - int ret = gds_peer_malloc(gpu_id, 0, &buf->addr, &buf->peer_addr, buf->length, &buf->handle); + int ret = gds_peer_malloc(gpu_id, 0, &buf->addr, &buf->peer_addr, buf->length, mem_type, &buf->handle); if (ret) { delete buf; buf = NULL; @@ -71,23 +71,29 @@ gds_buf *gds_peer::buf_alloc_cq(size_t length, uint32_t dir, uint32_t alignment, gds_buf *buf = NULL; switch (dir) { case (GDS_PEER_DIRECTION_FROM_HCA|GDS_PEER_DIRECTION_TO_PEER|GDS_PEER_DIRECTION_TO_CPU): - // CQ buf - if (GDS_ALLOC_CQ_ON_GPU == (flags & GDS_ALLOC_CQ_MASK)) { - gds_dbg("allocating CQ on GPU mem\n"); - buf = alloc(length, alignment); - } else { - gds_dbg("allocating CQ on Host mem\n"); - } - break; + // CQ dbrec + if (GDS_ALLOC_CQ_DBREC_ON_GPU == (flags & GDS_ALLOC_CQ_DBREC_MASK)) { + gds_dbg("allocating CQ DBREC on GPU mem\n"); + buf = alloc(length, alignment, GDS_MEMORY_GPU); + } + + // CQ buf + if (GDS_ALLOC_CQ_ON_GPU == (flags & GDS_ALLOC_CQ_MASK)) { + gds_dbg("allocating CQ buf on GPU mem\n"); + buf = alloc(length, alignment, GDS_MEMORY_GPU); + } + break; case (GDS_PEER_DIRECTION_FROM_PEER|GDS_PEER_DIRECTION_TO_CPU): // CQ peer buf, helper buffer // on SYSMEM for the near future // GPU does a store to the 'busy' field as part of the peek_cq task // CPU polls on that field gds_dbg("allocating CQ peer buf on Host mem\n"); + buf = alloc(length, alignment, GDS_MEMORY_HOST); break; case (GDS_PEER_DIRECTION_FROM_PEER|GDS_PEER_DIRECTION_TO_HCA): gds_dbg("allocating CQ dbrec on Host mem\n"); + buf = alloc(length, alignment, GDS_MEMORY_HOST); break; default: gds_err("unexpected dir 0x%x\n", dir); @@ -101,14 +107,22 @@ gds_buf *gds_peer::buf_alloc_wq(size_t length, uint32_t dir, uint32_t alignment, gds_buf *buf = NULL; switch (dir) { case GDS_PEER_DIRECTION_FROM_PEER|GDS_PEER_DIRECTION_TO_HCA: - // dbrec - if (GDS_ALLOC_DBREC_ON_GPU == (flags & GDS_ALLOC_DBREC_MASK)) { - gds_dbg("allocating DBREC on GPU mem\n"); - buf = alloc(length, alignment); - } else { - gds_dbg("allocating DBREC on Host mem\n"); - } - break; + // dbrec + if (GDS_ALLOC_WQ_DBREC_ON_GPU == (flags & GDS_ALLOC_WQ_DBREC_MASK)) { + gds_dbg("allocating WQ DBREC on GPU mem\n"); + buf = alloc(length, alignment, GDS_MEMORY_GPU); + } else { + gds_dbg("allocating WQ DBREC on Host mem\n"); + } + + // WQ + if (GDS_ALLOC_WQ_ON_GPU == (flags & GDS_ALLOC_WQ_MASK)) { + gds_dbg("allocating WQ buf on GPU mem\n"); + buf = alloc(length, alignment, GDS_MEMORY_GPU); + } else { + gds_dbg("allocating WQ buf on Host mem\n"); + } + break; default: gds_err("unexpected dir=%08x\n", dir); break; diff --git a/src/objs.hpp b/src/objs.hpp index 71f4ea4..48e408e 100644 --- a/src/objs.hpp +++ b/src/objs.hpp @@ -242,6 +242,7 @@ struct gds_peer { unsigned max_batch_size; gds_peer_attr attr; task_queue *tq; + void *opaque; enum obj_type { NONE, CQ, WQ, N_IBV_OBJS } alloc_type; // This field works as a ugly run-time parameters passing @@ -259,7 +260,7 @@ struct gds_peer { // unregister all kinds of memory void unregister(gds_range *range); - gds_buf *alloc(size_t length, uint32_t alignment); + gds_buf *alloc(size_t length, uint32_t alignment, gds_memory_type_t mem_type); gds_buf *buf_alloc_cq(size_t length, uint32_t dir, uint32_t alignment, int flags); gds_buf *buf_alloc_wq(size_t length, uint32_t dir, uint32_t alignment, int flags); gds_buf *buf_alloc(obj_type type, size_t length, uint32_t dir, uint32_t alignment, int flags); diff --git a/src/transport.hpp b/src/transport.hpp index 079f9ce..98a62f6 100644 --- a/src/transport.hpp +++ b/src/transport.hpp @@ -62,7 +62,7 @@ extern gds_transport_t *gds_main_transport; #if HAVE_EXP_VERBS int gds_transport_mlx5_exp_init(gds_transport_t **transport); #else -#warning "This library requires exp-verbs." +int gds_transport_mlx5_dv_init(gds_transport_t **transport); #endif static inline int gds_transport_init() @@ -72,15 +72,14 @@ static inline int gds_transport_init() gds_transport_t *t = NULL; #if HAVE_EXP_VERBS status = gds_transport_mlx5_exp_init(&t); + #else + status = gds_transport_mlx5_dv_init(&t); + #endif if (status) { - gds_err("error in gds_transport_mlx5_exp_init\n"); + gds_err("error in gds_transport_init\n"); goto out; } assert(t); - #else - status = ENOTSUP; - goto out; - #endif gds_main_transport = t; } out: diff --git a/src/transports/mlx5-dv/mlx5-dv.cpp b/src/transports/mlx5-dv/mlx5-dv.cpp new file mode 100644 index 0000000..9a39613 --- /dev/null +++ b/src/transports/mlx5-dv/mlx5-dv.cpp @@ -0,0 +1,1122 @@ +/* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of NVIDIA CORPORATION nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "gdsync.h" +#include "gdsync/mlx5.h" +#include "gdsync/tools.h" + +#include "archutils.h" +#include "mem.hpp" +#include "memmgr.hpp" +#include "mlx5-dv.hpp" +#include "mlnxutils.h" +#include "objs.hpp" +#include "utils.hpp" +#include "transport.hpp" +#include "mlx5_ifc.h" + +//----------------------------------------------------------------------------- + +static int gds_mlx5_dv_create_cq( + struct ibv_context *context, int cqe, + void *cq_context, struct ibv_comp_channel *channel, + int comp_vector, struct ibv_pd *pd, + gds_peer_attr *peer_attr, int alloc_flags, + gds_mlx5_dv_cq_t **out_mcq +) +{ + int ret = 0; + + bool register_peer_buf = false; + bool register_peer_dbr = false; + + struct ibv_cq_ex *ibcq_ex = NULL; + struct ibv_cq *ibcq = NULL; + gds_mlx5_dv_cq_t *mcq = NULL; + gds_cq_t *gcq; + + gds_mlx5_dv_cq_peer_t *mcq_peer = NULL; + + mlx5dv_obj dv_obj; + + gds_peer *peer = NULL; + + struct ibv_cq_init_attr_ex cq_attr = { + .cqe = (uint32_t)cqe, + .cq_context = cq_context, + .channel = channel, + .comp_vector = (uint32_t)comp_vector, + .wc_flags = IBV_WC_STANDARD_FLAGS, + .comp_mask = (uint32_t)IBV_CQ_INIT_ATTR_MASK_PD, + .flags = 0, + .parent_domain = pd + }; + + assert(peer_attr); + + mcq = (gds_mlx5_dv_cq_t *)calloc(1, sizeof(gds_mlx5_dv_cq_t)); + if (!mcq) { + gds_err("cannot allocate memory\n"); + ret = ENOMEM; + goto err; + } + + mcq_peer = (gds_mlx5_dv_cq_peer_t *)calloc(1, sizeof(gds_mlx5_dv_cq_peer_t)); + if (!mcq_peer) { + gds_err("cannot allocate memory\n"); + ret = ENOMEM; + goto err; + } + mcq_peer->peer_attr = peer_attr; + + mcq->cq_peer = mcq_peer; + + peer = peer_from_id(peer_attr->peer_id); + + // Setup peer allocation + peer->alloc_type = gds_peer::CQ; + peer->alloc_flags = alloc_flags; + // mcq_peer will be filled if we do allocation on device. + // pd_mem_alloc is responsible for the registration. + peer->opaque = mcq_peer; + + ibcq_ex = mlx5dv_create_cq(context, &cq_attr, NULL); + if (!ibcq_ex) { + gds_err("error in mlx5dv_create_cq\n"); + ret = EINVAL; + goto err; + } + ibcq = ibv_cq_ex_to_cq(ibcq_ex); + + dv_obj.cq.in = ibcq; + dv_obj.cq.out = &mcq->dvcq; + ret = mlx5dv_init_obj(&dv_obj, MLX5DV_OBJ_CQ); + if (ret) { + gds_err("error %d in mlx5dv_init_obj MLX5DV_OBJ_CQ\n", ret); + ret = EINVAL; + goto err; + } + + // If va_id is not set, pd_mem_alloc did not allocate the buffer on device. + // Hence, we register the buffer to the device here. + if (!mcq_peer->buf.va_id) { + if (alloc_flags & GDS_ALLOC_CQ_ON_GPU) + gds_err("Cannot alloc CQ buf on GPU. Falling back to host.\n"); + mcq_peer->buf.va_id = peer_attr->register_va( + mcq->dvcq.buf, + (uint64_t)mcq->dvcq.cqe_cnt * (uint64_t)mcq->dvcq.cqe_size, + peer_attr->peer_id, + NULL + ); + if (!mcq_peer->buf.va_id) { + gds_err("error in peer_attr->register_va\n"); + ret = EINVAL; + goto err; + } + register_peer_buf = true; + } + + if (!mcq_peer->dbr.va_id) { + if (alloc_flags & GDS_ALLOC_CQ_DBREC_ON_GPU) + gds_err("Cannot alloc CQ DBREC on GPU. Falling back to host.\n"); + mcq_peer->dbr.va_id = peer_attr->register_va( + mcq->dvcq.dbrec, + GDS_MLX5_DV_DBR_BUF_SIZE, + peer_attr->peer_id, + NULL + ); + if (!mcq_peer->dbr.va_id) { + gds_err("error in dbr register_va\n"); + ret = EINVAL; + goto err; + } + register_peer_dbr = true; + } + + mcq_peer->pdata.peek_table = (struct gds_mlx5_dv_peek_entry **)malloc(sizeof(struct gds_mlx5_dv_peek_entry *) * mcq->dvcq.cqe_cnt); + if (!mcq_peer->pdata.peek_table) { + gds_err("error %d in malloc peer_peek_table\n", errno); + ret = ENOMEM; + goto err; + } + memset(mcq_peer->pdata.peek_table, 0, sizeof(struct gds_peek_entry *) * mcq->dvcq.cqe_cnt); + mcq_peer->pdata.dir = GDS_PEER_DIRECTION_FROM_PEER | GDS_PEER_DIRECTION_TO_CPU; + + mcq_peer->pdata.gbuf = peer->buf_alloc( + peer->alloc_type, + sizeof(struct gds_mlx5_dv_peek_entry) * mcq->dvcq.cqe_cnt, + mcq_peer->pdata.dir, + (uint32_t)sysconf(_SC_PAGESIZE), + peer->alloc_flags + ); + if (!mcq_peer->pdata.gbuf) { + gds_err("error %d in buf_alloc\n", errno); + ret = ENOMEM; + goto err; + } + + mcq_peer->pdata.va_id = peer_attr->register_va( + mcq_peer->pdata.gbuf->addr, + mcq_peer->pdata.gbuf->length, + peer_attr->peer_id, + mcq_peer->pdata.gbuf + ); + if (!mcq_peer->pdata.va_id) { + gds_err("error %d in register_va\n", errno); + ret = EINVAL; + goto err; + } + + memset(mcq_peer->pdata.gbuf->addr, 0, mcq_peer->pdata.gbuf->length); + + mcq_peer->pdata.peek_free = (struct gds_mlx5_dv_peek_entry *)mcq_peer->pdata.gbuf->addr; + for (int i = 0; i < mcq->dvcq.cqe_cnt - 1; ++i) + mcq_peer->pdata.peek_free[i].next = i + 1; + mcq_peer->pdata.peek_free[mcq->dvcq.cqe_size - 1].next = GDS_MLX5_DV_LAST_PEEK_ENTRY; + + mcq->gcq.cq = ibcq; + *out_mcq = mcq; + + return 0; + +err: + if (mcq_peer) { + if (mcq_peer->pdata.va_id) + peer_attr->unregister_va(mcq_peer->pdata.va_id, peer_attr->peer_id); + + if (mcq_peer->pdata.gbuf) + peer_attr->buf_release(mcq_peer->pdata.gbuf); + + if (mcq_peer->pdata.peek_table) + free(mcq_peer->pdata.peek_table); + + if (register_peer_buf) + peer_attr->unregister_va(mcq_peer->buf.va_id, peer_attr->peer_id); + + if (register_peer_dbr) + peer_attr->unregister_va(mcq_peer->dbr.va_id, peer_attr->peer_id); + } + + if (ibcq) + ibv_destroy_cq(ibcq); + + if (mcq_peer) + free(mcq_peer); + + if (mcq) + free(mcq); + + return ret; +} + +//----------------------------------------------------------------------------- + +static void gds_mlx5_dv_destroy_cq(gds_mlx5_dv_cq_t *mcq) +{ + int status = 0; + gds_mlx5_dv_cq_peer_t *mcq_peer = mcq->cq_peer; + + if (mcq->wq) + mcq->wq = NULL; + + if (mcq_peer && mcq_peer->pdata.peek_table) { + free(mcq_peer->pdata.peek_table); + mcq_peer->pdata.peek_table = NULL; + } + + if (mcq_peer && mcq_peer->peer_attr) { + gds_peer_attr *peer_attr = mcq_peer->peer_attr; + gds_peer *peer = peer_from_id(peer_attr->peer_id); + + // This may be used by ibv_destroy_cq, which eventually calls pd_mem_free. + peer->alloc_type = gds_peer::CQ; + peer->opaque = mcq_peer; + + // gbuf has value iff pd_mem_alloc handled the allocation. + // In that case, leave the deallocation to pd_mem_free. + if (mcq_peer->buf.va_id && mcq_peer->buf.gbuf == NULL) { + peer_attr->unregister_va(mcq_peer->buf.va_id, peer_attr->peer_id); + mcq_peer->buf.va_id = 0; + } + if (mcq_peer->dbr.va_id && mcq_peer->dbr.gbuf == NULL) { + peer_attr->unregister_va(mcq_peer->dbr.va_id, peer_attr->peer_id); + mcq_peer->dbr.va_id = 0; + } + if (mcq_peer->pdata.va_id) { + peer_attr->unregister_va(mcq_peer->pdata.va_id, peer_attr->peer_id); + mcq_peer->pdata.va_id = 0; + } + if (mcq_peer->pdata.gbuf) { + peer_attr->buf_release(mcq_peer->pdata.gbuf); + mcq_peer->pdata.gbuf = NULL; + } + } + + if (mcq->gcq.cq) { + status = ibv_destroy_cq(mcq->gcq.cq); + if (status) { + gds_err("error %d in ibv_destroy\n", status); + return; + } + mcq->gcq.cq = NULL; + } + + if (mcq_peer) { + free(mcq_peer); + mcq->cq_peer = NULL; + } + + free(mcq); +} + +//----------------------------------------------------------------------------- + +int gds_mlx5_dv_create_qp( + struct ibv_pd *pd, struct ibv_context *context, gds_qp_init_attr_t *qp_attr, + gds_peer *peer, gds_peer_attr *peer_attr, int flags, gds_qp_t **gqp +) +{ + int status = 0; + + gds_mlx5_dv_qp_t *mdqp = NULL; + struct ibv_qp *ibqp = NULL; + + gds_mlx5_dv_cq_t *tx_mcq = NULL; + gds_mlx5_dv_cq_t *rx_mcq = NULL; + + uint32_t alignment; + + struct mlx5dv_devx_uar *uar = NULL; + uint64_t uar_range_id = 0; + uint8_t log_bf_reg_size; + size_t bf_reg_size; + + int max_tx; + int max_rx; + + size_t wqe_size; + struct mlx5dv_devx_umem *wq_umem = NULL; + size_t wq_buf_size; + gds_buf *wq_buf = NULL; + uint64_t wq_buf_range_id = 0; + + struct mlx5dv_devx_umem *dbr_umem = NULL; + size_t dbr_buf_size; + gds_buf *dbr_buf = NULL; + uint64_t dbr_buf_range_id = 0; + + mlx5dv_obj dv_obj; + struct mlx5dv_pd dvpd = {0,}; + uint64_t dv_obj_type = 0; + + uint8_t cmd_in[DEVX_ST_SZ_BYTES(create_qp_in)] = {0,}; + uint8_t cmd_out[DEVX_ST_SZ_BYTES(create_qp_out)] = {0,}; + + uint8_t cmd_cap_in[DEVX_ST_SZ_BYTES(query_hca_cap_in)] = {0,}; + uint8_t cmd_cap_out[DEVX_ST_SZ_BYTES(query_hca_cap_out)] = {0,}; + + void *qpc; + + gds_mlx5_dv_qp_peer_t *mqp_peer = NULL; + struct ibv_srq *srq = NULL; + bool is_internal_srq = false; + + struct mlx5dv_devx_obj *devx_obj = NULL; + + uint32_t qpn; + + gds_mlx5_dv_qp_type_t gmlx_qpt = GDS_MLX5_DV_QP_TYPE_UNKNOWN; + + assert(pd); + assert(context); + assert(qp_attr); + assert(peer); + assert(peer_attr); + + srq = qp_attr->srq; + + if (qp_attr->qp_type == IBV_QPT_RC) + gmlx_qpt = GDS_MLX5_DV_QP_TYPE_RC; + #if 0 + else if (qp_attr->qp_type == IBV_QPT_DRIVER) + gmlx_qpt = (qp_attr->dc_init_attr.dc_type == MLX5DV_DCTYPE_DCT) ? GDS_MLX5_QP_TYPE_DCT : GDS_MLX5_QP_TYPE_DCI; + #endif + + if (gmlx_qpt == GDS_MLX5_DV_QP_TYPE_UNKNOWN) { + gds_err("The requested QP type is not supported.\n"); + status = EINVAL; + goto out; + } + + if (gmlx_qpt == GDS_MLX5_DV_QP_TYPE_DCT) { + gds_err("DCT QP type is not supported.\n"); + status = EINVAL; + goto out; + } + + if (gmlx_qpt == GDS_MLX5_DV_QP_TYPE_RC) { + if (qp_attr->cap.max_send_sge != 1 || qp_attr->cap.max_recv_sge != 1) { + gds_err("Both cap.max_send_sge and cap.max_recv_sge must be 1.\n"); + status = EINVAL; + goto out; + } + } + else if (gmlx_qpt == GDS_MLX5_DV_QP_TYPE_DCI) { + if (qp_attr->cap.max_send_sge != 1) { + gds_err("cap.max_send_sge must be 1.\n"); + status = EINVAL; + goto out; + } + } + + mdqp = (gds_mlx5_dv_qp_t *)calloc(1, sizeof(gds_mlx5_dv_qp_t)); + if (!mdqp) { + gds_err("cannot allocate mdqp\n"); + status = ENOMEM; + goto out; + } + + ibqp = (struct ibv_qp *)calloc(1, sizeof(struct ibv_qp)); + if (!ibqp) { + gds_err("cannot allocate ibqp\n"); + status = ENOMEM; + goto out; + } + + status = gds_mlx5_dv_create_cq( + context, qp_attr->cap.max_send_wr, NULL, NULL, 0, pd, peer_attr, + (gds_alloc_cq_flags_t)((flags & GDS_CREATE_QP_TX_CQ_ON_GPU) ? (GDS_ALLOC_CQ_ON_GPU | GDS_ALLOC_CQ_DBREC_ON_GPU) : (GDS_ALLOC_CQ_DEFAULT | GDS_ALLOC_CQ_DBREC_DEFAULT)), + &tx_mcq + ); + if (status) { + gds_err("Error in creating tx cq\n"); + goto out; + } + + status = gds_mlx5_dv_create_cq( + context, qp_attr->cap.max_recv_wr, NULL, NULL, 0, pd, peer_attr, + (gds_alloc_cq_flags_t)((flags & GDS_CREATE_QP_RX_CQ_ON_GPU) ? (GDS_ALLOC_CQ_ON_GPU | GDS_ALLOC_CQ_DBREC_ON_GPU) : (GDS_ALLOC_CQ_DEFAULT | GDS_ALLOC_CQ_DBREC_DEFAULT)), + &rx_mcq + ); + if (status) { + gds_err("Error in creating rx cq\n"); + goto out; + } + + DEVX_SET(query_hca_cap_in, cmd_cap_in, opcode, MLX5_CMD_OP_QUERY_HCA_CAP); + DEVX_SET(query_hca_cap_in, cmd_cap_in, op_mod, + MLX5_SET_HCA_CAP_OP_MOD_GENERAL_DEVICE | + HCA_CAP_OPMOD_GET_CUR + ); + + status = mlx5dv_devx_general_cmd(context, cmd_cap_in, sizeof(cmd_cap_in), cmd_cap_out, sizeof(cmd_cap_out)); + if (status) { + gds_err("Error in mlx5dv_devx_general_cmd for HCA CAP.\n"); + goto out; + } + + srq = qp_attr->srq; + if (gmlx_qpt == GDS_MLX5_DV_QP_TYPE_DCI && !srq) { + struct ibv_srq_init_attr srq_init_attr = {0,}; + srq_init_attr.attr.max_wr = qp_attr->cap.max_recv_wr; + srq_init_attr.attr.max_sge = qp_attr->cap.max_recv_sge; + + mqp_peer = (gds_mlx5_dv_qp_peer_t *)calloc(1, sizeof(gds_mlx5_dv_qp_peer_t)); + if (!mqp_peer) { + gds_err("Cannot allocate memory for mqp_peer.\n"); + status = ENOMEM; + goto out; + } + mqp_peer->peer_attr = peer_attr; + + peer->alloc_type = gds_peer::WQ; + peer->alloc_flags = flags; + // mqp_peer will be filled if we do allocation on device. + // pd_mem_alloc is responsible for the registration. + peer->opaque = mqp_peer; + + srq = ibv_create_srq(pd, &srq_init_attr); + if (!srq) { + status = errno; + gds_err("Error in ibv_create_srq with errno=%d.\n", errno); + goto out; + } + qp_attr->srq = srq; + is_internal_srq = true; + } + + log_bf_reg_size = DEVX_GET(query_hca_cap_out, cmd_cap_out, capability.cmd_hca_cap.log_bf_reg_size); + + // The size of 1st + 2nd half (as when we use alternating DB) + bf_reg_size = 1LLU << log_bf_reg_size; + + // Allocate UAR. This will be used as a DB/BF register). + uar = mlx5dv_devx_alloc_uar(context, GDS_MLX5_DV_UAR_ALLOC_TYPE_BF); + if (!uar) { + gds_err("Error in mlx5dv_devx_uar\n"); + status = ENOMEM; + goto out; + } + + uar_range_id = peer_attr->register_va( + uar->reg_addr, + bf_reg_size, + peer_attr->peer_id, + GDS_PEER_IOMEMORY + ); + if (!uar_range_id) { + gds_err("Error in peer_attr->register_va for BF\n"); + status = EINVAL; + goto out; + } + + // In GPUVerbs, we use at most 4 16-byte elements. + wqe_size = MLX5_SEND_WQE_BB; // 64 bytes + max_tx = GDS_ROUND_UP_POW2_OR_0(qp_attr->cap.max_send_wr); + max_rx = (gmlx_qpt == GDS_MLX5_DV_QP_TYPE_RC) ? GDS_ROUND_UP_POW2_OR_0(qp_attr->cap.max_recv_wr) : 0; + wq_buf_size = (max_tx + max_rx) * wqe_size; + + // Allocate WQ buffer. + alignment = (uint32_t)((flags & GDS_ALLOC_WQ_ON_GPU) ? GDS_GPU_PAGE_SIZE : sysconf(_SC_PAGESIZE)); + wq_buf = peer->alloc(wq_buf_size, alignment, (flags & GDS_ALLOC_WQ_ON_GPU) ? GDS_MEMORY_GPU : GDS_MEMORY_HOST); + if (!wq_buf) { + gds_err("Error in peer->alloc of wq_buf.\n"); + status = ENOMEM; + goto out; + } + + wq_umem = mlx5dv_devx_umem_reg(context, wq_buf->addr, wq_buf_size, 0); + if (!wq_umem) { + gds_err("Error in mlx5dv_devx_umem_regfor WQ\n"); + status = ENOMEM; + goto out; + } + + wq_buf_range_id = peer_attr->register_va(wq_buf->addr, wq_buf_size, peer_attr->peer_id, wq_buf); + if (!wq_buf_range_id) { + gds_err("Error in peer_attr->register_va for WQ\n"); + status = ENOMEM; + goto out; + } + + // Allocate DBR buffer. + alignment = (uint32_t)((flags & GDS_ALLOC_WQ_DBREC_ON_GPU) ? GDS_GPU_PAGE_SIZE : sysconf(_SC_PAGESIZE)); + dbr_buf_size = GDS_MLX5_DV_DBR_BUF_SIZE; + + dbr_buf = peer->alloc(dbr_buf_size, alignment, (flags & GDS_ALLOC_WQ_DBREC_ON_GPU) ? GDS_MEMORY_GPU : GDS_MEMORY_HOST); + if (!dbr_buf) { + gds_err("Error in peer->alloc of dbr_buf.\n"); + status = ENOMEM; + goto out; + } + + dbr_umem = mlx5dv_devx_umem_reg(context, dbr_buf->addr, dbr_buf_size, 0); + if (!dbr_umem) { + gds_err("Error in mlx5dv_devx_umem_reg for DBR\n"); + status = ENOMEM; + goto out; + } + + dbr_buf_range_id = peer_attr->register_va(dbr_buf->addr, dbr_buf_size, peer_attr->peer_id, dbr_buf); + if (!dbr_buf_range_id) { + gds_err("Error in peer_attr->register_va for DBR\n"); + status = ENOMEM; + goto out; + } + + // Query more PD info with Direct-Verbs. + dv_obj.pd.in = pd; + dv_obj.pd.out = &dvpd; + dv_obj_type = MLX5DV_OBJ_PD; + if (srq) { + dv_obj.srq.in = srq; + dv_obj.srq.out = &mdqp->dvsrq; + mdqp->dvsrq.comp_mask = MLX5DV_SRQ_MASK_SRQN; + dv_obj_type |= MLX5DV_OBJ_SRQ; + } + status = mlx5dv_init_obj(&dv_obj, dv_obj_type); + if (status) { + gds_err("Error in mlx5dv_init_obj\n"); + goto out; + } + + DEVX_SET(create_qp_in, cmd_in, opcode, MLX5_CMD_OP_CREATE_QP); + DEVX_SET(create_qp_in, cmd_in, wq_umem_id, wq_umem->umem_id); // WQ buffer + + qpc = DEVX_ADDR_OF(create_qp_in, cmd_in, qpc); + DEVX_SET(qpc, qpc, st, (gmlx_qpt == GDS_MLX5_DV_QP_TYPE_RC) ? GDS_MLX5_DV_QPC_ST_RC : GDS_MLX5_DV_QPC_ST_DCI); + DEVX_SET(qpc, qpc, pm_state, MLX5_QPC_PM_STATE_MIGRATED); + DEVX_SET(qpc, qpc, pd, dvpd.pdn); + DEVX_SET(qpc, qpc, uar_page, uar->page_id); // BF register + if (srq) { + if (!(mdqp->dvsrq.comp_mask & MLX5DV_SRQ_MASK_SRQN)) { + status = EIO; + gds_err("mlx5dv_init_obj does not return SRQ number!\n"); + goto out; + } + DEVX_SET(qpc, qpc, rq_type, GDS_MLX5_DV_QPC_RQ_TYPE_SRQ); + DEVX_SET(qpc, qpc, srqn_rmpn_xrqn, mdqp->dvsrq.srqn); + } + else { + DEVX_SET(qpc, qpc, rq_type, GDS_MLX5_DV_QPC_RQ_TYPE_REGULAR); + DEVX_SET(qpc, qpc, srqn_rmpn_xrqn, 0); + } + DEVX_SET(qpc, qpc, cqn_snd, tx_mcq->dvcq.cqn); + DEVX_SET(qpc, qpc, cqn_rcv, rx_mcq->dvcq.cqn); + DEVX_SET(qpc, qpc, log_sq_size, GDS_ILOG2_OR0(max_tx)); + DEVX_SET(qpc, qpc, cs_req, 0); // Disable CS Request + DEVX_SET(qpc, qpc, cs_res, 0); // Disable CS Respond + DEVX_SET(qpc, qpc, dbr_umem_valid, 0x1); // Enable dbr_umem_id + DEVX_SET64(qpc, qpc, dbr_addr, 0); // Offset 0 of dbr_umem_id (behavior changed because of dbr_umem_valid) + DEVX_SET(qpc, qpc, dbr_umem_id, dbr_umem->umem_id); // DBR buffer + DEVX_SET(qpc, qpc, user_index, 0); + DEVX_SET(qpc, qpc, page_offset, 0); + if (gmlx_qpt == GDS_MLX5_DV_QP_TYPE_RC) + DEVX_SET(qpc, qpc, log_rq_size, GDS_ILOG2_OR0(max_rx)); + + devx_obj = mlx5dv_devx_obj_create(context, cmd_in, sizeof(cmd_in), cmd_out, sizeof(cmd_out)); + if (!devx_obj) { + gds_err("Error in mlx5dv_devx_obj_create for qp\n"); + status = EIO; + goto out; + } + + qpn = DEVX_GET(create_qp_out, cmd_out, qpn); + + mdqp->devx_qp = devx_obj; + mdqp->qp_type = gmlx_qpt; + + mdqp->is_internal_srq = is_internal_srq; + mdqp->qp_peer = mqp_peer; + + mdqp->wq_buf = wq_buf; + mdqp->wq_umem = wq_umem; + mdqp->wq_va_id = wq_buf_range_id; + + mdqp->dbr_buf = dbr_buf; + mdqp->dbr_umem = dbr_umem; + mdqp->dbr_va_id = dbr_buf_range_id; + + mdqp->bf_uar = uar; + mdqp->bf_size = bf_reg_size / 2; + mdqp->bf_va_id = uar_range_id; + + mdqp->sq_cnt = max_tx; + mdqp->rq_cnt = max_rx; + + mdqp->rq_buf_offset = 0; + // Assume 1 recv sge if RC and no wq_sig + mdqp->sq_buf_offset = (gmlx_qpt == GDS_MLX5_DV_QP_TYPE_RC) ? MAX(max_rx * sizeof(struct mlx5_wqe_data_seg), GDS_MLX5_DV_SEND_WQE_BB) : 0; + + mdqp->peer_attr = peer_attr; + + mdqp->gqp.send_cq = &tx_mcq->gcq; + mdqp->gqp.recv_cq = &rx_mcq->gcq; + + ibqp->context = context; + ibqp->pd = pd; + ibqp->send_cq = tx_mcq->gcq.cq; + ibqp->recv_cq = rx_mcq->gcq.cq; + if (srq) + ibqp->srq = srq; + ibqp->qp_num = qpn; + ibqp->state = IBV_QPS_RESET; + ibqp->qp_type = qp_attr->qp_type; + + mdqp->gqp.qp = ibqp; + + *gqp = &mdqp->gqp; + +out: + // Failed. Cleaning up. + if (status) { + if (devx_obj) + mlx5dv_devx_obj_destroy(devx_obj); + + if (dbr_buf_range_id) + peer_attr->unregister_va(dbr_buf_range_id, peer_attr->peer_id); + + if (dbr_umem) + mlx5dv_devx_umem_dereg(dbr_umem); + + if (dbr_buf) + peer->free(dbr_buf); + + if (wq_buf_range_id) + peer_attr->unregister_va(wq_buf_range_id, peer_attr->peer_id); + + if (wq_umem) + mlx5dv_devx_umem_dereg(wq_umem); + + if (wq_buf) + peer->free(wq_buf); + + if (uar_range_id) + peer_attr->unregister_va(uar_range_id, peer_attr->peer_id); + + if (uar) + mlx5dv_devx_free_uar(uar); + + if (is_internal_srq && srq) + ibv_destroy_srq(srq); + + if (mqp_peer) + free(mqp_peer); + + if (rx_mcq) + gds_mlx5_dv_destroy_cq(rx_mcq); + + if (tx_mcq) + gds_mlx5_dv_destroy_cq(tx_mcq); + + if (ibqp) + free(ibqp); + + if (mdqp) + free(mdqp); + } + return status; +} + +//----------------------------------------------------------------------------- + +static int gds_mlx5_dv_modify_qp_rst2init(gds_mlx5_dv_qp_t *mdqp, struct ibv_qp_attr *attr, int attr_mask) +{ + int status = 0; + + uint8_t cmd_in[DEVX_ST_SZ_BYTES(rst2init_qp_in)] = {0,}; + uint8_t cmd_out[DEVX_ST_SZ_BYTES(rst2init_qp_out)] = {0,}; + + void *qpc; + + assert(attr->qp_state == IBV_QPS_INIT); + if (mdqp->gqp.qp->state != IBV_QPS_RESET) { + gds_err("Incorrect current QP state.\n"); + status = EINVAL; + goto out; + } + + if (!(attr_mask & IBV_QP_PORT)) { + gds_err("IBV_QP_PORT is required.\n"); + status = EINVAL; + goto out; + } + + if (!(attr_mask & IBV_QP_PKEY_INDEX)) { + gds_err("IBV_QP_PKEY_INDEX is required.\n"); + status = EINVAL; + goto out; + } + + status = ibv_query_port(mdqp->gqp.qp->context, attr->port_num, &mdqp->port_attr); + if (status) { + gds_err("Error in ibv_query_port port_num=%d\n", attr->port_num); + goto out; + } + + DEVX_SET(rst2init_qp_in, cmd_in, opcode, MLX5_CMD_OP_RST2INIT_QP); + DEVX_SET(rst2init_qp_in, cmd_in, qpn, mdqp->gqp.qp->qp_num); + + qpc = DEVX_ADDR_OF(rst2init_qp_in, cmd_in, qpc); + DEVX_SET(qpc, qpc, pm_state, MLX5_QPC_PM_STATE_MIGRATED); + DEVX_SET(qpc, qpc, primary_address_path.vhca_port_num, attr->port_num); + DEVX_SET(qpc, qpc, primary_address_path.pkey_index, attr->pkey_index); + + if (attr_mask & IBV_QP_ACCESS_FLAGS) { + DEVX_SET(qpc, qpc, rwe, !!(attr->qp_access_flags & IBV_ACCESS_REMOTE_WRITE)); + DEVX_SET(qpc, qpc, rre, !!(attr->qp_access_flags & IBV_ACCESS_REMOTE_READ)); + DEVX_SET(qpc, qpc, rae, !!(attr->qp_access_flags & IBV_ACCESS_REMOTE_ATOMIC)); + if (attr->qp_access_flags & IBV_ACCESS_REMOTE_ATOMIC) { + DEVX_SET(qpc, qpc, atomic_mode, GDS_MLX5_DV_ATOMIC_MODE); + DEVX_SET(qpc, qpc, atomic_like_write_en, GDS_MLX5_DV_ATOMIC_LIKE_WRITE_EN); + } + } + + DEVX_SET(qpc, qpc, wq_signature, GDS_MLX5_DV_WQ_SIGNATURE); + DEVX_SET(qpc, qpc, counter_set_id, GDS_MLX5_DV_COUNTER_SET_ID); + DEVX_SET(qpc, qpc, lag_tx_port_affinity, GDS_MLX5_DV_LAG_TX_PORT_AFFINITY); + + status = mlx5dv_devx_obj_modify(mdqp->devx_qp, cmd_in, sizeof(cmd_in), cmd_out, sizeof(cmd_out)); + if (status) { + gds_err("Error in mlx5dv_devx_obj_modify for RST2INIT_QP with syndrome %x\n", DEVX_GET(rst2init_qp_out, cmd_out, syndrome)); + goto out; + } + + mdqp->gqp.qp->state = IBV_QPS_INIT; + mdqp->port_num = attr->port_num; + +out: + return status; +} + +//----------------------------------------------------------------------------- + +static int gds_mlx5_dv_modify_qp_init2rtr(gds_mlx5_dv_qp_t *mdqp, struct ibv_qp_attr *attr, int attr_mask) +{ + int status = 0; + + uint8_t cmd_in[DEVX_ST_SZ_BYTES(init2rtr_qp_in)] = {0,}; + uint8_t cmd_out[DEVX_ST_SZ_BYTES(init2rtr_qp_out)] = {0,}; + + void *qpc; + + assert(attr->qp_state == IBV_QPS_RTR); + if (mdqp->gqp.qp->state != IBV_QPS_INIT) { + gds_err("Incorrect current QP state.\n"); + status = EINVAL; + goto out; + } + + if (mdqp->qp_type == GDS_MLX5_DV_QP_TYPE_RC) { + if (!(attr_mask & IBV_QP_DEST_QPN)) { + gds_err("IBV_QP_DEST_QPN is required.\n"); + status = EINVAL; + goto out; + } + } + + if (!(attr_mask & IBV_QP_PATH_MTU)) { + gds_err("IBV_QP_PATH_MTU is required.\n"); + status = EINVAL; + goto out; + } + + if (!(attr_mask & IBV_QP_AV)) { + gds_err("IBV_QP_AV is required.\n"); + status = EINVAL; + goto out; + } + + if (mdqp->port_attr.link_layer != IBV_LINK_LAYER_INFINIBAND) { + gds_err("We support infiniband link layer only.\n"); + status = ENOTSUP; + goto out; + } + + if (attr->ah_attr.is_global && attr->ah_attr.grh.flow_label != 0) { + gds_err("Flow label is not supported.\n"); + status = ENOTSUP; + goto out; + } + + DEVX_SET(init2rtr_qp_in, cmd_in, opcode, MLX5_CMD_OP_INIT2RTR_QP); + DEVX_SET(init2rtr_qp_in, cmd_in, op_mod, 0x0); // Request INIT2RTR + DEVX_SET(init2rtr_qp_in, cmd_in, opt_param_mask, 0x0); // Don't pass optional params + DEVX_SET(init2rtr_qp_in, cmd_in, qpn, mdqp->gqp.qp->qp_num); + + qpc = DEVX_ADDR_OF(init2rtr_qp_in, cmd_in, qpc); + DEVX_SET(qpc, qpc, mtu, attr->path_mtu); + DEVX_SET(qpc, qpc, log_msg_max, GDS_MLX5_DV_LOG_MAX_MSG_SIZE); + + if (attr_mask & IBV_QP_DEST_QPN) + DEVX_SET(qpc, qpc, remote_qpn, attr->dest_qp_num); + + DEVX_SET(qpc, qpc, primary_address_path.grh, attr->ah_attr.is_global); + DEVX_SET(qpc, qpc, primary_address_path.rlid, attr->ah_attr.dlid); + DEVX_SET(qpc, qpc, primary_address_path.mlid, attr->ah_attr.src_path_bits & 0x7f); + DEVX_SET(qpc, qpc, primary_address_path.sl, attr->ah_attr.sl); + + if (attr->ah_attr.is_global) { + DEVX_SET(qpc, qpc, primary_address_path.hop_limit, attr->ah_attr.grh.hop_limit); + DEVX_SET(qpc, qpc, primary_address_path.rgid_rip, attr->ah_attr.grh.hop_limit); + memcpy( + DEVX_ADDR_OF(qpc, qpc, primary_address_path.rgid_rip), + &attr->ah_attr.grh.dgid, + DEVX_FLD_SZ_BYTES(qpc, primary_address_path.rgid_rip) + ); + DEVX_SET(qpc, qpc, primary_address_path.tclass, attr->ah_attr.grh.traffic_class); + } + + if (attr_mask & IBV_QP_MAX_DEST_RD_ATOMIC) + DEVX_SET(qpc, qpc, log_rra_max, GDS_ILOG2_OR0(attr->max_dest_rd_atomic)); + + if (attr_mask & IBV_QP_MIN_RNR_TIMER) + DEVX_SET(qpc, qpc, min_rnr_nak, attr->min_rnr_timer); + + status = mlx5dv_devx_obj_modify(mdqp->devx_qp, cmd_in, sizeof(cmd_in), cmd_out, sizeof(cmd_out)); + if (status) { + gds_err("Error in mlx5dv_devx_obj_modify for INIT2RTR_QP with syndrome %x\n", DEVX_GET(init2rtr_qp_out, cmd_out, syndrome)); + goto out; + } + + mdqp->sl = attr->ah_attr.sl; + + mdqp->gqp.qp->state = IBV_QPS_RTR; + +out: + return status; +} + +//----------------------------------------------------------------------------- + +static int gds_mlx5_dv_modify_qp_rtr2rts(gds_mlx5_dv_qp_t *mdqp, struct ibv_qp_attr *attr, int attr_mask) +{ + int status = 0; + + uint8_t cmd_in[DEVX_ST_SZ_BYTES(rtr2rts_qp_in)] = {0,}; + uint8_t cmd_out[DEVX_ST_SZ_BYTES(rtr2rts_qp_out)] = {0,}; + + void *qpc; + + assert(attr->qp_state == IBV_QPS_RTS); + if (mdqp->gqp.qp->state != IBV_QPS_RTR) { + gds_err("Incorrect current QP state.\n"); + status = EINVAL; + goto out; + } + + if (!(attr_mask & IBV_QP_MAX_QP_RD_ATOMIC)) { + gds_err("IBV_QP_MAX_QP_RD_ATOMIC is required.\n"); + status = EINVAL; + goto out; + } + + if (!(attr_mask & IBV_QP_RETRY_CNT)) { + gds_err("IBV_QP_RETRY_CNT is required.\n"); + status = EINVAL; + goto out; + } + + if (!(attr_mask & IBV_QP_RNR_RETRY)) { + gds_err("IBV_QP_RNR_RETRY is required.\n"); + status = EINVAL; + goto out; + } + + if (!(attr_mask & IBV_QP_TIMEOUT)) { + gds_err("IBV_QP_TIMEOUT is required.\n"); + status = EINVAL; + goto out; + } + + if (!(attr_mask & IBV_QP_SQ_PSN)) { + gds_err("IBV_QP_SQ_PSN is required.\n"); + status = EINVAL; + goto out; + } + + DEVX_SET(rtr2rts_qp_in, cmd_in, opcode, MLX5_CMD_OP_RTR2RTS_QP); + DEVX_SET(rtr2rts_qp_in, cmd_in, opt_param_mask, 0x0); // Don't pass optional params + DEVX_SET(rtr2rts_qp_in, cmd_in, qpn, mdqp->gqp.qp->qp_num); + + qpc = DEVX_ADDR_OF(rtr2rts_qp_in, cmd_in, qpc); + DEVX_SET(qpc, qpc, log_sra_max, GDS_ILOG2_OR0(attr->max_rd_atomic)); + DEVX_SET(qpc, qpc, retry_count, attr->retry_cnt); + DEVX_SET(qpc, qpc, rnr_retry, attr->rnr_retry); + DEVX_SET(qpc, qpc, next_send_psn, attr->sq_psn); + DEVX_SET(qpc, qpc, log_ack_req_freq, GDS_MLX5_DV_LOG_ACK_REQ_FREQ); + DEVX_SET(qpc, qpc, primary_address_path.ack_timeout, attr->timeout); + + status = mlx5dv_devx_obj_modify(mdqp->devx_qp, cmd_in, sizeof(cmd_in), cmd_out, sizeof(cmd_out)); + if (status) { + gds_err("Error in mlx5dv_devx_obj_modify for RTR2RTS_QP with syndrome %x\n", DEVX_GET(rtr2rts_qp_out, cmd_out, syndrome)); + goto out; + } + + mdqp->gqp.qp->state = IBV_QPS_RTS; + +out: + return status; +} + +//----------------------------------------------------------------------------- + +int gds_mlx5_dv_modify_qp(gds_mlx5_dv_qp_t *mdqp, struct ibv_qp_attr *attr, int attr_mask) +{ + int status = 0; + + assert(mdqp); + assert(attr); + + if (!(attr_mask & IBV_QP_STATE)) { + gds_err("IBV_QP_STATE is required.\n"); + status = EINVAL; + goto out; + } + + switch (attr->qp_state) { + case IBV_QPS_INIT: + status = gds_mlx5_dv_modify_qp_rst2init(mdqp, attr, attr_mask); + break; + case IBV_QPS_RTR: + status = gds_mlx5_dv_modify_qp_init2rtr(mdqp, attr, attr_mask); + break; + case IBV_QPS_RTS: + status = gds_mlx5_dv_modify_qp_rtr2rts(mdqp, attr, attr_mask); + break; + default: + gds_err("Encountered unsupported qp_state.\n"); + status = EINVAL; + } + +out: + return status; +} + +//----------------------------------------------------------------------------- + +void gds_mlx5_dv_destroy_qp(gds_mlx5_dv_qp_t *mdqp) +{ + int status = 0; + + gds_peer *peer = NULL; + gds_mlx5_dv_qp_peer_t *mqp_peer = mdqp->qp_peer; + + assert(mdqp); + assert(mdqp->devx_qp); + + if (mqp_peer) { + gds_peer_attr *peer_attr = mqp_peer->peer_attr; + gds_peer *peer = peer_from_id(peer_attr->peer_id); + + // This may be used by ibv_destroy_qp, which eventually calls pd_mem_free. + peer->alloc_type = gds_peer::WQ; + peer->opaque = mqp_peer; + } + + status = mlx5dv_devx_obj_destroy(mdqp->devx_qp); + if (status) + gds_err("Error in mlx5dv_devx_obj_destroy for QP.\n"); + + if (mdqp->is_internal_srq && mdqp->gqp.qp->srq) + ibv_destroy_srq(mdqp->gqp.qp->srq); + + if (mdqp->gqp.send_cq) { + gds_mlx5_dv_destroy_cq(to_gds_mdv_cq(mdqp->gqp.send_cq)); + mdqp->gqp.send_cq = NULL; + } + + if (mdqp->gqp.recv_cq) { + gds_mlx5_dv_destroy_cq(to_gds_mdv_cq(mdqp->gqp.recv_cq)); + mdqp->gqp.recv_cq = NULL; + } + + if (mdqp->dbr_umem) { + status = mlx5dv_devx_umem_dereg(mdqp->dbr_umem); + if (status) + gds_err("Error in mlx5dv_devx_umem_dereg of dbr_umem.\n"); + } + + if (mdqp->wq_umem) { + status = mlx5dv_devx_umem_dereg(mdqp->wq_umem); + if (status) + gds_err("Error in mlx5dv_devx_umem_dereg of wq_umem.\n"); + } + + if (mdqp->dbr_buf || mdqp->wq_buf || mdqp->bf_va_id) { + assert(mdqp->peer_attr); + + peer = peer_from_id(mdqp->peer_attr->peer_id); + + if (mdqp->bf_va_id) + mdqp->peer_attr->unregister_va(mdqp->bf_va_id, mdqp->peer_attr->peer_id); + + if (mdqp->dbr_buf) { + mdqp->peer_attr->unregister_va(mdqp->dbr_va_id, mdqp->peer_attr->peer_id); + peer->free(mdqp->dbr_buf); + } + + if (mdqp->wq_buf) { + mdqp->peer_attr->unregister_va(mdqp->wq_va_id, mdqp->peer_attr->peer_id); + peer->free(mdqp->wq_buf); + } + } + + if (mdqp->bf_uar) + mlx5dv_devx_free_uar(mdqp->bf_uar); + + if (mqp_peer) + free(mqp_peer); + + if (mdqp->gqp.qp) + free(mdqp->gqp.qp); + + free(mdqp); +} + +//----------------------------------------------------------------------------- + +int gds_transport_mlx5_dv_init(gds_transport_t **transport) +{ + int status = 0; + + gds_transport_t *t = (gds_transport_t *)calloc(1, sizeof(gds_transport_t)); + if (!t) { + status = ENOMEM; + goto out; + } + + t->create_qp = gds_mlx5_dv_create_qp; + #if 0 + t->destroy_qp = gds_mlx5_exp_destroy_qp; + t->rollback_qp = gds_mlx5_exp_rollback_qp; + + t->init_send_info = gds_mlx5_exp_init_send_info; + t->post_send_ops = gds_mlx5_exp_post_send_ops; + t->post_send_ops_on_cpu = gds_mlx5_exp_post_send_ops_on_cpu; + t->prepare_send = gds_mlx5_exp_prepare_send; + t->get_send_descs = gds_mlx5_exp_get_send_descs; + t->get_num_send_request_entries = gds_mlx5_exp_get_num_send_request_entries; + + t->init_wait_request = gds_mlx5_exp_init_wait_request; + t->dump_wait_request = gds_mlx5_exp_dump_wait_request; + t->stream_post_wait_descriptor = gds_mlx5_exp_stream_post_wait_descriptor; + t->post_wait_descriptor = gds_mlx5_exp_post_wait_descriptor; + t->get_wait_descs = gds_mlx5_exp_get_wait_descs; + t->get_num_wait_request_entries = gds_mlx5_exp_get_num_wait_request_entries; + + t->prepare_wait_cq = gds_mlx5_exp_prepare_wait_cq; + t->append_wait_cq = gds_mlx5_exp_append_wait_cq; + t->abort_wait_cq = gds_mlx5_exp_abort_wait_cq; + #endif + + *transport = t; + +out: + return status; +} + +//----------------------------------------------------------------------------- + +/* + * Local variables: + * c-indent-level: 8 + * c-basic-offset: 8 + * tab-width: 8 + * indent-tabs-mode: nil + * End: + */ diff --git a/src/transports/mlx5-dv/mlx5-dv.hpp b/src/transports/mlx5-dv/mlx5-dv.hpp new file mode 100644 index 0000000..6b63a23 --- /dev/null +++ b/src/transports/mlx5-dv/mlx5-dv.hpp @@ -0,0 +1,206 @@ +/* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of NVIDIA CORPORATION nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#pragma once + +#include +#include +#include + +#include + +#include +#include + +#include "objs.hpp" +#include "utils.hpp" + +#define GDS_MLX5_DV_DBR_BUF_SIZE 8 +#define GDS_MLX5_DV_LOG_MAX_MSG_SIZE 30 +#define GDS_MLX5_DV_ATOMIC_MODE 0x3 // Up to 8 bytes with Remote Micro Application atomics enabled +#define GDS_MLX5_DV_ATOMIC_LIKE_WRITE_EN 0x1 // Enable atomic with RDMA WRITE +#define GDS_MLX5_DV_WQ_SIGNATURE 0x0 // Disable wq signature +#define GDS_MLX5_DV_COUNTER_SET_ID 0x0 // Do not connect to any counter set +#define GDS_MLX5_DV_LAG_TX_PORT_AFFINITY 0x0 // Let the device decide +#define GDS_MLX5_DV_LOG_ACK_REQ_FREQ 0x0 // ACK every packet +#define GDS_MLX5_DV_UAR_ALLOC_TYPE_BF 0x0 // Allocate a BF buffer + +#define GDS_MLX5_DV_ROLLBACK_ID_PARITY_MASK (1ULL << 63) +#define GDS_MLX5_DV_LAST_PEEK_ENTRY (-1U) +#define GDS_MLX5_DV_PEEK_ENTRY(mcq, n) \ + (n == GDS_MLX5_DV_LAST_PEEK_ENTRY ? NULL : \ + ((struct gds_mlx5_dv_peek_entry *)mcq->cq_peer->pdata.gbuf->addr) + n) +#define GDS_MLX5_DV_PEEK_ENTRY_N(mcq, pe) \ + (pe == NULL ? GDS_MLX5_DV_LAST_PEEK_ENTRY : \ + ((pe - (struct gds_mlx5_dv_peek_entry *)mcq->cq_peer->pdata.gbuf->addr))) + +enum { + GDS_MLX5_DV_QPC_ST_RC = 0x0, + GDS_MLX5_DV_QPC_ST_DCI = 0x5 +}; + +enum { + GDS_MLX5_DV_QPC_RQ_TYPE_REGULAR = 0x0, + GDS_MLX5_DV_QPC_RQ_TYPE_SRQ = 0x1 +}; + +enum { + GDS_MLX5_DV_SEND_WQE_BB = 64, + GDS_MLX5_DV_SEND_WQE_SHIFT = 6, + GDS_MLX5_DV_RECV_WQE_BB = 64, + GDS_MLX5_DV_RECV_WQE_SHIFT = 6, +}; + +typedef struct gds_mlx5_dv_peek_entry { + uint32_t busy; + uint32_t next; +} gds_mlx5_dv_peek_entry_t; + +typedef struct gds_mlx5_dv_cq_peer { + gds_peer_attr *peer_attr; + + struct { + uint64_t va_id; + size_t size; + gds_buf *gbuf; + } buf; + + struct { + uint64_t va_id; + size_t size; + gds_buf *gbuf; + } dbr; + + struct { + uint64_t va_id; + uint32_t dir; + gds_buf *gbuf; + gds_mlx5_dv_peek_entry_t **peek_table; + gds_mlx5_dv_peek_entry_t *peek_free; + } pdata; +} gds_mlx5_dv_cq_peer_t; + +typedef struct gds_mlx5_dv_wq { + uint64_t *wrid; + uint64_t *wqe_head; + unsigned int wqe_cnt; + uint64_t head; + uint64_t tail; +} gds_mlx5_dv_wq_t; + +typedef struct gds_mlx5_dv_cq { + gds_cq_t gcq; + uint32_t cons_index; + struct mlx5dv_cq dvcq; + gds_mlx5_dv_wq_t *wq; + gds_mlx5_dv_cq_peer_t *cq_peer; +} gds_mlx5_dv_cq_t; + +typedef struct gds_mlx5_dv_qp_peer { + gds_peer_attr *peer_attr; + uint32_t scur_post; + + struct { + uint64_t va_id; + size_t size; + gds_buf *gbuf; + } wq; + + struct { + uint64_t va_id; + size_t size; + gds_buf *gbuf; + } dbr; + + struct { + uint64_t va_id; + } bf; +} gds_mlx5_dv_qp_peer_t; + +typedef enum gds_mlx5_dv_qp_type { + GDS_MLX5_DV_QP_TYPE_UNKNOWN = 0, + GDS_MLX5_DV_QP_TYPE_RC, + GDS_MLX5_DV_QP_TYPE_DCT, + GDS_MLX5_DV_QP_TYPE_DCI +} gds_mlx5_dv_qp_type_t; + +typedef struct gds_mlx5_dv_qp { + gds_qp_t gqp; + gds_mlx5_dv_qp_type_t qp_type; + + struct mlx5dv_devx_obj *devx_qp; + + bool is_internal_srq; + struct mlx5dv_srq dvsrq; + gds_mlx5_dv_qp_peer_t *qp_peer; + + uint8_t sl; + + gds_buf *wq_buf; + struct mlx5dv_devx_umem *wq_umem; + uint64_t wq_va_id; + + size_t sq_cnt; + size_t rq_cnt; + + off_t sq_buf_offset; + off_t rq_buf_offset; + + gds_buf *dbr_buf; + struct mlx5dv_devx_umem *dbr_umem; + uint64_t dbr_va_id; + + struct mlx5dv_devx_uar *bf_uar; + size_t bf_size; // Half of UAR reg size + uint64_t bf_va_id; + + uint8_t port_num; + struct ibv_port_attr port_attr; + + gds_peer_attr *peer_attr; +} gds_mlx5_dv_qp_t; + +//----------------------------------------------------------------------------- + +static inline gds_mlx5_dv_cq_t *to_gds_mdv_cq(gds_cq_t *gcq) { + return container_of(gcq, gds_mlx5_dv_cq_t, gcq); +} + +static inline gds_mlx5_dv_qp_t *to_gds_mdv_qp(gds_qp_t *gqp) { + return container_of(gqp, gds_mlx5_dv_qp_t, gqp); +} + +//----------------------------------------------------------------------------- + +/* + * Local variables: + * c-indent-level: 8 + * c-basic-offset: 8 + * tab-width: 8 + * indent-tabs-mode: nil + * End: + */ diff --git a/src/transports/mlx5-dv/mlx5_ifc.h b/src/transports/mlx5-dv/mlx5_ifc.h new file mode 100644 index 0000000..51b1814 --- /dev/null +++ b/src/transports/mlx5-dv/mlx5_ifc.h @@ -0,0 +1,5499 @@ +/* + * Copyright (c) 2019 Mellanox Technologies, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef MLX5_IFC_H +#define MLX5_IFC_H + +#define u8 uint8_t + +enum mlx5_cap_mode { + HCA_CAP_OPMOD_GET_MAX = 0, + HCA_CAP_OPMOD_GET_CUR = 1, +}; + +enum { + MLX5_CMD_OP_QUERY_HCA_CAP = 0x100, + MLX5_CMD_OP_INIT_HCA = 0x102, + MLX5_CMD_OP_TEARDOWN_HCA = 0x103, + MLX5_CMD_OP_ENABLE_HCA = 0x104, + MLX5_CMD_OP_QUERY_PAGES = 0x107, + MLX5_CMD_OP_MANAGE_PAGES = 0x108, + MLX5_CMD_OP_SET_HCA_CAP = 0x109, + MLX5_CMD_OP_QUERY_ISSI = 0x10a, + MLX5_CMD_OP_SET_ISSI = 0x10b, + MLX5_CMD_OP_CREATE_MKEY = 0x200, + MLX5_CMD_OP_DESTROY_MKEY = 0x202, + MLX5_CMD_OP_CREATE_EQ = 0x301, + MLX5_CMD_OP_DESTROY_EQ = 0x302, + MLX5_CMD_OP_CREATE_CQ = 0x400, + MLX5_CMD_OP_DESTROY_CQ = 0x401, + MLX5_CMD_OP_CREATE_QP = 0x500, + MLX5_CMD_OP_DESTROY_QP = 0x501, + MLX5_CMD_OP_RST2INIT_QP = 0x502, + MLX5_CMD_OP_INIT2RTR_QP = 0x503, + MLX5_CMD_OP_RTR2RTS_QP = 0x504, + MLX5_CMD_OP_RTS2RTS_QP = 0x505, + MLX5_CMD_OP_QUERY_QP = 0x50b, + MLX5_CMD_OP_INIT2INIT_QP = 0x50e, + MLX5_CMD_OP_CREATE_PSV = 0x600, + MLX5_CMD_OP_DESTROY_PSV = 0x601, + MLX5_CMD_OP_CREATE_SRQ = 0x700, + MLX5_CMD_OP_DESTROY_SRQ = 0x701, + MLX5_CMD_OP_CREATE_XRC_SRQ = 0x705, + MLX5_CMD_OP_DESTROY_XRC_SRQ = 0x706, + MLX5_CMD_OP_CREATE_DCT = 0x710, + MLX5_CMD_OP_DESTROY_DCT = 0x711, + MLX5_CMD_OP_QUERY_DCT = 0x713, + MLX5_CMD_OP_CREATE_XRQ = 0x717, + MLX5_CMD_OP_DESTROY_XRQ = 0x718, + MLX5_CMD_OP_QUERY_ESW_FUNCTIONS = 0x740, + MLX5_CMD_OP_QUERY_ESW_VPORT_CONTEXT = 0x752, + MLX5_CMD_OP_QUERY_NIC_VPORT_CONTEXT = 0x754, + MLX5_CMD_OP_MODIFY_NIC_VPORT_CONTEXT = 0x755, + MLX5_CMD_OP_QUERY_ROCE_ADDRESS = 0x760, + MLX5_CMD_OP_ALLOC_Q_COUNTER = 0x771, + MLX5_CMD_OP_DEALLOC_Q_COUNTER = 0x772, + MLX5_CMD_OP_CREATE_SCHEDULING_ELEMENT = 0x782, + MLX5_CMD_OP_DESTROY_SCHEDULING_ELEMENT = 0x783, + MLX5_CMD_OP_ALLOC_PD = 0x800, + MLX5_CMD_OP_DEALLOC_PD = 0x801, + MLX5_CMD_OP_ALLOC_UAR = 0x802, + MLX5_CMD_OP_DEALLOC_UAR = 0x803, + MLX5_CMD_OP_ACCESS_REG = 0x805, + MLX5_CMD_OP_ATTACH_TO_MCG = 0x806, + MLX5_CMD_OP_DETACH_FROM_MCG = 0x807, + MLX5_CMD_OP_ALLOC_XRCD = 0x80e, + MLX5_CMD_OP_DEALLOC_XRCD = 0x80f, + MLX5_CMD_OP_ALLOC_TRANSPORT_DOMAIN = 0x816, + MLX5_CMD_OP_DEALLOC_TRANSPORT_DOMAIN = 0x817, + MLX5_CMD_OP_ADD_VXLAN_UDP_DPORT = 0x827, + MLX5_CMD_OP_DELETE_VXLAN_UDP_DPORT = 0x828, + MLX5_CMD_OP_SET_L2_TABLE_ENTRY = 0x829, + MLX5_CMD_OP_DELETE_L2_TABLE_ENTRY = 0x82b, + MLX5_CMD_OP_QUERY_LAG = 0x842, + MLX5_CMD_OP_CREATE_TIR = 0x900, + MLX5_CMD_OP_DESTROY_TIR = 0x902, + MLX5_CMD_OP_CREATE_SQ = 0x904, + MLX5_CMD_OP_MODIFY_SQ = 0x905, + MLX5_CMD_OP_DESTROY_SQ = 0x906, + MLX5_CMD_OP_CREATE_RQ = 0x908, + MLX5_CMD_OP_DESTROY_RQ = 0x90a, + MLX5_CMD_OP_CREATE_RMP = 0x90c, + MLX5_CMD_OP_DESTROY_RMP = 0x90e, + MLX5_CMD_OP_CREATE_TIS = 0x912, + MLX5_CMD_OP_MODIFY_TIS = 0x913, + MLX5_CMD_OP_DESTROY_TIS = 0x914, + MLX5_CMD_OP_QUERY_TIS = 0x915, + MLX5_CMD_OP_CREATE_RQT = 0x916, + MLX5_CMD_OP_DESTROY_RQT = 0x918, + MLX5_CMD_OP_CREATE_FLOW_TABLE = 0x930, + MLX5_CMD_OP_DESTROY_FLOW_TABLE = 0x931, + MLX5_CMD_OP_QUERY_FLOW_TABLE = 0x932, + MLX5_CMD_OP_CREATE_FLOW_GROUP = 0x933, + MLX5_CMD_OP_DESTROY_FLOW_GROUP = 0x934, + MLX5_CMD_OP_SET_FLOW_TABLE_ENTRY = 0x936, + MLX5_CMD_OP_DELETE_FLOW_TABLE_ENTRY = 0x938, + MLX5_CMD_OP_CREATE_FLOW_COUNTER = 0x939, + MLX5_CMD_OP_DEALLOC_FLOW_COUNTER = 0x93a, + MLX5_CMD_OP_ALLOC_PACKET_REFORMAT_CONTEXT = 0x93d, + MLX5_CMD_OP_DEALLOC_PACKET_REFORMAT_CONTEXT = 0x93e, + MLX5_CMD_OP_ALLOC_MODIFY_HEADER_CONTEXT = 0x940, + MLX5_CMD_OP_DEALLOC_MODIFY_HEADER_CONTEXT = 0x941, + MLX5_CMD_OP_CREATE_GENERAL_OBJECT = 0xa00, + MLX5_CMD_OP_MODIFY_GENERAL_OBJECT = 0xa01, + MLX5_CMD_OP_QUERY_GENERAL_OBJECT = 0xa02, + MLX5_CMD_OP_DESTROY_GENERAL_OBJECT = 0xa03, + MLX5_CMD_OP_CREATE_UMEM = 0xa08, + MLX5_CMD_OP_DESTROY_UMEM = 0xa0a, + MLX5_CMD_OP_SYNC_STEERING = 0xb00, +}; + +enum { + MLX5_CMD_STAT_OK = 0x0, + MLX5_CMD_STAT_INT_ERR = 0x1, + MLX5_CMD_STAT_BAD_OP_ERR = 0x2, + MLX5_CMD_STAT_BAD_PARAM_ERR = 0x3, + MLX5_CMD_STAT_BAD_SYS_STATE_ERR = 0x4, + MLX5_CMD_STAT_BAD_RES_ERR = 0x5, + MLX5_CMD_STAT_RES_BUSY = 0x6, + MLX5_CMD_STAT_LIM_ERR = 0x8, + MLX5_CMD_STAT_BAD_RES_STATE_ERR = 0x9, + MLX5_CMD_STAT_IX_ERR = 0xa, + MLX5_CMD_STAT_NO_RES_ERR = 0xf, + MLX5_CMD_STAT_BAD_INP_LEN_ERR = 0x50, + MLX5_CMD_STAT_BAD_OUTP_LEN_ERR = 0x51, + MLX5_CMD_STAT_BAD_QP_STATE_ERR = 0x10, + MLX5_CMD_STAT_BAD_PKT_ERR = 0x30, + MLX5_CMD_STAT_BAD_SIZE_OUTS_CQES_ERR = 0x40, +}; + +enum { + MLX5_PAGES_CANT_GIVE = 0, + MLX5_PAGES_GIVE = 1, + MLX5_PAGES_TAKE = 2, +}; + +enum { + MLX5_REG_HOST_ENDIANNESS = 0x7004, +}; + +enum { + MLX5_CAP_PORT_TYPE_IB = 0x0, + MLX5_CAP_PORT_TYPE_ETH = 0x1, +}; + +enum mlx5_event { + MLX5_EVENT_TYPE_CMD = 0x0a, + MLX5_EVENT_TYPE_PAGE_REQUEST = 0xb, +}; + +enum { + MLX5_EQ_DOORBEL_OFFSET = 0x40, +}; + +struct mlx5_ifc_atomic_caps_bits { + u8 reserved_at_0[0x40]; + + u8 atomic_req_8B_endianness_mode[0x2]; + u8 reserved_at_42[0x4]; + u8 supported_atomic_req_8B_endianness_mode_1[0x1]; + + u8 reserved_at_47[0x19]; + + u8 reserved_at_60[0x20]; + + u8 reserved_at_80[0x10]; + u8 atomic_operations[0x10]; + + u8 reserved_at_a0[0x10]; + u8 atomic_size_qp[0x10]; + + u8 reserved_at_c0[0x10]; + u8 atomic_size_dc[0x10]; + + u8 reserved_at_e0[0x1a0]; + + u8 fetch_add_pci_atomic[0x10]; + u8 swap_pci_atomic[0x10]; + u8 compare_swap_pci_atomic[0x10]; + + u8 reserved_at_2b0[0x550]; +}; + +struct mlx5_ifc_roce_cap_bits { + u8 reserved_0[0x4]; + u8 sw_r_roce_src_udp_port[0x1]; + u8 fl_rc_qp_when_roce_disabled[0x1]; + u8 fl_rc_qp_when_roce_enabled[0x1]; + u8 reserved_at_7[0x17]; + u8 qp_ts_format[0x2]; + + u8 reserved_at_20[0x7e0]; +}; + +enum { + MLX5_MULTI_PATH_FT_MAX_LEVEL = 64, +}; + +struct mlx5_ifc_flow_table_context_bits { + u8 reformat_en[0x1]; + u8 decap_en[0x1]; + u8 sw_owner[0x1]; + u8 termination_table[0x1]; + u8 table_miss_action[0x4]; + u8 level[0x8]; + u8 reserved_at_10[0x8]; + u8 log_size[0x8]; + + u8 reserved_at_20[0x8]; + u8 table_miss_id[0x18]; + + u8 reserved_at_40[0x8]; + u8 lag_master_next_table_id[0x18]; + + u8 reserved_at_60[0x60]; + + u8 sw_owner_icm_root_1[0x40]; + + u8 sw_owner_icm_root_0[0x40]; +}; + +struct mlx5_ifc_create_flow_table_in_bits { + u8 opcode[0x10]; + u8 uid[0x10]; + + u8 reserved_at_20[0x10]; + u8 op_mod[0x10]; + + u8 other_vport[0x1]; + u8 reserved_at_41[0xf]; + u8 vport_number[0x10]; + + u8 reserved_at_60[0x20]; + + u8 table_type[0x8]; + u8 reserved_at_88[0x18]; + + u8 reserved_at_a0[0x20]; + + struct mlx5_ifc_flow_table_context_bits flow_table_context; +}; + +struct mlx5_ifc_create_flow_table_out_bits { + u8 status[0x8]; + u8 icm_address_63_40[0x18]; + + u8 syndrome[0x20]; + + u8 icm_address_39_32[0x8]; + u8 table_id[0x18]; + + u8 icm_address_31_0[0x20]; +}; + +struct mlx5_ifc_destroy_flow_table_in_bits { + u8 opcode[0x10]; + u8 reserved_at_10[0x10]; + + u8 reserved_at_20[0x20]; + + u8 other_vport[0x1]; + u8 reserved_at_41[0xf]; + u8 vport_number[0x10]; + + u8 reserved_at_60[0x20]; + + u8 table_type[0x8]; + u8 reserved_at_88[0x18]; + + u8 reserved_at_a0[0x8]; + u8 table_id[0x18]; + + u8 reserved_at_c0[0x140]; +}; + +struct mlx5_ifc_query_flow_table_in_bits { + u8 opcode[0x10]; + u8 reserved_at_10[0x10]; + + u8 reserved_at_20[0x10]; + u8 op_mod[0x10]; + + u8 reserved_at_40[0x40]; + + u8 table_type[0x8]; + u8 reserved_at_88[0x18]; + + u8 reserved_at_a0[0x8]; + u8 table_id[0x18]; + + u8 reserved_at_c0[0x140]; +}; + +struct mlx5_ifc_query_flow_table_out_bits { + u8 status[0x8]; + u8 reserved_at_8[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_at_40[0x80]; + + struct mlx5_ifc_flow_table_context_bits flow_table_context; +}; + +struct mlx5_ifc_sync_steering_in_bits { + u8 opcode[0x10]; + u8 uid[0x10]; + + u8 reserved_at_20[0x10]; + u8 op_mod[0x10]; + + u8 reserved_at_40[0xc0]; +}; + +struct mlx5_ifc_sync_steering_out_bits { + u8 status[0x8]; + u8 reserved_at_8[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_at_40[0x40]; +}; + +struct mlx5_ifc_device_mem_cap_bits { + u8 memic[0x1]; + u8 reserved_at_1[0x1f]; + + u8 reserved_at_20[0xb]; + u8 log_min_memic_alloc_size[0x5]; + u8 reserved_at_30[0x8]; + u8 log_max_memic_addr_alignment[0x8]; + + u8 memic_bar_start_addr[0x40]; + + u8 memic_bar_size[0x20]; + + u8 max_memic_size[0x20]; + + u8 steering_sw_icm_start_address[0x40]; + + u8 reserved_at_100[0x8]; + u8 log_header_modify_sw_icm_size[0x8]; + u8 reserved_at_110[0x2]; + u8 log_sw_icm_alloc_granularity[0x6]; + u8 log_steering_sw_icm_size[0x8]; + + u8 reserved_at_120[0x20]; + + u8 header_modify_sw_icm_start_address[0x40]; +}; + +struct mlx5_ifc_flow_table_fields_supported_bits { + u8 outer_dmac[0x1]; + u8 outer_smac[0x1]; + u8 outer_ether_type[0x1]; + u8 outer_ip_version[0x1]; + u8 outer_first_prio[0x1]; + u8 outer_first_cfi[0x1]; + u8 outer_first_vid[0x1]; + u8 outer_ipv4_ttl[0x1]; + u8 outer_second_prio[0x1]; + u8 outer_second_cfi[0x1]; + u8 outer_second_vid[0x1]; + u8 outer_ipv6_flow_label[0x1]; + u8 outer_sip[0x1]; + u8 outer_dip[0x1]; + u8 outer_frag[0x1]; + u8 outer_ip_protocol[0x1]; + u8 outer_ip_ecn[0x1]; + u8 outer_ip_dscp[0x1]; + u8 outer_udp_sport[0x1]; + u8 outer_udp_dport[0x1]; + u8 outer_tcp_sport[0x1]; + u8 outer_tcp_dport[0x1]; + u8 outer_tcp_flags[0x1]; + u8 outer_gre_protocol[0x1]; + u8 outer_gre_key[0x1]; + u8 outer_vxlan_vni[0x1]; + u8 outer_geneve_vni[0x1]; + u8 outer_geneve_oam[0x1]; + u8 outer_geneve_protocol_type[0x1]; + u8 outer_geneve_opt_len[0x1]; + u8 source_vhca_port[0x1]; + u8 source_eswitch_port[0x1]; + + u8 inner_dmac[0x1]; + u8 inner_smac[0x1]; + u8 inner_ether_type[0x1]; + u8 inner_ip_version[0x1]; + u8 inner_first_prio[0x1]; + u8 inner_first_cfi[0x1]; + u8 inner_first_vid[0x1]; + u8 inner_ipv4_ttl[0x1]; + u8 inner_second_prio[0x1]; + u8 inner_second_cfi[0x1]; + u8 inner_second_vid[0x1]; + u8 inner_ipv6_flow_label[0x1]; + u8 inner_sip[0x1]; + u8 inner_dip[0x1]; + u8 inner_frag[0x1]; + u8 inner_ip_protocol[0x1]; + u8 inner_ip_ecn[0x1]; + u8 inner_ip_dscp[0x1]; + u8 inner_udp_sport[0x1]; + u8 inner_udp_dport[0x1]; + u8 inner_tcp_sport[0x1]; + u8 inner_tcp_dport[0x1]; + u8 inner_tcp_flags[0x1]; + u8 reserved_at_37[0x7]; + u8 metadata_reg_b[0x1]; + u8 metadata_reg_a[0x1]; + + u8 reserved_at_40[0x5]; + u8 outer_first_mpls_over_udp_ttl[0x1]; + u8 outer_first_mpls_over_udp_s_bos[0x1]; + u8 outer_first_mpls_over_udp_exp[0x1]; + u8 outer_first_mpls_over_udp_label[0x1]; + u8 outer_first_mpls_over_gre_ttl[0x1]; + u8 outer_first_mpls_over_gre_s_bos[0x1]; + u8 outer_first_mpls_over_gre_exp[0x1]; + u8 outer_first_mpls_over_gre_label[0x1]; + u8 inner_first_mpls_ttl[0x1]; + u8 inner_first_mpls_s_bos[0x1]; + u8 inner_first_mpls_exp[0x1]; + u8 inner_first_mpls_label[0x1]; + u8 outer_first_mpls_ttl[0x1]; + u8 outer_first_mpls_s_bos[0x1]; + u8 outer_first_mpls_exp[0x1]; + u8 outer_first_mpls_label[0x1]; + u8 outer_emd_tag[0x1]; + u8 inner_esp_spi[0x1]; + u8 outer_esp_spi[0x1]; + u8 inner_ipv6_hop_limit[0x1]; + u8 outer_ipv6_hop_limit[0x1]; + u8 bth_dst_qp[0x1]; + u8 inner_first_svlan[0x1]; + u8 inner_second_svlan[0x1]; + u8 outer_first_svlan[0x1]; + u8 outer_second_svlan[0x1]; + u8 source_sqn[0x1]; + + u8 reserved_at_60[0x20]; +}; + +struct mlx5_ifc_dr_match_spec_bits { + u8 smac_47_16[0x20]; + + u8 smac_15_0[0x10]; + u8 ethertype[0x10]; + + u8 dmac_47_16[0x20]; + + u8 dmac_15_0[0x10]; + u8 first_prio[0x3]; + u8 first_cfi[0x1]; + u8 first_vid[0xc]; + + u8 ip_protocol[0x8]; + u8 ip_dscp[0x6]; + u8 ip_ecn[0x2]; + u8 cvlan_tag[0x1]; + u8 svlan_tag[0x1]; + u8 frag[0x1]; + u8 ip_version[0x4]; + u8 tcp_flags[0x9]; + + u8 tcp_sport[0x10]; + u8 tcp_dport[0x10]; + + u8 reserved_at_c0[0x10]; + u8 ipv4_ihl[0x4]; + u8 l3_ok[0x1]; + u8 l4_ok[0x1]; + u8 ipv4_checksum_ok[0x1]; + u8 l4_checksum_ok[0x1]; + u8 ip_ttl_hoplimit[0x8]; + + u8 udp_sport[0x10]; + u8 udp_dport[0x10]; + + u8 src_ip_127_96[0x20]; + + u8 src_ip_95_64[0x20]; + + u8 src_ip_63_32[0x20]; + + u8 src_ip_31_0[0x20]; + + u8 dst_ip_127_96[0x20]; + + u8 dst_ip_95_64[0x20]; + + u8 dst_ip_63_32[0x20]; + + u8 dst_ip_31_0[0x20]; +}; + +struct mlx5_ifc_dr_match_set_misc_bits { + u8 gre_c_present[0x1]; + u8 reserved_auto1[0x1]; + u8 gre_k_present[0x1]; + u8 gre_s_present[0x1]; + u8 source_vhca_port[0x4]; + u8 source_sqn[0x18]; + + u8 source_eswitch_owner_vhca_id[0x10]; + u8 source_port[0x10]; + + u8 outer_second_prio[0x3]; + u8 outer_second_cfi[0x1]; + u8 outer_second_vid[0xc]; + u8 inner_second_prio[0x3]; + u8 inner_second_cfi[0x1]; + u8 inner_second_vid[0xc]; + + u8 outer_second_cvlan_tag[0x1]; + u8 inner_second_cvlan_tag[0x1]; + u8 outer_second_svlan_tag[0x1]; + u8 inner_second_svlan_tag[0x1]; + u8 outer_emd_tag[0x1]; + u8 reserved_at_65[0xb]; + u8 gre_protocol[0x10]; + + u8 gre_key_h[0x18]; + u8 gre_key_l[0x8]; + + u8 vxlan_vni[0x18]; + u8 reserved_at_b8[0x8]; + + u8 geneve_vni[0x18]; + u8 reserved_at_e4[0x7]; + u8 geneve_oam[0x1]; + + u8 reserved_at_ec[0xc]; + u8 outer_ipv6_flow_label[0x14]; + + u8 reserved_at_100[0xc]; + u8 inner_ipv6_flow_label[0x14]; + + u8 reserved_at_120[0xa]; + u8 geneve_opt_len[0x6]; + u8 geneve_protocol_type[0x10]; + + u8 reserved_at_140[0x8]; + u8 bth_dst_qp[0x18]; + + u8 inner_esp_spi[0x20]; + + u8 outer_esp_spi[0x20]; + + u8 reserved_at_1a0[0x60]; +}; + +struct mlx5_ifc_dr_match_set_misc2_bits { + u8 outer_first_mpls_label[0x14]; + u8 outer_first_mpls_exp[0x3]; + u8 outer_first_mpls_s_bos[0x1]; + u8 outer_first_mpls_ttl[0x8]; + + u8 inner_first_mpls_label[0x14]; + u8 inner_first_mpls_exp[0x3]; + u8 inner_first_mpls_s_bos[0x1]; + u8 inner_first_mpls_ttl[0x8]; + + u8 outer_first_mpls_over_gre_label[0x14]; + u8 outer_first_mpls_over_gre_exp[0x3]; + u8 outer_first_mpls_over_gre_s_bos[0x1]; + u8 outer_first_mpls_over_gre_ttl[0x8]; + + u8 outer_first_mpls_over_udp_label[0x14]; + u8 outer_first_mpls_over_udp_exp[0x3]; + u8 outer_first_mpls_over_udp_s_bos[0x1]; + u8 outer_first_mpls_over_udp_ttl[0x8]; + + u8 metadata_reg_c_7[0x20]; + u8 metadata_reg_c_6[0x20]; + u8 metadata_reg_c_5[0x20]; + u8 metadata_reg_c_4[0x20]; + u8 metadata_reg_c_3[0x20]; + u8 metadata_reg_c_2[0x20]; + u8 metadata_reg_c_1[0x20]; + u8 metadata_reg_c_0[0x20]; + + u8 metadata_reg_a[0x20]; + u8 metadata_reg_b[0x20]; + + u8 reserved_at_260[0x40]; +}; + +struct mlx5_ifc_dr_match_set_misc3_bits { + u8 inner_tcp_seq_num[0x20]; + + u8 outer_tcp_seq_num[0x20]; + + u8 inner_tcp_ack_num[0x20]; + + u8 outer_tcp_ack_num[0x20]; + + u8 reserved_at_80[0x8]; + u8 outer_vxlan_gpe_vni[0x18]; + + u8 outer_vxlan_gpe_next_protocol[0x8]; + u8 outer_vxlan_gpe_flags[0x8]; + u8 reserved_at_b0[0x10]; + + u8 icmp_header_data[0x20]; + + u8 icmpv6_header_data[0x20]; + + u8 icmp_type[0x8]; + u8 icmp_code[0x8]; + u8 icmpv6_type[0x8]; + u8 icmpv6_code[0x8]; + + u8 geneve_tlv_option_0_data[0x20]; + + u8 gtpu_teid[0x20]; + + u8 gtpu_msg_type[0x8]; + u8 gtpu_msg_flags[0x8]; + u8 reserved_at_150[0x10]; + + u8 gtpu_dw_2[0x20]; + + u8 gtpu_first_ext_dw_0[0x20]; + + u8 gtpu_dw_0[0x20]; + + u8 reserved_at_1c0[0x20]; +}; + +struct mlx5_ifc_dr_match_set_misc4_bits { + u8 prog_sample_field_value_0[0x20]; + + u8 prog_sample_field_id_0[0x20]; + + u8 prog_sample_field_value_1[0x20]; + + u8 prog_sample_field_id_1[0x20]; + + u8 prog_sample_field_value_2[0x20]; + + u8 prog_sample_field_id_2[0x20]; + + u8 prog_sample_field_value_3[0x20]; + + u8 prog_sample_field_id_3[0x20]; + + u8 prog_sample_field_value_4[0x20]; + + u8 prog_sample_field_id_4[0x20]; + + u8 prog_sample_field_value_5[0x20]; + + u8 prog_sample_field_id_5[0x20]; + + u8 prog_sample_field_value_6[0x20]; + + u8 prog_sample_field_id_6[0x20]; + + u8 prog_sample_field_value_7[0x20]; + + u8 prog_sample_field_id_7[0x20]; +}; + +struct mlx5_ifc_dr_match_set_misc5_bits { + u8 macsec_tag_0[0x20]; + + u8 macsec_tag_1[0x20]; + + u8 macsec_tag_2[0x20]; + + u8 macsec_tag_3[0x20]; + + u8 tunnel_header_0[0x20]; + + u8 tunnel_header_1[0x20]; + + u8 tunnel_header_2[0x20]; + + u8 tunnel_header_3[0x20]; + + u8 reserved[0x100]; +}; + +struct mlx5_ifc_dr_match_param_bits { + struct mlx5_ifc_dr_match_spec_bits outer; + struct mlx5_ifc_dr_match_set_misc_bits misc; + struct mlx5_ifc_dr_match_spec_bits inner; + struct mlx5_ifc_dr_match_set_misc2_bits misc2; + struct mlx5_ifc_dr_match_set_misc3_bits misc3; + struct mlx5_ifc_dr_match_set_misc4_bits misc4; + struct mlx5_ifc_dr_match_set_misc5_bits misc5; +}; + +struct mlx5_ifc_flow_table_prop_layout_bits { + u8 ft_support[0x1]; + u8 flow_tag[0x1]; + u8 flow_counter[0x1]; + u8 flow_modify_en[0x1]; + u8 modify_root[0x1]; + u8 identified_miss_table[0x1]; + u8 flow_table_modify[0x1]; + u8 reformat[0x1]; + u8 decap[0x1]; + u8 reset_root_to_default[0x1]; + u8 pop_vlan[0x1]; + u8 push_vlan[0x1]; + u8 fpga_vendor_acceleration[0x1]; + u8 pop_vlan_2[0x1]; + u8 push_vlan_2[0x1]; + u8 reformat_and_vlan_action[0x1]; + u8 modify_and_vlan_action[0x1]; + u8 sw_owner[0x1]; + u8 reformat_l3_tunnel_to_l2[0x1]; + u8 reformat_l2_to_l3_tunnel[0x1]; + u8 reformat_and_modify_action[0x1]; + u8 reserved_at_15[0x9]; + u8 sw_owner_v2[0x1]; + u8 reserved_at_1f[0x1]; + + u8 reserved_at_20[0x2]; + u8 log_max_ft_size[0x6]; + u8 log_max_modify_header_context[0x8]; + u8 max_modify_header_actions[0x8]; + u8 max_ft_level[0x8]; + + u8 reserved_at_40[0x10]; + u8 metadata_reg_b_width[0x8]; + u8 metadata_reg_a_width[0x8]; + + u8 reserved_at_60[0x18]; + u8 log_max_ft_num[0x8]; + + u8 reserved_at_80[0x10]; + u8 log_max_flow_counter[0x8]; + u8 log_max_destination[0x8]; + + u8 reserved_at_a0[0x18]; + u8 log_max_flow[0x8]; + + u8 reserved_at_c0[0x40]; + + struct mlx5_ifc_flow_table_fields_supported_bits ft_field_support; + + struct mlx5_ifc_flow_table_fields_supported_bits ft_field_bitmask_support; +}; + +enum { + MLX5_FLEX_PARSER_GENEVE_ENABLED = 1 << 3, + MLX5_FLEX_PARSER_MPLS_OVER_GRE_ENABLED = 1 << 4, + mlx5_FLEX_PARSER_MPLS_OVER_UDP_ENABLED = 1 << 5, + MLX5_FLEX_PARSER_VXLAN_GPE_ENABLED = 1 << 7, + MLX5_FLEX_PARSER_ICMP_V4_ENABLED = 1 << 8, + MLX5_FLEX_PARSER_ICMP_V6_ENABLED = 1 << 9, + MLX5_FLEX_PARSER_GENEVE_OPT_0_ENABLED = 1 << 10, + MLX5_FLEX_PARSER_GTPU_ENABLED = 1 << 11, + MLX5_FLEX_PARSER_GTPU_DW_2_ENABLED = 1 << 16, + MLX5_FLEX_PARSER_GTPU_FIRST_EXT_DW_0_ENABLED = 1 << 17, + MLX5_FLEX_PARSER_GTPU_DW_0_ENABLED = 1 << 18, + MLX5_FLEX_PARSER_GTPU_TEID_ENABLED = 1 << 19, +}; + +enum mlx5_ifc_steering_format_version { + MLX5_HW_CONNECTX_5 = 0x0, + MLX5_HW_CONNECTX_6DX = 0x1, +}; + +enum mlx5_ifc_ste_v1_modify_hdr_offset { + MLX5_MODIFY_HEADER_V1_QW_OFFSET = 0x20, +}; + +struct mlx5_ifc_cmd_hca_cap_bits { + u8 access_other_hca_roce[0x1]; + u8 reserved_at_1[0x1e]; + u8 vhca_resource_manager[0x1]; + + u8 hca_cap_2[0x1]; + u8 reserved_at_21[0xf]; + u8 vhca_id[0x10]; + + u8 reserved_at_40[0x20]; + + u8 reserved_at_60[0x2]; + u8 qp_data_in_order[0x1]; + u8 reserved_at_63[0x8]; + u8 log_dma_mmo_max_size[0x5]; + u8 reserved_at_70[0x10]; + + u8 log_max_srq_sz[0x8]; + u8 log_max_qp_sz[0x8]; + u8 reserved_at_90[0x3]; + u8 isolate_vl_tc_new[0x1]; + u8 reserved_at_94[0x4]; + u8 prio_tag_required[0x1]; + u8 reserved_at_99[0x2]; + u8 log_max_qp[0x5]; + + u8 reserved_at_a0[0xb]; + u8 log_max_srq[0x5]; + u8 reserved_at_b0[0x10]; + + u8 reserved_at_c0[0x8]; + u8 log_max_cq_sz[0x8]; + u8 reserved_at_d0[0xb]; + u8 log_max_cq[0x5]; + + u8 log_max_eq_sz[0x8]; + u8 relaxed_ordering_write[0x1]; + u8 reserved_at_e9[0x1]; + u8 log_max_mkey[0x6]; + u8 tunneled_atomic[0x1]; + u8 as_notify[0x1]; + u8 m_pci_port[0x1]; + u8 m_vhca_mk[0x1]; + u8 cmd_on_behalf[0x1]; + u8 device_emulation_manager[0x1]; + u8 terminate_scatter_list_mkey[0x1]; + u8 repeated_mkey[0x1]; + u8 dump_fill_mkey[0x1]; + u8 reserved_at_f9[0x3]; + u8 log_max_eq[0x4]; + + u8 max_indirection[0x8]; + u8 fixed_buffer_size[0x1]; + u8 log_max_mrw_sz[0x7]; + u8 force_teardown[0x1]; + u8 fast_teardown[0x1]; + u8 log_max_bsf_list_size[0x6]; + u8 umr_extended_translation_offset[0x1]; + u8 null_mkey[0x1]; + u8 log_max_klm_list_size[0x6]; + + u8 reserved_at_120[0x2]; + u8 qpc_extension[0x1]; + u8 reserved_at_123[0x7]; + u8 log_max_ra_req_dc[0x6]; + u8 reserved_at_130[0xa]; + u8 log_max_ra_res_dc[0x6]; + + u8 reserved_at_140[0x7]; + u8 sig_crc64_xp10[0x1]; + u8 sig_crc32c[0x1]; + u8 reserved_at_149[0x1]; + u8 log_max_ra_req_qp[0x6]; + u8 reserved_at_150[0x1]; + u8 rts2rts_qp_udp_sport[0x1]; + u8 rts2rts_lag_tx_port_affinity[0x1]; + u8 dma_mmo_sq[0x1]; + u8 reserved_at_154[0x6]; + u8 log_max_ra_res_qp[0x6]; + + u8 end_pad[0x1]; + u8 cc_query_allowed[0x1]; + u8 cc_modify_allowed[0x1]; + u8 start_pad[0x1]; + u8 cache_line_128byte[0x1]; + u8 gid_table_size_ro[0x1]; + u8 pkey_table_size_ro[0x1]; + u8 reserved_at_167[0x1]; + u8 rnr_nak_q_counters[0x1]; + u8 rts2rts_qp_counters_set_id[0x1]; + u8 rts2rts_qp_dscp[0x1]; + u8 reserved_at_16b[0x4]; + u8 qcam_reg[0x1]; + u8 gid_table_size[0x10]; + + u8 out_of_seq_cnt[0x1]; + u8 vport_counters[0x1]; + u8 retransmission_q_counters[0x1]; + u8 debug[0x1]; + u8 modify_rq_counters_set_id[0x1]; + u8 rq_delay_drop[0x1]; + u8 max_qp_cnt[0xa]; + u8 pkey_table_size[0x10]; + + u8 vport_group_manager[0x1]; + u8 vhca_group_manager[0x1]; + u8 ib_virt[0x1]; + u8 eth_virt[0x1]; + u8 vnic_env_queue_counters[0x1]; + u8 ets[0x1]; + u8 nic_flow_table[0x1]; + u8 eswitch_manager[0x1]; + u8 device_memory[0x1]; + u8 mcam_reg[0x1]; + u8 pcam_reg[0x1]; + u8 local_ca_ack_delay[0x5]; + u8 port_module_event[0x1]; + u8 enhanced_retransmission_q_counters[0x1]; + u8 port_checks[0x1]; + u8 pulse_gen_control[0x1]; + u8 disable_link_up_by_init_hca[0x1]; + u8 beacon_led[0x1]; + u8 port_type[0x2]; + u8 num_ports[0x8]; + + u8 reserved_at_1c0[0x1]; + u8 pps[0x1]; + u8 pps_modify[0x1]; + u8 log_max_msg[0x5]; + u8 multi_path_xrc_rdma[0x1]; + u8 multi_path_dc_rdma[0x1]; + u8 multi_path_rc_rdma[0x1]; + u8 traffic_fast_control[0x1]; + u8 max_tc[0x4]; + u8 temp_warn_event[0x1]; + u8 dcbx[0x1]; + u8 general_notification_event[0x1]; + u8 multi_prio_sq[0x1]; + u8 afu_owner[0x1]; + u8 fpga[0x1]; + u8 rol_s[0x1]; + u8 rol_g[0x1]; + u8 ib_port_sniffer[0x1]; + u8 wol_s[0x1]; + u8 wol_g[0x1]; + u8 wol_a[0x1]; + u8 wol_b[0x1]; + u8 wol_m[0x1]; + u8 wol_u[0x1]; + u8 wol_p[0x1]; + + u8 stat_rate_support[0x10]; + u8 sig_block_4048[0x1]; + u8 reserved_at_1f1[0xb]; + u8 cqe_version[0x4]; + + u8 compact_address_vector[0x1]; + u8 eth_striding_wq[0x1]; + u8 reserved_at_202[0x1]; + u8 ipoib_enhanced_offloads[0x1]; + u8 ipoib_basic_offloads[0x1]; + u8 ib_striding_wq[0x1]; + u8 repeated_block_disabled[0x1]; + u8 umr_modify_entity_size_disabled[0x1]; + u8 umr_modify_atomic_disabled[0x1]; + u8 umr_indirect_mkey_disabled[0x1]; + u8 umr_fence[0x2]; + u8 dc_req_sctr_data_cqe[0x1]; + u8 dc_connect_qp[0x1]; + u8 dc_cnak_trace[0x1]; + u8 drain_sigerr[0x1]; + u8 cmdif_checksum[0x2]; + u8 sigerr_cqe[0x1]; + u8 reserved_at_213[0x1]; + u8 wq_signature[0x1]; + u8 sctr_data_cqe[0x1]; + u8 reserved_at_216[0x1]; + u8 sho[0x1]; + u8 tph[0x1]; + u8 rf[0x1]; + u8 dct[0x1]; + u8 qos[0x1]; + u8 eth_net_offloads[0x1]; + u8 roce[0x1]; + u8 atomic[0x1]; + u8 extended_retry_count[0x1]; + + u8 cq_oi[0x1]; + u8 cq_resize[0x1]; + u8 cq_moderation[0x1]; + u8 cq_period_mode_modify[0x1]; + u8 cq_invalidate[0x1]; + u8 reserved_at_225[0x1]; + u8 cq_eq_remap[0x1]; + u8 pg[0x1]; + u8 block_lb_mc[0x1]; + u8 exponential_backoff[0x1]; + u8 scqe_break_moderation[0x1]; + u8 cq_period_start_from_cqe[0x1]; + u8 cd[0x1]; + u8 atm[0x1]; + u8 apm[0x1]; + u8 vector_calc[0x1]; + u8 umr_ptr_rlkey[0x1]; + u8 imaicl[0x1]; + u8 qp_packet_based[0x1]; + u8 reserved_at_233[0x1]; + u8 ipoib_enhanced_pkey_change[0x1]; + u8 initiator_src_dct_in_cqe[0x1]; + u8 qkv[0x1]; + u8 pkv[0x1]; + u8 set_deth_sqpn[0x1]; + u8 rts2rts_primary_sl[0x1]; + u8 initiator_src_dct[0x1]; + u8 dc_v2[0x1]; + u8 xrc[0x1]; + u8 ud[0x1]; + u8 uc[0x1]; + u8 rc[0x1]; + + u8 uar_4k[0x1]; + u8 reserved_at_241[0x9]; + u8 uar_sz[0x6]; + u8 reserved_at_250[0x2]; + u8 umem_uid_0[0x1]; + u8 log_max_dc_cnak_qps[0x5]; + u8 log_pg_sz[0x8]; + + u8 bf[0x1]; + u8 driver_version[0x1]; + u8 pad_tx_eth_packet[0x1]; + u8 query_driver_version[0x1]; + u8 max_qp_retry_freq[0x1]; + u8 qp_by_name[0x1]; + u8 mkey_by_name[0x1]; + u8 reserved_at_267[0x1]; + u8 suspend_qp_uc[0x1]; + u8 suspend_qp_ud[0x1]; + u8 suspend_qp_rc[0x1]; + u8 log_bf_reg_size[0x5]; + u8 reserved_at_270[0x6]; + u8 lag_dct[0x2]; + u8 lag_tx_port_affinity[0x1]; + u8 reserved_at_279[0x2]; + u8 lag_master[0x1]; + u8 num_lag_ports[0x4]; + + u8 num_of_diagnostic_counters[0x10]; + u8 max_wqe_sz_sq[0x10]; + + u8 reserved_at_2a0[0x10]; + u8 max_wqe_sz_rq[0x10]; + + u8 max_flow_counter_31_16[0x10]; + u8 max_wqe_sz_sq_dc[0x10]; + + u8 reserved_at_2e0[0x7]; + u8 max_qp_mcg[0x19]; + + u8 mlnx_tag_ethertype[0x10]; + u8 reserved_at_310[0x8]; + u8 log_max_mcg[0x8]; + + u8 reserved_at_320[0x3]; + u8 log_max_transport_domain[0x5]; + u8 reserved_at_328[0x3]; + u8 log_max_pd[0x5]; + u8 reserved_at_330[0xb]; + u8 log_max_xrcd[0x5]; + + u8 nic_receive_steering_discard[0x1]; + u8 receive_discard_vport_down[0x1]; + u8 transmit_discard_vport_down[0x1]; + u8 eq_overrun_count[0x1]; + u8 nic_receive_steering_depth[0x1]; + u8 invalid_command_count[0x1]; + u8 quota_exceeded_count[0x1]; + u8 reserved_at_347[0x1]; + u8 log_max_flow_counter_bulk[0x8]; + u8 max_flow_counter_15_0[0x10]; + + u8 modify_tis[0x1]; + u8 reserved_at_361[0x2]; + u8 log_max_rq[0x5]; + u8 reserved_at_368[0x3]; + u8 log_max_sq[0x5]; + u8 reserved_at_370[0x3]; + u8 log_max_tir[0x5]; + u8 reserved_at_378[0x3]; + u8 log_max_tis[0x5]; + + u8 basic_cyclic_rcv_wqe[0x1]; + u8 reserved_at_381[0x2]; + u8 log_max_rmp[0x5]; + u8 reserved_at_388[0x3]; + u8 log_max_rqt[0x5]; + u8 reserved_at_390[0x3]; + u8 log_max_rqt_size[0x5]; + u8 reserved_at_398[0x3]; + u8 log_max_tis_per_sq[0x5]; + + u8 ext_stride_num_range[0x1]; + u8 reserved_at_3a1[0x2]; + u8 log_max_stride_sz_rq[0x5]; + u8 reserved_at_3a8[0x3]; + u8 log_min_stride_sz_rq[0x5]; + u8 reserved_at_3b0[0x3]; + u8 log_max_stride_sz_sq[0x5]; + u8 reserved_at_3b8[0x3]; + u8 log_min_stride_sz_sq[0x5]; + + u8 hairpin[0x1]; + u8 reserved_at_3c1[0x2]; + u8 log_max_hairpin_queues[0x5]; + u8 reserved_at_3c8[0x3]; + u8 log_max_hairpin_wq_data_sz[0x5]; + u8 reserved_at_3d0[0x3]; + u8 log_max_hairpin_num_packets[0x5]; + u8 reserved_at_3d8[0x3]; + u8 log_max_wq_sz[0x5]; + + u8 nic_vport_change_event[0x1]; + u8 disable_local_lb_uc[0x1]; + u8 disable_local_lb_mc[0x1]; + u8 log_min_hairpin_wq_data_sz[0x5]; + u8 reserved_at_3e8[0x3]; + u8 log_max_vlan_list[0x5]; + u8 reserved_at_3f0[0x3]; + u8 log_max_current_mc_list[0x5]; + u8 reserved_at_3f8[0x3]; + u8 log_max_current_uc_list[0x5]; + + u8 general_obj_types[0x40]; + + u8 reserved_at_440[0x4]; + u8 steering_format_version[0x4]; + u8 create_qp_start_hint[0x18]; + + u8 reserved_at_460[0x8]; + u8 aes_xts[0x1]; + u8 crypto[0x1]; + u8 reserved_at_46a[0x6]; + u8 max_num_eqs[0x10]; + + u8 sigerr_domain_and_sig_type[0x1]; + u8 reserved_at_481[0x2]; + u8 log_max_l2_table[0x5]; + u8 reserved_at_488[0x8]; + u8 log_uar_page_sz[0x10]; + + u8 reserved_at_4a0[0x20]; + + u8 device_frequency_mhz[0x20]; + + u8 device_frequency_khz[0x20]; + + u8 capi[0x1]; + u8 create_pec[0x1]; + u8 nvmf_target_offload[0x1]; + u8 capi_invalidate[0x1]; + u8 reserved_at_504[0x17]; + u8 log_max_pasid[0x5]; + + u8 num_of_uars_per_page[0x20]; + + u8 flex_parser_protocols[0x20]; + + u8 reserved_at_560[0x10]; + u8 flex_parser_header_modify[0x1]; + u8 reserved_at_571[0x2]; + u8 log_max_guaranteed_connections[0x5]; + u8 reserved_at_578[0x3]; + u8 log_max_dct_connections[0x5]; + + u8 log_max_atomic_size_qp[0x8]; + u8 reserved_at_588[0x10]; + u8 log_max_atomic_size_dc[0x8]; + + u8 reserved_at_5a0[0x1c]; + u8 mini_cqe_resp_stride_index[0x1]; + u8 cqe_128_always[0x1]; + u8 cqe_compression_128b[0x1]; + u8 cqe_compression[0x1]; + + u8 cqe_compression_timeout[0x10]; + u8 cqe_compression_max_num[0x10]; + + u8 reserved_at_5e0[0x8]; + u8 flex_parser_id_gtpu_dw_0[0x4]; + u8 log_max_tm_offloaded_op_size[0x4]; + u8 tag_matching[0x1]; + u8 rndv_offload_rc[0x1]; + u8 rndv_offload_dc[0x1]; + u8 log_tag_matching_list_sz[0x5]; + u8 reserved_at_5f8[0x3]; + u8 log_max_xrq[0x5]; + + u8 affiliate_nic_vport_criteria[0x8]; + u8 native_port_num[0x8]; + u8 num_vhca_ports[0x8]; + u8 flex_parser_id_gtpu_teid[0x4]; + u8 reserved_at_61c[0x1]; + u8 trusted_vnic_vhca[0x1]; + u8 sw_owner_id[0x1]; + u8 reserve_not_to_use[0x1]; + u8 reserved_at_620[0x60]; + u8 sf[0x1]; + u8 reserved_at_682[0x43]; + u8 flex_parser_id_geneve_opt_0[0x4]; + u8 flex_parser_id_icmp_dw1[0x4]; + u8 flex_parser_id_icmp_dw0[0x4]; + u8 flex_parser_id_icmpv6_dw1[0x4]; + u8 flex_parser_id_icmpv6_dw0[0x4]; + u8 flex_parser_id_outer_first_mpls_over_gre[0x4]; + u8 flex_parser_id_outer_first_mpls_over_udp_label[0x4]; + + u8 reserved_at_6e0[0x20]; + + u8 flex_parser_id_gtpu_dw_2[0x4]; + u8 flex_parser_id_gtpu_first_ext_dw_0[0x4]; + u8 reserved_at_708[0x18]; + + u8 reserved_at_720[0x20]; + + u8 reserved_at_740[0x8]; + u8 dma_mmo_qp[0x1]; + u8 reserved_at_749[0x17]; + + u8 reserved_at_760[0x60]; + + u8 match_definer_format_supported[0x40]; +}; + +struct mlx5_ifc_header_modify_cap_properties_bits { + struct mlx5_ifc_flow_table_fields_supported_bits set_action_field_support; + + u8 reserved_at_80[0x80]; + + struct mlx5_ifc_flow_table_fields_supported_bits add_action_field_support; + + u8 reserved_at_180[0x80]; + + u8 copy_action_field_support[8][0x20]; + + u8 reserved_at_300[0x100]; +}; + +struct mlx5_ifc_flow_table_fields_supported_2_bits { + u8 reserved_at_0[0x17]; + u8 inner_l3_ok[0x1]; + u8 inner_l4_ok[0x1]; + u8 outer_l3_ok[0x1]; + u8 outer_l4_ok[0x1]; + u8 psp_header[0x1]; + u8 inner_ipv4_checksum_ok[0x1]; + u8 inner_l4_checksum_ok[0x1]; + u8 outer_ipv4_checksum_ok[0x1]; + u8 outer_l4_checksum_ok[0x1]; + + u8 reserved_at_20[0x60]; +}; + +struct mlx5_ifc_flow_table_nic_cap_bits { + u8 nic_rx_multi_path_tirs[0x1]; + u8 nic_rx_multi_path_tirs_fts[0x1]; + u8 allow_sniffer_and_nic_rx_shared_tir[0x1]; + u8 reserved_at_3[0x1]; + u8 nic_rx_flow_tag_multipath_en[0x1]; + u8 reserved_at_5[0x13]; + u8 nic_receive_max_steering_depth[0x8]; + + u8 encap_general_header[0x1]; + u8 reserved_at_21[0xa]; + u8 log_max_packet_reformat_context[0x5]; + u8 reserved_at_30[0x6]; + u8 max_encap_header_size[0xa]; + + u8 reserved_at_40[0x1c0]; + + struct mlx5_ifc_flow_table_prop_layout_bits flow_table_properties_nic_receive; + + struct mlx5_ifc_flow_table_prop_layout_bits flow_table_properties_nic_receive_rdma; + + struct mlx5_ifc_flow_table_prop_layout_bits flow_table_properties_nic_receive_sniffer; + + struct mlx5_ifc_flow_table_prop_layout_bits flow_table_properties_nic_transmit; + + struct mlx5_ifc_flow_table_prop_layout_bits flow_table_properties_nic_transmit_rdma; + + struct mlx5_ifc_flow_table_prop_layout_bits flow_table_properties_nic_transmit_sniffer; + + u8 reserved_at_e00[0x200]; + + struct mlx5_ifc_header_modify_cap_properties_bits header_modify_nic_receive; + + struct mlx5_ifc_flow_table_fields_supported_2_bits ft_field_support_2_nic_receive; + + struct mlx5_ifc_flow_table_fields_supported_2_bits ft_field_bitmask_support_2_nic_receive; + + struct mlx5_ifc_flow_table_fields_supported_2_bits ft_field_support_2_nic_receive_rdma; + + struct mlx5_ifc_flow_table_fields_supported_2_bits ft_field_bitmask_support_2_nic_receive_rdma; + + struct mlx5_ifc_flow_table_fields_supported_2_bits ft_field_support_2_nic_receive_sniffer; + + struct mlx5_ifc_flow_table_fields_supported_2_bits ft_field_bitmask_support_2_nic_receive_sniffer; + + struct mlx5_ifc_flow_table_fields_supported_2_bits ft_field_support_2_nic_transmit; + + struct mlx5_ifc_flow_table_fields_supported_2_bits ft_field_bitmask_support_2_nic_transmit; + + struct mlx5_ifc_flow_table_fields_supported_2_bits ft_field_support_2_nic_transmit_rdma; + + struct mlx5_ifc_flow_table_fields_supported_2_bits ft_field_bitmask_support_2_nic_transmit_rdma; + + struct mlx5_ifc_flow_table_fields_supported_2_bits ft_field_support_2_nic_transmit_sniffer; + + struct mlx5_ifc_flow_table_fields_supported_2_bits ft_field_bitmask_support_2_nic_transmit_sniffer; + + u8 reserved_at_1400[0x200]; + + struct mlx5_ifc_header_modify_cap_properties_bits header_modify_nic_transmit; + + u8 sw_steering_nic_rx_action_drop_icm_address[0x40]; + + u8 sw_steering_nic_tx_action_drop_icm_address[0x40]; + + u8 sw_steering_nic_tx_action_allow_icm_address[0x40]; + + u8 reserved_at_20c0[0x5f40]; +}; + +struct mlx5_ifc_flow_table_eswitch_cap_bits { + u8 reserved_at_0[0x1c]; + u8 fdb_multi_path_to_table[0x1]; + u8 reserved_at_1d[0x1e3]; + + struct mlx5_ifc_flow_table_prop_layout_bits flow_table_properties_nic_esw_fdb; + + struct mlx5_ifc_flow_table_prop_layout_bits flow_table_properties_esw_acl_ingress; + + struct mlx5_ifc_flow_table_prop_layout_bits flow_table_properties_esw_acl_egress; + + u8 reserved_at_800[0x1000]; + + u8 sw_steering_fdb_action_drop_icm_address_rx[0x40]; + u8 sw_steering_fdb_action_drop_icm_address_tx[0x40]; + u8 sw_steering_uplink_icm_address_rx[0x40]; + u8 sw_steering_uplink_icm_address_tx[0x40]; + + u8 reserved_at_1900[0x6700]; +}; + +struct mlx5_ifc_odp_per_transport_service_cap_bits { + u8 send[0x1]; + u8 receive[0x1]; + u8 write[0x1]; + u8 read[0x1]; + u8 atomic[0x1]; + u8 srq_receive[0x1]; + u8 reserved_at_6[0x1a]; +}; + +struct mlx5_ifc_odp_cap_bits { + u8 reserved_at_0[0x40]; + + u8 sig[0x1]; + u8 reserved_at_41[0x1f]; + + u8 reserved_at_60[0x20]; + + struct mlx5_ifc_odp_per_transport_service_cap_bits rc_odp_caps; + + struct mlx5_ifc_odp_per_transport_service_cap_bits uc_odp_caps; + + struct mlx5_ifc_odp_per_transport_service_cap_bits ud_odp_caps; + + struct mlx5_ifc_odp_per_transport_service_cap_bits xrc_odp_caps; + + struct mlx5_ifc_odp_per_transport_service_cap_bits dc_odp_caps; + + u8 reserved_at_120[0x6e0]; +}; + +struct mlx5_ifc_e_switch_cap_bits { + u8 reserved_at_0[0x4b]; + u8 log_max_esw_sf[0x5]; + u8 esw_sf_base_id[0x10]; + u8 reserved_at_60[0x7a0]; +}; + +enum { + ELEMENT_TYPE_CAP_MASK_TASR = 1 << 0, + ELEMENT_TYPE_CAP_MASK_QUEUE_GROUP = 1 << 4, +}; + +enum { + TSAR_TYPE_CAP_MASK_DWRR = 1 << 0, +}; + +struct mlx5_ifc_qos_cap_bits { + u8 reserved_at_0[0x8]; + u8 nic_sq_scheduling[0x1]; + u8 nic_bw_share[0x1]; + u8 nic_rate_limit[0x1]; + u8 reserved_at_b[0x15]; + + u8 reserved_at_20[0x1]; + u8 nic_qp_scheduling[0x1]; + u8 reserved_at_22[0x1e]; + + u8 reserved_at_40[0xc0]; + + u8 nic_element_type[0x10]; + u8 nic_tsar_type[0x10]; + + u8 reserved_at_120[0x6e0]; +}; + +struct mlx5_ifc_cmd_hca_cap_2_bits { + u8 reserved_at_0[0x80]; + + u8 reserved_at_80[0x13]; + u8 log_reserved_qpn_granularity[0x5]; + u8 reserved_at_98[0x8]; + + u8 reserved_at_a0[0x760]; +}; + +enum { + MLX5_CRYPTO_CAPS_WRAPPED_IMPORT_METHOD_AES = 0x4, +}; + +struct mlx5_ifc_crypto_caps_bits { + u8 wrapped_crypto_operational[0x1]; + u8 wrapped_crypto_going_to_commissioning[0x1]; + u8 reserved_at_2[0x16]; + u8 wrapped_import_method[0x8]; + + u8 reserved_at_20[0xb]; + u8 log_max_num_deks[0x5]; + u8 reserved_at_30[0x3]; + u8 log_max_num_import_keks[0x5]; + u8 reserved_at_38[0x3]; + u8 log_max_num_creds[0x5]; + + u8 failed_selftests[0x10]; + u8 num_nv_import_keks[0x8]; + u8 num_nv_credentials[0x8]; + + u8 reserved_at_60[0x7a0]; +}; + +union mlx5_ifc_hca_cap_union_bits { + struct mlx5_ifc_atomic_caps_bits atomic_caps; + struct mlx5_ifc_cmd_hca_cap_bits cmd_hca_cap; + struct mlx5_ifc_flow_table_nic_cap_bits flow_table_nic_cap; + struct mlx5_ifc_flow_table_eswitch_cap_bits flow_table_eswitch_cap; + struct mlx5_ifc_e_switch_cap_bits e_switch_cap; + struct mlx5_ifc_device_mem_cap_bits device_mem_cap; + struct mlx5_ifc_odp_cap_bits odp_cap; + struct mlx5_ifc_roce_cap_bits roce_caps; + struct mlx5_ifc_qos_cap_bits qos_caps; + struct mlx5_ifc_cmd_hca_cap_2_bits cmd_hca_cap_2; + struct mlx5_ifc_crypto_caps_bits crypto_caps; + u8 reserved_at_0[0x8000]; +}; + +struct mlx5_ifc_query_hca_cap_out_bits { + u8 status[0x8]; + u8 reserved_at_8[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_at_40[0x40]; + + union mlx5_ifc_hca_cap_union_bits capability; +}; + +struct mlx5_ifc_query_hca_cap_in_bits { + u8 opcode[0x10]; + u8 reserved_at_10[0x10]; + + u8 reserved_at_20[0x10]; + u8 op_mod[0x10]; + + u8 other_function[0x1]; + u8 reserved_at_41[0xf]; + u8 function_id[0x10]; + + u8 reserved_at_60[0x20]; +}; + +enum mlx5_cap_type { + MLX5_CAP_GENERAL = 0, + MLX5_CAP_ODP = 2, + MLX5_CAP_ATOMIC = 3, + MLX5_CAP_ROCE, + MLX5_CAP_NUM, +}; + +enum { + MLX5_SET_HCA_CAP_OP_MOD_GENERAL_DEVICE = 0x0 << 1, + MLX5_SET_HCA_CAP_OP_MOD_ROCE = 0x4 << 1, + MLX5_SET_HCA_CAP_OP_MOD_NIC_FLOW_TABLE = 0x7 << 1, + MLX5_SET_HCA_CAP_OP_MOD_ESW_FLOW_TABLE = 0x8 << 1, + MLX5_SET_HCA_CAP_OP_MOD_QOS = 0xc << 1, + MLX5_SET_HCA_CAP_OP_MOD_ESW = 0x9 << 1, + MLX5_SET_HCA_CAP_OP_MOD_DEVICE_MEMORY = 0xf << 1, + MLX5_SET_HCA_CAP_OP_MOD_CRYPTO = 0x1a << 1, + MLX5_SET_HCA_CAP_OP_MOD_GENERAL_DEVICE_CAP_2 = 0x20 << 1, +}; + +enum { + MLX5_MKC_ACCESS_MODE_MTT = 0x1, + MLX5_MKC_ACCESS_MODE_KLMS = 0x2, +}; + +struct mlx5_ifc_mkc_bits { + u8 reserved_at_0[0x1]; + u8 free[0x1]; + u8 reserved_at_2[0x1]; + u8 access_mode_4_2[0x3]; + u8 reserved_at_6[0x7]; + u8 relaxed_ordering_write[0x1]; + u8 reserved_at_e[0x1]; + u8 small_fence_on_rdma_read_response[0x1]; + u8 umr_en[0x1]; + u8 a[0x1]; + u8 rw[0x1]; + u8 rr[0x1]; + u8 lw[0x1]; + u8 lr[0x1]; + u8 access_mode_1_0[0x2]; + u8 reserved_at_18[0x8]; + + u8 qpn[0x18]; + u8 mkey_7_0[0x8]; + + u8 reserved_at_40[0x20]; + + u8 length64[0x1]; + u8 bsf_en[0x1]; + u8 sync_umr[0x1]; + u8 reserved_at_63[0x2]; + u8 expected_sigerr_count[0x1]; + u8 reserved_at_66[0x1]; + u8 en_rinval[0x1]; + u8 pd[0x18]; + + u8 start_addr[0x40]; + + u8 len[0x40]; + + u8 bsf_octword_size[0x20]; + + u8 reserved_at_120[0x80]; + + u8 translations_octword_size[0x20]; + + u8 reserved_at_1c0[0x19]; + u8 relaxed_ordering_read[0x1]; + u8 reserved_at_1d9[0x1]; + u8 log_page_size[0x5]; + + u8 reserved_at_1e0[0x3]; + u8 crypto_en[0x2]; + u8 reserved_at_1e5[0x1b]; +}; + +struct mlx5_ifc_create_mkey_out_bits { + u8 status[0x8]; + u8 reserved_at_8[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_at_40[0x8]; + u8 mkey_index[0x18]; + + u8 reserved_at_60[0x20]; +}; + +struct mlx5_ifc_create_mkey_in_bits { + u8 opcode[0x10]; + u8 reserved_at_10[0x10]; + + u8 reserved_at_20[0x10]; + u8 op_mod[0x10]; + + u8 reserved_at_40[0x20]; + + u8 pg_access[0x1]; + u8 mkey_umem_valid[0x1]; + u8 reserved_at_62[0x1e]; + + struct mlx5_ifc_mkc_bits memory_key_mkey_entry; + + u8 reserved_at_280[0x80]; + + u8 translations_octword_actual_size[0x20]; + + u8 reserved_at_320[0x560]; + + u8 klm_pas_mtt[0][0x20]; +}; + +struct mlx5_ifc_destroy_mkey_out_bits { + u8 status[0x8]; + u8 reserved_at_8[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_at_40[0x40]; +}; + +struct mlx5_ifc_destroy_mkey_in_bits { + u8 opcode[0x10]; + u8 uid[0x10]; + + u8 reserved_at_20[0x10]; + u8 op_mod[0x10]; + + u8 reserved_at_40[0x8]; + u8 mkey_index[0x18]; + + u8 reserved_at_60[0x20]; +}; + +struct mlx5_ifc_l2_hdr_bits { + u8 dmac_47_16[0x20]; + u8 dmac_15_0[0x10]; + u8 smac_47_32[0x10]; + u8 smac_31_0[0x20]; + u8 ethertype[0x10]; + u8 vlan_type[0x10]; + u8 vlan[0x10]; +}; + +enum { + FS_FT_NIC_RX = 0x0, + FS_FT_NIC_TX = 0x1, + FS_FT_ESW_EGRESS_ACL = 0x2, + FS_FT_ESW_INGRESS_ACL = 0x3, + FS_FT_FDB = 0X4, + FS_FT_SNIFFER_RX = 0X5, + FS_FT_SNIFFER_TX = 0X6, +}; + +struct mlx5_ifc_ste_general_bits { + u8 entry_type[0x4]; + u8 reserved_at_4[0x4]; + u8 entry_sub_type[0x8]; + u8 byte_mask[0x10]; + u8 next_table_base_63_48[0x10]; + u8 next_lu_type[0x8]; + u8 next_table_base_39_32_size[0x8]; + u8 next_table_base_31_5_size[0x1b]; + u8 linear_hash_enable[0x1]; + u8 reserved_at_5c[0x2]; + u8 next_table_rank[0x2]; + u8 reserved_at_60[0xa0]; + u8 tag_value[0x60]; + u8 bit_mask[0x60]; +}; + +struct mlx5_ifc_ste_sx_transmit_bits { + u8 entry_type[0x4]; + u8 reserved_at_4[0x4]; + u8 entry_sub_type[0x8]; + u8 byte_mask[0x10]; + + u8 next_table_base_63_48[0x10]; + u8 next_lu_type[0x8]; + u8 next_table_base_39_32_size[0x8]; + + u8 next_table_base_31_5_size[0x1b]; + u8 linear_hash_enable[0x1]; + u8 reserved_at_5c[0x2]; + u8 next_table_rank[0x2]; + + u8 sx_wire[0x1]; + u8 sx_func_lb[0x1]; + u8 sx_sniffer[0x1]; + u8 sx_wire_enable[0x1]; + u8 sx_func_lb_enable[0x1]; + u8 sx_sniffer_enable[0x1]; + u8 action_type[0x3]; + u8 reserved_at_69[0x1]; + u8 action_description[0x6]; + u8 gvmi[0x10]; + + u8 encap_pointer_vlan_data[0x20]; + + u8 loopback_syndome_en[0x8]; + u8 loopback_syndome[0x8]; + u8 counter_trigger[0x10]; + + u8 miss_address_63_48[0x10]; + u8 counter_trigger_23_16[0x8]; + u8 miss_address_39_32[0x8]; + + u8 miss_address_31_6[0x1a]; + u8 learning_point[0x1]; + u8 go_back[0x1]; + u8 match_polarity[0x1]; + u8 mask_mode[0x1]; + u8 miss_rank[0x2]; +}; + +struct mlx5_ifc_ste_rx_steering_mult_bits { + u8 entry_type[0x4]; + u8 reserved_at_4[0x4]; + u8 entry_sub_type[0x8]; + u8 byte_mask[0x10]; + + u8 next_table_base_63_48[0x10]; + u8 next_lu_type[0x8]; + u8 next_table_base_39_32_size[0x8]; + + u8 next_table_base_31_5_size[0x1b]; + u8 linear_hash_enable[0x1]; + u8 reserved_at_5c[0x2]; + u8 next_table_rank[0x2]; + + u8 member_count[0x10]; + u8 gvmi[0x10]; + + u8 qp_list_pointer[0x20]; + + u8 reserved_at_a0[0x1]; + u8 tunneling_action[0x3]; + u8 action_description[0x4]; + u8 reserved_at_a8[0x8]; + u8 counter_trigger_15_0[0x10]; + + u8 miss_address_63_48[0x10]; + u8 counter_trigger_23_16[0x08]; + u8 miss_address_39_32[0x8]; + + u8 miss_address_31_6[0x1a]; + u8 learning_point[0x1]; + u8 fail_on_error[0x1]; + u8 match_polarity[0x1]; + u8 mask_mode[0x1]; + u8 miss_rank[0x2]; +}; + +struct mlx5_ifc_ste_modify_packet_bits { + u8 entry_type[0x4]; + u8 reserved_at_4[0x4]; + u8 entry_sub_type[0x8]; + u8 byte_mask[0x10]; + + u8 next_table_base_63_48[0x10]; + u8 next_lu_type[0x8]; + u8 next_table_base_39_32_size[0x8]; + + u8 next_table_base_31_5_size[0x1b]; + u8 linear_hash_enable[0x1]; + u8 reserved_at_5c[0x2]; + u8 next_table_rank[0x2]; + + u8 number_of_re_write_actions[0x10]; + u8 gvmi[0x10]; + + u8 header_re_write_actions_pointer[0x20]; + + u8 reserved_at_a0[0x1]; + u8 tunneling_action[0x3]; + u8 action_description[0x4]; + u8 reserved_at_a8[0x8]; + u8 counter_trigger_15_0[0x10]; + + u8 miss_address_63_48[0x10]; + u8 counter_trigger_23_16[0x08]; + u8 miss_address_39_32[0x8]; + + u8 miss_address_31_6[0x1a]; + u8 learning_point[0x1]; + u8 fail_on_error[0x1]; + u8 match_polarity[0x1]; + u8 mask_mode[0x1]; + u8 miss_rank[0x2]; +}; + +struct mlx5_ifc_ste_single_action_flow_tag_v1_bits { + u8 action_id[0x8]; + u8 flow_tag[0x18]; +}; + +struct mlx5_ifc_ste_single_action_modify_list_v1_bits { + u8 action_id[0x8]; + u8 num_of_modify_actions[0x8]; + u8 modify_actions_ptr[0x10]; +}; + +struct mlx5_ifc_ste_single_action_remove_header_v1_bits { + u8 action_id[0x8]; + u8 reserved_at_8[0x2]; + u8 start_anchor[0x6]; + u8 reserved_at_10[0x2]; + u8 end_anchor[0x6]; + u8 reserved_at_18[0x4]; + u8 decap[0x1]; + u8 vni_to_cqe[0x1]; + u8 qos_profile[0x2]; +}; + +struct mlx5_ifc_ste_single_action_remove_header_size_v1_bits { + u8 action_id[0x8]; + u8 reserved_at_8[0x2]; + u8 start_anchor[0x6]; + u8 outer_l4_remove[0x1]; + u8 reserved_at_11[0x1]; + u8 start_offset[0x7]; + u8 reserved_at_18[0x1]; + u8 remove_size[0x6]; +}; + +struct mlx5_ifc_ste_double_action_copy_v1_bits { + u8 action_id[0x8]; + u8 destination_dw_offset[0x8]; + u8 reserved_at_10[0x2]; + u8 destination_left_shifter[0x6]; + u8 reserved_at_18[0x2]; + u8 destination_length[0x6]; + + u8 reserved_at_20[0x8]; + u8 source_dw_offset[0x8]; + u8 reserved_at_30[0x2]; + u8 source_right_shifter[0x6]; + u8 reserved_at_38[0x8]; +}; + +struct mlx5_ifc_ste_double_action_set_v1_bits { + u8 action_id[0x8]; + u8 destination_dw_offset[0x8]; + u8 reserved_at_10[0x2]; + u8 destination_left_shifter[0x6]; + u8 reserved_at_18[0x2]; + u8 destination_length[0x6]; + + u8 inline_data[0x20]; +}; + +struct mlx5_ifc_ste_double_action_add_v1_bits { + u8 action_id[0x8]; + u8 destination_dw_offset[0x8]; + u8 reserved_at_10[0x2]; + u8 destination_left_shifter[0x6]; + u8 reserved_at_18[0x2]; + u8 destination_length[0x6]; + + u8 add_value[0x20]; +}; + +struct mlx5_ifc_ste_double_action_insert_with_inline_v1_bits { + u8 action_id[0x8]; + u8 reserved_at_8[0x2]; + u8 start_anchor[0x6]; + u8 start_offset[0x7]; + u8 reserved_at_17[0x9]; + + u8 inline_data[0x20]; +}; + +struct mlx5_ifc_ste_double_action_insert_with_ptr_v1_bits { + u8 action_id[0x8]; + u8 reserved_at_8[0x2]; + u8 start_anchor[0x6]; + u8 start_offset[0x7]; + u8 size[0x6]; + u8 attributes[0x3]; + + u8 pointer[0x20]; +}; + +struct mlx5_ifc_ste_double_action_modify_action_list_v1_bits { + u8 action_id[0x8]; + u8 modify_actions_pattern_pointer[0x18]; + + u8 number_of_modify_actions[0x8]; + u8 modify_actions_argument_pointer[0x18]; +}; + +enum { + MLX5_IFC_ASO_FLOW_METER_INITIAL_COLOR_RED = 0x0, + MLX5_IFC_ASO_FLOW_METER_INITIAL_COLOR_YELLOW = 0x1, + MLX5_IFC_ASO_FLOW_METER_INITIAL_COLOR_GREEN = 0x2, + MLX5_IFC_ASO_FLOW_METER_INITIAL_COLOR_UNDEFINED = 0x3, +}; + +enum { + MLX5_IFC_ASO_CT_DIRECTION_INITIATOR = 0x0, + MLX5_IFC_ASO_CT_DIRECTION_RESPONDER = 0x1, +}; + +struct mlx5_ifc_ste_aso_first_hit_action_v1_bits { + u8 reserved_at_0[0x6]; + u8 set[0x1]; + u8 line_id[0x9]; +}; + +struct mlx5_ifc_ste_aso_flow_meter_action_v1_bits { + u8 reserved_at_0[0xc]; + u8 action[0x1]; + u8 initial_color[0x2]; + u8 line_id[0x1]; +}; + +struct mlx5_ifc_ste_aso_ct_action_v1_bits { + u8 reserved_at_0[0xf]; + u8 direction[0x1]; +}; + +struct mlx5_ifc_ste_double_action_aso_v1_bits { + u8 action_id[0x8]; + u8 aso_context_number[0x18]; + + u8 dest_reg_id[0x2]; + u8 change_ordering_tag[0x1]; + u8 aso_check_ordering[0x1]; + u8 aso_context_type[0x4]; + u8 reserved_at_28[0x8]; + union { + u8 aso_fields[0x10]; + struct mlx5_ifc_ste_aso_first_hit_action_v1_bits first_hit; + struct mlx5_ifc_ste_aso_flow_meter_action_v1_bits flow_meter; + struct mlx5_ifc_ste_aso_ct_action_v1_bits ct; + }; +}; + +struct mlx5_ifc_ste_match_bwc_v1_bits { + u8 entry_format[0x8]; + u8 counter_id[0x18]; + + u8 miss_address_63_48[0x10]; + u8 match_definer_ctx_idx[0x8]; + u8 miss_address_39_32[0x8]; + + u8 miss_address_31_6[0x1a]; + u8 reserved_at_5a[0x1]; + u8 match_polarity[0x1]; + u8 reparse[0x1]; + u8 reserved_at_5d[0x3]; + + u8 next_table_base_63_48[0x10]; + u8 hash_definer_ctx_idx[0x8]; + u8 next_table_base_39_32_size[0x8]; + + u8 next_table_base_31_5_size[0x1b]; + u8 hash_type[0x2]; + u8 hash_after_actions[0x1]; + u8 reserved_at_9e[0x2]; + + u8 byte_mask[0x10]; + u8 next_entry_format[0x1]; + u8 mask_mode[0x1]; + u8 gvmi[0xe]; + + u8 action[0x40]; +}; + +struct mlx5_ifc_ste_mask_and_match_v1_bits { + u8 entry_format[0x8]; + u8 counter_id[0x18]; + + u8 miss_address_63_48[0x10]; + u8 match_definer_ctx_idx[0x8]; + u8 miss_address_39_32[0x8]; + + u8 miss_address_31_6[0x1a]; + u8 reserved_at_5a[0x1]; + u8 match_polarity[0x1]; + u8 reparse[0x1]; + u8 reserved_at_5d[0x3]; + + u8 next_table_base_63_48[0x10]; + u8 hash_definer_ctx_idx[0x8]; + u8 next_table_base_39_32_size[0x8]; + + u8 next_table_base_31_5_size[0x1b]; + u8 hash_type[0x2]; + u8 hash_after_actions[0x1]; + u8 reserved_at_9e[0x2]; + + u8 action[0x60]; +}; + +struct mlx5_ifc_ste_eth_l2_src_bits { + u8 smac_47_16[0x20]; + + u8 smac_15_0[0x10]; + u8 l3_ethertype[0x10]; + + u8 qp_type[0x2]; + u8 ethertype_filter[0x1]; + u8 reserved_at_43[0x1]; + u8 sx_sniffer[0x1]; + u8 force_lb[0x1]; + u8 functional_lb[0x1]; + u8 port[0x1]; + u8 reserved_at_48[0x4]; + u8 first_priority[0x3]; + u8 first_cfi[0x1]; + u8 first_vlan_qualifier[0x2]; + u8 reserved_at_52[0x2]; + u8 first_vlan_id[0xc]; + + u8 ip_fragmented[0x1]; + u8 tcp_syn[0x1]; + u8 encp_type[0x2]; + u8 l3_type[0x2]; + u8 l4_type[0x2]; + u8 reserved_at_68[0x4]; + u8 second_priority[0x3]; + u8 second_cfi[0x1]; + u8 second_vlan_qualifier[0x2]; + u8 reserved_at_72[0x2]; + u8 second_vlan_id[0xc]; +}; + +struct mlx5_ifc_ste_eth_l2_src_v1_bits { + u8 reserved_at_0[0x1]; + u8 sx_sniffer[0x1]; + u8 functional_loopback[0x1]; + u8 ip_fragmented[0x1]; + u8 qp_type[0x2]; + u8 encapsulation_type[0x2]; + u8 port[0x2]; + u8 l3_type[0x2]; + u8 l4_type[0x2]; + u8 first_vlan_qualifier[0x2]; + u8 first_priority[0x3]; + u8 first_cfi[0x1]; + u8 first_vlan_id[0xc]; + + u8 smac_47_16[0x20]; + + u8 smac_15_0[0x10]; + u8 l3_ethertype[0x10]; + + u8 reserved_at_60[0x6]; + u8 tcp_syn[0x1]; + u8 reserved_at_67[0x3]; + u8 force_loopback[0x1]; + u8 l2_ok[0x1]; + u8 l3_ok[0x1]; + u8 l4_ok[0x1]; + u8 second_vlan_qualifier[0x2]; + u8 second_priority[0x3]; + u8 second_cfi[0x1]; + u8 second_vlan_id[0xc]; +}; + +struct mlx5_ifc_ste_eth_l2_dst_bits { + u8 dmac_47_16[0x20]; + + u8 dmac_15_0[0x10]; + u8 l3_ethertype[0x10]; + + u8 qp_type[0x2]; + u8 ethertype_filter[0x1]; + u8 reserved_at_43[0x1]; + u8 sx_sniffer[0x1]; + u8 force_lb[0x1]; + u8 functional_lb[0x1]; + u8 port[0x1]; + u8 reserved_at_48[0x4]; + u8 first_priority[0x3]; + u8 first_cfi[0x1]; + u8 first_vlan_qualifier[0x2]; + u8 reserved_at_52[0x2]; + u8 first_vlan_id[0xc]; + + u8 ip_fragmented[0x1]; + u8 tcp_syn[0x1]; + u8 encp_type[0x2]; + u8 l3_type[0x2]; + u8 l4_type[0x2]; + u8 reserved_at_68[0x4]; + u8 second_priority[0x3]; + u8 second_cfi[0x1]; + u8 second_vlan_qualifier[0x2]; + u8 reserved_at_72[0x2]; + u8 second_vlan_id[0xc]; +}; + +struct mlx5_ifc_ste_eth_l2_dst_v1_bits { + u8 reserved_at_0[0x1]; + u8 sx_sniffer[0x1]; + u8 functional_lb[0x1]; + u8 ip_fragmented[0x1]; + u8 qp_type[0x2]; + u8 encapsulation_type[0x2]; + u8 port[0x2]; + u8 l3_type[0x2]; + u8 l4_type[0x2]; + u8 first_vlan_qualifier[0x2]; + u8 first_priority[0x3]; + u8 first_cfi[0x1]; + u8 first_vlan_id[0xc]; + + u8 dmac_47_16[0x20]; + + u8 dmac_15_0[0x10]; + u8 l3_ethertype[0x10]; + + u8 reserved_at_60[0x6]; + u8 tcp_syn[0x1]; + u8 reserved_at_67[0x3]; + u8 force_lb[0x1]; + u8 l2_ok[0x1]; + u8 l3_ok[0x1]; + u8 l4_ok[0x1]; + u8 second_vlan_qualifier[0x2]; + u8 second_priority[0x3]; + u8 second_cfi[0x1]; + u8 second_vlan_id[0xc]; +}; + +struct mlx5_ifc_ste_eth_l2_src_dst_bits { + u8 dmac_47_16[0x20]; + + u8 dmac_15_0[0x10]; + u8 smac_47_32[0x10]; + + u8 smac_31_0[0x20]; + + u8 sx_sniffer[0x1]; + u8 force_lb[0x1]; + u8 functional_lb[0x1]; + u8 port[0x1]; + u8 l3_type[0x2]; + u8 reserved_at_66[0x6]; + u8 first_priority[0x3]; + u8 first_cfi[0x1]; + u8 first_vlan_qualifier[0x2]; + u8 reserved_at_72[0x2]; + u8 first_vlan_id[0xc]; +}; + +struct mlx5_ifc_ste_eth_l2_src_dst_v1_bits { + u8 dmac_47_16[0x20]; + + u8 smac_47_16[0x20]; + + u8 dmac_15_0[0x10]; + u8 reserved_at_50[0x2]; + u8 functional_lb[0x1]; + u8 reserved_at_53[0x5]; + u8 port[0x2]; + u8 l3_type[0x2]; + u8 reserved_at_5c[0x2]; + u8 first_vlan_qualifier[0x2]; + + u8 first_priority[0x3]; + u8 first_cfi[0x1]; + u8 first_vlan_id[0xc]; + u8 smac_15_0[0x10]; +}; + +struct mlx5_ifc_ste_eth_l3_ipv4_5_tuple_bits { + u8 destination_address[0x20]; + + u8 source_address[0x20]; + + u8 source_port[0x10]; + u8 destination_port[0x10]; + + u8 fragmented[0x1]; + u8 first_fragment[0x1]; + u8 reserved_at_62[0x2]; + u8 reserved_at_64[0x1]; + u8 ecn[0x2]; + u8 tcp_ns[0x1]; + u8 tcp_cwr[0x1]; + u8 tcp_ece[0x1]; + u8 tcp_urg[0x1]; + u8 tcp_ack[0x1]; + u8 tcp_psh[0x1]; + u8 tcp_rst[0x1]; + u8 tcp_syn[0x1]; + u8 tcp_fin[0x1]; + u8 dscp[0x6]; + u8 reserved_at_76[0x2]; + u8 protocol[0x8]; +}; + +struct mlx5_ifc_ste_eth_l3_ipv4_5_tuple_v1_bits { + u8 source_address[0x20]; + + u8 destination_address[0x20]; + + u8 source_port[0x10]; + u8 destination_port[0x10]; + + u8 reserved_at_60[0x4]; + u8 l4_ok[0x1]; + u8 l3_ok[0x1]; + u8 fragmented[0x1]; + u8 tcp_ns[0x1]; + u8 tcp_cwr[0x1]; + u8 tcp_ece[0x1]; + u8 tcp_urg[0x1]; + u8 tcp_ack[0x1]; + u8 tcp_psh[0x1]; + u8 tcp_rst[0x1]; + u8 tcp_syn[0x1]; + u8 tcp_fin[0x1]; + u8 dscp[0x6]; + u8 ecn[0x2]; + u8 protocol[0x8]; +}; + +struct mlx5_ifc_ste_eth_l3_ipv6_dst_bits { + u8 dst_ip_127_96[0x20]; + + u8 dst_ip_95_64[0x20]; + + u8 dst_ip_63_32[0x20]; + + u8 dst_ip_31_0[0x20]; +}; + +struct mlx5_ifc_ste_eth_l2_tnl_bits { + u8 dmac_47_16[0x20]; + + u8 dmac_15_0[0x10]; + u8 l3_ethertype[0x10]; + + u8 l2_tunneling_network_id[0x20]; + + u8 ip_fragmented[0x1]; + u8 tcp_syn[0x1]; + u8 encp_type[0x2]; + u8 l3_type[0x2]; + u8 l4_type[0x2]; + u8 first_priority[0x3]; + u8 first_cfi[0x1]; + u8 reserved_at_6c[0x3]; + u8 gre_key_flag[0x1]; + u8 first_vlan_qualifier[0x2]; + u8 reserved_at_72[0x2]; + u8 first_vlan_id[0xc]; +}; + +struct mlx5_ifc_ste_eth_l2_tnl_v1_bits { + u8 l2_tunneling_network_id[0x20]; + + u8 dmac_47_16[0x20]; + + u8 dmac_15_0[0x10]; + u8 l3_ethertype[0x10]; + + u8 reserved_at_60[0x3]; + u8 ip_fragmented[0x1]; + u8 reserved_at_64[0x2]; + u8 encp_type[0x2]; + u8 reserved_at_68[0x2]; + u8 l3_type[0x2]; + u8 l4_type[0x2]; + u8 first_vlan_qualifier[0x2]; + u8 first_priority[0x3]; + u8 first_cfi[0x1]; + u8 first_vlan_id[0xc]; +}; + +struct mlx5_ifc_ste_eth_l3_ipv6_src_bits { + u8 src_ip_127_96[0x20]; + + u8 src_ip_95_64[0x20]; + + u8 src_ip_63_32[0x20]; + + u8 src_ip_31_0[0x20]; +}; + +struct mlx5_ifc_ste_eth_l3_ipv4_misc_bits { + u8 version[0x4]; + u8 ihl[0x4]; + u8 reserved_at_8[0x8]; + u8 total_length[0x10]; + + u8 identification[0x10]; + u8 flags[0x3]; + u8 fragment_offset[0xd]; + + u8 time_to_live[0x8]; + u8 reserved_at_48[0x8]; + u8 checksum[0x10]; + + u8 reserved_at_60[0x20]; +}; + +struct mlx5_ifc_ste_eth_l3_ipv4_misc_v1_bits { + u8 identification[0x10]; + u8 flags[0x3]; + u8 fragment_offset[0xd]; + + u8 total_length[0x10]; + u8 checksum[0x10]; + + u8 version[0x4]; + u8 ihl[0x4]; + u8 time_to_live[0x8]; + u8 reserved_at_50[0x10]; + + u8 reserved_at_60[0x1c]; + u8 voq_internal_prio[0x4]; +}; + +struct mlx5_ifc_ste_eth_l4_bits { + u8 fragmented[0x1]; + u8 first_fragment[0x1]; + u8 reserved_at_2[0x6]; + u8 protocol[0x8]; + u8 dst_port[0x10]; + + u8 ipv6_version[0x4]; + u8 reserved_at_24[0x1]; + u8 ecn[0x2]; + u8 tcp_ns[0x1]; + u8 tcp_cwr[0x1]; + u8 tcp_ece[0x1]; + u8 tcp_urg[0x1]; + u8 tcp_ack[0x1]; + u8 tcp_psh[0x1]; + u8 tcp_rst[0x1]; + u8 tcp_syn[0x1]; + u8 tcp_fin[0x1]; + u8 src_port[0x10]; + + u8 ipv6_payload_length[0x10]; + u8 ipv6_hop_limit[0x8]; + u8 dscp[0x6]; + u8 reserved_at_5e[0x2]; + + u8 tcp_data_offset[0x4]; + u8 reserved_at_64[0x8]; + u8 flow_label[0x14]; +}; + +struct mlx5_ifc_ste_eth_l4_v1_bits { + u8 ipv6_version[0x4]; + u8 reserved_at_4[0x4]; + u8 dscp[0x6]; + u8 ecn[0x2]; + u8 ipv6_hop_limit[0x8]; + u8 protocol[0x8]; + + u8 src_port[0x10]; + u8 dst_port[0x10]; + + u8 first_fragment[0x1]; + u8 reserved_at_41[0xb]; + u8 flow_label[0x14]; + + u8 tcp_data_offset[0x4]; + u8 l4_ok[0x1]; + u8 l3_ok[0x1]; + u8 fragmented[0x1]; + u8 tcp_ns[0x1]; + u8 tcp_cwr[0x1]; + u8 tcp_ece[0x1]; + u8 tcp_urg[0x1]; + u8 tcp_ack[0x1]; + u8 tcp_psh[0x1]; + u8 tcp_rst[0x1]; + u8 tcp_syn[0x1]; + u8 tcp_fin[0x1]; + u8 ipv6_paylen[0x10]; +}; + +struct mlx5_ifc_ste_eth_l4_misc_bits { + u8 checksum[0x10]; + u8 length[0x10]; + + u8 seq_num[0x20]; + + u8 ack_num[0x20]; + + u8 urgent_pointer[0x10]; + u8 window_size[0x10]; +}; + +struct mlx5_ifc_ste_eth_l4_misc_v1_bits { + u8 window_size[0x10]; + u8 urgent_pointer[0x10]; + + u8 ack_num[0x20]; + + u8 seq_num[0x20]; + + u8 length[0x10]; + u8 checksum[0x10]; +}; + +struct mlx5_ifc_ste_mpls_bits { + u8 mpls0_label[0x14]; + u8 mpls0_exp[0x3]; + u8 mpls0_s_bos[0x1]; + u8 mpls0_ttl[0x8]; + + u8 mpls1_label[0x20]; + + u8 mpls2_label[0x20]; + + u8 reserved_at_60[0x16]; + u8 mpls4_s_bit[0x1]; + u8 mpls4_qualifier[0x1]; + u8 mpls3_s_bit[0x1]; + u8 mpls3_qualifier[0x1]; + u8 mpls2_s_bit[0x1]; + u8 mpls2_qualifier[0x1]; + u8 mpls1_s_bit[0x1]; + u8 mpls1_qualifier[0x1]; + u8 mpls0_s_bit[0x1]; + u8 mpls0_qualifier[0x1]; +}; + +struct mlx5_ifc_ste_mpls_v1_bits { + u8 reserved_at_0[0x15]; + u8 mpls_ok[0x1]; + u8 mpls4_s_bit[0x1]; + u8 mpls4_qualifier[0x1]; + u8 mpls3_s_bit[0x1]; + u8 mpls3_qualifier[0x1]; + u8 mpls2_s_bit[0x1]; + u8 mpls2_qualifier[0x1]; + u8 mpls1_s_bit[0x1]; + u8 mpls1_qualifier[0x1]; + u8 mpls0_s_bit[0x1]; + u8 mpls0_qualifier[0x1]; + + u8 mpls0_label[0x14]; + u8 mpls0_exp[0x3]; + u8 mpls0_s_bos[0x1]; + u8 mpls0_ttl[0x8]; + + u8 mpls1_label[0x20]; + + u8 mpls2_label[0x20]; +}; + +struct mlx5_ifc_ste_register_0_bits { + u8 register_0_h[0x20]; + + u8 register_0_l[0x20]; + + u8 register_1_h[0x20]; + + u8 register_1_l[0x20]; +}; + +struct mlx5_ifc_ste_register_1_bits { + u8 register_2_h[0x20]; + + u8 register_2_l[0x20]; + + u8 register_3_h[0x20]; + + u8 register_3_l[0x20]; +}; + +struct mlx5_ifc_ste_gre_bits { + u8 gre_c_present[0x1]; + u8 reserved_at_1[0x1]; + u8 gre_k_present[0x1]; + u8 gre_s_present[0x1]; + u8 strict_src_route[0x1]; + u8 recur[0x3]; + u8 flags[0x5]; + u8 version[0x3]; + u8 gre_protocol[0x10]; + + u8 checksum[0x10]; + u8 offset[0x10]; + + u8 gre_key_h[0x18]; + u8 gre_key_l[0x8]; + + u8 seq_num[0x20]; +}; + +struct mlx5_ifc_ste_gre_v1_bits { + u8 gre_c_present[0x1]; + u8 reserved_at_1[0x1]; + u8 gre_k_present[0x1]; + u8 gre_s_present[0x1]; + u8 strict_src_route[0x1]; + u8 recur[0x3]; + u8 flags[0x5]; + u8 version[0x3]; + u8 gre_protocol[0x10]; + + u8 reserved_at_20[0x20]; + + u8 gre_key_h[0x18]; + u8 gre_key_l[0x8]; + + u8 reserved_at_60[0x20]; +}; + +struct mlx5_ifc_ste_flex_parser_0_bits { + u8 flex_parser_3[0x20]; + + u8 flex_parser_2[0x20]; + + u8 flex_parser_1[0x20]; + + u8 flex_parser_0[0x20]; +}; + +struct mlx5_ifc_ste_flex_parser_1_bits { + u8 flex_parser_7[0x20]; + + u8 flex_parser_6[0x20]; + + u8 flex_parser_5[0x20]; + + u8 flex_parser_4[0x20]; +}; + +struct mlx5_ifc_ste_tunnel_header_bits { + u8 tunnel_header_dw0[0x20]; + + u8 tunnel_header_dw1[0x20]; + + u8 reserved_at_40[0x40]; +}; + +struct mlx5_ifc_ste_tunnel_header_v1_bits { + u8 tunnel_header_0[0x20]; + + u8 tunnel_header_1[0x20]; + + u8 reserved_at_40[0x40]; +}; + +struct mlx5_ifc_ste_flex_parser_tnl_vxlan_gpe_bits { + u8 outer_vxlan_gpe_flags[0x8]; + u8 reserved_at_8[0x10]; + u8 outer_vxlan_gpe_next_protocol[0x8]; + + u8 outer_vxlan_gpe_vni[0x18]; + u8 reserved_at_38[0x8]; + + u8 reserved_at_40[0x40]; +}; + +struct mlx5_ifc_ste_flex_parser_tnl_geneve_bits { + u8 reserved_at_0[0x2]; + u8 geneve_opt_len[0x6]; + u8 geneve_oam[0x1]; + u8 reserved_at_9[0x7]; + u8 geneve_protocol_type[0x10]; + + u8 geneve_vni[0x18]; + u8 reserved_at_38[0x8]; + + u8 reserved_at_40[0x40]; +}; + +struct mlx5_ifc_ste_flex_parser_tnl_gtpu_bits { + u8 gtpu_msg_flags[0x8]; + u8 gtpu_msg_type[0x8]; + u8 reserved_at_10[0x10]; + + u8 gtpu_teid[0x20]; + + u8 reserved_at_40[0x40]; +}; + +struct mlx5_ifc_ste_general_purpose_bits { + u8 general_purpose_lookup_field[0x20]; + + u8 reserved_at_20[0x20]; + + u8 reserved_at_40[0x20]; + + u8 reserved_at_60[0x20]; +}; + +struct mlx5_ifc_ste_src_gvmi_qp_bits { + u8 loopback_syndrome[0x8]; + u8 reserved_at_8[0x8]; + u8 source_gvmi[0x10]; + + u8 reserved_at_20[0x5]; + u8 force_lb[0x1]; + u8 functional_lb[0x1]; + u8 source_is_requestor[0x1]; + u8 source_qp[0x18]; + + u8 reserved_at_40[0x20]; + + u8 reserved_at_60[0x20]; +}; + +struct mlx5_ifc_ste_src_gvmi_qp_v1_bits { + u8 loopback_synd[0x8]; + u8 reserved_at_8[0x7]; + u8 functional_lb[0x1]; + u8 source_gvmi[0x10]; + + u8 force_lb[0x1]; + u8 reserved_at_21[0x1]; + u8 source_is_requestor[0x1]; + u8 reserved_at_23[0x5]; + u8 source_qp[0x18]; + + u8 reserved_at_40[0x20]; + + u8 reserved_at_60[0x20]; +}; + +struct mlx5_ifc_ste_icmp_v1_bits { + u8 icmp_payload_data[0x20]; + + u8 icmp_header_data[0x20]; + + u8 icmp_type[0x8]; + u8 icmp_code[0x8]; + u8 reserved_at_50[0x10]; + + u8 reserved_at_60[0x20]; +}; + +struct mlx5_ifc_ste_def0_v1_bits { + u8 metadata_reg_c_0[0x20]; + + u8 metadata_reg_c_1[0x20]; + + u8 dmac_47_16[0x20]; + + u8 dmac_15_0[0x10]; + u8 ethertype[0x10]; + + u8 reserved_at_60[0x1]; + u8 sx_sniffer[0x1]; + u8 functional_loopback[0x1]; + u8 ip_frag[0x1]; + u8 qp_type[0x2]; + u8 encapsulation_type[0x2]; + u8 port[0x2]; + u8 outer_l3_type[0x2]; + u8 outer_l4_type[0x2]; + u8 first_vlan_qualifier[0x2]; + u8 first_priority[0x3]; + u8 first_cfi[0x1]; + u8 first_vlan_id[0xc]; + + u8 reserved_at_80[0xa]; + u8 force_loopback[0x1]; + u8 reserved_at_8b[0x3]; + u8 second_vlan_qualifier[0x2]; + u8 second_priority[0x3]; + u8 second_cfi[0x1]; + u8 second_vlan_id[0xc]; + + u8 smac_47_16[0x20]; + + u8 smac_15_0[0x10]; + u8 inner_ipv4_checksum_ok[0x1]; + u8 inner_l4_checksum_ok[0x1]; + u8 outer_ipv4_checksum_ok[0x1]; + u8 outer_l4_checksum_ok[0x1]; + u8 inner_l3_ok[0x1]; + u8 inner_l4_ok[0x1]; + u8 outer_l3_ok[0x1]; + u8 outer_l4_ok[0x1]; + u8 tcp_cwr[0x1]; + u8 tcp_ece[0x1]; + u8 tcp_urg[0x1]; + u8 tcp_ack[0x1]; + u8 tcp_psh[0x1]; + u8 tcp_rst[0x1]; + u8 tcp_syn[0x1]; + u8 tcp_fin[0x1]; +}; + +struct mlx5_ifc_ste_def2_v1_bits { + u8 metadata_reg_a[0x20]; + + u8 outer_ip_version[0x4]; + u8 outer_ip_ihl[0x4]; + u8 outer_ip_dscp[0x6]; + u8 outer_ip_ecn[0x2]; + u8 outer_ip_ttl[0x8]; + u8 outer_ip_protocol[0x8]; + + u8 outer_ip_identification[0x10]; + u8 outer_ip_flags[0x3]; + u8 outer_ip_fragment_offset[0xd]; + + u8 outer_ip_total_length[0x10]; + u8 outer_ip_checksum[0x10]; + + u8 reserved_180[0xc]; + u8 outer_ip_flow_label[0x14]; + + u8 outer_eth_packet_length[0x10]; + u8 outer_ip_payload_length[0x10]; + + u8 outer_l4_sport[0x10]; + u8 outer_l4_dport[0x10]; + + u8 outer_data_offset[0x4]; + u8 reserved_1e4[0x2]; + u8 outer_ip_frag[0x1]; + u8 tcp_ns[0x1]; + u8 tcp_cwr[0x1]; + u8 tcp_ece[0x1]; + u8 tcp_urg[0x1]; + u8 tcp_ack[0x1]; + u8 tcp_psh[0x1]; + u8 tcp_rst[0x1]; + u8 tcp_syn[0x1]; + u8 tcp_fin[0x1]; + u8 outer_ip_frag_first[0x1]; + u8 reserved_1f0[0x7]; + u8 inner_ipv4_checksum_ok[0x1]; + u8 inner_l4_checksum_ok[0x1]; + u8 outer_ipv4_checksum_ok[0x1]; + u8 outer_l4_checksum_ok[0x1]; + u8 inner_l3_ok[0x1]; + u8 inner_l4_ok[0x1]; + u8 outer_l3_ok[0x1]; + u8 outer_l4_ok[0x1]; +}; + +struct mlx5_ifc_ste_def6_v1_bits { + u8 dst_ipv6_127_96[0x20]; + + u8 dst_ipv6_95_64[0x20]; + + u8 dst_ipv6_63_32[0x20]; + + u8 dst_ipv6_31_0[0x20]; + + u8 reserved_at_80[0x40]; + + u8 outer_l4_sport[0x10]; + u8 outer_l4_dport[0x10]; + + u8 reserved_e0[0x4]; + u8 l4_ok[0x1]; + u8 l3_ok[0x1]; + u8 ip_frag[0x1]; + u8 tcp_ns[0x1]; + u8 tcp_cwr[0x1]; + u8 tcp_ece[0x1]; + u8 tcp_urg[0x1]; + u8 tcp_ack[0x1]; + u8 tcp_psh[0x1]; + u8 tcp_rst[0x1]; + u8 tcp_syn[0x1]; + u8 tcp_fin[0x1]; + u8 reserved_f0[0x10]; +}; + +struct mlx5_ifc_ste_def16_v1_bits { + u8 tunnel_header_0[0x20]; + + u8 tunnel_header_1[0x20]; + + u8 tunnel_header_2[0x20]; + + u8 tunnel_header_3[0x20]; + + u8 random_number[0x10]; + u8 reserved_90[0x10]; + + u8 metadata_reg_a[0x20]; + + u8 reserved_c0[0x8]; + u8 outer_l3_type[0x2]; + u8 outer_l4_type[0x2]; + u8 outer_first_vlan_type[0x2]; + u8 reserved_ce[0x1]; + u8 functional_lb[0x1]; + u8 source_gvmi[0x10]; + + u8 force_lb[0x1]; + u8 outer_ip_frag[0x1]; + u8 source_is_requester[0x1]; + u8 reserved_e3[0x5]; + u8 source_sqn[0x18]; +}; + +struct mlx5_ifc_ste_def22_v1_bits { + u8 outer_ip_src_addr[0x20]; + + u8 outer_ip_dst_addr[0x20]; + + u8 outer_l4_sport[0x10]; + u8 outer_l4_dport[0x10]; + + u8 reserved_at_40[0x1]; + u8 sx_sniffer[0x1]; + u8 functional_loopback[0x1]; + u8 outer_ip_frag[0x1]; + u8 qp_type[0x2]; + u8 encapsulation_type[0x2]; + u8 port[0x2]; + u8 outer_l3_type[0x2]; + u8 outer_l4_type[0x2]; + u8 first_vlan_qualifier[0x2]; + u8 first_priority[0x3]; + u8 first_cfi[0x1]; + u8 first_vlan_id[0xc]; + + u8 metadata_reg_c_0[0x20]; + + u8 outer_dmac_47_16[0x20]; + + u8 outer_smac_47_16[0x20]; + + u8 outer_smac_15_0[0x10]; + u8 outer_dmac_15_0[0x10]; +}; + +struct mlx5_ifc_ste_def24_v1_bits { + u8 metadata_reg_c_2[0x20]; + + u8 metadata_reg_c_3[0x20]; + + u8 metadata_reg_c_0[0x20]; + + u8 metadata_reg_c_1[0x20]; + + u8 outer_ip_src_addr[0x20]; + + u8 outer_ip_dst_addr[0x20]; + + u8 outer_l4_sport[0x10]; + u8 outer_l4_dport[0x10]; + + u8 inner_ip_protocol[0x8]; + u8 inner_l3_type[0x2]; + u8 inner_l4_type[0x2]; + u8 inner_first_vlan_type[0x2]; + u8 inner_ip_frag[0x1]; + u8 functional_lb[0x1]; + u8 outer_ip_protocol[0x8]; + u8 outer_l3_type[0x2]; + u8 outer_l4_type[0x2]; + u8 outer_first_vlan_type[0x2]; + u8 outer_ip_frag[0x1]; + u8 functional_lb_dup[0x1]; +}; + +struct mlx5_ifc_ste_def25_v1_bits { + u8 inner_ip_src_addr[0x20]; + + u8 inner_ip_dst_addr[0x20]; + + u8 inner_l4_sport[0x10]; + u8 inner_l4_dport[0x10]; + + u8 tunnel_header_0[0x20]; + + u8 tunnel_header_1[0x20]; + + u8 reserved_at_a0[0x20]; + + u8 port_number_dup[0x2]; + u8 inner_l3_type[0x2]; + u8 inner_l4_type[0x2]; + u8 inner_first_vlan_type[0x2]; + u8 port_number[0x2]; + u8 outer_l3_type[0x2]; + u8 outer_l4_type[0x2]; + u8 outer_first_vlan_type[0x2]; + u8 outer_l4_dport[0x10]; + + u8 reserved_at_e0[0x20]; +}; + +struct mlx5_ifc_ste_def26_v1_bits { + u8 src_ipv6_127_96[0x20]; + + u8 src_ipv6_95_64[0x20]; + + u8 src_ipv6_63_32[0x20]; + + u8 src_ipv6_31_0[0x20]; + + u8 reserved_at_80[0x3]; + u8 ip_frag[0x1]; + u8 reserved_at_84[0x6]; + u8 l3_type[0x2]; + u8 l4_type[0x2]; + u8 first_vlan_type[0x2]; + u8 first_priority[0x3]; + u8 first_cfi[0x1]; + u8 first_vlan_id[0xc]; + + u8 reserved_at_a0[0xb]; + u8 l2_ok[0x1]; + u8 l3_ok[0x1]; + u8 l4_ok[0x1]; + u8 second_vlan_type[0x2]; + u8 second_priority[0x3]; + u8 second_cfi[0x1]; + u8 second_vlan_id[0xc]; + + u8 smac_47_16[0x20]; + + u8 smac_15_0[0x10]; + u8 ip_porotcol[0x8]; + u8 tcp_cwr[0x1]; + u8 tcp_ece[0x1]; + u8 tcp_urg[0x1]; + u8 tcp_ack[0x1]; + u8 tcp_psh[0x1]; + u8 tcp_rst[0x1]; + u8 tcp_syn[0x1]; + u8 tcp_fin[0x1]; +}; + +struct mlx5_ifc_ste_def28_v1_bits { + u8 inner_l4_sport[0x10]; + u8 inner_l4_dport[0x10]; + + u8 flex_gtpu_teid[0x20]; + + u8 inner_ip_src_addr[0x20]; + + u8 inner_ip_dst_addr[0x20]; + + u8 outer_ip_src_addr[0x20]; + + u8 outer_ip_dst_addr[0x20]; + + u8 outer_l4_sport[0x10]; + u8 outer_l4_dport[0x10]; + + u8 inner_ip_protocol[0x8]; + u8 inner_l3_type[0x2]; + u8 inner_l4_type[0x2]; + u8 inner_first_vlan_type[0x2]; + u8 inner_ip_frag[0x1]; + u8 functional_lb[0x1]; + u8 outer_ip_protocol[0x8]; + u8 outer_l3_type[0x2]; + u8 outer_l4_type[0x2]; + u8 outer_first_vlan_type[0x2]; + u8 outer_ip_frag[0x1]; + u8 functional_lb_dup[0x1]; +}; + +struct mlx5_ifc_set_action_in_bits { + u8 action_type[0x4]; + u8 field[0xc]; + u8 reserved_at_10[0x3]; + u8 offset[0x5]; + u8 reserved_at_18[0x3]; + u8 length[0x5]; + + u8 data[0x20]; +}; + +struct mlx5_ifc_add_action_in_bits { + u8 action_type[0x4]; + u8 field[0xc]; + u8 reserved_at_10[0x10]; + + u8 data[0x20]; +}; + +struct mlx5_ifc_copy_action_in_bits { + u8 action_type[0x4]; + u8 src_field[0xc]; + u8 reserved_at_10[0x3]; + u8 src_offset[0x5]; + u8 reserved_at_18[0x3]; + u8 length[0x5]; + + u8 reserved_at_20[0x4]; + u8 dst_field[0xc]; + u8 reserved_at_30[0x3]; + u8 dst_offset[0x5]; + u8 reserved_at_38[0x8]; +}; + +enum { + MLX5_ACTION_TYPE_SET = 0x1, + MLX5_ACTION_TYPE_ADD = 0x2, + MLX5_ACTION_TYPE_COPY = 0x3, +}; + +enum { + MLX5_ACTION_IN_FIELD_OUT_SMAC_47_16 = 0x1, + MLX5_ACTION_IN_FIELD_OUT_SMAC_15_0 = 0x2, + MLX5_ACTION_IN_FIELD_OUT_ETHERTYPE = 0x3, + MLX5_ACTION_IN_FIELD_OUT_DMAC_47_16 = 0x4, + MLX5_ACTION_IN_FIELD_OUT_DMAC_15_0 = 0x5, + MLX5_ACTION_IN_FIELD_OUT_IP_DSCP = 0x6, + MLX5_ACTION_IN_FIELD_OUT_TCP_FLAGS = 0x7, + MLX5_ACTION_IN_FIELD_OUT_TCP_SPORT = 0x8, + MLX5_ACTION_IN_FIELD_OUT_TCP_DPORT = 0x9, + MLX5_ACTION_IN_FIELD_OUT_IP_TTL = 0xa, + MLX5_ACTION_IN_FIELD_OUT_UDP_SPORT = 0xb, + MLX5_ACTION_IN_FIELD_OUT_UDP_DPORT = 0xc, + MLX5_ACTION_IN_FIELD_OUT_SIPV6_127_96 = 0xd, + MLX5_ACTION_IN_FIELD_OUT_SIPV6_95_64 = 0xe, + MLX5_ACTION_IN_FIELD_OUT_SIPV6_63_32 = 0xf, + MLX5_ACTION_IN_FIELD_OUT_SIPV6_31_0 = 0x10, + MLX5_ACTION_IN_FIELD_OUT_DIPV6_127_96 = 0x11, + MLX5_ACTION_IN_FIELD_OUT_DIPV6_95_64 = 0x12, + MLX5_ACTION_IN_FIELD_OUT_DIPV6_63_32 = 0x13, + MLX5_ACTION_IN_FIELD_OUT_DIPV6_31_0 = 0x14, + MLX5_ACTION_IN_FIELD_OUT_SIPV4 = 0x15, + MLX5_ACTION_IN_FIELD_OUT_DIPV4 = 0x16, + MLX5_ACTION_IN_FIELD_OUT_FIRST_VID = 0x17, + MLX5_ACTION_IN_FIELD_OUT_IPV6_HOPLIMIT = 0x47, + MLX5_ACTION_IN_FIELD_OUT_METADATA_REGA = 0x49, + MLX5_ACTION_IN_FIELD_OUT_METADATA_REGB = 0x50, + MLX5_ACTION_IN_FIELD_OUT_METADATA_REGC_0 = 0x51, + MLX5_ACTION_IN_FIELD_OUT_METADATA_REGC_1 = 0x52, + MLX5_ACTION_IN_FIELD_OUT_METADATA_REGC_2 = 0x53, + MLX5_ACTION_IN_FIELD_OUT_METADATA_REGC_3 = 0x54, + MLX5_ACTION_IN_FIELD_OUT_METADATA_REGC_4 = 0x55, + MLX5_ACTION_IN_FIELD_OUT_METADATA_REGC_5 = 0x56, + MLX5_ACTION_IN_FIELD_OUT_TCP_SEQ_NUM = 0x59, + MLX5_ACTION_IN_FIELD_OUT_TCP_ACK_NUM = 0x5B, + MLX5_ACTION_IN_FIELD_OUT_GTPU_TEID = 0x6E, +}; + +struct mlx5_ifc_dctc_bits { + u8 reserved_at_0[0x1d]; + u8 data_in_order[0x1]; + u8 reserved_at_1e[0x362]; +}; + +struct mlx5_ifc_packet_reformat_context_in_bits { + u8 reserved_at_0[0x5]; + u8 reformat_type[0x3]; + u8 reserved_at_8[0xe]; + u8 reformat_data_size[0xa]; + + u8 reserved_at_20[0x10]; + u8 reformat_data[2][0x8]; + + u8 more_reformat_data[0][0x8]; +}; + +struct mlx5_ifc_alloc_packet_reformat_context_in_bits { + u8 opcode[0x10]; + u8 reserved_at_10[0x10]; + + u8 reserved_at_20[0x10]; + u8 op_mod[0x10]; + + u8 reserved_at_40[0xa0]; + + struct mlx5_ifc_packet_reformat_context_in_bits packet_reformat_context; +}; + +struct mlx5_ifc_alloc_packet_reformat_context_out_bits { + u8 status[0x8]; + u8 reserved_at_8[0x18]; + + u8 syndrome[0x20]; + + u8 packet_reformat_id[0x20]; + + u8 reserved_at_60[0x20]; +}; + +struct mlx5_ifc_dealloc_packet_reformat_context_in_bits { + u8 opcode[0x10]; + u8 reserved_at_10[0x10]; + + u8 reserved_20[0x10]; + u8 op_mod[0x10]; + + u8 packet_reformat_id[0x20]; + + u8 reserved_60[0x20]; +}; + +struct mlx5_ifc_dealloc_packet_reformat_context_out_bits { + u8 status[0x8]; + u8 reserved_at_8[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_at_40[0x40]; +}; + +enum reformat_type { + MLX5_REFORMAT_TYPE_L2_TO_VXLAN = 0x0, + MLX5_REFORMAT_TYPE_L2_TO_NVGRE = 0x1, + MLX5_REFORMAT_TYPE_L2_TO_L2_TUNNEL = 0x2, + MLX5_REFORMAT_TYPE_L3_TUNNEL_TO_L2 = 0x3, + MLX5_REFORMAT_TYPE_L2_TO_L3_TUNNEL = 0x4, +}; + +struct mlx5_ifc_alloc_flow_counter_in_bits { + u8 opcode[0x10]; + u8 uid[0x10]; + + u8 reserved_at_20[0x10]; + u8 op_mod[0x10]; + + u8 reserved_at_40[0x40]; +}; + +struct mlx5_ifc_alloc_flow_counter_out_bits { + u8 status[0x8]; + u8 reserved_at_8[0x18]; + + u8 syndrome[0x20]; + + u8 flow_counter_id[0x20]; + + u8 reserved_at_60[0x20]; +}; + +struct mlx5_ifc_dealloc_flow_counter_in_bits { + u8 opcode[0x10]; + u8 reserved_at_10[0x10]; + + u8 reserved_at_20[0x20]; + + u8 flow_counter_id[0x20]; + + u8 reserved_at_60[0x20]; +}; + +enum { + MLX5_OBJ_TYPE_FLOW_METER = 0x000a, + MLX5_OBJ_TYPE_DEK = 0x000C, + MLX5_OBJ_TYPE_MATCH_DEFINER = 0x0018, + MLX5_OBJ_TYPE_CRYPTO_LOGIN = 0x001F, + MLX5_OBJ_TYPE_FLOW_SAMPLER = 0x0020, + MLX5_OBJ_TYPE_ASO_FLOW_METER = 0x0024, + MLX5_OBJ_TYPE_ASO_FIRST_HIT = 0x0025, + MLX5_OBJ_TYPE_SCHEDULING_ELEMENT = 0x0026, + MLX5_OBJ_TYPE_RESERVED_QPN = 0x002C, + MLX5_OBJ_TYPE_ASO_CT = 0x0031, + MLX5_OBJ_TYPE_AV_QP_MAPPING = 0x003A, +}; + +struct mlx5_ifc_general_obj_in_cmd_hdr_bits { + u8 opcode[0x10]; + u8 uid[0x10]; + + u8 reserved_at_20[0x10]; + u8 obj_type[0x10]; + + u8 obj_id[0x20]; + + u8 reserved_at_60[0x3]; + u8 log_obj_range[0x5]; + u8 reserved_at_68[0x18]; +}; + +struct mlx5_ifc_general_obj_out_cmd_hdr_bits { + u8 status[0x8]; + u8 reserved_at_8[0x18]; + + u8 syndrome[0x20]; + + u8 obj_id[0x20]; + + u8 reserved_at_60[0x20]; +}; + +struct mlx5_ifc_flow_meter_bits { + u8 modify_field_select[0x40]; + + u8 active[0x1]; + u8 reserved_at_41[0x3]; + u8 return_reg_id[0x4]; + u8 table_type[0x8]; + u8 reserved_at_50[0x10]; + + u8 reserved_at_60[0x8]; + u8 destination_table_id[0x18]; + + u8 reserved_at_80[0x80]; + + u8 flow_meter_params[0x100]; + + u8 reserved_at_180[0x180]; + + u8 sw_steering_icm_address_rx[0x40]; + u8 sw_steering_icm_address_tx[0x40]; +}; + +struct mlx5_ifc_create_flow_meter_in_bits { + struct mlx5_ifc_general_obj_in_cmd_hdr_bits hdr; + struct mlx5_ifc_flow_meter_bits meter; +}; + +struct mlx5_ifc_query_flow_meter_out_bits { + struct mlx5_ifc_general_obj_out_cmd_hdr_bits hdr; + struct mlx5_ifc_flow_meter_bits obj; +}; + +struct mlx5_ifc_flow_sampler_bits { + u8 modify_field_select[0x40]; + + u8 table_type[0x8]; + u8 level[0x8]; + u8 reserved_at_50[0xf]; + u8 ignore_flow_level[0x1]; + + u8 sample_ratio[0x20]; + + u8 reserved_at_80[0x8]; + u8 sample_table_id[0x18]; + + u8 reserved_at_a0[0x8]; + u8 default_table_id[0x18]; + + u8 sw_steering_icm_address_rx[0x40]; + u8 sw_steering_icm_address_tx[0x40]; +}; + +struct mlx5_ifc_create_flow_sampler_in_bits { + struct mlx5_ifc_general_obj_in_cmd_hdr_bits hdr; + struct mlx5_ifc_flow_sampler_bits sampler; +}; + +struct mlx5_ifc_query_flow_sampler_out_bits { + struct mlx5_ifc_general_obj_out_cmd_hdr_bits hdr; + struct mlx5_ifc_flow_sampler_bits obj; +}; + +struct mlx5_ifc_definer_bits { + u8 modify_field_select[0x40]; + + u8 reserved_at_40[0x40]; + + u8 reserved_at_80[0x10]; + u8 format_id[0x10]; + + u8 reserved_at_60[0x160]; + + u8 ctrl[0xA0]; + u8 match_mask_dw_11_8[0x60]; + u8 match_mask_dw_7_0[0x100]; +}; + +struct mlx5_ifc_create_definer_in_bits { + struct mlx5_ifc_general_obj_in_cmd_hdr_bits hdr; + struct mlx5_ifc_definer_bits definer; +}; + +struct mlx5_ifc_esw_vport_context_bits { + u8 reserved_at_0[0x3]; + u8 vport_svlan_strip[0x1]; + u8 vport_cvlan_strip[0x1]; + u8 vport_svlan_insert[0x1]; + u8 vport_cvlan_insert[0x2]; + u8 reserved_at_8[0x18]; + + u8 reserved_at_20[0x20]; + + u8 svlan_cfi[0x1]; + u8 svlan_pcp[0x3]; + u8 svlan_id[0xc]; + u8 cvlan_cfi[0x1]; + u8 cvlan_pcp[0x3]; + u8 cvlan_id[0xc]; + + u8 reserved_at_40[0x720]; + u8 sw_steering_vport_icm_address_rx[0x40]; + u8 sw_steering_vport_icm_address_tx[0x40]; +}; + +struct mlx5_ifc_query_esw_vport_context_out_bits { + u8 status[0x8]; + u8 reserved_at_8[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_at_40[0x40]; + + struct mlx5_ifc_esw_vport_context_bits esw_vport_context; +}; + +struct mlx5_ifc_query_esw_vport_context_in_bits { + u8 opcode[0x10]; + u8 reserved_at_10[0x10]; + + u8 reserved_at_20[0x10]; + u8 op_mod[0x10]; + + u8 other_vport[0x1]; + u8 reserved_at_41[0xf]; + u8 vport_number[0x10]; + + u8 reserved_at_60[0x20]; +}; + +struct mlx5_ifc_nic_vport_context_bits { + u8 reserved_at_0[0x1f]; + u8 roce_en[0x1]; + + u8 reserved_at_20[0x7e0]; +}; + +struct mlx5_ifc_query_nic_vport_context_out_bits { + u8 status[0x8]; + u8 reserved_at_8[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_at_40[0x40]; + + struct mlx5_ifc_nic_vport_context_bits nic_vport_context; +}; + +struct mlx5_ifc_query_nic_vport_context_in_bits { + u8 opcode[0x10]; + u8 reserved_at_10[0x10]; + + u8 reserved_at_20[0x10]; + u8 op_mod[0x10]; + + u8 reserved_at_40[0x40]; +}; + +enum { + MLX5_QPC_ST_RC = 0x0, +}; + +enum { + MLX5_QPC_PM_STATE_MIGRATED = 0x3, +}; + +struct mlx5_ifc_ud_av_bits { + u8 reserved_at_0[0x60]; + + u8 reserved_at_60[0x4]; + u8 sl_or_eth_prio[0x4]; + u8 reserved_at_68[0x18]; + + u8 reserved_at_80[0x60]; + + u8 reserved_at_e0[0x4]; + u8 src_addr_index[0x8]; + u8 reserved_at_ec[0x14]; + + u8 rgid_or_rip[16][0x8]; +}; + +struct mlx5_ifc_ads_bits { + u8 fl[0x1]; + u8 free_ar[0x1]; + u8 reserved_at_2[0xe]; + u8 pkey_index[0x10]; + + u8 reserved_at_20[0x8]; + u8 grh[0x1]; + u8 mlid[0x7]; + u8 rlid[0x10]; + + u8 ack_timeout[0x5]; + u8 reserved_at_45[0x3]; + u8 src_addr_index[0x8]; + u8 reserved_at_50[0x4]; + u8 stat_rate[0x4]; + u8 hop_limit[0x8]; + + u8 reserved_at_60[0x4]; + u8 tclass[0x8]; + u8 flow_label[0x14]; + + u8 rgid_rip[16][0x8]; + + u8 reserved_at_100[0x4]; + u8 f_dscp[0x1]; + u8 f_ecn[0x1]; + u8 reserved_at_106[0x1]; + u8 f_eth_prio[0x1]; + u8 ecn[0x2]; + u8 dscp[0x6]; + u8 udp_sport[0x10]; + + u8 dei_cfi[0x1]; + u8 eth_prio[0x3]; + u8 sl[0x4]; + u8 vhca_port_num[0x8]; + u8 rmac_47_32[0x10]; + + u8 rmac_31_0[0x20]; +}; + +enum { + MLX5_QPC_STATE_SQDRAINED = 0x5, +}; + +enum { + MLX5_QPC_TIMESTAMP_FORMAT_FREE_RUNNING = 0x0, + MLX5_QPC_TIMESTAMP_FORMAT_DEFAULT = 0x1, + MLX5_QPC_TIMESTAMP_FORMAT_REAL_TIME = 0x2, +}; + +struct mlx5_ifc_qpc_bits { + u8 state[0x4]; + u8 lag_tx_port_affinity[0x4]; + u8 st[0x8]; + u8 reserved_at_10[0x2]; + u8 isolate_vl_tc[0x1]; + u8 pm_state[0x2]; + u8 reserved_at_15[0x1]; + u8 req_e2e_credit_mode[0x2]; + u8 offload_type[0x4]; + u8 end_padding_mode[0x2]; + u8 reserved_at_1e[0x2]; + + u8 wq_signature[0x1]; + u8 block_lb_mc[0x1]; + u8 atomic_like_write_en[0x1]; + u8 latency_sensitive[0x1]; + u8 reserved_at_24[0x1]; + u8 drain_sigerr[0x1]; + u8 reserved_at_26[0x2]; + u8 pd[0x18]; + + u8 mtu[0x3]; + u8 log_msg_max[0x5]; + u8 reserved_at_48[0x1]; + u8 log_rq_size[0x4]; + u8 log_rq_stride[0x3]; + u8 no_sq[0x1]; + u8 log_sq_size[0x4]; + u8 reserved_at_55[0x3]; + u8 ts_format[0x2]; + u8 data_in_order[0x1]; + u8 rlky[0x1]; + u8 ulp_stateless_offload_mode[0x4]; + + u8 counter_set_id[0x8]; + u8 uar_page[0x18]; + + u8 reserved_at_80[0x8]; + u8 user_index[0x18]; + + u8 reserved_at_a0[0x3]; + u8 log_page_size[0x5]; + u8 remote_qpn[0x18]; + + struct mlx5_ifc_ads_bits primary_address_path; + + struct mlx5_ifc_ads_bits secondary_address_path; + + u8 log_ack_req_freq[0x4]; + u8 reserved_at_384[0x4]; + u8 log_sra_max[0x3]; + u8 reserved_at_38b[0x2]; + u8 retry_count[0x3]; + u8 rnr_retry[0x3]; + u8 reserved_at_393[0x1]; + u8 fre[0x1]; + u8 cur_rnr_retry[0x3]; + u8 cur_retry_count[0x3]; + u8 reserved_at_39b[0x5]; + + u8 reserved_at_3a0[0x20]; + + u8 reserved_at_3c0[0x8]; + u8 next_send_psn[0x18]; + + u8 reserved_at_3e0[0x8]; + u8 cqn_snd[0x18]; + + u8 reserved_at_400[0x8]; + u8 deth_sqpn[0x18]; + + u8 reserved_at_420[0x20]; + + u8 reserved_at_440[0x8]; + u8 last_acked_psn[0x18]; + + u8 reserved_at_460[0x8]; + u8 ssn[0x18]; + + u8 reserved_at_480[0x8]; + u8 log_rra_max[0x3]; + u8 reserved_at_48b[0x1]; + u8 atomic_mode[0x4]; + u8 rre[0x1]; + u8 rwe[0x1]; + u8 rae[0x1]; + u8 reserved_at_493[0x1]; + u8 page_offset[0x6]; + u8 reserved_at_49a[0x3]; + u8 cd_slave_receive[0x1]; + u8 cd_slave_send[0x1]; + u8 cd_master[0x1]; + + u8 reserved_at_4a0[0x3]; + u8 min_rnr_nak[0x5]; + u8 next_rcv_psn[0x18]; + + u8 reserved_at_4c0[0x8]; + u8 xrcd[0x18]; + + u8 reserved_at_4e0[0x8]; + u8 cqn_rcv[0x18]; + + u8 dbr_addr[0x40]; + + u8 q_key[0x20]; + + u8 reserved_at_560[0x5]; + u8 rq_type[0x3]; + u8 srqn_rmpn_xrqn[0x18]; + + u8 reserved_at_580[0x8]; + u8 rmsn[0x18]; + + u8 hw_sq_wqebb_counter[0x10]; + u8 sw_sq_wqebb_counter[0x10]; + + u8 hw_rq_counter[0x20]; + + u8 sw_rq_counter[0x20]; + + u8 reserved_at_600[0x20]; + + u8 reserved_at_620[0xf]; + u8 cgs[0x1]; + u8 cs_req[0x8]; + u8 cs_res[0x8]; + + u8 dc_access_key[0x40]; + + u8 reserved_at_680[0x3]; + u8 dbr_umem_valid[0x1]; + + u8 reserved_at_684[0x9c]; + + u8 dbr_umem_id[0x20]; +}; + +struct mlx5_ifc_qpc_ext_bits { + u8 reserved_at_0[0x2]; + u8 mmo[0x1]; + u8 reserved_at_3[0xd]; + u8 dci_stream_channel_id[0x10]; + + u8 qos_queue_group_id_requester[0x20]; + + u8 qos_queue_group_id_responder[0x20]; + + u8 reserved_at_60[0x5a0]; +}; + +struct mlx5_ifc_create_tir_out_bits { + u8 status[0x8]; + u8 icm_address_63_40[0x18]; + + u8 syndrome[0x20]; + + u8 icm_address_39_32[0x8]; + u8 tirn[0x18]; + + u8 icm_address_31_0[0x20]; +}; + +struct mlx5_ifc_destroy_tir_in_bits { + u8 opcode[0x10]; + u8 uid[0x10]; + + u8 reserved_at_20[0x20]; + + u8 reserved_at_40[0x8]; + u8 tirn[0x18]; + + u8 reserved_at_60[0x20]; +}; + +struct mlx5_ifc_create_qp_out_bits { + u8 status[0x8]; + u8 reserved_at_8[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_at_40[0x8]; + u8 qpn[0x18]; + + u8 reserved_at_60[0x20]; +}; + +struct mlx5_ifc_create_qp_in_bits { + u8 opcode[0x10]; + u8 uid[0x10]; + + u8 reserved_at_20[0x10]; + u8 op_mod[0x10]; + + u8 reserved_at_40[0x40]; + + u8 opt_param_mask[0x20]; + + u8 reserved_at_a0[0x20]; + + struct mlx5_ifc_qpc_bits qpc; + + u8 reserved_at_800[0x40]; + + u8 wq_umem_id[0x20]; + + u8 wq_umem_valid[0x1]; + u8 reserved_at_861[0x1f]; + + u8 pas[0][0x40]; +}; + +struct mlx5_ifc_destroy_qp_in_bits { + u8 opcode[0x10]; + u8 uid[0x10]; + + u8 reserved_at_20[0x20]; + + u8 reserved_at_40[0x8]; + u8 qpn[0x18]; + + u8 reserved_at_60[0x20]; +}; + +enum mlx5_qpc_opt_mask_32 { + MLX5_QPC_OPT_MASK_32_DCI_STREAM_CHANNEL_ID = 1 << 0, + MLX5_QPC_OPT_MASK_32_QOS_QUEUE_GROUP_ID = 1 << 1, + MLX5_QPC_OPT_MASK_32_UDP_SPORT = 1 << 2, +}; + +enum mlx5_qpc_opt_mask { + MLX5_QPC_OPT_MASK_INIT2INIT_DRAIN_SIGERR = 1 << 11, + MLX5_QPC_OPT_MASK_RTS2RTS_LAG_TX_PORT_AFFINITY = 1 << 15, + MLX5_QPC_OPT_MASK_INIT2INIT_MMO = 1 << 25, +}; + +struct mlx5_ifc_init2init_qp_out_bits { + u8 status[0x8]; + u8 reserved_at_8[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_at_40[0x40]; +}; + +struct mlx5_ifc_init2init_qp_in_bits { + u8 opcode[0x10]; + u8 uid[0x10]; + + u8 reserved_at_20[0x10]; + u8 op_mod[0x10]; + + u8 qpc_ext[0x1]; + u8 reserved_at_41[0x7]; + u8 qpn[0x18]; + + u8 reserved_at_60[0x20]; + + u8 opt_param_mask[0x20]; + + u8 reserved_at_a0[0x20]; + + struct mlx5_ifc_qpc_bits qpc; + + u8 reserved_at_800[0x40]; + + u8 opt_param_mask_95_32[0x40]; + + struct mlx5_ifc_qpc_ext_bits qpc_data_ext; +}; + +struct mlx5_ifc_init2rtr_qp_out_bits { + u8 status[0x8]; + u8 reserved_at_8[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_at_40[0x40]; +}; + +struct mlx5_ifc_init2rtr_qp_in_bits { + u8 opcode[0x10]; + u8 uid[0x10]; + + u8 reserved_at_20[0x10]; + u8 op_mod[0x10]; + + u8 reserved_at_40[0x8]; + u8 qpn[0x18]; + + u8 reserved_at_60[0x20]; + + u8 opt_param_mask[0x20]; + + u8 reserved_at_a0[0x20]; + + struct mlx5_ifc_qpc_bits qpc; + + u8 reserved_at_800[0x80]; +}; + +struct mlx5_ifc_rtr2rts_qp_out_bits { + u8 status[0x8]; + u8 reserved_at_8[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_at_40[0x40]; +}; + +struct mlx5_ifc_rtr2rts_qp_in_bits { + u8 opcode[0x10]; + u8 uid[0x10]; + + u8 reserved_at_20[0x10]; + u8 op_mod[0x10]; + + u8 reserved_at_40[0x8]; + u8 qpn[0x18]; + + u8 reserved_at_60[0x20]; + + u8 opt_param_mask[0x20]; + + u8 reserved_at_a0[0x20]; + + struct mlx5_ifc_qpc_bits qpc; + + u8 reserved_at_800[0x80]; +}; + +struct mlx5_ifc_rst2init_qp_out_bits { + u8 status[0x8]; + u8 reserved_at_8[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_at_40[0x40]; +}; + +struct mlx5_ifc_rst2init_qp_in_bits { + u8 opcode[0x10]; + u8 uid[0x10]; + + u8 reserved_at_20[0x10]; + u8 op_mod[0x10]; + + u8 reserved_at_40[0x8]; + u8 qpn[0x18]; + + u8 reserved_at_60[0x20]; + + u8 opt_param_mask[0x20]; + + u8 reserved_at_a0[0x20]; + + struct mlx5_ifc_qpc_bits qpc; + + u8 reserved_at_800[0x80]; +}; + +struct mlx5_ifc_rts2rts_qp_out_bits { + u8 status[0x8]; + u8 reserved_at_8[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_at_40[0x40]; +}; + +struct mlx5_ifc_rts2rts_qp_in_bits { + u8 opcode[0x10]; + u8 uid[0x10]; + + u8 reserved_at_20[0x10]; + u8 op_mod[0x10]; + + u8 qpc_ext[0x1]; + u8 reserved_at_41[0x7]; + u8 qpn[0x18]; + + u8 reserved_at_60[0x20]; + + u8 opt_param_mask[0x20]; + + u8 reserved_at_a0[0x20]; + + struct mlx5_ifc_qpc_bits qpc; + + u8 reserved_at_800[0x40]; + + u8 opt_param_mask_95_32[0x40]; + + struct mlx5_ifc_qpc_ext_bits qpc_data_ext; +}; + +struct mlx5_ifc_query_qp_out_bits { + u8 status[0x8]; + u8 reserved_at_8[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_at_40[0x40]; + + u8 opt_param_mask[0x20]; + + u8 reserved_at_a0[0x20]; + + struct mlx5_ifc_qpc_bits qpc; + + u8 reserved_at_800[0x80]; + + u8 pas[0][0x40]; +}; + +struct mlx5_ifc_query_qp_in_bits { + u8 opcode[0x10]; + u8 reserved_at_10[0x10]; + + u8 reserved_at_20[0x10]; + u8 op_mod[0x10]; + + u8 reserved_at_40[0x8]; + u8 qpn[0x18]; + + u8 reserved_at_60[0x20]; +}; + +struct mlx5_ifc_query_dct_out_bits { + u8 status[0x8]; + u8 reserved_at_8[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_at_40[0x40]; + + struct mlx5_ifc_dctc_bits dctc; +}; + +struct mlx5_ifc_query_dct_in_bits { + u8 opcode[0x10]; + u8 reserved_at_10[0x10]; + + u8 reserved_at_20[0x10]; + u8 op_mod[0x10]; + + u8 reserved_at_40[0x8]; + u8 dctn[0x18]; + + u8 reserved_at_60[0x20]; +}; + +struct mlx5_ifc_tisc_bits { + u8 strict_lag_tx_port_affinity[0x1]; + u8 tls_en[0x1]; + u8 reserved_at_2[0x2]; + u8 lag_tx_port_affinity[0x04]; + + u8 reserved_at_8[0x4]; + u8 prio[0x4]; + u8 reserved_at_10[0x10]; + + u8 reserved_at_20[0x100]; + + u8 reserved_at_120[0x8]; + u8 transport_domain[0x18]; + + u8 reserved_at_140[0x8]; + u8 underlay_qpn[0x18]; + + u8 reserved_at_160[0x8]; + u8 pd[0x18]; + + u8 reserved_at_180[0x380]; +}; + +struct mlx5_ifc_query_tis_out_bits { + u8 status[0x8]; + u8 reserved_at_8[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_at_40[0x40]; + + struct mlx5_ifc_tisc_bits tis_context; +}; + +struct mlx5_ifc_query_tis_in_bits { + u8 opcode[0x10]; + u8 reserved_at_10[0x10]; + + u8 reserved_at_20[0x10]; + u8 op_mod[0x10]; + + u8 reserved_at_40[0x8]; + u8 tisn[0x18]; + + u8 reserved_at_60[0x20]; +}; + +struct mlx5_ifc_lagc_bits { + u8 reserved_at_0[0x1d]; + u8 lag_state[0x3]; + + u8 reserved_at_20[0x14]; + u8 tx_remap_affinity_2[0x4]; + u8 reserved_at_38[0x4]; + u8 tx_remap_affinity_1[0x4]; +}; + +struct mlx5_ifc_query_lag_out_bits { + u8 status[0x8]; + u8 reserved_at_8[0x18]; + + u8 syndrome[0x20]; + + struct mlx5_ifc_lagc_bits ctx; +}; + +struct mlx5_ifc_query_lag_in_bits { + u8 opcode[0x10]; + u8 reserved_at_10[0x10]; + + u8 reserved_at_20[0x10]; + u8 op_mod[0x10]; + + u8 reserved_at_40[0x40]; +}; + +struct mlx5_ifc_av_qp_mapping_bits { + u8 modify_field_select[0x40]; + + u8 reserved_at_40[0x20]; + + u8 qpn[0x20]; + + struct mlx5_ifc_ud_av_bits remote_address_vector; +}; + +struct mlx5_ifc_create_av_qp_mapping_in_bits { + struct mlx5_ifc_general_obj_in_cmd_hdr_bits hdr; + struct mlx5_ifc_av_qp_mapping_bits mapping; +}; + +struct mlx5_ifc_query_av_qp_mapping_out_bits { + struct mlx5_ifc_general_obj_out_cmd_hdr_bits hdr; + struct mlx5_ifc_av_qp_mapping_bits obj; +}; + + +struct mlx5_ifc_modify_tis_out_bits { + u8 status[0x8]; + u8 reserved_at_8[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_at_40[0x40]; +}; + +struct mlx5_ifc_modify_tis_bitmask_bits { + u8 reserved_at_0[0x20]; + + u8 reserved_at_20[0x1d]; + u8 lag_tx_port_affinity[0x1]; + u8 strict_lag_tx_port_affinity[0x1]; + u8 prio[0x1]; +}; + +struct mlx5_ifc_modify_tis_in_bits { + u8 opcode[0x10]; + u8 uid[0x10]; + + u8 reserved_at_20[0x10]; + u8 op_mod[0x10]; + + u8 reserved_at_40[0x8]; + u8 tisn[0x18]; + + u8 reserved_at_60[0x20]; + + struct mlx5_ifc_modify_tis_bitmask_bits bitmask; + + u8 reserved_at_c0[0x40]; + + struct mlx5_ifc_tisc_bits ctx; +}; + +enum roce_version { + MLX5_ROCE_VERSION_1 = 0, + MLX5_ROCE_VERSION_2 = 2, +}; + +struct mlx5_ifc_roce_addr_layout_bits { + u8 source_l3_address[16][0x8]; + + u8 reserved_at_80[0x3]; + u8 vlan_valid[0x1]; + u8 vlan_id[0xc]; + u8 source_mac_47_32[0x10]; + + u8 source_mac_31_0[0x20]; + + u8 reserved_at_c0[0x14]; + u8 roce_l3_type[0x4]; + u8 roce_version[0x8]; + + u8 reserved_at_e0[0x20]; +}; + +struct mlx5_ifc_query_roce_address_out_bits { + u8 status[0x8]; + u8 reserved_at_8[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_at_40[0x40]; + + struct mlx5_ifc_roce_addr_layout_bits roce_address; +}; + +struct mlx5_ifc_query_roce_address_in_bits { + u8 opcode[0x10]; + u8 reserved_at_10[0x10]; + + u8 reserved_at_20[0x10]; + u8 op_mod[0x10]; + + u8 roce_address_index[0x10]; + u8 reserved_at_50[0xc]; + u8 vhca_port_num[0x4]; + + u8 reserved_at_60[0x20]; +}; + +/* Both HW set and HW add share the same HW format with different opcodes */ +struct mlx5_ifc_dr_action_hw_set_bits { + u8 opcode[0x8]; + u8 destination_field_code[0x8]; + u8 reserved_at_10[0x2]; + u8 destination_left_shifter[0x6]; + u8 reserved_at_18[0x3]; + u8 destination_length[0x5]; + + u8 inline_data[0x20]; +}; + +struct mlx5_ifc_dr_action_hw_copy_bits { + u8 opcode[0x8]; + u8 destination_field_code[0x8]; + u8 reserved_at_10[0x2]; + u8 destination_left_shifter[0x6]; + u8 reserved_at_18[0x2]; + u8 destination_length[0x6]; + + u8 reserved_at_20[0x8]; + u8 source_field_code[0x8]; + u8 reserved_at_30[0x2]; + u8 source_left_shifter[0x6]; + u8 reserved_at_38[0x8]; +}; + +struct mlx5_ifc_host_params_context_bits { + u8 host_number[0x8]; + u8 reserved_at_8[0x6]; + u8 host_pf_vhca_id_valid[0x1]; + u8 host_pf_disabled[0x1]; + u8 host_num_of_vfs[0x10]; + + u8 host_total_vfs[0x10]; + u8 host_pci_bus[0x10]; + + u8 host_pf_vhca_id[0x10]; + u8 host_pci_device[0x10]; + + u8 reserved_at_60[0x10]; + u8 host_pci_function[0x10]; + + u8 reserved_at_80[0x180]; +}; + +struct mlx5_ifc_query_esw_functions_in_bits { + u8 opcode[0x10]; + u8 reserved_at_10[0x10]; + + u8 reserved_at_20[0x10]; + u8 op_mod[0x10]; + + u8 reserved_at_40[0x40]; +}; + +struct mlx5_ifc_query_esw_functions_out_bits { + u8 status[0x8]; + u8 reserved_at_8[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_at_40[0x40]; + + struct mlx5_ifc_host_params_context_bits host_params_context; + + u8 reserved_at_280[0x180]; + u8 host_sf_enable[0][0x40]; +}; + +struct mlx5_ifc_create_flow_group_in_bits { + u8 opcode[0x10]; + u8 reserved_at_10[0x10]; + + u8 reserved_at_20[0x20]; + + u8 other_vport[0x1]; + u8 reserved_at_41[0xf]; + u8 vport_number[0x10]; + + u8 reserved_at_60[0x20]; + + u8 table_type[0x8]; + u8 reserved_at_88[0x18]; + + u8 reserved_at_a0[0x8]; + u8 table_id[0x18]; + + u8 reserved_at_c0[0x1f40]; +}; + +struct mlx5_ifc_create_flow_group_out_bits { + u8 status[0x8]; + u8 reserved_at_8[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_at_40[0x8]; + u8 group_id[0x18]; + + u8 reserved_at_60[0x20]; +}; + +struct mlx5_ifc_destroy_flow_group_in_bits { + u8 opcode[0x10]; + u8 reserved_at_10[0x10]; + + u8 reserved_at_20[0x20]; + + u8 other_vport[0x1]; + u8 reserved_at_41[0xf]; + u8 vport_number[0x10]; + + u8 reserved_at_60[0x20]; + + u8 table_type[0x8]; + u8 reserved_at_88[0x18]; + + u8 reserved_at_a0[0x8]; + u8 table_id[0x18]; + + u8 group_id[0x20]; + + u8 reserved_at_e0[0x120]; +}; + +struct mlx5_ifc_dest_format_bits { + u8 destination_type[0x8]; + u8 destination_id[0x18]; + + u8 reserved_at_20[0x1]; + u8 packet_reformat[0x1]; + u8 reserved_at_22[0x1e]; +}; + +struct mlx5_ifc_extended_dest_format_bits { + struct mlx5_ifc_dest_format_bits destination_entry; + + u8 packet_reformat_id[0x20]; + + u8 reserved_at_60[0x20]; +}; + +struct mlx5_ifc_flow_counter_list_bits { + u8 flow_counter_id[0x20]; + + u8 reserved_at_20[0x20]; +}; + +union mlx5_ifc_dest_format_flow_counter_list_auto_bits { + struct mlx5_ifc_dest_format_bits dest_format; + struct mlx5_ifc_flow_counter_list_bits flow_counter_list; + u8 reserved_at_0[0x40]; +}; + +struct mlx5_ifc_flow_context_bits { + u8 reserved_at_00[0x20]; + + u8 group_id[0x20]; + + u8 reserved_at_40[0x8]; + u8 flow_tag[0x18]; + + u8 reserved_at_60[0x10]; + u8 action[0x10]; + + u8 extended_destination[0x1]; + u8 reserved_at_81[0x7]; + u8 destination_list_size[0x18]; + + u8 reserved_at_a0[0x8]; + u8 flow_counter_list_size[0x18]; + + u8 reserved_at_c0[0x1740]; + + union mlx5_ifc_dest_format_flow_counter_list_auto_bits destination[0]; +}; + +struct mlx5_ifc_set_fte_in_bits { + u8 opcode[0x10]; + u8 reserved_at_10[0x10]; + + u8 reserved_at_20[0x10]; + u8 op_mod[0x10]; + + u8 other_vport[0x1]; + u8 reserved_at_41[0xf]; + u8 vport_number[0x10]; + + u8 reserved_at_60[0x20]; + + u8 table_type[0x8]; + u8 reserved_at_88[0x18]; + + u8 reserved_at_a0[0x8]; + u8 table_id[0x18]; + + u8 reserved_at_c0[0x40]; + u8 flow_index[0x20]; + + u8 reserved_at_120[0xe0]; + struct mlx5_ifc_flow_context_bits flow_context; +}; + +struct mlx5_ifc_set_fte_out_bits { + u8 status[0x8]; + u8 reserved_at_8[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_at_40[0x40]; +}; + +enum dr_devx_flow_dest_type { + MLX5_FLOW_DEST_TYPE_VPORT = 0x0, + MLX5_FLOW_DEST_TYPE_FT = 0x1, + MLX5_FLOW_DEST_TYPE_TIR = 0x2, + + MLX5_FLOW_DEST_TYPE_COUNTER = 0x100, +}; + +enum { + MLX5_FLOW_CONTEXT_ACTION_FWD_DEST = 0x4, + MLX5_FLOW_CONTEXT_ACTION_COUNT = 0x8, +}; + +enum { + MLX5_QPC_PAGE_OFFSET_QUANTA = 64, +}; + +enum { + MLX5_ASO_FIRST_HIT_NUM_PER_OBJ = 512, + MLX5_ASO_FLOW_METER_NUM_PER_OBJ = 2, + MLX5_ASO_CT_NUM_PER_OBJ = 1, +}; + +enum mlx5_sched_hierarchy_type { + MLX5_SCHED_HIERARCHY_NIC = 3, +}; + +enum mlx5_sched_elem_type { + MLX5_SCHED_ELEM_TYPE_TSAR = 0x0, + MLX5_SCHED_ELEM_TYPE_VPORT = 0x1, + MLX5_SCHED_ELEM_TYPE_VPORT_TC = 0x2, + MLX5_SCHED_ELEM_TYPE_PARA_VPORT_TC = 0x3, + MLX5_SCHED_ELEM_TYPE_QUEUE_GROUP = 0x4, +}; + +enum mlx5_sched_tsar_type { + MLX5_SCHED_TSAR_TYPE_DWRR = 0x0, + MLX5_SCHED_TSAR_TYPE_ROUND_ROBIN = 0x1, + MLX5_SCHED_TSAR_TYPE_ETS = 0x2, +}; + +struct mlx5_ifc_sched_elem_attr_tsar_bits { + u8 reserved_at_0[0x8]; + u8 tsar_type[0x8]; + u8 reserved_at_10[0x10]; +}; + +union mlx5_ifc_sched_elem_attr_bits { + struct mlx5_ifc_sched_elem_attr_tsar_bits tsar; +}; + +struct mlx5_ifc_sched_context_bits { + u8 element_type[0x8]; + u8 reserved_at_8[0x18]; + + union mlx5_ifc_sched_elem_attr_bits sched_elem_attr; + + u8 parent_element_id[0x20]; + + u8 reserved_at_60[0x40]; + + u8 bw_share[0x20]; + + u8 max_average_bw[0x20]; + + u8 reserved_at_e0[0x120]; +}; + +struct mlx5_ifc_sched_elem_bits { + u8 modify_field_select[0x40]; + + u8 scheduling_hierarchy[0x8]; + u8 reserved_at_48[0x18]; + + u8 reserved_at_60[0xa0]; + + struct mlx5_ifc_sched_context_bits sched_context; + + u8 reserved_at_300[0x100]; +}; + +struct mlx5_ifc_create_sched_elem_in_bits { + struct mlx5_ifc_general_obj_in_cmd_hdr_bits hdr; + struct mlx5_ifc_sched_elem_bits sched_elem; +}; + +struct mlx5_ifc_create_modify_elem_in_bits { + struct mlx5_ifc_general_obj_in_cmd_hdr_bits hdr; + struct mlx5_ifc_sched_elem_bits sched_elem; +}; + +enum { + MLX5_SQC_STATE_RDY = 0x1, +}; + +struct mlx5_ifc_sqc_bits { + u8 reserved_at_0[0x8]; + u8 state[0x4]; + u8 reserved_at_c[0x14]; + + u8 reserved_at_20[0xe0]; + + u8 reserved_at_100[0x10]; + u8 qos_queue_group_id[0x10]; + + u8 reserved_at_120[0x660]; +}; + +enum { + MLX5_MODIFY_SQ_BITMASK_QOS_QUEUE_GROUP_ID = 1 << 2, +}; + +struct mlx5_ifc_modify_sq_out_bits { + u8 status[0x8]; + u8 reserved_at_8[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_at_40[0x40]; +}; + +struct mlx5_ifc_modify_sq_in_bits { + u8 opcode[0x10]; + u8 uid[0x10]; + + u8 reserved_at_20[0x10]; + u8 op_mod[0x10]; + + u8 sq_state[0x4]; + u8 reserved_at_44[0x4]; + u8 sqn[0x18]; + + u8 reserved_at_60[0x20]; + + u8 modify_bitmask[0x40]; + + u8 reserved_at_c0[0x40]; + + struct mlx5_ifc_sqc_bits sq_context; +}; + +struct mlx5_ifc_reserved_qpn_bits { + u8 reserved_at_0[0x80]; +}; + +struct mlx5_ifc_create_reserved_qpn_in_bits { + struct mlx5_ifc_general_obj_in_cmd_hdr_bits hdr; + struct mlx5_ifc_reserved_qpn_bits rqpns; +}; + +struct mlx5_ifc_create_psv_out_bits { + u8 status[0x8]; + u8 reserved_at_8[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_at_40[0x40]; + + u8 reserved_at_80[0x8]; + u8 psv0_index[0x18]; + + u8 reserved_at_a0[0x8]; + u8 psv1_index[0x18]; + + u8 reserved_at_c0[0x8]; + u8 psv2_index[0x18]; + + u8 reserved_at_e0[0x8]; + u8 psv3_index[0x18]; +}; + +struct mlx5_ifc_create_psv_in_bits { + u8 opcode[0x10]; + u8 reserved_at_10[0x10]; + + u8 reserved_at_20[0x10]; + u8 op_mod[0x10]; + + u8 num_psv[0x4]; + u8 reserved_at_44[0x4]; + u8 pd[0x18]; + + u8 reserved_at_60[0x20]; +}; + +struct mlx5_ifc_destroy_psv_in_bits { + u8 opcode[0x10]; + u8 reserved_at_10[0x10]; + + u8 reserved_at_20[0x20]; + + u8 reserved_at_40[0x8]; + u8 psvn[0x18]; + + u8 reserved_at_60[0x20]; +}; + +struct mlx5_ifc_mbox_out_bits { + u8 status[0x8]; + u8 reserved_at_8[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_at_40[0x40]; +}; + +struct mlx5_ifc_mbox_in_bits { + u8 opcode[0x10]; + u8 uid[0x10]; + + u8 reserved_at_20[0x10]; + u8 op_mod[0x10]; + + u8 reserved_at_40[0x40]; +}; + +struct mlx5_ifc_enable_hca_in_bits { + u8 opcode[0x10]; + u8 reserved_at_10[0x10]; + + u8 reserved_at_20[0x10]; + u8 op_mod[0x10]; + + u8 reserved_at_40[0x10]; + u8 function_id[0x10]; + + u8 reserved_at_60[0x20]; +}; + +struct mlx5_ifc_enable_hca_out_bits { + u8 status[0x8]; + u8 reserved_at_8[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_at_40[0x20]; +}; + +struct mlx5_ifc_query_issi_out_bits { + u8 status[0x8]; + u8 reserved_at_8[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_at_40[0x10]; + u8 current_issi[0x10]; + + u8 reserved_at_60[0xa0]; + + u8 reserved_at_100[76][0x8]; + u8 supported_issi_dw0[0x20]; +}; + +struct mlx5_ifc_query_issi_in_bits { + u8 opcode[0x10]; + u8 reserved_at_10[0x10]; + + u8 reserved_at_20[0x10]; + u8 op_mod[0x10]; + + u8 reserved_at_40[0x40]; +}; + +struct mlx5_ifc_set_issi_out_bits { + u8 status[0x8]; + u8 reserved_at_8[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_at_40[0x40]; +}; + +struct mlx5_ifc_set_issi_in_bits { + u8 opcode[0x10]; + u8 reserved_at_10[0x10]; + + u8 reserved_at_20[0x10]; + u8 op_mod[0x10]; + + u8 reserved_at_40[0x10]; + u8 current_issi[0x10]; + + u8 reserved_at_60[0x20]; +}; + +struct mlx5_ifc_query_pages_out_bits { + u8 status[0x8]; + u8 reserved_at_8[0x18]; + + u8 syndrome[0x20]; + + u8 embedded_cpu_function[0x01]; + u8 reserved_bits[0x0f]; + u8 function_id[0x10]; + + u8 num_pages[0x20]; +}; + +struct mlx5_ifc_query_pages_in_bits { + u8 opcode[0x10]; + u8 reserved_at_10[0x10]; + + u8 reserved_at_20[0x10]; + u8 op_mod[0x10]; + + u8 reserved_at_40[0x10]; + u8 function_id[0x10]; + + u8 reserved_at_60[0x20]; +}; + +struct mlx5_ifc_manage_pages_out_bits { + u8 status[0x8]; + u8 reserved_at_8[0x18]; + + u8 syndrome[0x20]; + + u8 output_num_entries[0x20]; + + u8 reserved_at_60[0x20]; + + u8 pas[][0x40]; +}; + +struct mlx5_ifc_manage_pages_in_bits { + u8 opcode[0x10]; + u8 reserved_at_10[0x10]; + + u8 reserved_at_20[0x10]; + u8 op_mod[0x10]; + + u8 embedded_cpu_function[0x1]; + u8 reserved_at_41[0xf]; + u8 function_id[0x10]; + + u8 input_num_entries[0x20]; + + u8 pas[][0x40]; +}; + +enum { + MLX5_TEARDOWN_HCA_OUT_FORCE_STATE_FAIL = 0x1, +}; + +struct mlx5_ifc_teardown_hca_out_bits { + u8 status[0x8]; + u8 reserved_at_8[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_at_40[0x3f]; + + u8 state[0x1]; +}; + +enum { + MLX5_TEARDOWN_HCA_IN_PROFILE_GRACEFUL_CLOSE = 0x0, + MLX5_TEARDOWN_HCA_IN_PROFILE_PREPARE_FAST_TEARDOWN = 0x2, +}; + +struct mlx5_ifc_teardown_hca_in_bits { + u8 opcode[0x10]; + u8 reserved_at_10[0x10]; + + u8 reserved_at_20[0x10]; + u8 op_mod[0x10]; + + u8 reserved_at_40[0x10]; + u8 profile[0x10]; + + u8 reserved_at_60[0x20]; +}; + +struct mlx5_ifc_init_hca_out_bits { + u8 status[0x8]; + u8 reserved_at_8[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_at_40[0x40]; +}; + +struct mlx5_ifc_init_hca_in_bits { + u8 opcode[0x10]; + u8 reserved_at_10[0x10]; + + u8 reserved_at_20[0x10]; + u8 op_mod[0x10]; + + u8 reserved_at_40[0x40]; +}; + +struct mlx5_ifc_access_register_out_bits { + u8 status[0x8]; + u8 reserved_at_8[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_at_40[0x40]; + + u8 register_data[][0x20]; +}; + +struct mlx5_ifc_access_register_in_bits { + u8 opcode[0x10]; + u8 reserved_at_10[0x10]; + + u8 reserved_at_20[0x10]; + u8 op_mod[0x10]; + + u8 reserved_at_40[0x10]; + u8 register_id[0x10]; + + u8 argument[0x20]; + + u8 register_data[][0x20]; +}; + +struct mlx5_ifc_modify_nic_vport_context_out_bits { + u8 status[0x8]; + u8 reserved_at_8[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_at_40[0x40]; +}; + +struct mlx5_ifc_modify_nic_vport_field_select_bits { + u8 reserved_at_0[0x12]; + u8 affiliation[0x1]; + u8 reserved_at_13[0x1]; + u8 disable_uc_local_lb[0x1]; + u8 disable_mc_local_lb[0x1]; + u8 node_guid[0x1]; + u8 port_guid[0x1]; + u8 min_inline[0x1]; + u8 mtu[0x1]; + u8 change_event[0x1]; + u8 promisc[0x1]; + u8 permanent_address[0x1]; + u8 addresses_list[0x1]; + u8 roce_en[0x1]; + u8 reserved_at_1f[0x1]; +}; + +struct mlx5_ifc_modify_nic_vport_context_in_bits { + u8 opcode[0x10]; + u8 reserved_at_10[0x10]; + + u8 reserved_at_20[0x10]; + u8 op_mod[0x10]; + + u8 other_vport[0x1]; + u8 reserved_at_41[0xf]; + u8 vport_number[0x10]; + + struct mlx5_ifc_modify_nic_vport_field_select_bits field_select; + + u8 reserved_at_80[0x780]; + + struct mlx5_ifc_nic_vport_context_bits nic_vport_context; +}; + +struct mlx5_ifc_set_hca_cap_out_bits { + u8 status[0x8]; + u8 reserved_at_8[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_at_40[0x40]; +}; + +struct mlx5_ifc_set_hca_cap_in_bits { + u8 opcode[0x10]; + u8 reserved_at_10[0x10]; + + u8 reserved_at_20[0x10]; + u8 op_mod[0x10]; + + u8 other_function[0x1]; + u8 reserved_at_41[0xf]; + u8 function_id[0x10]; + + u8 reserved_at_60[0x20]; + + union mlx5_ifc_hca_cap_union_bits capability; +}; + +struct mlx5_ifc_alloc_uar_out_bits { + u8 status[0x8]; + u8 reserved_at_8[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_at_40[0x8]; + u8 uar[0x18]; + + u8 reserved_at_60[0x20]; +}; + +struct mlx5_ifc_alloc_uar_in_bits { + u8 opcode[0x10]; + u8 reserved_at_10[0x10]; + + u8 reserved_at_20[0x10]; + u8 op_mod[0x10]; + + u8 reserved_at_40[0x40]; +}; + +struct mlx5_ifc_dealloc_uar_out_bits { + u8 status[0x8]; + u8 reserved_at_8[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_at_40[0x40]; +}; + +struct mlx5_ifc_dealloc_uar_in_bits { + u8 opcode[0x10]; + u8 reserved_at_10[0x10]; + + u8 reserved_at_20[0x10]; + u8 op_mod[0x10]; + + u8 reserved_at_40[0x8]; + u8 uar[0x18]; + + u8 reserved_at_60[0x20]; +}; + +struct mlx5_ifc_eqc_bits { + u8 status[0x4]; + u8 reserved_at_4[0x9]; + u8 ec[0x1]; + u8 oi[0x1]; + u8 reserved_at_f[0x5]; + u8 st[0x4]; + u8 reserved_at_18[0x8]; + + u8 reserved_at_20[0x20]; + + u8 reserved_at_40[0x14]; + u8 page_offset[0x6]; + u8 reserved_at_5a[0x6]; + + u8 reserved_at_60[0x3]; + u8 log_eq_size[0x5]; + u8 uar_page[0x18]; + + u8 reserved_at_80[0x20]; + + u8 reserved_at_a0[0x18]; + u8 intr[0x8]; + + u8 reserved_at_c0[0x3]; + u8 log_page_size[0x5]; + u8 reserved_at_c8[0x18]; + + u8 reserved_at_e0[0x60]; + + u8 reserved_at_140[0x8]; + u8 consumer_counter[0x18]; + + u8 reserved_at_160[0x8]; + u8 producer_counter[0x18]; + + u8 reserved_at_180[0x80]; +}; + +struct mlx5_ifc_create_eq_out_bits { + u8 status[0x8]; + u8 reserved_at_8[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_at_40[0x18]; + u8 eq_number[0x8]; + + u8 reserved_at_60[0x20]; +}; + +struct mlx5_ifc_create_eq_in_bits { + u8 opcode[0x10]; + u8 uid[0x10]; + + u8 reserved_at_20[0x10]; + u8 op_mod[0x10]; + + u8 reserved_at_40[0x40]; + + struct mlx5_ifc_eqc_bits eq_context_entry; + + u8 reserved_at_280[0x40]; + + u8 event_bitmask[4][0x40]; + + u8 reserved_at_3c0[0x4c0]; + + u8 pas[][0x40]; +}; + +struct mlx5_ifc_destroy_eq_out_bits { + u8 status[0x8]; + u8 reserved_at_8[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_at_40[0x40]; +}; + +struct mlx5_ifc_destroy_eq_in_bits { + u8 opcode[0x10]; + u8 reserved_at_10[0x10]; + + u8 reserved_at_20[0x10]; + u8 op_mod[0x10]; + + u8 reserved_at_40[0x18]; + u8 eq_number[0x8]; + + u8 reserved_at_60[0x20]; +}; + +struct mlx5_ifc_alloc_pd_out_bits { + u8 status[0x8]; + u8 reserved_at_8[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_at_40[0x8]; + u8 pd[0x18]; + + u8 reserved_at_60[0x20]; +}; + +struct mlx5_ifc_alloc_pd_in_bits { + u8 opcode[0x10]; + u8 uid[0x10]; + + u8 reserved_at_20[0x10]; + u8 op_mod[0x10]; + + u8 reserved_at_40[0x40]; +}; + +struct mlx5_ifc_dealloc_pd_out_bits { + u8 status[0x8]; + u8 reserved_at_8[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_at_40[0x40]; +}; + +struct mlx5_ifc_dealloc_pd_in_bits { + u8 opcode[0x10]; + u8 uid[0x10]; + + u8 reserved_at_20[0x10]; + u8 op_mod[0x10]; + + u8 reserved_at_40[0x8]; + u8 pd[0x18]; + + u8 reserved_at_60[0x20]; +}; + +struct mlx5_ifc_mtt_bits { + u8 ptag_63_32[0x20]; + + u8 ptag_31_8[0x18]; + u8 reserved_at_38[0x6]; + u8 wr_en[0x1]; + u8 rd_en[0x1]; +}; + +struct mlx5_ifc_umem_bits { + u8 reserved_at_0[0x80]; + + u8 reserved_at_80[0x1b]; + u8 log_page_size[0x5]; + + u8 page_offset[0x20]; + + u8 num_of_mtt[0x40]; + + struct mlx5_ifc_mtt_bits mtt[]; +}; + +struct mlx5_ifc_create_umem_in_bits { + u8 opcode[0x10]; + u8 uid[0x10]; + + u8 reserved_at_20[0x10]; + u8 op_mod[0x10]; + + u8 reserved_at_40[0x40]; + + struct mlx5_ifc_umem_bits umem; +}; + +struct mlx5_ifc_create_umem_out_bits { + u8 status[0x8]; + u8 reserved_at_8[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_at_40[0x8]; + u8 umem_id[0x18]; + + u8 reserved_at_60[0x20]; +}; + +struct mlx5_ifc_destroy_umem_in_bits { + u8 opcode[0x10]; + u8 uid[0x10]; + + u8 reserved_at_20[0x10]; + u8 op_mod[0x10]; + + u8 reserved_at_40[0x8]; + u8 umem_id[0x18]; + + u8 reserved_at_60[0x20]; +}; + +struct mlx5_ifc_destroy_umem_out_bits { + u8 status[0x8]; + u8 reserved_at_8[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_at_40[0x40]; +}; + +struct mlx5_ifc_delete_fte_in_bits { + u8 opcode[0x10]; + u8 reserved_at_10[0x10]; + + u8 reserved_at_20[0x20]; + + u8 other_vport[0x1]; + u8 reserved_at_41[0xf]; + u8 vport_number[0x10]; + + u8 reserved_at_60[0x20]; + + u8 table_type[0x8]; + u8 reserved_at_88[0x18]; + + u8 reserved_at_a0[0x8]; + u8 table_id[0x18]; + + u8 reserved_at_c0[0x40]; + + u8 flow_index[0x20]; + + u8 reserved_at_120[0xe0]; +}; + +struct mlx5_ifc_create_cq_out_bits { + u8 reserved_at_0[0x40]; + + u8 reserved_at_40[0x8]; + u8 cqn[0x18]; + + u8 reserved_at_60[0x20]; +}; + +struct mlx5_ifc_destroy_cq_in_bits { + u8 opcode[0x10]; + u8 uid[0x10]; + + u8 reserved_at_20[0x20]; + + u8 reserved_at_40[0x8]; + u8 cqn[0x18]; + + u8 reserved_at_60[0x20]; +}; + +struct mlx5_ifc_alloc_transport_domain_out_bits { + u8 reserved_at_0[0x40]; + + u8 reserved_at_40[0x8]; + u8 transport_domain[0x18]; + + u8 reserved_at_60[0x20]; +}; + +struct mlx5_ifc_dealloc_transport_domain_in_bits { + u8 opcode[0x10]; + u8 uid[0x10]; + + u8 reserved_at_20[0x20]; + + u8 reserved_at_40[0x8]; + u8 transport_domain[0x18]; + + u8 reserved_at_60[0x20]; +}; + +struct mlx5_ifc_create_rmp_out_bits { + u8 reserved_at_0[0x40]; + + u8 reserved_at_40[0x8]; + u8 rmpn[0x18]; + + u8 reserved_at_60[0x20]; +}; + +struct mlx5_ifc_destroy_rmp_in_bits { + u8 opcode[0x10]; + u8 uid[0x10]; + + u8 reserved_at_20[0x20]; + + u8 reserved_at_40[0x8]; + u8 rmpn[0x18]; + + u8 reserved_at_60[0x20]; +}; + +struct mlx5_ifc_create_sq_out_bits { + u8 reserved_at_0[0x40]; + + u8 reserved_at_40[0x8]; + u8 sqn[0x18]; + + u8 reserved_at_60[0x20]; +}; + +struct mlx5_ifc_destroy_sq_in_bits { + u8 opcode[0x10]; + u8 uid[0x10]; + + u8 reserved_at_20[0x20]; + + u8 reserved_at_40[0x8]; + u8 sqn[0x18]; + + u8 reserved_at_60[0x20]; +}; + +struct mlx5_ifc_create_rq_out_bits { + u8 reserved_at_0[0x40]; + + u8 reserved_at_40[0x8]; + u8 rqn[0x18]; + + u8 reserved_at_60[0x20]; +}; + +struct mlx5_ifc_destroy_rq_in_bits { + u8 opcode[0x10]; + u8 uid[0x10]; + + u8 reserved_at_20[0x20]; + + u8 reserved_at_40[0x8]; + u8 rqn[0x18]; + + u8 reserved_at_60[0x20]; +}; + +struct mlx5_ifc_create_rqt_out_bits { + u8 reserved_at_0[0x40]; + + u8 reserved_at_40[0x8]; + u8 rqtn[0x18]; + + u8 reserved_at_60[0x20]; +}; + +struct mlx5_ifc_destroy_rqt_in_bits { + u8 opcode[0x10]; + u8 uid[0x10]; + + u8 reserved_at_20[0x20]; + + u8 reserved_at_40[0x8]; + u8 rqtn[0x18]; + + u8 reserved_at_60[0x20]; +}; + +struct mlx5_ifc_create_tis_out_bits { + u8 reserved_at_0[0x40]; + + u8 reserved_at_40[0x8]; + u8 tisn[0x18]; + + u8 reserved_at_60[0x20]; +}; + +struct mlx5_ifc_destroy_tis_in_bits { + u8 opcode[0x10]; + u8 uid[0x10]; + + u8 reserved_at_20[0x20]; + + u8 reserved_at_40[0x8]; + u8 tisn[0x18]; + + u8 reserved_at_60[0x20]; +}; + +struct mlx5_ifc_alloc_q_counter_out_bits { + u8 reserved_at_0[0x40]; + + u8 reserved_at_40[0x18]; + u8 counter_set_id[0x8]; + + u8 reserved_at_60[0x20]; +}; + +struct mlx5_ifc_dealloc_q_counter_in_bits { + u8 opcode[0x10]; + u8 reserved_at_10[0x10]; + + u8 reserved_at_20[0x20]; + + u8 reserved_at_40[0x18]; + u8 counter_set_id[0x8]; + + u8 reserved_at_60[0x20]; +}; + +struct mlx5_ifc_alloc_modify_header_context_out_bits { + u8 reserved_at_0[0x40]; + + u8 modify_header_id[0x20]; + + u8 reserved_at_60[0x20]; +}; + +struct mlx5_ifc_dealloc_modify_header_context_in_bits { + u8 opcode[0x10]; + u8 reserved_at_10[0x10]; + + u8 reserved_at_20[0x20]; + + u8 modify_header_id[0x20]; + + u8 reserved_at_60[0x20]; +}; + +struct mlx5_ifc_create_scheduling_element_out_bits { + u8 reserved_at_0[0x80]; + + u8 scheduling_element_id[0x20]; + + u8 reserved_at_a0[0x160]; +}; + +struct mlx5_ifc_create_scheduling_element_in_bits { + u8 opcode[0x10]; + u8 reserved_at_10[0x10]; + + u8 reserved_at_20[0x20]; + + u8 scheduling_hierarchy[0x8]; + u8 reserved_at_48[0x18]; + + u8 reserved_at_60[0x3a0]; +}; + +struct mlx5_ifc_destroy_scheduling_element_in_bits { + u8 opcode[0x10]; + u8 reserved_at_10[0x10]; + + u8 reserved_at_20[0x20]; + + u8 scheduling_hierarchy[0x8]; + u8 reserved_at_48[0x18]; + + u8 scheduling_element_id[0x20]; + + u8 reserved_at_80[0x180]; +}; + +struct mlx5_ifc_add_vxlan_udp_dport_in_bits { + u8 reserved_at_0[0x60]; + + u8 reserved_at_60[0x10]; + u8 vxlan_udp_port[0x10]; +}; + +struct mlx5_ifc_delete_vxlan_udp_dport_in_bits { + u8 opcode[0x10]; + u8 reserved_at_10[0x10]; + + u8 reserved_at_20[0x40]; + + u8 reserved_at_60[0x10]; + u8 vxlan_udp_port[0x10]; +}; + +struct mlx5_ifc_set_l2_table_entry_in_bits { + u8 reserved_at_0[0xa0]; + + u8 reserved_at_a0[0x8]; + u8 table_index[0x18]; + + u8 reserved_at_c0[0x140]; + +}; + +struct mlx5_ifc_delete_l2_table_entry_in_bits { + u8 opcode[0x10]; + u8 reserved_at_10[0x10]; + + u8 reserved_at_20[0x80]; + + u8 reserved_at_a0[0x8]; + u8 table_index[0x18]; + + u8 reserved_at_c0[0x140]; +}; + +struct mlx5_ifc_create_srq_out_bits { + u8 reserved_at_0[0x40]; + + u8 reserved_at_40[0x8]; + u8 srqn[0x18]; + + u8 reserved_at_60[0x20]; +}; + +struct mlx5_ifc_destroy_srq_in_bits { + u8 opcode[0x10]; + u8 uid[0x10]; + + u8 reserved_at_20[0x20]; + + u8 reserved_at_40[0x8]; + u8 srqn[0x18]; + + u8 reserved_at_60[0x20]; +}; + +struct mlx5_ifc_create_xrc_srq_out_bits { + u8 reserved_at_0[0x40]; + + u8 reserved_at_40[0x8]; + u8 xrc_srqn[0x18]; + + u8 reserved_at_60[0x20]; +}; + +struct mlx5_ifc_destroy_xrc_srq_in_bits { + u8 opcode[0x10]; + u8 uid[0x10]; + + u8 reserved_at_20[0x20]; + + u8 reserved_at_40[0x8]; + u8 xrc_srqn[0x18]; + + u8 reserved_at_60[0x20]; +}; + +struct mlx5_ifc_create_dct_out_bits { + u8 reserved_at_0[0x40]; + + u8 reserved_at_40[0x8]; + u8 dctn[0x18]; + + u8 ece[0x20]; +}; + +struct mlx5_ifc_destroy_dct_in_bits { + u8 opcode[0x10]; + u8 uid[0x10]; + + u8 reserved_at_20[0x20]; + + u8 reserved_at_40[0x8]; + u8 dctn[0x18]; + + u8 reserved_at_60[0x20]; +}; + +struct mlx5_ifc_create_xrq_out_bits { + u8 reserved_at_0[0x40]; + + u8 reserved_at_40[0x8]; + u8 xrqn[0x18]; + + u8 reserved_at_60[0x20]; +}; + +struct mlx5_ifc_destroy_xrq_in_bits { + u8 opcode[0x10]; + u8 uid[0x10]; + + u8 reserved_at_20[0x20]; + + u8 reserved_at_40[0x8]; + u8 xrqn[0x18]; + + u8 reserved_at_60[0x20]; +}; + +struct mlx5_ifc_attach_to_mcg_in_bits { + u8 reserved_at_0[0x40]; + + u8 reserved_at_40[0x8]; + u8 qpn[0x18]; + + u8 reserved_at_60[0x20]; + + u8 multicast_gid[16][0x8]; +}; + +struct mlx5_ifc_detach_from_mcg_in_bits { + u8 opcode[0x10]; + u8 uid[0x10]; + + u8 reserved_at_20[0x20]; + + u8 reserved_at_40[0x8]; + u8 qpn[0x18]; + + u8 reserved_at_60[0x20]; + + u8 multicast_gid[16][0x8]; +}; + +struct mlx5_ifc_alloc_xrcd_out_bits { + u8 reserved_at_0[0x40]; + + u8 reserved_at_40[0x8]; + u8 xrcd[0x18]; + + u8 reserved_at_60[0x20]; +}; + +struct mlx5_ifc_dealloc_xrcd_in_bits { + u8 opcode[0x10]; + u8 uid[0x10]; + + u8 reserved_at_20[0x20]; + + u8 reserved_at_40[0x8]; + u8 xrcd[0x18]; + + u8 reserved_at_60[0x20]; +}; + +enum { + MLX5_CRYPTO_LOGIN_OBJ_STATE_VALID = 0x0, + MLX5_CRYPTO_LOGIN_OBJ_STATE_INVALID = 0x1, +}; + +struct mlx5_ifc_crypto_login_obj_bits { + u8 modify_field_select[0x40]; + + u8 reserved_at_40[0x40]; + + u8 reserved_at_80[0x4]; + u8 state[0x4]; + u8 credential_pointer[0x18]; + + u8 reserved_at_a0[0x8]; + u8 session_import_kek_ptr[0x18]; + + u8 reserved_at_c0[0x140]; + + u8 credential[12][0x20]; + + u8 reserved_at_380[0x480]; +}; + +struct mlx5_ifc_create_crypto_login_obj_in_bits { + struct mlx5_ifc_general_obj_in_cmd_hdr_bits hdr; + struct mlx5_ifc_crypto_login_obj_bits login_obj; +}; + +struct mlx5_ifc_query_crypto_login_obj_out_bits { + struct mlx5_ifc_general_obj_out_cmd_hdr_bits hdr; + struct mlx5_ifc_crypto_login_obj_bits obj; +}; + +enum { + MLX5_ENCRYPTION_KEY_OBJ_STATE_READY = 0x0, + MLX5_ENCRYPTION_KEY_OBJ_STATE_ERROR = 0x1, +}; + +enum { + MLX5_ENCRYPTION_KEY_OBJ_KEY_SIZE_SIZE_128 = 0x0, + MLX5_ENCRYPTION_KEY_OBJ_KEY_SIZE_SIZE_256 = 0x1, +}; + +enum { + MLX5_ENCRYPTION_KEY_OBJ_KEY_PURPOSE_AES_XTS = 0x3, +}; + +struct mlx5_ifc_encryption_key_obj_bits { + u8 modify_field_select[0x40]; + + u8 state[0x8]; + u8 reserved_at_48[0xc]; + u8 key_size[0x4]; + u8 has_keytag[0x1]; + u8 reserved_at_59[0x3]; + u8 key_purpose[0x4]; + + u8 reserved_at_60[0x8]; + u8 pd[0x18]; + + u8 reserved_at_80[0x100]; + + u8 opaque[0x40]; + + u8 reserved_at_1c0[0x40]; + + u8 key[32][0x20]; + + u8 reserved_at_600[0x200]; +}; + +struct mlx5_ifc_create_encryption_key_obj_in_bits { + struct mlx5_ifc_general_obj_in_cmd_hdr_bits hdr; + struct mlx5_ifc_encryption_key_obj_bits key_obj; +}; + +struct mlx5_ifc_query_encryption_key_obj_out_bits { + struct mlx5_ifc_general_obj_out_cmd_hdr_bits hdr; + struct mlx5_ifc_encryption_key_obj_bits obj; +}; + +enum { + MLX5_ENCRYPTION_ORDER_ENCRYPTED_WIRE_SIGNATURE = 0x0, + MLX5_ENCRYPTION_ORDER_ENCRYPTED_MEMORY_SIGNATURE = 0x1, + MLX5_ENCRYPTION_ORDER_ENCRYPTED_RAW_WIRE = 0x2, + MLX5_ENCRYPTION_ORDER_ENCRYPTED_RAW_MEMORY = 0x3, +}; + +enum { + MLX5_ENCRYPTION_STANDARD_AES_XTS = 0x0, +}; + +#endif /* MLX5_IFC_H */ diff --git a/src/utils.hpp b/src/utils.hpp index dccb125..4416e99 100644 --- a/src/utils.hpp +++ b/src/utils.hpp @@ -32,6 +32,27 @@ #endif #include // to pull PRIx64 #include +#include +#include + +#ifndef typeof +#define typeof __typeof__ +#endif + +#ifndef container_of +#define container_of(ptr, type, member) ({ \ + void *__mptr = (void *)(ptr); \ + ((type *)((uintptr_t)__mptr - offsetof(type, member))); }) + +#endif + +#ifndef MIN +#define MIN(x, y) ((x) < (y) ? (x) : (y)) +#endif + +#ifndef MAX +#define MAX(x, y) ((x) > (y) ? (x) : (y)) +#endif // internal assert function @@ -77,6 +98,23 @@ static inline T gds_atomic_get(T *ptr) #define ROUND_UP(V,SIZE) (((V)+(SIZE)-1)/(SIZE)*(SIZE)) +#define GDS_ROUND_UP_POW2(_n) \ + ({ \ + typeof(_n) pow2; \ + GDS_ASSERT((_n) >= 1); \ + for (pow2 = 1; pow2 < (_n); pow2 <<= 1); \ + pow2; \ + }) + +#define GDS_ROUND_UP_POW2_OR_0(_n) \ + ( ((_n) == 0) ? 0 : GDS_ROUND_UP_POW2(_n) ) + +#define GDS_ILOG2(_n) \ + ((typeof(_n))ceil(log2((double)(_n)))) + +#define GDS_ILOG2_OR0(_n) \ + ( ((_n) == 0) ? 0 : GDS_ILOG2(_n) ) + //----------------------------------------------------------------------------- //static inline size_t host_page_size() { return sysconf(_SC_PAGESIZE); } @@ -177,7 +215,11 @@ static inline uint32_t gds_qword_hi(uint64_t v) { typedef enum gds_alloc_cq_flags { GDS_ALLOC_CQ_DEFAULT = 0, // default on Host memory GDS_ALLOC_CQ_ON_GPU = 1<<0, - GDS_ALLOC_CQ_MASK = 1<<0 + GDS_ALLOC_CQ_MASK = 1<<0, + + GDS_ALLOC_CQ_DBREC_DEFAULT = 0x0<<2, // default on Host memory + GDS_ALLOC_CQ_DBREC_ON_GPU = 0x1<<2, + GDS_ALLOC_CQ_DBREC_MASK = 0x1<<2 } gds_alloc_cq_flags_t; typedef enum gds_alloc_qp_flags { @@ -185,9 +227,9 @@ typedef enum gds_alloc_qp_flags { GDS_ALLOC_WQ_ON_GPU = 1, GDS_ALLOC_WQ_MASK = 1<<0, - GDS_ALLOC_DBREC_DEFAULT = 0, // default on Host memory - GDS_ALLOC_DBREC_ON_GPU = 1<<4, - GDS_ALLOC_DBREC_MASK = 1<<4 + GDS_ALLOC_WQ_DBREC_DEFAULT = 0x0<<2, // default on Host memory + GDS_ALLOC_WQ_DBREC_ON_GPU = 0x1<<2, + GDS_ALLOC_WQ_DBREC_MASK = 0x1<<2 } gds_alloc_qp_flags_t; #include From 1c302b466bee0497db7d7f036d8aec30fca18104 Mon Sep 17 00:00:00 2001 From: Pak Markthub Date: Mon, 18 Oct 2021 18:39:02 -0700 Subject: [PATCH 41/50] Connected dv destroy_qp to the transport interface --- src/transports/mlx5-dv/mlx5-dv.cpp | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/src/transports/mlx5-dv/mlx5-dv.cpp b/src/transports/mlx5-dv/mlx5-dv.cpp index 9a39613..35d4a3f 100644 --- a/src/transports/mlx5-dv/mlx5-dv.cpp +++ b/src/transports/mlx5-dv/mlx5-dv.cpp @@ -989,16 +989,24 @@ int gds_mlx5_dv_modify_qp(gds_mlx5_dv_qp_t *mdqp, struct ibv_qp_attr *attr, int //----------------------------------------------------------------------------- -void gds_mlx5_dv_destroy_qp(gds_mlx5_dv_qp_t *mdqp) +int gds_mlx5_dv_destroy_qp(gds_qp_t *gqp) { int status = 0; + gds_mlx5_dv_qp_t *mdqp; + gds_peer *peer = NULL; - gds_mlx5_dv_qp_peer_t *mqp_peer = mdqp->qp_peer; + gds_mlx5_dv_qp_peer_t *mqp_peer; + + if (!gqp) + return status; + + mdqp = to_gds_mdv_qp(gqp); - assert(mdqp); assert(mdqp->devx_qp); + mqp_peer = mdqp->qp_peer; + if (mqp_peer) { gds_peer_attr *peer_attr = mqp_peer->peer_attr; gds_peer *peer = peer_from_id(peer_attr->peer_id); @@ -1081,8 +1089,8 @@ int gds_transport_mlx5_dv_init(gds_transport_t **transport) } t->create_qp = gds_mlx5_dv_create_qp; + t->destroy_qp = gds_mlx5_dv_destroy_qp; #if 0 - t->destroy_qp = gds_mlx5_exp_destroy_qp; t->rollback_qp = gds_mlx5_exp_rollback_qp; t->init_send_info = gds_mlx5_exp_init_send_info; From 97a5f819a1207facf37c62fd86fe9cc841d4cd6b Mon Sep 17 00:00:00 2001 From: Pak Markthub Date: Mon, 18 Oct 2021 19:02:36 -0700 Subject: [PATCH 42/50] Introduced gds_modify_qp --- include/gdsync/core.h | 7 ++++++- src/gdsync.cpp | 14 ++++++++++++++ src/transport.hpp | 1 + src/transports/mlx5-dv/mlx5-dv.cpp | 11 +++++++++-- src/transports/mlx5-exp/mlx5-exp.cpp | 7 +++++++ tests/gds_kernel_latency.c | 10 +++++----- tests/gds_kernel_loopback_latency.c | 6 +++--- 7 files changed, 45 insertions(+), 11 deletions(-) diff --git a/include/gdsync/core.h b/include/gdsync/core.h index e500c93..25e706f 100644 --- a/include/gdsync/core.h +++ b/include/gdsync/core.h @@ -74,7 +74,6 @@ typedef struct gds_qp { * Peer QPs require dedicated send and recv CQs, e.g. cannot (easily) * use SRQ. */ - struct gds_qp *gds_create_qp(struct ibv_pd *pd, struct ibv_context *context, gds_qp_init_attr_t *qp_init_attr, int gpu_id, int flags); @@ -85,6 +84,12 @@ struct gds_qp *gds_create_qp(struct ibv_pd *pd, struct ibv_context *context, */ int gds_destroy_qp(struct gds_qp *qp); +/* \brief: Modify a peer-enabled QP + * + * Similar to ibv_modify_qp. + */ +int gds_modify_qp(gds_qp_t *gqp, struct ibv_qp_attr *attr, int attr_mask); + /* \brief: CPU-synchronous post send for peer QPs * * Notes: diff --git a/src/gdsync.cpp b/src/gdsync.cpp index 84a9b05..7f683cf 100644 --- a/src/gdsync.cpp +++ b/src/gdsync.cpp @@ -1303,6 +1303,20 @@ int gds_destroy_qp(struct gds_qp *gqp) //----------------------------------------------------------------------------- +int gds_modify_qp(gds_qp_t *gqp, struct ibv_qp_attr *attr, int attr_mask) +{ + int ret = 0; + + if (!gqp || !attr) + return EINVAL; + + ret = gds_main_transport->modify_qp(gqp, attr, attr_mask); + + return ret; +} + +//----------------------------------------------------------------------------- + int gds_query_param(gds_param_t param, int *value) { int ret = 0; diff --git a/src/transport.hpp b/src/transport.hpp index 98a62f6..9f857fe 100644 --- a/src/transport.hpp +++ b/src/transport.hpp @@ -36,6 +36,7 @@ typedef struct gds_transport { int (*create_qp)(struct ibv_pd *pd, struct ibv_context *context, gds_qp_init_attr_t *qp_attr, gds_peer *peer, gds_peer_attr *peer_attr, int flags, gds_qp_t **gqp); int (*destroy_qp)(gds_qp_t *gqp); + int (*modify_qp)(gds_qp_t *gqp, struct ibv_qp_attr *attr, int attr_mask); int (*rollback_qp)(gds_qp_t *gqp, gds_send_request_t *request); void (*init_send_info)(gds_send_request_t *request); diff --git a/src/transports/mlx5-dv/mlx5-dv.cpp b/src/transports/mlx5-dv/mlx5-dv.cpp index 35d4a3f..4d01cb2 100644 --- a/src/transports/mlx5-dv/mlx5-dv.cpp +++ b/src/transports/mlx5-dv/mlx5-dv.cpp @@ -955,13 +955,19 @@ static int gds_mlx5_dv_modify_qp_rtr2rts(gds_mlx5_dv_qp_t *mdqp, struct ibv_qp_a //----------------------------------------------------------------------------- -int gds_mlx5_dv_modify_qp(gds_mlx5_dv_qp_t *mdqp, struct ibv_qp_attr *attr, int attr_mask) +int gds_mlx5_dv_modify_qp(gds_qp_t *gqp, struct ibv_qp_attr *attr, int attr_mask) { int status = 0; - assert(mdqp); + gds_mlx5_dv_qp_t *mdqp; + + assert(gqp); assert(attr); + mdqp = to_gds_mdv_qp(gqp); + + assert(mdqp->gqp.qp); + if (!(attr_mask & IBV_QP_STATE)) { gds_err("IBV_QP_STATE is required.\n"); status = EINVAL; @@ -1090,6 +1096,7 @@ int gds_transport_mlx5_dv_init(gds_transport_t **transport) t->create_qp = gds_mlx5_dv_create_qp; t->destroy_qp = gds_mlx5_dv_destroy_qp; + t->modify_qp = gds_mlx5_dv_modify_qp; #if 0 t->rollback_qp = gds_mlx5_exp_rollback_qp; diff --git a/src/transports/mlx5-exp/mlx5-exp.cpp b/src/transports/mlx5-exp/mlx5-exp.cpp index 85f97e2..b093156 100644 --- a/src/transports/mlx5-exp/mlx5-exp.cpp +++ b/src/transports/mlx5-exp/mlx5-exp.cpp @@ -761,6 +761,12 @@ int gds_mlx5_exp_create_qp( return ret; } +//----------------------------------------------------------------------------- + +int gds_mlx5_exp_modify_qp(gds_qp_t *gqp, struct ibv_qp_attr *attr, int attr_mask) +{ + return ibv_modify_qp(gqp->qp, attr, attr_mask); +} //----------------------------------------------------------------------------- @@ -1231,6 +1237,7 @@ int gds_transport_mlx5_exp_init(gds_transport_t **transport) t->create_qp = gds_mlx5_exp_create_qp; t->destroy_qp = gds_mlx5_exp_destroy_qp; + t->modify_qp = gds_mlx5_exp_modify_qp; t->rollback_qp = gds_mlx5_exp_rollback_qp; t->init_send_info = gds_mlx5_exp_init_send_info; diff --git a/tests/gds_kernel_latency.c b/tests/gds_kernel_latency.c index 04370f5..a2fc621 100644 --- a/tests/gds_kernel_latency.c +++ b/tests/gds_kernel_latency.c @@ -304,7 +304,7 @@ static struct pingpong_context *pp_init_ctx(struct ibv_device *ib_dev, int size, .qp_access_flags = IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_READ | IBV_ACCESS_LOCAL_WRITE }; - if (ibv_modify_qp(ctx->qp, &attr, + if (gds_modify_qp(ctx->gds_qp, &attr, IBV_QP_STATE | IBV_QP_PKEY_INDEX | IBV_QP_PORT | @@ -1245,7 +1245,7 @@ int main(int argc, char *argv[]) .qp_state = IBV_QPS_RTR }; - if (ibv_modify_qp(ctx->qp, &attr, IBV_QP_STATE)) { + if (gds_modify_qp(ctx->gds_qp, &attr, IBV_QP_STATE)) { gpu_err("Failed to modify QP to RTR\n"); return 1; } @@ -1255,7 +1255,7 @@ int main(int argc, char *argv[]) attr.qp_state = IBV_QPS_RTS; attr.sq_psn = my_dest.psn; - if (ibv_modify_qp(ctx->qp, &attr, + if (gds_modify_qp(ctx->gds_qp, &attr, IBV_QP_STATE | IBV_QP_SQ_PSN)) { gpu_err("Failed to modify QP to RTS\n"); @@ -1300,7 +1300,7 @@ int main(int argc, char *argv[]) .ah_attr.port_num = ib_port }; - if (ibv_modify_qp(ctx->qp, &attr, (IBV_QP_STATE | IBV_QP_AV | IBV_QP_PATH_MTU + if (gds_modify_qp(ctx->gds_qp, &attr, (IBV_QP_STATE | IBV_QP_AV | IBV_QP_PATH_MTU | IBV_QP_DEST_QPN | IBV_QP_RQ_PSN | IBV_QP_MIN_RNR_TIMER | IBV_QP_MAX_DEST_RD_ATOMIC))) { gpu_err("Failed to modify QP to RTR\n"); @@ -1315,7 +1315,7 @@ int main(int argc, char *argv[]) attr.rnr_retry = 7; attr.max_rd_atomic = 1; - if (ibv_modify_qp(ctx->qp, &attr, (IBV_QP_STATE | IBV_QP_SQ_PSN | IBV_QP_TIMEOUT + if (gds_modify_qp(ctx->gds_qp, &attr, (IBV_QP_STATE | IBV_QP_SQ_PSN | IBV_QP_TIMEOUT | IBV_QP_RETRY_CNT | IBV_QP_RNR_RETRY | IBV_QP_MAX_QP_RD_ATOMIC))) { gpu_err("Failed to modify QP to RTS\n"); diff --git a/tests/gds_kernel_loopback_latency.c b/tests/gds_kernel_loopback_latency.c index f6ccc32..0dd2ff5 100644 --- a/tests/gds_kernel_loopback_latency.c +++ b/tests/gds_kernel_loopback_latency.c @@ -138,7 +138,7 @@ static int pp_connect_ctx(struct pingpong_context *ctx, int port, int my_psn, .qp_state = IBV_QPS_RTR }; - if (ibv_modify_qp(ctx->qp, &attr, IBV_QP_STATE)) { + if (gds_modify_qp(ctx->gds_qp, &attr, IBV_QP_STATE)) { fprintf(stderr, "Failed to modify QP to RTR\n"); return 1; } @@ -146,7 +146,7 @@ static int pp_connect_ctx(struct pingpong_context *ctx, int port, int my_psn, attr.qp_state = IBV_QPS_RTS; attr.sq_psn = my_psn; - if (ibv_modify_qp(ctx->qp, &attr, + if (gds_modify_qp(ctx->gds_qp, &attr, IBV_QP_STATE | IBV_QP_SQ_PSN)) { fprintf(stderr, "Failed to modify QP to RTS\n"); @@ -357,7 +357,7 @@ static struct pingpong_context *pp_init_ctx(struct ibv_device *ib_dev, int size, .qkey = 0x11111111 }; - if (ibv_modify_qp(ctx->qp, &attr, + if (gds_modify_qp(ctx->gds_qp, &attr, IBV_QP_STATE | IBV_QP_PKEY_INDEX | IBV_QP_PORT | From c053bb3957b0b02a55a5446bbc244e5e42bc6c00 Mon Sep 17 00:00:00 2001 From: Pak Markthub Date: Mon, 18 Oct 2021 19:50:39 -0700 Subject: [PATCH 43/50] Fixed typo in the help text in gds_kernel_latency --- tests/gds_kernel_latency.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/gds_kernel_latency.c b/tests/gds_kernel_latency.c index a2fc621..2a4689f 100644 --- a/tests/gds_kernel_latency.c +++ b/tests/gds_kernel_latency.c @@ -886,7 +886,7 @@ static void usage(const char *argv0) printf(" -U, --peersync-desc-apis use batched descriptor APIs (default disabled)\n"); printf(" -Q, --consume-rx-cqe enable GPU consumes RX CQE support (default disabled)\n"); printf(" -T, --time-gds-ops evaluate time needed to execute gds operations using cuda events\n"); - printf(" -k, --qp-kind select IB transport kind used by GDS QPs. (-K 1) for UD, (-K 2) for RC\n"); + printf(" -k, --qp-kind select IB transport kind used by GDS QPs. (-k 1) for UD, (-k 2) for RC\n"); printf(" -M, --gpu-sched-mode set CUDA context sched mode, default (A)UTO, (S)PIN, (Y)IELD, (B)LOCKING\n"); printf(" -E, --gpu-mem allocate GPU intead of CPU memory buffers\n"); printf(" -K, --skip-kernel-launch no GPU kernel computations, only communications\n"); From d4f1f413ddcbbd7c18f8957803daae9a8822ef1a Mon Sep 17 00:00:00 2001 From: Pak Markthub Date: Mon, 18 Oct 2021 23:06:35 -0700 Subject: [PATCH 44/50] Implemented CQ buffer allocation on GPU memory using DirectVerbs --- src/transports/mlx5-dv/mlx5-dv.cpp | 212 ++++++++++++++++++++++++++++- src/transports/mlx5-dv/mlx5-dv.hpp | 2 + 2 files changed, 209 insertions(+), 5 deletions(-) diff --git a/src/transports/mlx5-dv/mlx5-dv.cpp b/src/transports/mlx5-dv/mlx5-dv.cpp index 4d01cb2..f5cc7f3 100644 --- a/src/transports/mlx5-dv/mlx5-dv.cpp +++ b/src/transports/mlx5-dv/mlx5-dv.cpp @@ -51,6 +51,10 @@ //----------------------------------------------------------------------------- +/** + * Create a CQ using DirectVerbs. + * @params pd parent_domain with GPU memory allocation support. + */ static int gds_mlx5_dv_create_cq( struct ibv_context *context, int cqe, void *cq_context, struct ibv_comp_channel *channel, @@ -305,6 +309,188 @@ static void gds_mlx5_dv_destroy_cq(gds_mlx5_dv_cq_t *mcq) //----------------------------------------------------------------------------- +static void *pd_mem_alloc(struct ibv_pd *pd, void *pd_context, size_t size, + size_t alignment, uint64_t resource_type) +{ + assert(pd_context); + + gds_peer_attr *peer_attr = (gds_peer_attr *)pd_context; + gds_peer *peer = peer_from_id(peer_attr->peer_id); + uint32_t dir = 0; + uint64_t range_id; + gds_buf *buf = NULL; + void *ptr = NULL; + + gds_dbg("pd_mem_alloc: pd=%p, pd_context=%p, size=%zu, alignment=%zu, resource_type=0x%lx\n", + pd, pd_context, size, alignment, resource_type); + + // Prevent incorrect setting of alloc type + assert(!((resource_type == MLX5DV_RES_TYPE_QP || resource_type == MLX5DV_RES_TYPE_SRQ) && peer->alloc_type != gds_peer::WQ)); + assert(!(resource_type == MLX5DV_RES_TYPE_CQ && peer->alloc_type != gds_peer::CQ)); + assert(!(resource_type == MLX5DV_RES_TYPE_DBR && peer->alloc_type != gds_peer::WQ && peer->alloc_type != gds_peer::CQ)); + + if (peer->alloc_type == gds_peer::WQ) + dir = GDS_PEER_DIRECTION_FROM_PEER | GDS_PEER_DIRECTION_TO_HCA; + else if (peer->alloc_type == gds_peer::CQ) + dir = GDS_PEER_DIRECTION_FROM_HCA | GDS_PEER_DIRECTION_TO_PEER | GDS_PEER_DIRECTION_TO_CPU; + else { + gds_dbg("encountered unsupported alloc_type\n"); + return IBV_ALLOCATOR_USE_DEFAULT; + } + + if (resource_type == MLX5DV_RES_TYPE_QP || resource_type == MLX5DV_RES_TYPE_SRQ || resource_type == MLX5DV_RES_TYPE_DBR || resource_type == MLX5DV_RES_TYPE_CQ) { + buf = peer->buf_alloc(peer->alloc_type, size, dir, (uint32_t)alignment, peer->alloc_flags); + } + else + gds_dbg("request allocation with unsupported resource_type\n"); + + if (!buf) { + gds_dbg("alloc on host\n"); + return IBV_ALLOCATOR_USE_DEFAULT; + } + else { + gds_dbg("alloc on GPU\n"); + ptr = buf->addr; + } + + if ((range_id = peer_attr->register_va(ptr, size, peer_attr->peer_id, buf)) == 0) { + gds_err("error in register_va\n"); + peer->free(buf); + return IBV_ALLOCATOR_USE_DEFAULT; + } + + // peer->opaque should be set + assert(peer->opaque); + + if (peer->alloc_type == gds_peer::WQ) { + gds_mlx5_dv_qp_peer_t *mqp_peer = (gds_mlx5_dv_qp_peer_t *)peer->opaque; + if (resource_type == MLX5DV_RES_TYPE_QP || resource_type == MLX5DV_RES_TYPE_SRQ) { + // BUG: Will be overrided if use with IBV_QPT_RAW_PACKET or MLX5_QP_FLAGS_USE_UNDERLAY + mqp_peer->wq.va_id = range_id; + mqp_peer->wq.size = size; + mqp_peer->wq.gbuf = buf; + } + else if (resource_type == MLX5DV_RES_TYPE_DBR) { + mqp_peer->dbr.va_id = range_id; + mqp_peer->dbr.size = size; + mqp_peer->dbr.gbuf = buf; + } + else + gds_err("Unsupported resource_type\n"); + } + else if (peer->alloc_type == gds_peer::CQ) { + gds_mlx5_dv_cq_peer_t *mcq_peer = (gds_mlx5_dv_cq_peer_t *)peer->opaque; + if (resource_type == MLX5DV_RES_TYPE_CQ) { + mcq_peer->buf.va_id = range_id; + mcq_peer->buf.size = size; + mcq_peer->buf.gbuf = buf; + } + else if (resource_type == MLX5DV_RES_TYPE_DBR) { + mcq_peer->dbr.va_id = range_id; + mcq_peer->dbr.size = size; + mcq_peer->dbr.gbuf = buf; + } + else + gds_err("Unsupported resource_type\n"); + } + else + gds_err("Unsupported peer->alloc_type\n"); + + return ptr; +} + +//----------------------------------------------------------------------------- + +static void pd_mem_free(struct ibv_pd *pd, void *pd_context, void *ptr, + uint64_t resource_type) +{ + gds_dbg("pd_mem_free: pd=%p, pd_context=%p, ptr=%p, resource_type=0x%lx\n", + pd, pd_context, ptr, resource_type); + + assert(pd_context); + + gds_peer_attr *peer_attr = (gds_peer_attr *)pd_context; + gds_peer *peer = peer_from_id(peer_attr->peer_id); + + // Prevent incorrect setting of alloc type + assert(!(resource_type == MLX5DV_RES_TYPE_QP && peer->alloc_type != gds_peer::WQ)); + assert(!(resource_type == MLX5DV_RES_TYPE_CQ && peer->alloc_type != gds_peer::CQ)); + assert(!(resource_type == MLX5DV_RES_TYPE_DBR && peer->alloc_type != gds_peer::WQ && peer->alloc_type != gds_peer::CQ)); + + assert(peer->opaque); + + if (peer->alloc_type == gds_peer::WQ) { + gds_mlx5_dv_qp_peer_t *mqp_peer = (gds_mlx5_dv_qp_peer_t *)peer->opaque; + if (resource_type == MLX5DV_RES_TYPE_QP && mqp_peer->wq.gbuf) { + if (mqp_peer->wq.va_id) { + peer_attr->unregister_va(mqp_peer->wq.va_id, peer_attr->peer_id); + mqp_peer->wq.va_id = 0; + } + peer->free(mqp_peer->wq.gbuf); + mqp_peer->wq.gbuf = NULL; + } + else if (resource_type == MLX5DV_RES_TYPE_DBR && mqp_peer->dbr.gbuf) { + if (mqp_peer->dbr.va_id) { + peer_attr->unregister_va(mqp_peer->dbr.va_id, peer_attr->peer_id); + mqp_peer->dbr.va_id = 0; + } + peer->free(mqp_peer->dbr.gbuf); + mqp_peer->dbr.gbuf = NULL; + } + } + else if (peer->alloc_type == gds_peer::CQ) { + gds_mlx5_dv_cq_peer_t *mcq_peer = (gds_mlx5_dv_cq_peer_t *)peer->opaque; + if (resource_type == MLX5DV_RES_TYPE_CQ && mcq_peer->buf.gbuf) { + if (mcq_peer->buf.va_id) { + peer_attr->unregister_va(mcq_peer->buf.va_id, peer_attr->peer_id); + mcq_peer->buf.va_id = 0; + } + peer->free(mcq_peer->buf.gbuf); + mcq_peer->buf.gbuf = NULL; + } + else if (resource_type == MLX5DV_RES_TYPE_DBR && mcq_peer->dbr.gbuf) { + if (mcq_peer->dbr.va_id) { + peer_attr->unregister_va(mcq_peer->dbr.va_id, peer_attr->peer_id); + mcq_peer->dbr.va_id = 0; + } + peer->free(mcq_peer->dbr.gbuf); + mcq_peer->dbr.gbuf = NULL; + } + } +} + +//----------------------------------------------------------------------------- + +static int gds_mlx5_dv_alloc_parent_domain(struct ibv_pd *p_pd, struct ibv_context *ibctx, gds_peer_attr *peer_attr, struct ibv_pd **out_pd) +{ + int ret = 0; + + struct ibv_parent_domain_init_attr pd_init_attr; + struct ibv_pd *pd = NULL; + gds_peer *peer = peer_from_id(peer_attr->peer_id); + + memset(&pd_init_attr, 0, sizeof(ibv_parent_domain_init_attr)); + pd_init_attr.pd = p_pd; + pd_init_attr.comp_mask = IBV_PARENT_DOMAIN_INIT_ATTR_ALLOCATORS | IBV_PARENT_DOMAIN_INIT_ATTR_PD_CONTEXT; + pd_init_attr.alloc = pd_mem_alloc; + pd_init_attr.free = pd_mem_free; + pd_init_attr.pd_context = peer_attr; + + pd = ibv_alloc_parent_domain(ibctx, &pd_init_attr); + if (!pd) { + gds_err("error in ibv_alloc_parent_domain\n"); + ret = EINVAL; + goto out; + } + + *out_pd = pd; + +out: + return ret; +} + +//----------------------------------------------------------------------------- + int gds_mlx5_dv_create_qp( struct ibv_pd *pd, struct ibv_context *context, gds_qp_init_attr_t *qp_attr, gds_peer *peer, gds_peer_attr *peer_attr, int flags, gds_qp_t **gqp @@ -315,6 +501,8 @@ int gds_mlx5_dv_create_qp( gds_mlx5_dv_qp_t *mdqp = NULL; struct ibv_qp *ibqp = NULL; + struct ibv_pd *parent_domain = NULL; + gds_mlx5_dv_cq_t *tx_mcq = NULL; gds_mlx5_dv_cq_t *rx_mcq = NULL; @@ -417,8 +605,14 @@ int gds_mlx5_dv_create_qp( goto out; } + status = gds_mlx5_dv_alloc_parent_domain(pd, context, peer_attr, &parent_domain); + if (status) { + gds_err("Error in gds_mlx5_dv_alloc_parent_domain\n"); + goto out; + } + status = gds_mlx5_dv_create_cq( - context, qp_attr->cap.max_send_wr, NULL, NULL, 0, pd, peer_attr, + context, qp_attr->cap.max_send_wr, NULL, NULL, 0, parent_domain, peer_attr, (gds_alloc_cq_flags_t)((flags & GDS_CREATE_QP_TX_CQ_ON_GPU) ? (GDS_ALLOC_CQ_ON_GPU | GDS_ALLOC_CQ_DBREC_ON_GPU) : (GDS_ALLOC_CQ_DEFAULT | GDS_ALLOC_CQ_DBREC_DEFAULT)), &tx_mcq ); @@ -428,7 +622,7 @@ int gds_mlx5_dv_create_qp( } status = gds_mlx5_dv_create_cq( - context, qp_attr->cap.max_recv_wr, NULL, NULL, 0, pd, peer_attr, + context, qp_attr->cap.max_recv_wr, NULL, NULL, 0, parent_domain, peer_attr, (gds_alloc_cq_flags_t)((flags & GDS_CREATE_QP_RX_CQ_ON_GPU) ? (GDS_ALLOC_CQ_ON_GPU | GDS_ALLOC_CQ_DBREC_ON_GPU) : (GDS_ALLOC_CQ_DEFAULT | GDS_ALLOC_CQ_DBREC_DEFAULT)), &rx_mcq ); @@ -519,9 +713,9 @@ int gds_mlx5_dv_create_qp( goto out; } - wq_umem = mlx5dv_devx_umem_reg(context, wq_buf->addr, wq_buf_size, 0); + wq_umem = mlx5dv_devx_umem_reg(context, wq_buf->addr, wq_buf_size, IBV_ACCESS_LOCAL_WRITE); if (!wq_umem) { - gds_err("Error in mlx5dv_devx_umem_regfor WQ\n"); + gds_err("Error in mlx5dv_devx_umem_reg for WQ\n"); status = ENOMEM; goto out; } @@ -544,7 +738,7 @@ int gds_mlx5_dv_create_qp( goto out; } - dbr_umem = mlx5dv_devx_umem_reg(context, dbr_buf->addr, dbr_buf_size, 0); + dbr_umem = mlx5dv_devx_umem_reg(context, dbr_buf->addr, dbr_buf_size, IBV_ACCESS_LOCAL_WRITE); if (!dbr_umem) { gds_err("Error in mlx5dv_devx_umem_reg for DBR\n"); status = ENOMEM; @@ -647,6 +841,8 @@ int gds_mlx5_dv_create_qp( mdqp->gqp.send_cq = &tx_mcq->gcq; mdqp->gqp.recv_cq = &rx_mcq->gcq; + mdqp->parent_domain = parent_domain; + ibqp->context = context; ibqp->pd = pd; ibqp->send_cq = tx_mcq->gcq.cq; @@ -703,6 +899,9 @@ int gds_mlx5_dv_create_qp( if (tx_mcq) gds_mlx5_dv_destroy_cq(tx_mcq); + if (parent_domain) + ibv_dealloc_pd(parent_domain); + if (ibqp) free(ibqp); @@ -1073,6 +1272,9 @@ int gds_mlx5_dv_destroy_qp(gds_qp_t *gqp) if (mdqp->bf_uar) mlx5dv_devx_free_uar(mdqp->bf_uar); + if (mdqp->parent_domain) + ibv_dealloc_pd(mdqp->parent_domain); + if (mqp_peer) free(mqp_peer); diff --git a/src/transports/mlx5-dv/mlx5-dv.hpp b/src/transports/mlx5-dv/mlx5-dv.hpp index 6b63a23..fada0ce 100644 --- a/src/transports/mlx5-dv/mlx5-dv.hpp +++ b/src/transports/mlx5-dv/mlx5-dv.hpp @@ -182,6 +182,8 @@ typedef struct gds_mlx5_dv_qp { struct ibv_port_attr port_attr; gds_peer_attr *peer_attr; + + struct ibv_pd *parent_domain; } gds_mlx5_dv_qp_t; //----------------------------------------------------------------------------- From f0c5167b733d7fa7f8a8a08e24cad194dbb2bdbb Mon Sep 17 00:00:00 2001 From: Pak Markthub Date: Tue, 19 Oct 2021 00:03:40 -0700 Subject: [PATCH 45/50] Added UD support to QP creation with DEVX --- src/transports/mlx5-dv/mlx5-dv.cpp | 102 +++++----------------------- src/transports/mlx5-dv/mlx5-dv.hpp | 8 +-- tests/gds_kernel_loopback_latency.c | 5 -- 3 files changed, 20 insertions(+), 95 deletions(-) diff --git a/src/transports/mlx5-dv/mlx5-dv.cpp b/src/transports/mlx5-dv/mlx5-dv.cpp index f5cc7f3..2306b56 100644 --- a/src/transports/mlx5-dv/mlx5-dv.cpp +++ b/src/transports/mlx5-dv/mlx5-dv.cpp @@ -540,12 +540,11 @@ int gds_mlx5_dv_create_qp( void *qpc; gds_mlx5_dv_qp_peer_t *mqp_peer = NULL; - struct ibv_srq *srq = NULL; - bool is_internal_srq = false; struct mlx5dv_devx_obj *devx_obj = NULL; uint32_t qpn; + uint32_t st_val; gds_mlx5_dv_qp_type_t gmlx_qpt = GDS_MLX5_DV_QP_TYPE_UNKNOWN; @@ -555,14 +554,14 @@ int gds_mlx5_dv_create_qp( assert(peer); assert(peer_attr); - srq = qp_attr->srq; - - if (qp_attr->qp_type == IBV_QPT_RC) + if (qp_attr->qp_type == IBV_QPT_RC) { gmlx_qpt = GDS_MLX5_DV_QP_TYPE_RC; - #if 0 - else if (qp_attr->qp_type == IBV_QPT_DRIVER) - gmlx_qpt = (qp_attr->dc_init_attr.dc_type == MLX5DV_DCTYPE_DCT) ? GDS_MLX5_QP_TYPE_DCT : GDS_MLX5_QP_TYPE_DCI; - #endif + st_val = GDS_MLX5_DV_QPC_ST_RC; + } + else if (qp_attr->qp_type == IBV_QPT_UD) { + gmlx_qpt = GDS_MLX5_DV_QP_TYPE_UD; + st_val = GDS_MLX5_DV_QPC_ST_UD; + } if (gmlx_qpt == GDS_MLX5_DV_QP_TYPE_UNKNOWN) { gds_err("The requested QP type is not supported.\n"); @@ -570,12 +569,6 @@ int gds_mlx5_dv_create_qp( goto out; } - if (gmlx_qpt == GDS_MLX5_DV_QP_TYPE_DCT) { - gds_err("DCT QP type is not supported.\n"); - status = EINVAL; - goto out; - } - if (gmlx_qpt == GDS_MLX5_DV_QP_TYPE_RC) { if (qp_attr->cap.max_send_sge != 1 || qp_attr->cap.max_recv_sge != 1) { gds_err("Both cap.max_send_sge and cap.max_recv_sge must be 1.\n"); @@ -583,13 +576,6 @@ int gds_mlx5_dv_create_qp( goto out; } } - else if (gmlx_qpt == GDS_MLX5_DV_QP_TYPE_DCI) { - if (qp_attr->cap.max_send_sge != 1) { - gds_err("cap.max_send_sge must be 1.\n"); - status = EINVAL; - goto out; - } - } mdqp = (gds_mlx5_dv_qp_t *)calloc(1, sizeof(gds_mlx5_dv_qp_t)); if (!mdqp) { @@ -643,36 +629,6 @@ int gds_mlx5_dv_create_qp( goto out; } - srq = qp_attr->srq; - if (gmlx_qpt == GDS_MLX5_DV_QP_TYPE_DCI && !srq) { - struct ibv_srq_init_attr srq_init_attr = {0,}; - srq_init_attr.attr.max_wr = qp_attr->cap.max_recv_wr; - srq_init_attr.attr.max_sge = qp_attr->cap.max_recv_sge; - - mqp_peer = (gds_mlx5_dv_qp_peer_t *)calloc(1, sizeof(gds_mlx5_dv_qp_peer_t)); - if (!mqp_peer) { - gds_err("Cannot allocate memory for mqp_peer.\n"); - status = ENOMEM; - goto out; - } - mqp_peer->peer_attr = peer_attr; - - peer->alloc_type = gds_peer::WQ; - peer->alloc_flags = flags; - // mqp_peer will be filled if we do allocation on device. - // pd_mem_alloc is responsible for the registration. - peer->opaque = mqp_peer; - - srq = ibv_create_srq(pd, &srq_init_attr); - if (!srq) { - status = errno; - gds_err("Error in ibv_create_srq with errno=%d.\n", errno); - goto out; - } - qp_attr->srq = srq; - is_internal_srq = true; - } - log_bf_reg_size = DEVX_GET(query_hca_cap_out, cmd_cap_out, capability.cmd_hca_cap.log_bf_reg_size); // The size of 1st + 2nd half (as when we use alternating DB) @@ -701,7 +657,7 @@ int gds_mlx5_dv_create_qp( // In GPUVerbs, we use at most 4 16-byte elements. wqe_size = MLX5_SEND_WQE_BB; // 64 bytes max_tx = GDS_ROUND_UP_POW2_OR_0(qp_attr->cap.max_send_wr); - max_rx = (gmlx_qpt == GDS_MLX5_DV_QP_TYPE_RC) ? GDS_ROUND_UP_POW2_OR_0(qp_attr->cap.max_recv_wr) : 0; + max_rx = GDS_ROUND_UP_POW2_OR_0(qp_attr->cap.max_recv_wr); wq_buf_size = (max_tx + max_rx) * wqe_size; // Allocate WQ buffer. @@ -756,12 +712,6 @@ int gds_mlx5_dv_create_qp( dv_obj.pd.in = pd; dv_obj.pd.out = &dvpd; dv_obj_type = MLX5DV_OBJ_PD; - if (srq) { - dv_obj.srq.in = srq; - dv_obj.srq.out = &mdqp->dvsrq; - mdqp->dvsrq.comp_mask = MLX5DV_SRQ_MASK_SRQN; - dv_obj_type |= MLX5DV_OBJ_SRQ; - } status = mlx5dv_init_obj(&dv_obj, dv_obj_type); if (status) { gds_err("Error in mlx5dv_init_obj\n"); @@ -772,35 +722,26 @@ int gds_mlx5_dv_create_qp( DEVX_SET(create_qp_in, cmd_in, wq_umem_id, wq_umem->umem_id); // WQ buffer qpc = DEVX_ADDR_OF(create_qp_in, cmd_in, qpc); - DEVX_SET(qpc, qpc, st, (gmlx_qpt == GDS_MLX5_DV_QP_TYPE_RC) ? GDS_MLX5_DV_QPC_ST_RC : GDS_MLX5_DV_QPC_ST_DCI); + DEVX_SET(qpc, qpc, st, st_val); DEVX_SET(qpc, qpc, pm_state, MLX5_QPC_PM_STATE_MIGRATED); DEVX_SET(qpc, qpc, pd, dvpd.pdn); DEVX_SET(qpc, qpc, uar_page, uar->page_id); // BF register - if (srq) { - if (!(mdqp->dvsrq.comp_mask & MLX5DV_SRQ_MASK_SRQN)) { - status = EIO; - gds_err("mlx5dv_init_obj does not return SRQ number!\n"); - goto out; - } - DEVX_SET(qpc, qpc, rq_type, GDS_MLX5_DV_QPC_RQ_TYPE_SRQ); - DEVX_SET(qpc, qpc, srqn_rmpn_xrqn, mdqp->dvsrq.srqn); - } - else { - DEVX_SET(qpc, qpc, rq_type, GDS_MLX5_DV_QPC_RQ_TYPE_REGULAR); - DEVX_SET(qpc, qpc, srqn_rmpn_xrqn, 0); - } + DEVX_SET(qpc, qpc, rq_type, GDS_MLX5_DV_QPC_RQ_TYPE_REGULAR); + DEVX_SET(qpc, qpc, srqn_rmpn_xrqn, 0); DEVX_SET(qpc, qpc, cqn_snd, tx_mcq->dvcq.cqn); DEVX_SET(qpc, qpc, cqn_rcv, rx_mcq->dvcq.cqn); DEVX_SET(qpc, qpc, log_sq_size, GDS_ILOG2_OR0(max_tx)); - DEVX_SET(qpc, qpc, cs_req, 0); // Disable CS Request + if (gmlx_qpt == GDS_MLX5_DV_QP_TYPE_RC) + DEVX_SET(qpc, qpc, cs_req, 0); // Disable CS Request DEVX_SET(qpc, qpc, cs_res, 0); // Disable CS Respond + if (gmlx_qpt == GDS_MLX5_DV_QP_TYPE_UD) + DEVX_SET(qpc, qpc, cgs, 0); // GRH is always scattered to the beginning of the receive buffer. DEVX_SET(qpc, qpc, dbr_umem_valid, 0x1); // Enable dbr_umem_id DEVX_SET64(qpc, qpc, dbr_addr, 0); // Offset 0 of dbr_umem_id (behavior changed because of dbr_umem_valid) DEVX_SET(qpc, qpc, dbr_umem_id, dbr_umem->umem_id); // DBR buffer DEVX_SET(qpc, qpc, user_index, 0); DEVX_SET(qpc, qpc, page_offset, 0); - if (gmlx_qpt == GDS_MLX5_DV_QP_TYPE_RC) - DEVX_SET(qpc, qpc, log_rq_size, GDS_ILOG2_OR0(max_rx)); + DEVX_SET(qpc, qpc, log_rq_size, GDS_ILOG2_OR0(max_rx)); devx_obj = mlx5dv_devx_obj_create(context, cmd_in, sizeof(cmd_in), cmd_out, sizeof(cmd_out)); if (!devx_obj) { @@ -814,7 +755,6 @@ int gds_mlx5_dv_create_qp( mdqp->devx_qp = devx_obj; mdqp->qp_type = gmlx_qpt; - mdqp->is_internal_srq = is_internal_srq; mdqp->qp_peer = mqp_peer; mdqp->wq_buf = wq_buf; @@ -847,8 +787,6 @@ int gds_mlx5_dv_create_qp( ibqp->pd = pd; ibqp->send_cq = tx_mcq->gcq.cq; ibqp->recv_cq = rx_mcq->gcq.cq; - if (srq) - ibqp->srq = srq; ibqp->qp_num = qpn; ibqp->state = IBV_QPS_RESET; ibqp->qp_type = qp_attr->qp_type; @@ -887,9 +825,6 @@ int gds_mlx5_dv_create_qp( if (uar) mlx5dv_devx_free_uar(uar); - if (is_internal_srq && srq) - ibv_destroy_srq(srq); - if (mqp_peer) free(mqp_peer); @@ -1225,9 +1160,6 @@ int gds_mlx5_dv_destroy_qp(gds_qp_t *gqp) if (status) gds_err("Error in mlx5dv_devx_obj_destroy for QP.\n"); - if (mdqp->is_internal_srq && mdqp->gqp.qp->srq) - ibv_destroy_srq(mdqp->gqp.qp->srq); - if (mdqp->gqp.send_cq) { gds_mlx5_dv_destroy_cq(to_gds_mdv_cq(mdqp->gqp.send_cq)); mdqp->gqp.send_cq = NULL; diff --git a/src/transports/mlx5-dv/mlx5-dv.hpp b/src/transports/mlx5-dv/mlx5-dv.hpp index fada0ce..fd8b362 100644 --- a/src/transports/mlx5-dv/mlx5-dv.hpp +++ b/src/transports/mlx5-dv/mlx5-dv.hpp @@ -60,7 +60,8 @@ enum { GDS_MLX5_DV_QPC_ST_RC = 0x0, - GDS_MLX5_DV_QPC_ST_DCI = 0x5 + GDS_MLX5_DV_QPC_ST_UC = 0x1, + GDS_MLX5_DV_QPC_ST_UD = 0x2 }; enum { @@ -144,8 +145,7 @@ typedef struct gds_mlx5_dv_qp_peer { typedef enum gds_mlx5_dv_qp_type { GDS_MLX5_DV_QP_TYPE_UNKNOWN = 0, GDS_MLX5_DV_QP_TYPE_RC, - GDS_MLX5_DV_QP_TYPE_DCT, - GDS_MLX5_DV_QP_TYPE_DCI + GDS_MLX5_DV_QP_TYPE_UD } gds_mlx5_dv_qp_type_t; typedef struct gds_mlx5_dv_qp { @@ -154,8 +154,6 @@ typedef struct gds_mlx5_dv_qp { struct mlx5dv_devx_obj *devx_qp; - bool is_internal_srq; - struct mlx5dv_srq dvsrq; gds_mlx5_dv_qp_peer_t *qp_peer; uint8_t sl; diff --git a/tests/gds_kernel_loopback_latency.c b/tests/gds_kernel_loopback_latency.c index 0dd2ff5..8f95830 100644 --- a/tests/gds_kernel_loopback_latency.c +++ b/tests/gds_kernel_loopback_latency.c @@ -334,11 +334,6 @@ static struct pingpong_context *pp_init_ctx(struct ibv_device *ib_dev, int size, .qp_type = IBV_QPT_UD, }; - //why? - if (my_rank == 1) { - printf("sleeping 2s\n"); - sleep(2); - } ctx->gds_qp = gds_create_qp(ctx->pd, ctx->context, &attr, gpu_id, gds_flags); if (!ctx->gds_qp) { From b99b28c5a7fcec4a9126c2a5759d074813e4649d Mon Sep 17 00:00:00 2001 From: Pak Markthub Date: Fri, 22 Oct 2021 00:14:29 -0700 Subject: [PATCH 46/50] Implemented gds_mlx5_dv_post_recv --- src/transport.hpp | 2 + src/transports/mlx5-dv/mlx5-dv.cpp | 201 +++++++++++++++++------------ src/transports/mlx5-dv/mlx5-dv.hpp | 34 +---- src/utils.hpp | 12 ++ 4 files changed, 140 insertions(+), 109 deletions(-) diff --git a/src/transport.hpp b/src/transport.hpp index 9f857fe..8c649f4 100644 --- a/src/transport.hpp +++ b/src/transport.hpp @@ -56,6 +56,8 @@ typedef struct gds_transport { int (*prepare_wait_cq)(gds_cq_t *gcq, gds_wait_request_t *request, int flags); int (*append_wait_cq)(gds_wait_request_t *request, uint32_t *dw, uint32_t val); int (*abort_wait_cq)(gds_cq_t *gcq, gds_wait_request_t *request); + + int (*post_recv)(gds_qp_t *gqp, struct ibv_recv_wr *wr, struct ibv_recv_wr **bad_wr); } gds_transport_t; extern gds_transport_t *gds_main_transport; diff --git a/src/transports/mlx5-dv/mlx5-dv.cpp b/src/transports/mlx5-dv/mlx5-dv.cpp index 2306b56..be0bd95 100644 --- a/src/transports/mlx5-dv/mlx5-dv.cpp +++ b/src/transports/mlx5-dv/mlx5-dv.cpp @@ -254,9 +254,6 @@ static void gds_mlx5_dv_destroy_cq(gds_mlx5_dv_cq_t *mcq) int status = 0; gds_mlx5_dv_cq_peer_t *mcq_peer = mcq->cq_peer; - if (mcq->wq) - mcq->wq = NULL; - if (mcq_peer && mcq_peer->pdata.peek_table) { free(mcq_peer->pdata.peek_table); mcq_peer->pdata.peek_table = NULL; @@ -325,20 +322,17 @@ static void *pd_mem_alloc(struct ibv_pd *pd, void *pd_context, size_t size, pd, pd_context, size, alignment, resource_type); // Prevent incorrect setting of alloc type - assert(!((resource_type == MLX5DV_RES_TYPE_QP || resource_type == MLX5DV_RES_TYPE_SRQ) && peer->alloc_type != gds_peer::WQ)); assert(!(resource_type == MLX5DV_RES_TYPE_CQ && peer->alloc_type != gds_peer::CQ)); - assert(!(resource_type == MLX5DV_RES_TYPE_DBR && peer->alloc_type != gds_peer::WQ && peer->alloc_type != gds_peer::CQ)); + assert(!(resource_type == MLX5DV_RES_TYPE_DBR && peer->alloc_type != gds_peer::CQ)); - if (peer->alloc_type == gds_peer::WQ) - dir = GDS_PEER_DIRECTION_FROM_PEER | GDS_PEER_DIRECTION_TO_HCA; - else if (peer->alloc_type == gds_peer::CQ) + if (peer->alloc_type == gds_peer::CQ) dir = GDS_PEER_DIRECTION_FROM_HCA | GDS_PEER_DIRECTION_TO_PEER | GDS_PEER_DIRECTION_TO_CPU; else { gds_dbg("encountered unsupported alloc_type\n"); return IBV_ALLOCATOR_USE_DEFAULT; } - if (resource_type == MLX5DV_RES_TYPE_QP || resource_type == MLX5DV_RES_TYPE_SRQ || resource_type == MLX5DV_RES_TYPE_DBR || resource_type == MLX5DV_RES_TYPE_CQ) { + if (resource_type == MLX5DV_RES_TYPE_DBR || resource_type == MLX5DV_RES_TYPE_CQ) { buf = peer->buf_alloc(peer->alloc_type, size, dir, (uint32_t)alignment, peer->alloc_flags); } else @@ -362,23 +356,7 @@ static void *pd_mem_alloc(struct ibv_pd *pd, void *pd_context, size_t size, // peer->opaque should be set assert(peer->opaque); - if (peer->alloc_type == gds_peer::WQ) { - gds_mlx5_dv_qp_peer_t *mqp_peer = (gds_mlx5_dv_qp_peer_t *)peer->opaque; - if (resource_type == MLX5DV_RES_TYPE_QP || resource_type == MLX5DV_RES_TYPE_SRQ) { - // BUG: Will be overrided if use with IBV_QPT_RAW_PACKET or MLX5_QP_FLAGS_USE_UNDERLAY - mqp_peer->wq.va_id = range_id; - mqp_peer->wq.size = size; - mqp_peer->wq.gbuf = buf; - } - else if (resource_type == MLX5DV_RES_TYPE_DBR) { - mqp_peer->dbr.va_id = range_id; - mqp_peer->dbr.size = size; - mqp_peer->dbr.gbuf = buf; - } - else - gds_err("Unsupported resource_type\n"); - } - else if (peer->alloc_type == gds_peer::CQ) { + if (peer->alloc_type == gds_peer::CQ) { gds_mlx5_dv_cq_peer_t *mcq_peer = (gds_mlx5_dv_cq_peer_t *)peer->opaque; if (resource_type == MLX5DV_RES_TYPE_CQ) { mcq_peer->buf.va_id = range_id; @@ -413,32 +391,12 @@ static void pd_mem_free(struct ibv_pd *pd, void *pd_context, void *ptr, gds_peer *peer = peer_from_id(peer_attr->peer_id); // Prevent incorrect setting of alloc type - assert(!(resource_type == MLX5DV_RES_TYPE_QP && peer->alloc_type != gds_peer::WQ)); assert(!(resource_type == MLX5DV_RES_TYPE_CQ && peer->alloc_type != gds_peer::CQ)); - assert(!(resource_type == MLX5DV_RES_TYPE_DBR && peer->alloc_type != gds_peer::WQ && peer->alloc_type != gds_peer::CQ)); + assert(!(resource_type == MLX5DV_RES_TYPE_DBR && peer->alloc_type != gds_peer::CQ)); assert(peer->opaque); - if (peer->alloc_type == gds_peer::WQ) { - gds_mlx5_dv_qp_peer_t *mqp_peer = (gds_mlx5_dv_qp_peer_t *)peer->opaque; - if (resource_type == MLX5DV_RES_TYPE_QP && mqp_peer->wq.gbuf) { - if (mqp_peer->wq.va_id) { - peer_attr->unregister_va(mqp_peer->wq.va_id, peer_attr->peer_id); - mqp_peer->wq.va_id = 0; - } - peer->free(mqp_peer->wq.gbuf); - mqp_peer->wq.gbuf = NULL; - } - else if (resource_type == MLX5DV_RES_TYPE_DBR && mqp_peer->dbr.gbuf) { - if (mqp_peer->dbr.va_id) { - peer_attr->unregister_va(mqp_peer->dbr.va_id, peer_attr->peer_id); - mqp_peer->dbr.va_id = 0; - } - peer->free(mqp_peer->dbr.gbuf); - mqp_peer->dbr.gbuf = NULL; - } - } - else if (peer->alloc_type == gds_peer::CQ) { + if (peer->alloc_type == gds_peer::CQ) { gds_mlx5_dv_cq_peer_t *mcq_peer = (gds_mlx5_dv_cq_peer_t *)peer->opaque; if (resource_type == MLX5DV_RES_TYPE_CQ && mcq_peer->buf.gbuf) { if (mcq_peer->buf.va_id) { @@ -513,8 +471,8 @@ int gds_mlx5_dv_create_qp( uint8_t log_bf_reg_size; size_t bf_reg_size; - int max_tx; - int max_rx; + unsigned int max_tx; + unsigned int max_rx; size_t wqe_size; struct mlx5dv_devx_umem *wq_umem = NULL; @@ -539,13 +497,16 @@ int gds_mlx5_dv_create_qp( void *qpc; - gds_mlx5_dv_qp_peer_t *mqp_peer = NULL; - struct mlx5dv_devx_obj *devx_obj = NULL; uint32_t qpn; uint32_t st_val; + off_t sq_buf_offset; + + uint64_t *sq_wrid = NULL; + uint64_t *rq_wrid = NULL; + gds_mlx5_dv_qp_type_t gmlx_qpt = GDS_MLX5_DV_QP_TYPE_UNKNOWN; assert(pd); @@ -660,6 +621,27 @@ int gds_mlx5_dv_create_qp( max_rx = GDS_ROUND_UP_POW2_OR_0(qp_attr->cap.max_recv_wr); wq_buf_size = (max_tx + max_rx) * wqe_size; + // Assume 1 recv sge and no wq_sig + sq_buf_offset = MAX(max_rx * sizeof(struct mlx5_wqe_data_seg), GDS_MLX5_DV_SEND_WQE_BB); + + if (max_tx > 0) { + sq_wrid = (uint64_t *)malloc(sizeof(uint64_t) * max_tx); + if (!sq_wrid) { + gds_err("Error in malloc for sq_wrid\n"); + status = ENOMEM; + goto out; + } + } + + if (max_rx > 0) { + rq_wrid = (uint64_t *)malloc(sizeof(uint64_t) * max_rx); + if (!rq_wrid) { + gds_err("Error in malloc for rq_wrid\n"); + status = ENOMEM; + goto out; + } + } + // Allocate WQ buffer. alignment = (uint32_t)((flags & GDS_ALLOC_WQ_ON_GPU) ? GDS_GPU_PAGE_SIZE : sysconf(_SC_PAGESIZE)); wq_buf = peer->alloc(wq_buf_size, alignment, (flags & GDS_ALLOC_WQ_ON_GPU) ? GDS_MEMORY_GPU : GDS_MEMORY_HOST); @@ -669,14 +651,14 @@ int gds_mlx5_dv_create_qp( goto out; } - wq_umem = mlx5dv_devx_umem_reg(context, wq_buf->addr, wq_buf_size, IBV_ACCESS_LOCAL_WRITE); + wq_umem = mlx5dv_devx_umem_reg(context, (flags & GDS_ALLOC_WQ_ON_GPU) ? (void *)wq_buf->peer_addr : wq_buf->addr, wq_buf_size, IBV_ACCESS_LOCAL_WRITE); if (!wq_umem) { gds_err("Error in mlx5dv_devx_umem_reg for WQ\n"); status = ENOMEM; goto out; } - wq_buf_range_id = peer_attr->register_va(wq_buf->addr, wq_buf_size, peer_attr->peer_id, wq_buf); + wq_buf_range_id = peer_attr->register_va((flags & GDS_ALLOC_WQ_ON_GPU) ? (void *)wq_buf->peer_addr : wq_buf->addr, wq_buf_size, peer_attr->peer_id, wq_buf); if (!wq_buf_range_id) { gds_err("Error in peer_attr->register_va for WQ\n"); status = ENOMEM; @@ -694,14 +676,14 @@ int gds_mlx5_dv_create_qp( goto out; } - dbr_umem = mlx5dv_devx_umem_reg(context, dbr_buf->addr, dbr_buf_size, IBV_ACCESS_LOCAL_WRITE); + dbr_umem = mlx5dv_devx_umem_reg(context, (flags & GDS_ALLOC_WQ_DBREC_ON_GPU) ? (void *)dbr_buf->peer_addr : dbr_buf->addr, dbr_buf_size, IBV_ACCESS_LOCAL_WRITE); if (!dbr_umem) { gds_err("Error in mlx5dv_devx_umem_reg for DBR\n"); status = ENOMEM; goto out; } - dbr_buf_range_id = peer_attr->register_va(dbr_buf->addr, dbr_buf_size, peer_attr->peer_id, dbr_buf); + dbr_buf_range_id = peer_attr->register_va((flags & GDS_ALLOC_WQ_DBREC_ON_GPU) ? (void *)dbr_buf->peer_addr : dbr_buf->addr, dbr_buf_size, peer_attr->peer_id, dbr_buf); if (!dbr_buf_range_id) { gds_err("Error in peer_attr->register_va for DBR\n"); status = ENOMEM; @@ -755,8 +737,6 @@ int gds_mlx5_dv_create_qp( mdqp->devx_qp = devx_obj; mdqp->qp_type = gmlx_qpt; - mdqp->qp_peer = mqp_peer; - mdqp->wq_buf = wq_buf; mdqp->wq_umem = wq_umem; mdqp->wq_va_id = wq_buf_range_id; @@ -769,12 +749,20 @@ int gds_mlx5_dv_create_qp( mdqp->bf_size = bf_reg_size / 2; mdqp->bf_va_id = uar_range_id; - mdqp->sq_cnt = max_tx; - mdqp->rq_cnt = max_rx; - mdqp->rq_buf_offset = 0; - // Assume 1 recv sge if RC and no wq_sig - mdqp->sq_buf_offset = (gmlx_qpt == GDS_MLX5_DV_QP_TYPE_RC) ? MAX(max_rx * sizeof(struct mlx5_wqe_data_seg), GDS_MLX5_DV_SEND_WQE_BB) : 0; + mdqp->sq_buf_offset = sq_buf_offset; + + mdqp->sq_wq.wrid = sq_wrid; + mdqp->sq_wq.buf = (void *)((uintptr_t)wq_buf->addr + sq_buf_offset); + mdqp->sq_wq.cnt = max_tx; + mdqp->rq_wq.dbrec = (__be32 *)((uintptr_t)dbr_buf->addr + sizeof(__be32)); + tx_mcq->wq = &mdqp->sq_wq; + + mdqp->rq_wq.wrid = rq_wrid; + mdqp->rq_wq.buf = wq_buf->addr; + mdqp->rq_wq.cnt = max_rx; + mdqp->rq_wq.dbrec = (__be32 *)dbr_buf->addr; + rx_mcq->wq = &mdqp->rq_wq; mdqp->peer_attr = peer_attr; @@ -819,15 +807,18 @@ int gds_mlx5_dv_create_qp( if (wq_buf) peer->free(wq_buf); + if (rq_wrid) + free(rq_wrid); + + if (sq_wrid) + free(sq_wrid); + if (uar_range_id) peer_attr->unregister_va(uar_range_id, peer_attr->peer_id); if (uar) mlx5dv_devx_free_uar(uar); - if (mqp_peer) - free(mqp_peer); - if (rx_mcq) gds_mlx5_dv_destroy_cq(rx_mcq); @@ -1136,7 +1127,6 @@ int gds_mlx5_dv_destroy_qp(gds_qp_t *gqp) gds_mlx5_dv_qp_t *mdqp; gds_peer *peer = NULL; - gds_mlx5_dv_qp_peer_t *mqp_peer; if (!gqp) return status; @@ -1145,17 +1135,6 @@ int gds_mlx5_dv_destroy_qp(gds_qp_t *gqp) assert(mdqp->devx_qp); - mqp_peer = mdqp->qp_peer; - - if (mqp_peer) { - gds_peer_attr *peer_attr = mqp_peer->peer_attr; - gds_peer *peer = peer_from_id(peer_attr->peer_id); - - // This may be used by ibv_destroy_qp, which eventually calls pd_mem_free. - peer->alloc_type = gds_peer::WQ; - peer->opaque = mqp_peer; - } - status = mlx5dv_devx_obj_destroy(mdqp->devx_qp); if (status) gds_err("Error in mlx5dv_devx_obj_destroy for QP.\n"); @@ -1201,15 +1180,18 @@ int gds_mlx5_dv_destroy_qp(gds_qp_t *gqp) } } + if (mdqp->rq_wq.wrid) + free(mdqp->rq_wq.wrid); + + if (mdqp->sq_wq.wrid) + free(mdqp->sq_wq.wrid); + if (mdqp->bf_uar) mlx5dv_devx_free_uar(mdqp->bf_uar); if (mdqp->parent_domain) ibv_dealloc_pd(mdqp->parent_domain); - if (mqp_peer) - free(mqp_peer); - if (mdqp->gqp.qp) free(mdqp->gqp.qp); @@ -1218,6 +1200,61 @@ int gds_mlx5_dv_destroy_qp(gds_qp_t *gqp) //----------------------------------------------------------------------------- +int gds_mlx5_dv_post_recv(gds_qp_t *gqp, struct ibv_recv_wr *wr, struct ibv_recv_wr **bad_wr) +{ + int status = 0; + gds_mlx5_dv_qp_t *mdqp; + struct ibv_recv_wr *curr_wr = NULL; + uint64_t head, tail; + unsigned int cnt; + + assert(gqp); + assert(wr); + assert(bad_wr); + + mdqp = to_gds_mdv_qp(gqp); + + assert(mdqp->rq_wq.head >= mdqp->rq_wq.tail); + + curr_wr = wr; + head = mdqp->rq_wq.head; + tail = mdqp->rq_wq.tail; + cnt = mdqp->rq_wq.cnt; + while (curr_wr) { + struct mlx5_wqe_data_seg *seg; + uint16_t idx; + if (curr_wr->num_sge != 1 || !curr_wr->sg_list) { + *bad_wr = curr_wr; + status = EINVAL; + gds_err("num_sge must be 1.\n"); + goto out; + } + if (head - tail >= cnt) { + *bad_wr = curr_wr; + status = ENOMEM; + gds_err("No rx credit available.\n"); + goto out; + } + idx = head & (cnt - 1); + seg = (struct mlx5_wqe_data_seg *)((uintptr_t)mdqp->rq_wq.buf + (idx << GDS_MLX5_DV_RECV_WQE_SHIFT)); + mlx5dv_set_data_seg(seg, curr_wr->sg_list->length, curr_wr->sg_list->lkey, curr_wr->sg_list->addr); + mdqp->rq_wq.wrid[idx] = curr_wr->wr_id; + + ++head; + + curr_wr = wr->next; + } + + wmb(); + + WRITE_ONCE(*mdqp->rq_wq.dbrec, htobe32(head & 0xffff)); + +out: + return status; +} + +//----------------------------------------------------------------------------- + int gds_transport_mlx5_dv_init(gds_transport_t **transport) { int status = 0; @@ -1231,6 +1268,8 @@ int gds_transport_mlx5_dv_init(gds_transport_t **transport) t->create_qp = gds_mlx5_dv_create_qp; t->destroy_qp = gds_mlx5_dv_destroy_qp; t->modify_qp = gds_mlx5_dv_modify_qp; + + t->post_recv = gds_mlx5_dv_post_recv; #if 0 t->rollback_qp = gds_mlx5_exp_rollback_qp; diff --git a/src/transports/mlx5-dv/mlx5-dv.hpp b/src/transports/mlx5-dv/mlx5-dv.hpp index fd8b362..fd6788e 100644 --- a/src/transports/mlx5-dv/mlx5-dv.hpp +++ b/src/transports/mlx5-dv/mlx5-dv.hpp @@ -107,8 +107,9 @@ typedef struct gds_mlx5_dv_cq_peer { typedef struct gds_mlx5_dv_wq { uint64_t *wrid; - uint64_t *wqe_head; - unsigned int wqe_cnt; + void *buf; // SQ and RQ point to different regions. + __be32 *dbrec; + unsigned int cnt; uint64_t head; uint64_t tail; } gds_mlx5_dv_wq_t; @@ -121,27 +122,6 @@ typedef struct gds_mlx5_dv_cq { gds_mlx5_dv_cq_peer_t *cq_peer; } gds_mlx5_dv_cq_t; -typedef struct gds_mlx5_dv_qp_peer { - gds_peer_attr *peer_attr; - uint32_t scur_post; - - struct { - uint64_t va_id; - size_t size; - gds_buf *gbuf; - } wq; - - struct { - uint64_t va_id; - size_t size; - gds_buf *gbuf; - } dbr; - - struct { - uint64_t va_id; - } bf; -} gds_mlx5_dv_qp_peer_t; - typedef enum gds_mlx5_dv_qp_type { GDS_MLX5_DV_QP_TYPE_UNKNOWN = 0, GDS_MLX5_DV_QP_TYPE_RC, @@ -154,20 +134,18 @@ typedef struct gds_mlx5_dv_qp { struct mlx5dv_devx_obj *devx_qp; - gds_mlx5_dv_qp_peer_t *qp_peer; - uint8_t sl; gds_buf *wq_buf; struct mlx5dv_devx_umem *wq_umem; uint64_t wq_va_id; - size_t sq_cnt; - size_t rq_cnt; - off_t sq_buf_offset; off_t rq_buf_offset; + gds_mlx5_dv_wq_t sq_wq; + gds_mlx5_dv_wq_t rq_wq; + gds_buf *dbr_buf; struct mlx5dv_devx_umem *dbr_umem; uint64_t dbr_va_id; diff --git a/src/utils.hpp b/src/utils.hpp index 4416e99..f719210 100644 --- a/src/utils.hpp +++ b/src/utils.hpp @@ -46,6 +46,18 @@ #endif +#ifndef ACCESS_ONCE + #define ACCESS_ONCE(x) (*(volatile typeof(x) *)&(x)) +#endif + +#ifndef READ_ONCE + #define READ_ONCE(x) ACCESS_ONCE(x) +#endif + +#ifndef WRITE_ONCE + #define WRITE_ONCE(x, v) (ACCESS_ONCE(x) = (v)) +#endif + #ifndef MIN #define MIN(x, y) ((x) < (y) ? (x) : (y)) #endif From 3b51bd961d5c6f4b4f6b0e3224049baf9cb46b05 Mon Sep 17 00:00:00 2001 From: Pak Markthub Date: Fri, 22 Oct 2021 00:14:55 -0700 Subject: [PATCH 47/50] Moved test applications to use gds_post_recv --- include/gdsync/core.h | 3 ++- src/apis.cpp | 9 +++++---- tests/gds_kernel_latency.c | 2 +- tests/gds_kernel_loopback_latency.c | 2 +- 4 files changed, 9 insertions(+), 7 deletions(-) diff --git a/include/gdsync/core.h b/include/gdsync/core.h index 25e706f..942decd 100644 --- a/include/gdsync/core.h +++ b/include/gdsync/core.h @@ -101,7 +101,8 @@ int gds_post_send(struct gds_qp *qp, gds_send_wr *wr, gds_send_wr **bad_wr); /* \brief: CPU-synchronous post recv for peer QPs * * Notes: - * - there is no GPU-synchronous version of this because there is not a use case for it. + * - There is no GPU-synchronous version of this because there is not a use case for it. + * - It is required for portability. For example, ibv_post_recv does not work with dv transport. */ int gds_post_recv(struct gds_qp *qp, struct ibv_recv_wr *wr, struct ibv_recv_wr **bad_wr); diff --git a/src/apis.cpp b/src/apis.cpp index cf9008c..ea65131 100644 --- a/src/apis.cpp +++ b/src/apis.cpp @@ -115,12 +115,13 @@ int gds_post_recv(struct gds_qp *qp, struct ibv_recv_wr *wr, struct ibv_recv_wr { int ret = 0; - gds_dbg("qp=%p wr=%p\n", qp, wr); assert(qp); - assert(qp->qp); - ret = ibv_post_recv(qp->qp, wr, bad_wr); + assert(wr); + assert(bad_wr); + gds_dbg("qp=%p wr=%p\n", qp, wr); + ret = gds_main_transport->post_recv(qp, wr, bad_wr); if (ret) { - gds_err("error %d in ibv_post_recv\n", ret); + gds_err("error %d in post_recv\n", ret); goto out; } diff --git a/tests/gds_kernel_latency.c b/tests/gds_kernel_latency.c index 2a4689f..b66bc0e 100644 --- a/tests/gds_kernel_latency.c +++ b/tests/gds_kernel_latency.c @@ -485,7 +485,7 @@ static int pp_post_recv(struct pingpong_context *ctx, int n) int i; for (i = 0; i < n; ++i) - if (ibv_post_recv(ctx->qp, &wr, &bad_wr)) + if (gds_post_recv(ctx->gds_qp, &wr, &bad_wr)) break; return i; diff --git a/tests/gds_kernel_loopback_latency.c b/tests/gds_kernel_loopback_latency.c index 8f95830..4b51b80 100644 --- a/tests/gds_kernel_loopback_latency.c +++ b/tests/gds_kernel_loopback_latency.c @@ -487,7 +487,7 @@ static int pp_post_recv(struct pingpong_context *ctx, int n) int i; gpu_dbg("posting %d recvs\n", n); for (i = 0; i < n; ++i) - if (ibv_post_recv(ctx->qp, &wr, &bad_wr)) + if (gds_post_recv(ctx->gds_qp, &wr, &bad_wr)) break; gpu_dbg("posted %d recvs\n", i); return i; From e8235aef370b241d6501be655891cc8d742615f3 Mon Sep 17 00:00:00 2001 From: Pak Markthub Date: Fri, 22 Oct 2021 00:23:24 -0700 Subject: [PATCH 48/50] Implemented gds_mlx5_exp_post_recv --- src/transports/mlx5-exp/mlx5-exp.cpp | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/src/transports/mlx5-exp/mlx5-exp.cpp b/src/transports/mlx5-exp/mlx5-exp.cpp index b093156..c23c93e 100644 --- a/src/transports/mlx5-exp/mlx5-exp.cpp +++ b/src/transports/mlx5-exp/mlx5-exp.cpp @@ -1225,6 +1225,18 @@ uint32_t gds_mlx5_exp_get_num_send_request_entries(gds_send_request_t *request) //----------------------------------------------------------------------------- +int gds_mlx5_exp_post_recv(gds_qp_t *gqp, struct ibv_recv_wr *wr, struct ibv_recv_wr **bad_wr) +{ + assert(gqp); + assert(gqp->qp); + assert(wr); + assert(bad_wr); + + return ibv_post_recv(gqp->qp, wr, bad_wr); +} + +//----------------------------------------------------------------------------- + int gds_transport_mlx5_exp_init(gds_transport_t **transport) { int status = 0; @@ -1247,6 +1259,8 @@ int gds_transport_mlx5_exp_init(gds_transport_t **transport) t->get_send_descs = gds_mlx5_exp_get_send_descs; t->get_num_send_request_entries = gds_mlx5_exp_get_num_send_request_entries; + t->post_recv = gds_mlx5_exp_post_recv; + t->init_wait_request = gds_mlx5_exp_init_wait_request; t->dump_wait_request = gds_mlx5_exp_dump_wait_request; t->stream_post_wait_descriptor = gds_mlx5_exp_stream_post_wait_descriptor; From 834e896e3ca3db8287678bf8074cbb4bf17581cc Mon Sep 17 00:00:00 2001 From: Pak Markthub Date: Fri, 22 Oct 2021 03:29:36 -0400 Subject: [PATCH 49/50] Fixed bugs when running with exp verbs --- src/transports/mlx5-exp/mlx5-exp.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/transports/mlx5-exp/mlx5-exp.cpp b/src/transports/mlx5-exp/mlx5-exp.cpp index c23c93e..eed1034 100644 --- a/src/transports/mlx5-exp/mlx5-exp.cpp +++ b/src/transports/mlx5-exp/mlx5-exp.cpp @@ -710,14 +710,14 @@ int gds_mlx5_exp_create_qp( // peer registration peer->alloc_type = gds_peer::WQ; - peer->alloc_flags = GDS_ALLOC_WQ_DEFAULT | GDS_ALLOC_DBREC_DEFAULT; + peer->alloc_flags = GDS_ALLOC_WQ_DEFAULT | GDS_ALLOC_WQ_DBREC_DEFAULT; if (flags & GDS_CREATE_QP_WQ_ON_GPU) { gds_err("error, QP WQ on GPU is not supported yet\n"); goto err; } if (flags & GDS_CREATE_QP_WQ_DBREC_ON_GPU) { gds_warn("QP WQ DBREC on GPU\n"); - peer->alloc_flags |= GDS_ALLOC_DBREC_ON_GPU; + peer->alloc_flags |= GDS_ALLOC_WQ_DBREC_ON_GPU; } exp_qp_attr.send_cq = tx_gmexpcq->gcq.cq; From 803d6707efd4e1f949f98f7b9bf9789cbf531e2d Mon Sep 17 00:00:00 2001 From: Pak Markthub Date: Fri, 22 Oct 2021 01:03:44 -0700 Subject: [PATCH 50/50] Fixed bugs in gds_mlx5_dv_modify_qp when using UD connection --- src/transports/mlx5-dv/mlx5-dv.cpp | 81 +++++++++++++++++------------- 1 file changed, 47 insertions(+), 34 deletions(-) diff --git a/src/transports/mlx5-dv/mlx5-dv.cpp b/src/transports/mlx5-dv/mlx5-dv.cpp index be0bd95..570cd93 100644 --- a/src/transports/mlx5-dv/mlx5-dv.cpp +++ b/src/transports/mlx5-dv/mlx5-dv.cpp @@ -932,18 +932,18 @@ static int gds_mlx5_dv_modify_qp_init2rtr(gds_mlx5_dv_qp_t *mdqp, struct ibv_qp_ status = EINVAL; goto out; } - } - if (!(attr_mask & IBV_QP_PATH_MTU)) { - gds_err("IBV_QP_PATH_MTU is required.\n"); - status = EINVAL; - goto out; - } + if (!(attr_mask & IBV_QP_PATH_MTU)) { + gds_err("IBV_QP_PATH_MTU is required.\n"); + status = EINVAL; + goto out; + } - if (!(attr_mask & IBV_QP_AV)) { - gds_err("IBV_QP_AV is required.\n"); - status = EINVAL; - goto out; + if (!(attr_mask & IBV_QP_AV)) { + gds_err("IBV_QP_AV is required.\n"); + status = EINVAL; + goto out; + } } if (mdqp->port_attr.link_layer != IBV_LINK_LAYER_INFINIBAND) { @@ -964,7 +964,9 @@ static int gds_mlx5_dv_modify_qp_init2rtr(gds_mlx5_dv_qp_t *mdqp, struct ibv_qp_ DEVX_SET(init2rtr_qp_in, cmd_in, qpn, mdqp->gqp.qp->qp_num); qpc = DEVX_ADDR_OF(init2rtr_qp_in, cmd_in, qpc); - DEVX_SET(qpc, qpc, mtu, attr->path_mtu); + if (attr_mask & IBV_QP_PATH_MTU) + DEVX_SET(qpc, qpc, mtu, attr->path_mtu); + DEVX_SET(qpc, qpc, log_msg_max, GDS_MLX5_DV_LOG_MAX_MSG_SIZE); if (attr_mask & IBV_QP_DEST_QPN) @@ -1024,28 +1026,30 @@ static int gds_mlx5_dv_modify_qp_rtr2rts(gds_mlx5_dv_qp_t *mdqp, struct ibv_qp_a goto out; } - if (!(attr_mask & IBV_QP_MAX_QP_RD_ATOMIC)) { - gds_err("IBV_QP_MAX_QP_RD_ATOMIC is required.\n"); - status = EINVAL; - goto out; - } + if (mdqp->qp_type == GDS_MLX5_DV_QP_TYPE_RC) { + if (!(attr_mask & IBV_QP_MAX_QP_RD_ATOMIC)) { + gds_err("IBV_QP_MAX_QP_RD_ATOMIC is required.\n"); + status = EINVAL; + goto out; + } - if (!(attr_mask & IBV_QP_RETRY_CNT)) { - gds_err("IBV_QP_RETRY_CNT is required.\n"); - status = EINVAL; - goto out; - } + if (!(attr_mask & IBV_QP_RETRY_CNT)) { + gds_err("IBV_QP_RETRY_CNT is required.\n"); + status = EINVAL; + goto out; + } - if (!(attr_mask & IBV_QP_RNR_RETRY)) { - gds_err("IBV_QP_RNR_RETRY is required.\n"); - status = EINVAL; - goto out; - } + if (!(attr_mask & IBV_QP_RNR_RETRY)) { + gds_err("IBV_QP_RNR_RETRY is required.\n"); + status = EINVAL; + goto out; + } - if (!(attr_mask & IBV_QP_TIMEOUT)) { - gds_err("IBV_QP_TIMEOUT is required.\n"); - status = EINVAL; - goto out; + if (!(attr_mask & IBV_QP_TIMEOUT)) { + gds_err("IBV_QP_TIMEOUT is required.\n"); + status = EINVAL; + goto out; + } } if (!(attr_mask & IBV_QP_SQ_PSN)) { @@ -1059,12 +1063,21 @@ static int gds_mlx5_dv_modify_qp_rtr2rts(gds_mlx5_dv_qp_t *mdqp, struct ibv_qp_a DEVX_SET(rtr2rts_qp_in, cmd_in, qpn, mdqp->gqp.qp->qp_num); qpc = DEVX_ADDR_OF(rtr2rts_qp_in, cmd_in, qpc); - DEVX_SET(qpc, qpc, log_sra_max, GDS_ILOG2_OR0(attr->max_rd_atomic)); - DEVX_SET(qpc, qpc, retry_count, attr->retry_cnt); - DEVX_SET(qpc, qpc, rnr_retry, attr->rnr_retry); + + if (attr_mask & IBV_QP_MAX_QP_RD_ATOMIC) + DEVX_SET(qpc, qpc, log_sra_max, GDS_ILOG2_OR0(attr->max_rd_atomic)); + + if (attr_mask & IBV_QP_RETRY_CNT) + DEVX_SET(qpc, qpc, retry_count, attr->retry_cnt); + + if (attr_mask & IBV_QP_RNR_RETRY) + DEVX_SET(qpc, qpc, rnr_retry, attr->rnr_retry); + DEVX_SET(qpc, qpc, next_send_psn, attr->sq_psn); DEVX_SET(qpc, qpc, log_ack_req_freq, GDS_MLX5_DV_LOG_ACK_REQ_FREQ); - DEVX_SET(qpc, qpc, primary_address_path.ack_timeout, attr->timeout); + + if (attr_mask & IBV_QP_TIMEOUT) + DEVX_SET(qpc, qpc, primary_address_path.ack_timeout, attr->timeout); status = mlx5dv_devx_obj_modify(mdqp->devx_qp, cmd_in, sizeof(cmd_in), cmd_out, sizeof(cmd_out)); if (status) {