From 58707396292ec51a8a188338326e188994c98d23 Mon Sep 17 00:00:00 2001 From: Yaniv Blumenfeld Date: Sun, 11 Nov 2018 08:30:18 +0200 Subject: [PATCH 1/5] reformatted changes in direct verbs branch to one compact commit --- Makefile.am | 12 ++++---- configure.ac | 22 ++++++++++++-- include/gdsync/core.h | 53 +++++++++++++++++++++++++++++++- src/apis.cpp | 70 ++++++++++++++++++++++++++++++++++++++----- src/gdsync.cpp | 40 +++++++++++++++++++++++-- 5 files changed, 178 insertions(+), 19 deletions(-) diff --git a/Makefile.am b/Makefile.am index a8103cf..bc7e688 100644 --- a/Makefile.am +++ b/Makefile.am @@ -6,7 +6,7 @@ AM_CPPFLAGS += -I$(srcdir)/src #AM_CPPFLAGS += -I$(CUDA_PATH)/include AM_CPPFLAGS += -D__STDC_FORMAT_MACROS -#AM_LDFLAGS = -L$(CUDA_PATH)/lib64 +AM_LDFLAGS = -lmlx5 LIBGDSTOOLS = @LIBGDSTOOLS@ LIBNVTX = @LIBNVTX@ @@ -73,7 +73,7 @@ bin_PROGRAMS = tests/gds_kernel_latency tests/gds_poll_lat tests/gds_kernel_loop noinst_PROGRAMS = tests/rstest tests/wqtest tests_gds_kernel_latency_SOURCES = tests/gds_kernel_latency.c tests/gpu_kernels.cu tests/pingpong.c tests/gpu.cpp -tests_gds_kernel_latency_LDADD = $(top_builddir)/src/libgdsync.la -lmpi $(LIBGDSTOOLS) -lgdrapi $(LIBNVTX) -lcuda -lcudart $(PTHREAD_LIBS) +tests_gds_kernel_latency_LDADD = $(top_builddir)/src/libgdsync.la -lmpi $(LIBGDSTOOLS) -lgdrapi -lmlx5 $(LIBNVTX) -lcuda -lcudart $(PTHREAD_LIBS) tests_rstest_SOURCES = tests/rstest.cpp tests_rstest_LDADD = @@ -82,20 +82,20 @@ tests_wqtest_SOURCES = tests/task_queue_test.cpp tests_wqtest_LDADD = $(PTHREAD_LIBS) tests_gds_poll_lat_SOURCES = tests/gds_poll_lat.c tests/gpu.cpp tests/gpu_kernels.cu -tests_gds_poll_lat_LDADD = $(top_builddir)/src/libgdsync.la $(LIBGDSTOOLS) -lgdrapi -lmpi $(LIBNVTX) -lcuda -lcudart $(PTHREAD_LIBS) +tests_gds_poll_lat_LDADD = $(top_builddir)/src/libgdsync.la $(LIBGDSTOOLS) -lgdrapi -lmlx5 -lmpi $(LIBNVTX) -lcuda -lcudart $(PTHREAD_LIBS) tests_gds_sanity_SOURCES = tests/gds_sanity.cpp tests/gpu.cpp tests/gpu_kernels.cu -tests_gds_sanity_LDADD = $(top_builddir)/src/libgdsync.la $(LIBGDSTOOLS) -lgdrapi -lmpi $(LIBNVTX) -lcuda -lcudart $(PTHREAD_LIBS) +tests_gds_sanity_LDADD = $(top_builddir)/src/libgdsync.la $(LIBGDSTOOLS) -lgdrapi -lmlx5 -lmpi $(LIBNVTX) -lcuda -lcudart $(PTHREAD_LIBS) tests_gds_kernel_loopback_latency_SOURCES = tests/gds_kernel_loopback_latency.c tests/pingpong.c tests/gpu.cpp tests/gpu_kernels.cu -tests_gds_kernel_loopback_latency_LDADD = $(top_builddir)/src/libgdsync.la $(LIBGDSTOOLS) -lgdrapi $(LIBNVTX) -lcuda -lcudart $(PTHREAD_LIBS) +tests_gds_kernel_loopback_latency_LDADD = $(top_builddir)/src/libgdsync.la $(LIBGDSTOOLS) -lgdrapi -lmlx5 $(LIBNVTX) -lcuda -lcudart $(PTHREAD_LIBS) endif SUFFIXES= .cu .cu.o: - $(NVCC) $(CPPFLAGS) $(AM_CPPFLAGS) $(NVCCFLAGS) $(GENCODE_FLAGS) -c -o $@ $< + $(NVCC) $(CPPFLAGS) $(AM_LDFLAGS) $(AM_CPPFLAGS) $(NVCCFLAGS) $(GENCODE_FLAGS) -c -o $@ $< .cu.lo: $(LIBTOOL) --tag=CXX --mode=compile $(top_srcdir)/cudalt $(NVCC) --resource-usage -o $@ -c $< $(CPPFLAGS) $(AM_CPPFLAGS) $(NVCCFLAGS) $(GENCODE_FLAGS) diff --git a/configure.ac b/configure.ac index 901fce3..f227c42 100644 --- a/configure.ac +++ b/configure.ac @@ -47,6 +47,18 @@ else fi fi +AC_ARG_WITH([libmlx5], + AC_HELP_STRING([--with-libmlx5], [ Set path to libmlx5s installation ])) +if test x$with_libmlx5 = x || test x$with_libmlx5 = xno; then + want_libmlx5=no +else + want_libmlx5=yes + if test -d $with_libmlx5; then + CPPFLAGS="$CPPFLAGS -I$with_libmlx5/include" + LDFLAGS="$LDFLAGS -L$with_libmlx5/lib" + fi +fi + AC_ARG_WITH([gdrcopy], AC_HELP_STRING([--with-gdrcopy], [ Set path to gdrcopy installation ])) if test x$with_gdrcopy = x || test x$with_gdrcopy = xno; then @@ -149,6 +161,13 @@ AC_CHECK_LIB(ibverbs, ibv_exp_create_qp, AC_CHECK_HEADER(infiniband/peer_ops.h, [], AC_MSG_ERROR([ not found. libgdsync requires verbs peer-direct support.])) + +AC_CHECK_HEADERS([infiniband/mlx5dv.h], [], + AC_MSG_ERROR([ not found. libgdsync requires verbs peer-direct support.])) + +AC_CHECK_DECLS([mlx5dv_init_obj], + [], [], [[#include ]]) + AC_HEADER_STDC dnl Checks for typedefs, structures, and compiler characteristics. @@ -175,12 +194,11 @@ LDFLAGS="$LDFLAGS -L$CUDA_DRV_PATH/lib64 -L$CUDA_DRV_PATH/lib -L$CUDA_PATH/lib64 NVCCFLAGS="$NVCCFLAGS" CUDA_CFLAGS="$CUDA_CFLAGS" CUDA_LDFLAGS="-L$CUDA_DRV_PATH/lib64 -L$CUDA_DRV_PATH/lib -L$CUDA_PATH/lib64 -L$CUDA_PATH/lib" -CUDA_LIBS="-lcuda -lcudart -lcufft" +CUDA_LIBS="-lcuda -lcudart -lcufft -lmlx5" NVCCFLAGS="$NVCCFLAGS $CUDA_CFLAGS $CUDA_LDFLAGS $CUDA_LIBS" AC_SUBST(NVCC, [nvcc]) AC_SUBST(NVCCFLAGS) - dnl AC_CHECK_MEMBER([union CUstreamBatchMemOpParams_union.flushRemoteWrites], dnl [AC_SUBST( HAS_CUDA_MEMOP_FLUSH_REMOTE_WRITES, 1 )], dnl [AC_MSG_NOTICE([flushRemoteWrites is not defined])], diff --git a/include/gdsync/core.h b/include/gdsync/core.h index 284b8eb..8fac522 100644 --- a/include/gdsync/core.h +++ b/include/gdsync/core.h @@ -39,6 +39,8 @@ ( ((((v) & 0xffff0000U) >> 16) == GDS_API_MAJOR_VERSION) && \ ((((v) & 0x0000ffffU) >> 0 ) >= GDS_API_MINOR_VERSION) ) +#define IBV_EXP_SEND_GET_INFO (1 << 28) + typedef enum gds_param { GDS_PARAM_VERSION, GDS_NUM_PARAMS @@ -68,6 +70,11 @@ struct gds_qp { struct gds_cq recv_cq; struct ibv_exp_res_domain * res_domain; struct ibv_context *dev_context; + + void* swq; //send work queue pointer + size_t swq_cnt; //counter tracking swq location + size_t swq_size; //size of the swq (Blocks) + size_t swq_stride; //size of Blocks }; /* \brief: Create a peer-enabled QP attached to the specified GPU id. @@ -159,8 +166,23 @@ typedef enum gds_update_send_info_type { * Represents a posted send operation on a particular QP */ +#define GDS_SEND_MAX_SGE 16 + +struct ptr_to_sge{ + uintptr_t ptr_to_size; + uintptr_t ptr_to_lkey; + uintptr_t ptr_to_addr; + int offset; +}; + +struct gds_swr_info{ + size_t num_sge; + struct ptr_to_sge sge_list[GDS_SEND_MAX_SGE]; + size_t wr_id; +}; + typedef struct gds_send_request_info { - struct ibv_qp_swr_info swr_info; + struct gds_swr_info swr_info; //Size info uintptr_t ptr_to_size_wqe_h; CUdeviceptr ptr_to_size_wqe_d; @@ -350,6 +372,35 @@ int gds_stream_post_descriptors(CUstream stream, size_t n_descs, gds_descriptor_ */ int gds_post_descriptors(size_t n_descs, gds_descriptor_t *descs, int flags); +/** + * \brief: TODO + * + * + * \param flags - TODO + * + * \return + * 0 on success or one standard errno error + * + * + * Notes: + * - TODO. + */ +int gds_report_post(struct gds_qp *gqp /*, struct gds_send_wr* wr*/); + +/** + * \brief: TODO + * + * + * \param flags - TODO + * + * \return + * 0 on success or one standard errno error + * + * + * Notes: + * - TODO. + */ +int gds_query_last_info(struct gds_qp* qp, struct gds_swr_info* gds_info); /* * Local variables: diff --git a/src/apis.cpp b/src/apis.cpp index 0f8946a..6984c44 100644 --- a/src/apis.cpp +++ b/src/apis.cpp @@ -127,13 +127,14 @@ static int gds_rollback_qp(struct gds_qp *qp, gds_send_request_t * send_info, en //----------------------------------------------------------------------------- #define ntohll(x) (((uint64_t)(ntohl((int)((x << 32) >> 32))) << 32) | (uint32_t)ntohl(((int)(x >> 32)))) -static void gds_dump_swr(const char * func_name, struct ibv_qp_swr_info swr_info) + +static void gds_dump_swr(const char * func_name, struct gds_swr_info swr_info) { gds_dbg("[%s] wr_id=%lx, num_sge=%d\n", func_name, swr_info.wr_id, swr_info.num_sge); for(int j=0; j < swr_info.num_sge; j++) { - gds_dbg("[%s] SGE=%d, Size ptr=0x%08x, Size=%d (0x%08x), +offset=%d\n", + gds_dbg("[%s] SGE=%d, Size ptr=00x%lx, Size=%d (0x%08x), +offset=%d\n", func_name, j, (uintptr_t)swr_info.sge_list[j].ptr_to_size, @@ -141,14 +142,14 @@ static void gds_dump_swr(const char * func_name, struct ibv_qp_swr_info swr_info (uint32_t) ((uint32_t*)swr_info.sge_list[j].ptr_to_size)[0], ((uint32_t) ntohl( ((uint32_t*)swr_info.sge_list[j].ptr_to_size)[0]) ) + swr_info.sge_list[j].offset ); - gds_dbg("[%s] SGE=%d, lkey ptr=0x%08x, lkey=%d (0x%08x)\n", + gds_dbg("[%s] SGE=%d, lkey ptr=00x%lx, lkey=%d (0x%08x)\n", func_name, j, (uintptr_t)swr_info.sge_list[j].ptr_to_lkey, (uint32_t) ntohl( ((uint32_t*)swr_info.sge_list[j].ptr_to_lkey)[0]) , (uint32_t) ((uint32_t*)swr_info.sge_list[j].ptr_to_lkey)[0]); - gds_dbg("[%s] SGE=%d, Addr ptr=%lx, Addr=%lx -offset=%lx\n", + gds_dbg("[%s] SGE=%d, Addr ptr=00x%lx, Addr=%lx -offset=%lx\n", func_name, j, (uintptr_t)swr_info.sge_list[j].ptr_to_addr, @@ -233,7 +234,7 @@ int gds_prepare_send(struct gds_qp *qp, gds_send_wr *p_ewr, (int)p_ewr->sg_list[i].length ); } - memset(&(request->gds_sinfo.swr_info), 0, sizeof(struct ibv_qp_swr_info)); + memset(&(request->gds_sinfo.swr_info), 0, sizeof(struct gds_swr_info)); } ret = ibv_exp_post_send(qp->qp, p_ewr, bad_ewr); @@ -246,18 +247,19 @@ int gds_prepare_send(struct gds_qp *qp, gds_send_wr *p_ewr, } goto out; } - + if(get_info) { - ret = ibv_exp_query_send_info(qp->qp, p_ewr->wr_id, &(request->gds_sinfo.swr_info)); + ret = gds_query_last_info(qp, &(request->gds_sinfo.swr_info)); if(ret) { - fprintf(stderr, "ibv_exp_query_send_info returned %d: %s\n", ret, strerror(ret)); + fprintf(stderr, "gds_query_last_info returned %d: %s\n", ret, strerror(ret)); goto out; } gds_dump_swr("gds_prepare_send", request->gds_sinfo.swr_info); } + ret = gds_report_post(qp /*, p_ewr*/); //increment counter. ret = ibv_exp_peer_commit_qp(qp->qp, &request->commit); if (ret) { @@ -1180,6 +1182,58 @@ int gds_post_descriptors(size_t n_descs, gds_descriptor_t *descs, int flags) return ret; } +struct mlx5_sge{ + uint32_t byte_count; + uint32_t key; + uint64_t addr; +}; + +struct mlx5_send_wqe{ + uint32_t ctrl1; + uint32_t qpn_ds; + uint64_t ctrl34; + uint64_t send12; + uint64_t send34; + struct mlx5_sge sge; +}; + +int gds_report_post(struct gds_qp *qp /*, struct gds_send_wr* wr*/){ + ++(qp->swq_cnt); + return 0; + /*//Smarter Alternative for cases we use larger wqes: + struct mlx5_send_wqe* wqe = (struct mlx5_send_wqe*) ((char*) qp->swq + qp->swq_stride * ((qp->swq_cnt) % qp->swq_size)); + size_t ds = (ntohl(wqe->qpn_ds) & (0x0000007f)); + size_t wqes_per_block = (qp->swq_stride / sizeof(mlx5_sge)); + size_t num_blocks = ds / wqes_per_block + !!(ds % wqes_per_block); + (qp->swq_cnt)+=num_blocks; + return 0; + */ +} + +int gds_query_last_info(struct gds_qp *qp, struct gds_swr_info* gds_info){ + struct mlx5_send_wqe* wqe = (struct mlx5_send_wqe*) ((char*) qp->swq + qp->swq_stride * ((qp->swq_cnt) % qp->swq_size)); + gds_info->num_sge = (ntohl(wqe->qpn_ds) & (0x0000007f)) - 2; + struct mlx5_sge* sge = &(wqe->sge); + size_t blocks_per_wqe = (qp->swq_stride / sizeof(mlx5_sge)); + + uint16_t blocks_left = ((qp->swq_size - (qp->swq_cnt % qp->swq_size)) * qp->swq_stride) - 2; + //we need to monitor how many blocks we have left before wrap around. + + for (size_t i = 0; i< gds_info->num_sge; ++i){ + gds_info->sge_list[i].ptr_to_size = (uintptr_t) &(sge->byte_count); + gds_info->sge_list[i].ptr_to_lkey = (uintptr_t) &(sge->key); + gds_info->sge_list[i].ptr_to_addr = (uintptr_t) &(sge->addr); + gds_info->sge_list[i].offset = 0; //why is that here? + if (i == blocks_left){ + sge = (struct mlx5_sge*) qp->swq; + } else { + (++sge); + } + } + gds_info->wr_id = 1; //just exists to match old API. + return 0; +} + //----------------------------------------------------------------------------- /* diff --git a/src/gdsync.cpp b/src/gdsync.cpp index 2a45169..cf6f30e 100644 --- a/src/gdsync.cpp +++ b/src/gdsync.cpp @@ -44,6 +44,10 @@ #include "mlnxutils.h" #include "task_queue.hpp" +extern "C" { +#include +} + //----------------------------------------------------------------------------- void gds_assert(const char *cond, const char *file, unsigned line, const char *function) @@ -1871,7 +1875,32 @@ gds_create_cq(struct ibv_context *context, int cqe, //----------------------------------------------------------------------------- -struct gds_qp *gds_create_qp(struct ibv_pd *pd, struct ibv_context *context, + +int gds_add_dv_qp_ctx(struct gds_qp* gqp){ + struct mlx5dv_obj dv_obj = {}; + struct mlx5dv_qp* dv_qp = (struct mlx5dv_qp *)malloc(sizeof(struct mlx5dv_qp)); + memset((void *)&dv_obj, 0, sizeof(struct mlx5dv_obj)); + + dv_obj.qp.in = gqp->qp; + dv_obj.qp.out = dv_qp; + int ret = mlx5dv_init_obj(&dv_obj, MLX5DV_OBJ_QP); + + if (ret) + return ret; + + gqp->swq_cnt = 0; + gqp->swq_size = dv_qp->sq.wqe_cnt; + gqp->swq = dv_qp->sq.buf; + gqp->swq_stride = dv_qp->sq.stride; + free(dv_qp); + gds_dbg("extracted dv_qp context=%p\n", gqp); + return 0; +} + +//----------------------------------------------------------------------------- + +struct gds_qp* +gds_create_qp(struct ibv_pd *pd, struct ibv_context *context, gds_qp_init_attr_t *qp_attr, int gpu_id, int flags) { int ret = 0; @@ -1964,8 +1993,15 @@ struct gds_qp *gds_create_qp(struct ibv_pd *pd, struct ibv_context *context, gqp->recv_cq.cq = qp->recv_cq; gqp->recv_cq.curr_offset = 0; - gds_dbg("created gds_qp=%p\n", gqp); + ret = gds_add_dv_qp_ctx(gqp); + if (ret){ + ret = EINVAL; + gds_err("error in gds_add_dv_qp_ctx\n"); + goto err; + } + + gds_dbg("created gds_qp=%p\n", gqp); return gqp; err: From 4f47911742f20abd7395c0e0ee5d13a7af6899d6 Mon Sep 17 00:00:00 2001 From: Yaniv Blumenfeld Date: Tue, 13 Nov 2018 08:53:32 +0200 Subject: [PATCH 2/5] Fixed the sge count --- src/apis.cpp | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/src/apis.cpp b/src/apis.cpp index 6984c44..4e23c35 100644 --- a/src/apis.cpp +++ b/src/apis.cpp @@ -1189,7 +1189,7 @@ struct mlx5_sge{ }; struct mlx5_send_wqe{ - uint32_t ctrl1; + uint32_t opmod_wqeidx_opcode; uint32_t qpn_ds; uint64_t ctrl34; uint64_t send12; @@ -1212,11 +1212,25 @@ int gds_report_post(struct gds_qp *qp /*, struct gds_send_wr* wr*/){ int gds_query_last_info(struct gds_qp *qp, struct gds_swr_info* gds_info){ struct mlx5_send_wqe* wqe = (struct mlx5_send_wqe*) ((char*) qp->swq + qp->swq_stride * ((qp->swq_cnt) % qp->swq_size)); - gds_info->num_sge = (ntohl(wqe->qpn_ds) & (0x0000007f)) - 2; + + size_t base_blocks = 1; + switch (ntohl(wqe->opmod_wqeidx_opcode) & (0x000000ff)){ + case IBV_WR_RDMA_WRITE: + case IBV_WR_RDMA_WRITE_WITH_IMM: + case IBV_WR_RDMA_READ: + base_blocks = 2; + break; + case IBV_WR_SEND: + default: + base_blocks = 1; + } + + gds_info->num_sge = (ntohl(wqe->qpn_ds) & (0x0000007f)) - base_blocks; + struct mlx5_sge* sge = &(wqe->sge); size_t blocks_per_wqe = (qp->swq_stride / sizeof(mlx5_sge)); - uint16_t blocks_left = ((qp->swq_size - (qp->swq_cnt % qp->swq_size)) * qp->swq_stride) - 2; + uint16_t blocks_left = ((qp->swq_size - (qp->swq_cnt % qp->swq_size)) * qp->swq_stride) - base_blocks; //we need to monitor how many blocks we have left before wrap around. for (size_t i = 0; i< gds_info->num_sge; ++i){ From 263a587e59284cb32aafb08085450da10718b346 Mon Sep 17 00:00:00 2001 From: Haggai Eran Date: Thu, 15 Nov 2018 10:24:57 +0200 Subject: [PATCH 3/5] Update configure.ac Co-Authored-By: yanivbl6 --- configure.ac | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/configure.ac b/configure.ac index f227c42..fb1d78f 100644 --- a/configure.ac +++ b/configure.ac @@ -163,7 +163,7 @@ AC_CHECK_HEADER(infiniband/peer_ops.h, [], AC_MSG_ERROR([ not found. libgdsync requires verbs peer-direct support.])) AC_CHECK_HEADERS([infiniband/mlx5dv.h], [], - AC_MSG_ERROR([ not found. libgdsync requires verbs peer-direct support.])) + AC_MSG_ERROR([ not found. libgdsync requires direct verbs support.])) AC_CHECK_DECLS([mlx5dv_init_obj], [], [], [[#include ]]) From 612813e73136051c71f880754c19d2f4c63ffb67 Mon Sep 17 00:00:00 2001 From: Yaniv Blumenfeld Date: Sun, 9 Dec 2018 06:42:46 +0200 Subject: [PATCH 4/5] Fixed info extraction, memory leak on error, and build-related comment --- Makefile.am | 2 +- src/apis.cpp | 2 -- src/gdsync.cpp | 4 +++- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/Makefile.am b/Makefile.am index bc7e688..57be658 100644 --- a/Makefile.am +++ b/Makefile.am @@ -95,7 +95,7 @@ endif SUFFIXES= .cu .cu.o: - $(NVCC) $(CPPFLAGS) $(AM_LDFLAGS) $(AM_CPPFLAGS) $(NVCCFLAGS) $(GENCODE_FLAGS) -c -o $@ $< + $(NVCC) $(CPPFLAGS) $(AM_CPPFLAGS) $(NVCCFLAGS) $(GENCODE_FLAGS) -c -o $@ $< .cu.lo: $(LIBTOOL) --tag=CXX --mode=compile $(top_srcdir)/cudalt $(NVCC) --resource-usage -o $@ -c $< $(CPPFLAGS) $(AM_CPPFLAGS) $(NVCCFLAGS) $(GENCODE_FLAGS) diff --git a/src/apis.cpp b/src/apis.cpp index 4e23c35..5248613 100644 --- a/src/apis.cpp +++ b/src/apis.cpp @@ -1192,8 +1192,6 @@ struct mlx5_send_wqe{ uint32_t opmod_wqeidx_opcode; uint32_t qpn_ds; uint64_t ctrl34; - uint64_t send12; - uint64_t send34; struct mlx5_sge sge; }; diff --git a/src/gdsync.cpp b/src/gdsync.cpp index cf6f30e..2c1f8c0 100644 --- a/src/gdsync.cpp +++ b/src/gdsync.cpp @@ -1885,8 +1885,10 @@ int gds_add_dv_qp_ctx(struct gds_qp* gqp){ dv_obj.qp.out = dv_qp; int ret = mlx5dv_init_obj(&dv_obj, MLX5DV_OBJ_QP); - if (ret) + if (ret){ + free(dv_qp); return ret; + } gqp->swq_cnt = 0; gqp->swq_size = dv_qp->sq.wqe_cnt; From 18c817796d7f222fad714c15fd9b000daf9f694e Mon Sep 17 00:00:00 2001 From: Yaniv Blumenfeld Date: Sun, 16 Dec 2018 10:25:49 +0200 Subject: [PATCH 5/5] Added MPI Barrier in validation, which allows test to pass --- tests/gds_kernel_latency.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/gds_kernel_latency.c b/tests/gds_kernel_latency.c index f2a3342..e2ad5ae 100644 --- a/tests/gds_kernel_latency.c +++ b/tests/gds_kernel_latency.c @@ -1103,7 +1103,7 @@ static int pp_post_work(struct pingpong_context *ctx, int n_posts, int rcnt, uin if(ctx->validate) { cudaDeviceSynchronize(); - + MPI_Barrier(MPI_COMM_WORLD); cudaMemcpy(ctx->validate_buf, ctx->rxbuf, ctx->size, cudaMemcpyDefault); char *value = (char*)ctx->validate_buf; char expected=i%CHAR_MAX;