-
Notifications
You must be signed in to change notification settings - Fork 19
RFC - Expose send params driver bypass #75
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: expose_send_params
Are you sure you want to change the base?
Changes from all commits
5870739
4f47911
263a587
612813e
18c8177
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -39,6 +39,8 @@ | |
| ( ((((v) & 0xffff0000U) >> 16) == GDS_API_MAJOR_VERSION) && \ | ||
| ((((v) & 0x0000ffffU) >> 0 ) >= GDS_API_MINOR_VERSION) ) | ||
|
|
||
| #define IBV_EXP_SEND_GET_INFO (1 << 28) | ||
|
|
||
| typedef enum gds_param { | ||
| GDS_PARAM_VERSION, | ||
| GDS_NUM_PARAMS | ||
|
|
@@ -68,6 +70,11 @@ struct gds_qp { | |
| struct gds_cq recv_cq; | ||
| struct ibv_exp_res_domain * res_domain; | ||
| struct ibv_context *dev_context; | ||
|
|
||
| void* swq; //send work queue pointer | ||
| size_t swq_cnt; //counter tracking swq location | ||
| size_t swq_size; //size of the swq (Blocks) | ||
| size_t swq_stride; //size of Blocks | ||
| }; | ||
|
|
||
| /* \brief: Create a peer-enabled QP attached to the specified GPU id. | ||
|
|
@@ -159,8 +166,23 @@ typedef enum gds_update_send_info_type { | |
| * Represents a posted send operation on a particular QP | ||
| */ | ||
|
|
||
| #define GDS_SEND_MAX_SGE 16 | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is there a way to enforce this limitation? Maybe check it in |
||
|
|
||
| struct ptr_to_sge{ | ||
| uintptr_t ptr_to_size; | ||
| uintptr_t ptr_to_lkey; | ||
| uintptr_t ptr_to_addr; | ||
| int offset; | ||
| }; | ||
|
|
||
| struct gds_swr_info{ | ||
| size_t num_sge; | ||
| struct ptr_to_sge sge_list[GDS_SEND_MAX_SGE]; | ||
| size_t wr_id; | ||
| }; | ||
|
|
||
| typedef struct gds_send_request_info { | ||
| struct ibv_qp_swr_info swr_info; | ||
| struct gds_swr_info swr_info; | ||
| //Size info | ||
| uintptr_t ptr_to_size_wqe_h; | ||
| CUdeviceptr ptr_to_size_wqe_d; | ||
|
|
@@ -350,6 +372,35 @@ int gds_stream_post_descriptors(CUstream stream, size_t n_descs, gds_descriptor_ | |
| */ | ||
| int gds_post_descriptors(size_t n_descs, gds_descriptor_t *descs, int flags); | ||
|
|
||
| /** | ||
| * \brief: TODO | ||
| * | ||
| * | ||
| * \param flags - TODO | ||
| * | ||
| * \return | ||
| * 0 on success or one standard errno error | ||
| * | ||
| * | ||
| * Notes: | ||
| * - TODO. | ||
| */ | ||
| int gds_report_post(struct gds_qp *gqp /*, struct gds_send_wr* wr*/); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The function documentation is still missing. I guess this function advances the tracking in the gds_qp struct of the current producer index with the given send wr size? |
||
|
|
||
| /** | ||
| * \brief: TODO | ||
| * | ||
| * | ||
| * \param flags - TODO | ||
| * | ||
| * \return | ||
| * 0 on success or one standard errno error | ||
| * | ||
| * | ||
| * Notes: | ||
| * - TODO. | ||
| */ | ||
| int gds_query_last_info(struct gds_qp* qp, struct gds_swr_info* gds_info); | ||
|
|
||
| /* | ||
| * Local variables: | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -127,28 +127,29 @@ static int gds_rollback_qp(struct gds_qp *qp, gds_send_request_t * send_info, en | |
| //----------------------------------------------------------------------------- | ||
|
|
||
| #define ntohll(x) (((uint64_t)(ntohl((int)((x << 32) >> 32))) << 32) | (uint32_t)ntohl(((int)(x >> 32)))) | ||
| static void gds_dump_swr(const char * func_name, struct ibv_qp_swr_info swr_info) | ||
|
|
||
| static void gds_dump_swr(const char * func_name, struct gds_swr_info swr_info) | ||
| { | ||
| gds_dbg("[%s] wr_id=%lx, num_sge=%d\n", func_name, swr_info.wr_id, swr_info.num_sge); | ||
|
|
||
| for(int j=0; j < swr_info.num_sge; j++) | ||
| { | ||
| gds_dbg("[%s] SGE=%d, Size ptr=0x%08x, Size=%d (0x%08x), +offset=%d\n", | ||
| gds_dbg("[%s] SGE=%d, Size ptr=00x%lx, Size=%d (0x%08x), +offset=%d\n", | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There's a typo here (00x), and you might want to put debugging print changes in a separate patch, to make the review easier. |
||
| func_name, | ||
| j, | ||
| (uintptr_t)swr_info.sge_list[j].ptr_to_size, | ||
| (uint32_t) ntohl( ((uint32_t*)swr_info.sge_list[j].ptr_to_size)[0]) , | ||
| (uint32_t) ((uint32_t*)swr_info.sge_list[j].ptr_to_size)[0], | ||
| ((uint32_t) ntohl( ((uint32_t*)swr_info.sge_list[j].ptr_to_size)[0]) ) + swr_info.sge_list[j].offset ); | ||
|
|
||
| gds_dbg("[%s] SGE=%d, lkey ptr=0x%08x, lkey=%d (0x%08x)\n", | ||
| gds_dbg("[%s] SGE=%d, lkey ptr=00x%lx, lkey=%d (0x%08x)\n", | ||
| func_name, | ||
| j, | ||
| (uintptr_t)swr_info.sge_list[j].ptr_to_lkey, | ||
| (uint32_t) ntohl( ((uint32_t*)swr_info.sge_list[j].ptr_to_lkey)[0]) , | ||
| (uint32_t) ((uint32_t*)swr_info.sge_list[j].ptr_to_lkey)[0]); | ||
|
|
||
| gds_dbg("[%s] SGE=%d, Addr ptr=%lx, Addr=%lx -offset=%lx\n", | ||
| gds_dbg("[%s] SGE=%d, Addr ptr=00x%lx, Addr=%lx -offset=%lx\n", | ||
| func_name, | ||
| j, | ||
| (uintptr_t)swr_info.sge_list[j].ptr_to_addr, | ||
|
|
@@ -233,7 +234,7 @@ int gds_prepare_send(struct gds_qp *qp, gds_send_wr *p_ewr, | |
| (int)p_ewr->sg_list[i].length | ||
| ); | ||
| } | ||
| memset(&(request->gds_sinfo.swr_info), 0, sizeof(struct ibv_qp_swr_info)); | ||
| memset(&(request->gds_sinfo.swr_info), 0, sizeof(struct gds_swr_info)); | ||
| } | ||
|
|
||
| ret = ibv_exp_post_send(qp->qp, p_ewr, bad_ewr); | ||
|
|
@@ -246,18 +247,19 @@ int gds_prepare_send(struct gds_qp *qp, gds_send_wr *p_ewr, | |
| } | ||
| goto out; | ||
| } | ||
|
|
||
| if(get_info) | ||
| { | ||
| ret = ibv_exp_query_send_info(qp->qp, p_ewr->wr_id, &(request->gds_sinfo.swr_info)); | ||
| ret = gds_query_last_info(qp, &(request->gds_sinfo.swr_info)); | ||
| if(ret) | ||
| { | ||
| fprintf(stderr, "ibv_exp_query_send_info returned %d: %s\n", ret, strerror(ret)); | ||
| fprintf(stderr, "gds_query_last_info returned %d: %s\n", ret, strerror(ret)); | ||
| goto out; | ||
| } | ||
|
|
||
| gds_dump_swr("gds_prepare_send", request->gds_sinfo.swr_info); | ||
| } | ||
| ret = gds_report_post(qp /*, p_ewr*/); //increment counter. | ||
|
|
||
| ret = ibv_exp_peer_commit_qp(qp->qp, &request->commit); | ||
| if (ret) { | ||
|
|
@@ -1180,6 +1182,70 @@ int gds_post_descriptors(size_t n_descs, gds_descriptor_t *descs, int flags) | |
| return ret; | ||
| } | ||
|
|
||
| struct mlx5_sge{ | ||
| uint32_t byte_count; | ||
| uint32_t key; | ||
| uint64_t addr; | ||
| }; | ||
|
|
||
| struct mlx5_send_wqe{ | ||
| uint32_t opmod_wqeidx_opcode; | ||
| uint32_t qpn_ds; | ||
| uint64_t ctrl34; | ||
| struct mlx5_sge sge; | ||
| }; | ||
|
|
||
| int gds_report_post(struct gds_qp *qp /*, struct gds_send_wr* wr*/){ | ||
| ++(qp->swq_cnt); | ||
| return 0; | ||
| /*//Smarter Alternative for cases we use larger wqes: | ||
| struct mlx5_send_wqe* wqe = (struct mlx5_send_wqe*) ((char*) qp->swq + qp->swq_stride * ((qp->swq_cnt) % qp->swq_size)); | ||
| size_t ds = (ntohl(wqe->qpn_ds) & (0x0000007f)); | ||
| size_t wqes_per_block = (qp->swq_stride / sizeof(mlx5_sge)); | ||
| size_t num_blocks = ds / wqes_per_block + !!(ds % wqes_per_block); | ||
| (qp->swq_cnt)+=num_blocks; | ||
| return 0; | ||
| */ | ||
| } | ||
|
|
||
| int gds_query_last_info(struct gds_qp *qp, struct gds_swr_info* gds_info){ | ||
| struct mlx5_send_wqe* wqe = (struct mlx5_send_wqe*) ((char*) qp->swq + qp->swq_stride * ((qp->swq_cnt) % qp->swq_size)); | ||
|
|
||
| size_t base_blocks = 1; | ||
| switch (ntohl(wqe->opmod_wqeidx_opcode) & (0x000000ff)){ | ||
| case IBV_WR_RDMA_WRITE: | ||
| case IBV_WR_RDMA_WRITE_WITH_IMM: | ||
| case IBV_WR_RDMA_READ: | ||
| base_blocks = 2; | ||
| break; | ||
| case IBV_WR_SEND: | ||
| default: | ||
| base_blocks = 1; | ||
| } | ||
|
|
||
| gds_info->num_sge = (ntohl(wqe->qpn_ds) & (0x0000007f)) - base_blocks; | ||
|
|
||
| struct mlx5_sge* sge = &(wqe->sge); | ||
| size_t blocks_per_wqe = (qp->swq_stride / sizeof(mlx5_sge)); | ||
|
|
||
| uint16_t blocks_left = ((qp->swq_size - (qp->swq_cnt % qp->swq_size)) * qp->swq_stride) - base_blocks; | ||
| //we need to monitor how many blocks we have left before wrap around. | ||
|
|
||
| for (size_t i = 0; i< gds_info->num_sge; ++i){ | ||
| gds_info->sge_list[i].ptr_to_size = (uintptr_t) &(sge->byte_count); | ||
| gds_info->sge_list[i].ptr_to_lkey = (uintptr_t) &(sge->key); | ||
| gds_info->sge_list[i].ptr_to_addr = (uintptr_t) &(sge->addr); | ||
| gds_info->sge_list[i].offset = 0; //why is that here? | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. what does the offset field stand for?
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @e-ago do you remember why you had offset in the first place?
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Where did you find this? Considering file https://github.com/gpudirect/libmlx5/blob/expose_send_params/src/qp.c the offset is modified here
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is the offset field affecting the data written into the wqe in some way? |
||
| if (i == blocks_left){ | ||
| sge = (struct mlx5_sge*) qp->swq; | ||
| } else { | ||
| (++sge); | ||
| } | ||
| } | ||
| gds_info->wr_id = 1; //just exists to match old API. | ||
| return 0; | ||
| } | ||
|
|
||
| //----------------------------------------------------------------------------- | ||
|
|
||
| /* | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Does the code still keep this flag in send_flags? I think it would be better to use a separate field so that future send flags in libibverbs won't conflict with this definition.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
the problem here is that gds_send_wr is simply ibv_exp_send_wr.
maybe we want to have a new flags arg for gds_prepare_send().