-
Notifications
You must be signed in to change notification settings - Fork 62
Fix Chakra Errors #185
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Fix Chakra Errors #185
Changes from all commits
3bbfa50
568a545
0ff2430
1b17d40
5b0a378
8476ebc
7750db3
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -11,6 +11,7 @@ | |
| COMM_RECV_NODE, | ||
| COMM_SEND_NODE, | ||
| COMP_NODE, | ||
| METADATA_NODE, | ||
| REDUCE_SCATTER, | ||
| GlobalMetadata, | ||
| ) | ||
|
|
@@ -338,6 +339,8 @@ def get_protobuf_node_type_from_json_node( | |
| Returns: | ||
| int: The corresponding Chakra node type. | ||
| """ | ||
| if json_node.is_metadata_op(): | ||
| return METADATA_NODE | ||
| if json_node.is_gpu_op(): | ||
| if "ncclDevKernel_SendRecv" in json_node.name: | ||
| parent_node = json_node_map[json_node.parent] | ||
|
|
@@ -346,10 +349,17 @@ def get_protobuf_node_type_from_json_node( | |
| if parent_node.name == "record_param_comms" | ||
| else parent_node.name | ||
| ) | ||
| if parent_node.name == "record_param_comms" and parent_node.pg_name != "": | ||
| json_node.pg_name = parent_node.pg_name | ||
| if "send" in keyword: | ||
| return COMM_SEND_NODE | ||
| if "recv" in keyword: | ||
| return COMM_RECV_NODE | ||
| # In NCCL, all-to-all communication is implemented using point-to-point | ||
| # communications. More details can be found here: | ||
| # https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/usage/p2p.html | ||
| if "nccl:all_to_all" in keyword: | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. c10d calls multiple send & receive calls, but we believe this is coalesced into one c10d::allToall https://github.com/pytorch/pytorch/blob/7c71ab1d40992ea2660bb124152e95b7a9a5119d/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp#L4912 c10d::collective::fn |
||
| return COMM_COLL_NODE | ||
| if "ncclKernel" in json_node.name or "ncclDevKernel" in json_node.name: | ||
| return COMM_COLL_NODE | ||
| return COMP_NODE | ||
|
|
@@ -379,6 +389,10 @@ def get_collective_comm_type(self, name: str) -> int: | |
| for key in comm_type_mapping: | ||
| if key in normalized_name: | ||
| return comm_type_mapping[key] | ||
| # If both COMM_COLL_NAME and ncclDevKernel_SendRecv are present, this is nccl:all_to_all. | ||
| if "ncclDevKernel_SendRecv" in name: | ||
| return comm_type_mapping["alltoall"] | ||
|
|
||
| raise ValueError( | ||
| f"The name '{name}' does not correspond to a recognized collective communication type. " | ||
| "The converter determines collective communication types based on the node name of a GPU operator. " | ||
|
|
@@ -460,11 +474,15 @@ def convert_ctrl_dep_to_data_dep( | |
| if json_node.sync_dep: | ||
| for sync_dep in json_node.sync_dep: | ||
| if sync_dep not in current_node.data_deps: | ||
| current_node.data_deps.append(sync_dep) | ||
| logging.info( | ||
| f"Node ID {current_node.id} now has an synchonization dependency on Node ID {sync_dep}" | ||
| ) | ||
|
|
||
| # Found a bug encoding false dependency HTA. | ||
| # Compare start_time to eliminate false sync dependency. | ||
| prior_node = protobuf_node_map.get(sync_dep) | ||
| if prior_node is not None and prior_node.start_time_micros < current_node.start_time_micros: | ||
| current_node.data_deps.append(sync_dep) | ||
| logging.debug( | ||
| f"Node ID {current_node.id} now has an synchonization dependency on Node ID " | ||
| f"{sync_dep}" | ||
| ) | ||
| # Add children to the stack | ||
| children_chakra_ids = [child.id for child in json_node.children] | ||
| for child_chakra_id in sorted(children_chakra_ids, reverse=True): | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This string name
nccl::all_to_allis hardcoded below:https://github.com/pytorch/pytorch/blob/7c71ab1d40992ea2660bb124152e95b7a9a5119d/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp#L4974
There is a possibility that once this changes in PT, this breaks.