From cf336fd8103d8a01e3ff5521e4ac07e24ebc0dd9 Mon Sep 17 00:00:00 2001 From: Joseph Greathouse Date: Fri, 29 Oct 2021 09:24:05 -0500 Subject: [PATCH 1/4] Add better PCI ID backup in rocm_agent_enumerator The PCI ID backup method in rocm_agent_enumerator, where the tool uses lspci to find all AMD GPU devices in the system and manaully match them to gfx version, is extremely outdated. The PCI ID list did not include anything after Vega 10, and the actual call to lspci no longer returned anything due to some missing conversions. The patch adds all GPUs that might be needed by ROCr up through Navy Flounder. The PCI ID to gfx matching pulls from the amdgpu driver and libhsakmt. --- rocm_agent_enumerator | 51 ++++++++++++++++++++++++++++++++++++++----- 1 file changed, 45 insertions(+), 6 deletions(-) diff --git a/rocm_agent_enumerator b/rocm_agent_enumerator index f62daa4..b669565 100755 --- a/rocm_agent_enumerator +++ b/rocm_agent_enumerator @@ -10,7 +10,9 @@ CWD = os.path.dirname(os.path.realpath(__file__)) ISA_TO_ID = { # Kaveri - Temporary - "gfx700" : [0x130f], + "gfx700" : [0x1304, 0x1305, 0x1306, 0x1307, 0x1309, 0x130a, 0x130b, 0x130c, + 0x130d, 0x130e, 0x130f, 0x1310, 0x1311, 0x1312, 0x1313, 0x1315, + 0x1316, 0x1317, 0x1318, 0x131b, 0x131c, 0x131d], # Hawaii "gfx701" : [0x67a0, 0x67a1, 0x67a2, 0x67a8, 0x67a9, 0x67aa, 0x67b0, 0x67b1, 0x67b8, 0x67b9, 0x67ba, 0x67be], @@ -23,15 +25,52 @@ ISA_TO_ID = { "gfx803" : [0x7300, 0x730f, # Polaris10 0x67c0, 0x67c1, 0x67c2, 0x67c4, 0x67c7, 0x67c8, 0x67c9, 0x67ca, - 0x67cc, 0x67cf, + 0x67cc, 0x67cf, 0x6fdf, # Polaris11 0x67d0, 0x67df, 0x67e0, 0x67e1, 0x67e3, 0x67e7, 0x67e8, 0x67e9, 0x67eb, 0x67ef, 0x67ff, # Polaris12 - 0x6980, 0x6981, 0x6985, 0x6986, 0x6987, 0x6995, 0x6997, 0x699f], + 0x6980, 0x6981, 0x6985, 0x6986, 0x6987, 0x6995, 0x6997, 0x699f, + # VegaM + 0x694c, 0x694e, 0x694f], # Vega10 - "gfx900" : [0x6860, 0x6861, 0x6862, 0x6863, 0x6864, 0x6867, 0x6868, 0x686c, - 0x687f] + "gfx900" : [0x6860, 0x6861, 0x6862, 0x6863, 0x6864, 0x6867, 0x6868, 0x6869, + 0x6869, 0x686a, 0x686b, 0x686c, 0x686d, 0x686e, 0x686f, 0x687f], + # Raven + "gfx902" : [0x15dd, 0x15d8], + # Vega12 + "gfx904" : [0x69a0, 0x69a1, 0x69a2, 0x69a3, 0x69af], + # Vega20 + "gfx906" : [0x66a0, 0x66a1, 0x66a2, 0x66a3, 0x66a4, 0x66a7, 0x66af], + # Arcturus + "gfx908" : [0x738c, 0x7388, 0x738e, 0x7390], + # Aldebaran + "gfx90a" : [0x7408, 0x740c, 0x740f, 0x7410], + # Renoir + "gfx90c" : [0x15e7, 0x1636, 0x1638, 0x164c], + # Navi10 + "gfx1010" : [0x7310, 0x7312, 0x7318, 0x7319, 0x731a, 0x731b, 0x731e, 0x731f], + # Navi12 + "gfx1011" : [0x7360, 0x7362], + # Navi14 + "gfx1012" : [0x7340, 0x7341, 0x7347, 0x734f], + # Cyan_Skillfish + "gfx1013" : [0x13f9, 0x13fa, 0x13fb, 0x13fc, 0x13f3], + # Sienna_Cichlid + "gfx1030" : [0x73a0, 0x73a1, 0x73a2, 0x73a3, 0x73a5, 0x73a8, 0x73a9, 0x73ab, + 0x73ac, 0x73ad, 0x73ae, 0x73af, 0x73bf], + # Navy_Flounder + "gfx1031" : [0x73c0, 0x73c1, 0x73c3, 0x73da, 0x73db, 0x73dc, 0x73dd, 0x73de, + 0x73df], + # Dimgray_Cavefish + "gfx1032" : [0x73e0, 0x73e1, 0x73e2, 0x73e3, 0x73e8, 0x73e9, 0x73ea, 0x73eb, + 0x73ec, 0x73ed, 0x73ef, 0x73ff], + # Van Gogh + "gfx1033" : [0x163f], + # Beige_Goby + "gfx1034" : [0x7420, 0x7421, 0x7422, 0x7423, 0x743f], + # Yellow_Carp + "gfx1035" : [0x164d, 0x1681] } def staticVars(**kwargs): @@ -99,7 +138,7 @@ def readFromLSPCI(): try: # run lspci - lspci_output = subprocess.Popen(["/usr/bin/lspci", "-n", "-d", "1002:"], stdout=subprocess.PIPE).communicate()[0].split('\n') + lspci_output = subprocess.Popen(["/usr/bin/lspci", "-n", "-d", "1002:"], stdout=subprocess.PIPE).communicate()[0].decode("utf-8").split('\n') except: lspci_output = [] From 5fd28711ef43c0aaaa5eded92b6c119680e65156 Mon Sep 17 00:00:00 2001 From: Joseph Greathouse Date: Fri, 29 Oct 2021 10:34:47 -0500 Subject: [PATCH 2/4] Update cmake to include pciutils dependencies When building packages, add in pciutils as a dependency because rocm_agent_enumerator uses this as a mechanism for looking up what GPUs exist on the system. --- CMakeLists.txt | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 112d1d7..9475068 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -207,7 +207,7 @@ if (DEFINED ENV{CPACK_DEBIAN_PACKAGE_RELEASE}) else() set(CPACK_DEBIAN_PACKAGE_RELEASE "local") endif() -set(CPACK_DEBIAN_PACKAGE_DEPENDS "rocm-core") +set(CPACK_DEBIAN_PACKAGE_DEPENDS "rocm-core, pciutils") #RPM package specific variables if(DEFINED CPACK_PACKAGING_INSTALL_PREFIX) @@ -218,7 +218,7 @@ if(DEFINED ENV{CPACK_RPM_PACKAGE_RELEASE}) else() set(CPACK_RPM_PACKAGE_RELEASE "local") endif() -set(CPACK_RPM_PACKAGE_REQUIRES "rocm-core") +set(CPACK_RPM_PACKAGE_REQUIRES "rocm-core, pciutils") #Set rpm distro if(CPACK_RPM_PACKAGE_RELEASE) @@ -233,8 +233,8 @@ set(CPACK_DEBIAN_FILE_NAME "DEB-DEFAULT") set(CPACK_RPM_FILE_NAME "RPM-DEFAULT") # Remove dependency on rocm-core if -DROCM_DEP_ROCMCORE=ON not given to cmake if(NOT ROCM_DEP_ROCMCORE) - string(REGEX REPLACE ",? ?rocm-core" "" CPACK_RPM_PACKAGE_REQUIRES ${CPACK_RPM_PACKAGE_REQUIRES}) - string(REGEX REPLACE ",? ?rocm-core" "" CPACK_DEBIAN_PACKAGE_DEPENDS ${CPACK_DEBIAN_PACKAGE_DEPENDS}) + string(REGEX REPLACE ",? ?rocm-core, " "" CPACK_RPM_PACKAGE_REQUIRES ${CPACK_RPM_PACKAGE_REQUIRES}) + string(REGEX REPLACE ",? ?rocm-core, " "" CPACK_DEBIAN_PACKAGE_DEPENDS ${CPACK_DEBIAN_PACKAGE_DEPENDS}) endif() include ( CPack ) From 0385fcdb7bcff542cc206a2a4ac4701b907b7153 Mon Sep 17 00:00:00 2001 From: Joseph Greathouse Date: Fri, 29 Oct 2021 10:35:35 -0500 Subject: [PATCH 3/4] Switch order of lspci and rocminfo for gfx arch query rocminfo is a very heavyweight mechanism for learning a lot of information about the GPUs that are attached to the system. It opens up the limited /dev/kfd resource to gather lots of information about each device, while rocm_agent_enumerator really only wants the gfx number of AMD devices attached to the system. To avoid this heavyweight lookup in most cases, this patch switches the order of tests. Rather than starting with rocminfo and then falling back to a poorly-maintained PCI ID list, this patch changes the agent enumerator to start by checking in the PCI ID list (fast case) and then falling back to rocminfo (slow case) if the PCI ID list is out of date. --- rocm_agent_enumerator | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/rocm_agent_enumerator b/rocm_agent_enumerator index b669565..0ab2094 100755 --- a/rocm_agent_enumerator +++ b/rocm_agent_enumerator @@ -168,18 +168,18 @@ def main(): execution of "rocminfo" is not possible. 2. target.lst : user-supplied text file. This is used in a container setting where ROCm stack may usually not available. - 3. rocminfo : a tool shipped with this script to enumerate GPU agents - available on a working ROCm stack. - 4. lspci : enumerate PCI bus and locate supported devices from a hard-coded + 3. lspci : enumerate PCI bus and locate supported devices from a hard-coded lookup table. + 4. rocminfo : a tool shipped with this script to enumerate GPU agents + available on a working ROCm stack. """ target_list = readFromTargetLstFile() if len(target_list) == 0: - target_list = readFromROCMINFO() + target_list = readFromLSPCI() if len(target_list) == 0: - target_list = readFromLSPCI() + target_list = readFromROCMINFO() # workaround to cope with existing rocm_agent_enumerator behavior where gfx000 # would always be returned From 071b41ef21ccea7baab26b8c79897834a6583eb6 Mon Sep 17 00:00:00 2001 From: Joseph Greathouse Date: Fri, 29 Oct 2021 16:25:10 -0500 Subject: [PATCH 4/4] Add capability to pull gfx architecture from KFD topology New versions of amdkfd include the gfx architecture version number for all GPUs surfaced in the HSA topology. This patch adds this as the preferred way for rocm_agent_enumerator to check for supported gfx architecture numbers. Kernels that are missing this feature will not have the value in the topology. rocm_agent_enumerator will fall back to checking against the PCI IDs in this case. If PCI IDs fail, we fall back to the heavyweight rocminfo method. --- rocm_agent_enumerator | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/rocm_agent_enumerator b/rocm_agent_enumerator index 0ab2094..f4051d0 100755 --- a/rocm_agent_enumerator +++ b/rocm_agent_enumerator @@ -156,6 +156,32 @@ def readFromLSPCI(): return target_list +def readFromKFD(): + target_list = [] + + topology_dir = '/sys/class/kfd/kfd/topology/nodes/' + for node in sorted(os.listdir(topology_dir)): + node_path = os.path.join(topology_dir, node) + if os.path.isdir(node_path): + prop_path = node_path + '/properties' + if os.path.isfile(prop_path): + target_search_term = re.compile("gfx_target_version.+") + with open(prop_path) as f: + line = f.readline() + while line != '' : + search_result = target_search_term.search(line) + if search_result is not None: + device_id = int(search_result.group(0).split(' ')[1], 10) + if device_id != 0: + major_ver = int((device_id / 10000) % 100) + minor_ver = int((device_id / 100) % 100) + stepping_ver = int(device_id % 100) + target_list.append("gfx" + format(major_ver, 'x') + format(minor_ver, 'x') + format(stepping_ver, 'x')) + line = f.readline() + + return target_list + + def main(): """Prints the list of available AMD GCN ISA @@ -175,6 +201,9 @@ def main(): """ target_list = readFromTargetLstFile() + if len(target_list) == 0: + target_list = readFromKFD() + if len(target_list) == 0: target_list = readFromLSPCI()