ROCm · hubertlu-tw · May 5, 2020 · May 6, 2020 · May 6, 2020 · May 8, 2020
diff --git a/.github/workflows/ifu.yml b/.github/workflows/ifu.yml
@@ -0,0 +1,63 @@
+name: IntegrateFromUpstream
+on:
+#  schedule:
+#    # verified via crontab.guru website. “At 06:55 on Monday.”
+#    - cron: '55 6 * * 1'
+  workflow_dispatch:
+    inputs:
+      message:
+        description: 'Reason for manual trigger'
+        required: false
+        default: 'refresh branch'
+jobs:
+  IntegrateFromUpstream:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v2
+        with:
+          fetch-depth: 0
+      - name: Get Current Date
+        id: date
+        run: echo "::set-output name=date::$(date +'%Y-%m-%d')"
+      - name: Extract branch name
+        id: extract_branch
+        shell: bash
+        run: echo "##[set-output name=branch;]$(echo ${GITHUB_REF#refs/heads/})"
+      - name: Fetch and Merge
+        id: fetch_and_merge
+        run: |
+          echo "Reason for trigger: ${{ github.event.inputs.message }}"
+          echo "Actor for trigger: ${{ github.actor }}"
+          git config user.name github-actions
+          git config user.email github-actions@github.com
+          git remote add upstream https://github.com/microsoft/DeepSpeed
+          git fetch upstream master
+          git merge upstream/master 
+          # Since we use our own fork of DeepSpeedExamples, ignore theirs
+          git checkout HEAD DeepSpeedExamples
+      - name: Create Pull Request
+        id: create_pull_request
+        uses: jithunnair-amd/create-pull-request@v3
+        with:
+#          token: ${{ secrets.PAT }}
+          branch: IFU-${{ steps.extract_branch.outputs.branch }}-${{ steps.date.outputs.date }}
+          title: IFU-${{ steps.extract_branch.outputs.branch }}-${{ steps.date.outputs.date }}
+          assignees: rraminen
+          reviewers: jithunnair-amd
+          delete-branch: true
+#      - name: Send email
+#        uses: jithunnair-amd/action-send-mail@v3.1.0
+#        if: always()
+#        with:
+#          server_address: smtp.gmail.com
+#          server_port: 465
+#          secure: true
+#          username: ${{ secrets.GMAIL_USERNAME }}
+#          password: ${{ secrets.GMAIL_PASSWORD }}
+#          subject: IFU to ${{ steps.extract_branch.outputs.branch }} branch of ${{ github.repository }}
+#          to: Jithun.Nair@amd.com, RamyaSai.Ramineni@amd.com
+#          from: ${{ secrets.GMAIL_USERNAME }}
+#          html_body: |
+#            <b>Fetch and Merge</b>: ${{ steps.fetch_and_merge.outcome }} <br/>
+#            <b>Create Pull Request</b>: ${{ steps.create_pull_request.outcome }} <br/>"""
+#            <b>Pull request</b>: <a href="${{ steps.create_pull_request.outputs.pull-request-url }}">${{ steps.create_pull_request.outputs.pull-request-url }}</a> <br/>
diff --git a/.gitmodules b/.gitmodules
@@ -1,4 +1,3 @@
 [submodule "DeepSpeedExamples"]
 	path = DeepSpeedExamples
-	url = https://github.com/microsoft/DeepSpeedExamples
-	branch = master
+	url = https://github.com/ROCmSoftwarePlatform/DeepSpeedExamples.git
diff --git a/DeepSpeedExamples b/DeepSpeedExamples
diff --git a/csrc/includes/cublas_wrappers.h b/csrc/includes/cublas_wrappers.h
@@ -5,7 +5,9 @@
 #include <cuda.h>
 #include <cuda_fp16.h>
 #include <cuda_runtime.h>
+#ifndef __HIP_PLATFORM_HCC__
 #include <mma.h>
+#endif
 #include <stdio.h>
 
 int cublas_gemm_ex(cublasHandle_t handle,
@@ -19,7 +21,11 @@ int cublas_gemm_ex(cublasHandle_t handle,
                    const float* A,
                    const float* B,
                    float* C,
+#ifdef __HIP_PLATFORM_HCC__
+                   rocblas_gemm_algo algo = rocblas_gemm_algo_standard);
+#else
                    cublasGemmAlgo_t algo = CUBLAS_GEMM_DEFAULT);
+#endif
 
 int cublas_gemm_ex(cublasHandle_t handle,
                    cublasOperation_t transa,
@@ -32,7 +38,11 @@ int cublas_gemm_ex(cublasHandle_t handle,
                    const __half* A,
                    const __half* B,
                    __half* C,
+#ifdef __HIP_PLATFORM_HCC__
+                   rocblas_gemm_algo algo = rocblas_gemm_algo_standard);
+#else
                    cublasGemmAlgo_t algo = CUBLAS_GEMM_DEFAULT_TENSOR_OP);
+#endif
 
 int cublas_strided_batched_gemm(cublasHandle_t handle,
                                 int m,
@@ -49,7 +59,11 @@ int cublas_strided_batched_gemm(cublasHandle_t handle,
                                 int stride_B,
                                 int stride_C,
                                 int batch,
+#ifdef __HIP_PLATFORM_HCC__
+                                rocblas_gemm_algo algo = rocblas_gemm_algo_standard);
+#else
                                 cublasGemmAlgo_t algo = CUBLAS_GEMM_DEFAULT);
+#endif
 
 int cublas_strided_batched_gemm(cublasHandle_t handle,
                                 int m,
@@ -66,4 +80,8 @@ int cublas_strided_batched_gemm(cublasHandle_t handle,
                                 int stride_B,
                                 int stride_C,
                                 int batch,
+#ifdef __HIP_PLATFORM_HCC__
+                                rocblas_gemm_algo algo = rocblas_gemm_algo_standard);
+#else
                                 cublasGemmAlgo_t algo = CUBLAS_GEMM_DEFAULT_TENSOR_OP);
+#endif
diff --git a/csrc/includes/custom_cuda_layers.h b/csrc/includes/custom_cuda_layers.h
@@ -5,7 +5,11 @@
 #include <stdio.h>
 #include <stdlib.h>
 
+#ifdef __HIP_PLATFORM_HCC__
+#include <hip/hip_cooperative_groups.h>
+#else
 #include <cooperative_groups.h>
+#endif
 #include <curand_kernel.h>
 
 #include "context.h"

diff --git a/csrc/includes/feed_forward.h b/csrc/includes/feed_forward.h
@@ -43,7 +43,11 @@ class FeedForward {
                        weights,
                        input_ptr,
                        out,
+#ifdef __HIP_PLATFORM_HCC__
+                       rocblas_gemm_algo(config_.gemm_algos[0]));
+#else
                        cublasGemmAlgo_t(config_.gemm_algos[0]));
+#endif
     }
     void Backward(int bsz,
                   const T* out_grad,
@@ -68,7 +72,11 @@ class FeedForward {
                        input_ptr,
                        out_grad,
                        weights_grad,
+#ifdef __HIP_PLATFORM_HCC__
+                       rocblas_gemm_algo(config_.gemm_algos[1]));
+#else
                        cublasGemmAlgo_t(config_.gemm_algos[1]));
+#endif
 
         cublas_gemm_ex(_cublasHandle,
                        CUBLAS_OP_N,
@@ -81,7 +89,11 @@ class FeedForward {
                        weights,
                        out_grad,
                        inp_grad_out,
+#ifdef __HIP_PLATFORM_HCC__
+                       rocblas_gemm_algo(config_.gemm_algos[2]));
+#else
                        cublasGemmAlgo_t(config_.gemm_algos[2]));
+#endif
 
         launch_fuse_transpose_bias_kernel<T>(out_grad, bias_grad, bsz, config_.outputSize, stream);
     }

diff --git a/csrc/includes/gemm_test.h b/csrc/includes/gemm_test.h
@@ -2,7 +2,9 @@
 #pragma once
 
 #include <cuda_fp16.h>
+#ifndef __HIP_PLATFORM_HCC__
 #include <cuda_profiler_api.h>
+#endif
 #include <array>
 #include <cstdio>
 #include <cstdlib>
@@ -58,7 +60,11 @@ class GemmTest {
                            B,
                            A,
                            C,
+#ifdef __HIP_PLATFORM_HCC__
+                           static_cast<rocblas_gemm_algo>(algo));
+#else
                            static_cast<cublasGemmAlgo_t>(algo));
+#endif
         });
 
         int algo_bw1 = Run(loops, [=](int algo) {
@@ -73,7 +79,11 @@ class GemmTest {
                            A,
                            C,
                            B,
+#ifdef __HIP_PLATFORM_HCC__
+                           static_cast<rocblas_gemm_algo>(algo));
+#else
                            static_cast<cublasGemmAlgo_t>(algo));
+#endif
         });
 
         int algo_bw2 = Run(loops, [=](int algo) {
@@ -88,7 +98,11 @@ class GemmTest {
                            B,
                            C,
                            A,
+#ifdef __HIP_PLATFORM_HCC__
+                           static_cast<rocblas_gemm_algo>(algo));
+#else
                            static_cast<cublasGemmAlgo_t>(algo));
+#endif
         });
 
         return std::array<int, 3>({algo_fw, algo_bw1, algo_bw2});
@@ -100,8 +114,13 @@ class GemmTest {
         float fast_latency = (std::numeric_limits<float>::max)();
         int fast_algo = 0;
 
+#ifdef __HIP_PLATFORM_HCC__
+        for (int algo = (int)rocblas_gemm_algo_standard;
+             algo <= (int)rocblas_gemm_algo_standard;
+#else
         for (int algo = (int)CUBLAS_GEMM_DEFAULT_TENSOR_OP;
              algo <= (int)CUBLAS_GEMM_ALGO15_TENSOR_OP;
+#endif
              algo++) {
             int warm_up = 5;
             for (int i = 0; i < warm_up; ++i) f(algo);
@@ -186,7 +205,11 @@ class StridedGemmTest {
                                         stride_b,
                                         stride_c,
                                         bsz,
+#ifdef __HIP_PLATFORM_HCC__
+                                        static_cast<rocblas_gemm_algo>(algo));
+#else
                                         static_cast<cublasGemmAlgo_t>(algo));
+#endif
         });
 
         int algo_bw1 = Run(loops, [=](int algo) {
@@ -216,7 +239,11 @@ class StridedGemmTest {
                                         stride_b,
                                         stride_c,
                                         bsz,
+#ifdef __HIP_PLATFORM_HCC__
+                                        static_cast<rocblas_gemm_algo>(algo));
+#else
                                         static_cast<cublasGemmAlgo_t>(algo));
+#endif
         });
 
         int algo_bw2 = Run(loops, [=](int algo) {
@@ -243,7 +270,11 @@ class StridedGemmTest {
                                         stride_b,
                                         stride_c,
                                         bsz,
+#ifdef __HIP_PLATFORM_HCC__
+                                        static_cast<rocblas_gemm_algo>(algo));
+#else
                                         static_cast<cublasGemmAlgo_t>(algo));
+#endif
         });
 
         return std::array<int, 3>({algo_fw, algo_bw1, algo_bw2});
@@ -255,8 +286,13 @@ class StridedGemmTest {
         float fast_latency = (std::numeric_limits<float>::max)();
         int fast_algo = 0;
 
+#ifdef __HIP_PLATFORM_HCC__
+        for (int algo = (int)rocblas_gemm_algo_standard;
+             algo <= (int)rocblas_gemm_algo_standard;
+#else
         for (int algo = (int)CUBLAS_GEMM_DEFAULT_TENSOR_OP;
              algo <= (int)CUBLAS_GEMM_ALGO15_TENSOR_OP;
+#endif
              algo++) {
             int warm_up = 5;
             for (int i = 0; i < warm_up; ++i) f(algo);

diff --git a/csrc/includes/general_kernels.h b/csrc/includes/general_kernels.h
@@ -3,7 +3,11 @@
 #include <stdio.h>
 #include <stdlib.h>
 
+#ifdef __HIP_PLATFORM_HCC__
+#include <hip/hip_cooperative_groups.h>
+#else
 #include <cooperative_groups.h>
+#endif
 #include <curand_kernel.h>
 
 #include "context.h"