PaddlePaddle · Xreki · Jan 4, 2022 · Jan 4, 2022 · Jan 4, 2022 · Jan 5, 2022
diff --git a/api/common/env.py b/api/common/env.py
@@ -0,0 +1,23 @@
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+
+def benchmark_need_feed():
+    return os.environ.get("BENCHMARK_NEED_FEED", False)
+
+
+def benchmark_need_fetch():
+    return os.environ.get("BENCHMARK_NEED_FETCH", False)
diff --git a/api/common/launch.py b/api/common/launch.py
@@ -18,6 +18,7 @@
 import sys
 import argparse
 
+from common import env
 from common import system
 from common import api_param
 
@@ -33,19 +34,76 @@ def is_ampere_gpu():
     return False
 
 
+class TimeUnit(object):
+    def __init__(self):
+        self.kernel_time = 0.0
+        self.memory_time = 0.0
+        self.memcpy_h2d = 0.0
+        self.memcpy_d2h = 0.0
+        self.memcpy_d2d = 0.0
+        self.memset = 0.0
+
+    def total(self):
+        self.memory_time = self.memcpy_h2d + self.memcpy_d2d + self.memset
+        if not env.benchmark_need_fetch():
+            # Normally DtoH is fetching results.
+            self.memory_time += self.memcpy_d2h
+        return self.kernel_time + self.memory_time
+
+    def __str__(self):
+        total_time = self.total()
+        if env.benchmark_need_fetch():
+            infostr = "total gpu_time (exclude DtoH): {:.4f} ms ".format(
+                total_time)
+        else:
+            infostr = "total gpu_time: {:.4f} ms ".format(total_time)
+        if total_time > 0.0:
+            infostr += "(kernel: {:.4f} ms ({:.2f}%); memory: {:.4f} ms ({:.2f}%))".format(
+                self.kernel_time, self.kernel_time * 100 / total_time,
+                self.memory_time, self.memory_time * 100 / total_time)
+        else:
+            infostr += "(kernel: {:.4f} ms; memory: {:.4f} ms)".format(
+                self.kernel_time, self.memory_time)
+        infostr += "\n"
+        return infostr
+
+    def add_info(self, time, name):
+        if name == "[CUDA memcpy HtoD]":
+            self._update_memory_time("memcpy_h2d", time)
+        elif name == "[CUDA memcpy DtoH]":
+            self._update_memory_time("memcpy_d2h", time)
+        elif name == "[CUDA memcpy DtoD]":
+            self._update_memory_time("memcpy_d2d", time)
+        elif name == "[CUDA memset]":
+            self._update_memory_time("memset", time)
+        else:
+            self.kernel_time += time
+
+    def _update_memory_time(self, member_name, time):
+        assert member_name in [
+            "memcpy_h2d", "memcpy_d2h", "memcpy_d2d", "memset"
+        ]
+        setattr(self, member_name, time)
+        if member_name != "memcpy_d2h" or not env.benchmark_need_fetch():
+            self.memory_time += time
+
+
 class NvprofRunner(object):
-    def run(self, cmd):
-        stdout, exit_code = self._nvprof(cmd)
+    def run(self, cmd, profile_from_start=False):
+        stdout, exit_code = self._nvprof(cmd, profile_from_start)
         if exit_code == 0:
             parse_status, gpu_time = self._parse_logs(stdout.split("\n"))
             if parse_status:
                 return gpu_time
         print("Running Error:\n {}".format(stdout))
         return 0.0
 
-    def _nvprof(self, cmd):
-        return system.run_command("nvprof --profile-from-start off {}".format(
-            cmd))
+    def _nvprof(self, cmd, profile_from_start):
+        if profile_from_start:
+            profile_cmd = "nvprof {}".format(cmd)
+        else:
+            profile_cmd = "nvprof --profile-from-start off {}".format(cmd)
+        return system.run_command(profile_cmd)
 
     def _parse_logs(self, logs):
         line_from = None
@@ -58,51 +116,58 @@ def _parse_logs(self, logs):
                 line_to = i
                 break
         if line_from is not None and line_to is not None:
+            time_unit = TimeUnit()
             for i in range(line_from, line_to):
                 print(logs[i])
+                if i >= line_from + 1:
+                    begin_pos = 2 if i == line_from + 1 else 0
+                    gpu_time, percent, function = self._parse_line(logs[i],
+                                                                   begin_pos)
+                    time_unit.add_info(gpu_time, function)
             print("")
-            return True, self._parse_gpu_time(logs[line_from + 1])
+            print(time_unit)
+            return True, time_unit.total()
         else:
             return False, 0.0
 
-    def _parse_gpu_time(self, line):
-        infos = line.strip().split()
-        percent = float(infos[2].replace("%", "")) * 0.01
-        gpu_time = infos[3]
-        if gpu_time.endswith("us"):
-            gpu_time = float(gpu_time.replace("us", "")) * 0.001
-        elif gpu_time.endswith("ms"):
-            gpu_time = float(gpu_time.replace("ms", ""))
-        elif gpu_time.endswith("s"):
-            gpu_time = float(gpu_time.replace("s", "")) * 1000
+    def _to_millisecond(self, timestr):
+        if timestr.endswith("us"):
+            return float(timestr.replace("us", "")) * 0.001
+        elif timestr.endswith("ms"):
+            return float(timestr.replace("ms", ""))
+        elif timestr.endswith("s"):
+            return float(timestr.replace("s", "")) * 1000
         else:
             raise ValueError("Invalid time: %s" % gpu_time)
-        calls = int(infos[4])
-        function = infos[8]
-        for i in range(9, len(infos)):
-            function = function + " " + infos[i]
-        #print("percent: %.2f; gpu_time: %.4f ms; calls: %d; function: %s" %
-        #      (percent, gpu_time, calls, function))
 
-        total_gpu_time = gpu_time / percent
-        print("total gpu_time: %.4f ms" % total_gpu_time)
-        print("")
-        return total_gpu_time
+    def _parse_line(self, line, begin_pos=0):
+        infos = line.strip().split()
+        percent = float(infos[begin_pos].replace("%", "")) * 0.01
+        gpu_time = self._to_millisecond(infos[begin_pos + 1])
+        calls = int(infos[begin_pos + 2])
+        function = infos[begin_pos + 6]
+        for i in range(begin_pos + 7, len(infos)):
+            function = function + " " + infos[i]
+        return gpu_time, percent, function
 
 
 class NsightRunner(object):
-    def run(self, cmd):
-        stdout, exit_code = self._nsight(cmd)
+    def run(self, cmd, profile_from_start=False):
+        stdout, exit_code = self._nsight(cmd, profile_from_start)
         if exit_code == 0:
             parse_status, gpu_time = self._parse_logs(stdout.split("\n"))
             if parse_status:
                 return gpu_time
         print("Running Error:\n {}".format(stdout))
         return 0.0
 
-    def _nsight(self, cmd):
-        return system.run_command(
-            "nsys nvprof --profile-from-start=off -o tmp.qdrep {}".format(cmd))
+    def _nsight(self, cmd, profile_from_start):
+        if profile_from_start:
+            profile_cmd = "nsys nvprof -o tmp.qdrep {}".format(cmd)
+        else:
+            profile_cmd = "nsys nvprof --profile-from-start=off -o tmp.qdrep {}".format(
+                cmd)
+        return system.run_command(profile_cmd)
 
     def _parse_logs(self, logs):
         kernel_line_from = None
@@ -362,7 +427,8 @@ def launch(benchmark_script,
            task="speed",
            repeat=1,
            sync_interval=80,
-           with_nvprof=False):
+           with_nvprof=False,
+           profile_from_start=True):
     """
     If with_nvprof is True, it will launch the following command firstly to
     get the gpu_time:
@@ -371,7 +437,8 @@ def launch(benchmark_script,
     Then the normal testing command will be launched:
         python benchmark_script benchmark_script_args
     """
-    if with_nvprof:
+
+    if with_nvprof and not profile_from_start:
         if task == "speed":
             _set_args(benchmark_script_args, "--profiler", "nvprof")
         elif task == "scheduling":
@@ -384,7 +451,7 @@ def launch(benchmark_script,
                 runner = NsightRunner()
             else:
                 runner = NvprofRunner()
-            gpu_time = runner.run(cmd)
+            gpu_time = runner.run(cmd, profile_from_start)
             _set_args(benchmark_script_args, "--profiler", "none")
             return gpu_time
         elif task == "scheduling":
@@ -438,6 +505,7 @@ def _set_args(args, arg, value):
     args = parser.parse_args()
     benchmark_args_dict = _args_list_to_dict(args.benchmark_script_args)
     task = benchmark_args_dict.get("task", "speed")
+    framework = benchmark_args_dict.get("framework", "paddle")
     use_gpu = system.str2bool(benchmark_args_dict.get(
         "use_gpu", "False")) and os.environ.get("CUDA_VISIBLE_DEVICES",
                                                 None) != ""
@@ -448,13 +516,15 @@ def _set_args(args, arg, value):
     system.check_commit()
 
     if use_gpu and task in ["speed", "scheduling"] and profiler == "none":
+        profile_from_start = False
         output_time = launch(
             args.benchmark_script,
             args.benchmark_script_args,
             task,
             repeat,
             sync_interval,
-            with_nvprof=True)
+            with_nvprof=True,
+            profile_from_start=profile_from_start)
         if task == "speed":
             args.benchmark_script_args.append(" --gpu_time ")
             args.benchmark_script_args.append(str(output_time))

diff --git a/api/common/special_op_list.py b/api/common/special_op_list.py
@@ -36,7 +36,6 @@
     "arange",
     "argmax",
     "argmin",
-    "argsort",
     "assign",
     "cast",
     "clip_by_norm",

diff --git a/api/common/tensorflow_op_benchmark.py b/api/common/tensorflow_op_benchmark.py
@@ -21,12 +21,14 @@
 from common import special_op_list
 from common.benchmark import BenchmarkBase
 
+from . import env
 from . import utils
 from . import api_param
 from . import feeder
 
 try:
     import tensorflow as tf
+
     from tensorflow.python.profiler import model_analyzer
     from tensorflow.python.profiler import option_builder
     from tensorflow.core.protobuf import config_pb2
@@ -55,9 +57,9 @@ def __enter__(self):
             import cProfile
             self._profiler_handle = cProfile.Profile()
             self._profiler_handle.enable()
-        elif self.profiler != "none":
+        elif self.profiler == "native":
             self._profiler_handle = model_analyzer.Profiler(
-                graph=self.sess.graph)
+                graph=self._sess.graph)
             self.run_options = tf.compat.v1.RunOptions(
                 trace_level=tf.compat.v1.RunOptions.FULL_TRACE)
             self.run_metadata = tf.compat.v1.RunMetadata()
@@ -247,8 +249,10 @@ def generate_random_feeder(self,
             assert use_feed_fetch, "Argument use_feed_fetch must be True when feeder_adapter is initialized by paddle."
 
         if feeder_adapter is None or feeder_adapter.framework != "tensorflow":
-            self._need_feed = config.name == "feed"
-            self._need_fetch = use_feed_fetch or config.name == "fetch"
+            self._need_feed = env.benchmark_need_feed(
+            ) or config.name == "feed"
+            self._need_fetch = env.benchmark_need_fetch(
+            ) or use_feed_fetch or config.name == "fetch"
             self._feed_spec = feeder.copy_feed_spec(config.feed_spec)
             self._feed_dict = {}
 
@@ -294,12 +298,14 @@ def run(self, config, args, use_feed_fetch=True, feeder_adapter=None):
         self.fetch_list = fetch_list
 
         self.allow_growth = False if args.task == "speed" else True
-        outputs, stats = self.run_impl(
-            use_gpu=args.use_gpu,
-            config=config,
-            feed=feed,
-            repeat=args.repeat,
-            profiler=args.profiler)
+        device = "GPU:0" if args.use_gpu else "CPU"
+        with tf.device(device):
+            outputs, stats = self.run_impl(
+                use_gpu=args.use_gpu,
+                config=config,
+                feed=feed,
+                repeat=args.repeat,
+                profiler=args.profiler)
         return outputs, stats
 
     def _init_session(self, use_gpu):

diff --git a/api/deploy/collect_api_info.py b/api/deploy/collect_api_info.py
@@ -51,8 +51,16 @@ def collect_subclass_dict(test_cases_dict):
 
 
 def import_all_tests(test_module_name):
+    def _is_special_module(api_name):
+        special_module_list = [
+            "__init__", "common_import", "test_main", "fused_"
+        ]
+        for name in special_module_list:
+            if name in api_name:
+                return True
+        return False
+
     test_cases_dict = {}
-    special_module_list = ["__init__", "common_import", "test_main"]
 
     def _import_api(test_module_name, basename):
         try:
@@ -68,7 +76,7 @@ def _import_api(test_module_name, basename):
     for filename in sorted(os.listdir(tests_path)):
         api_name = os.path.splitext(filename)[0]
         file_extension = os.path.splitext(filename)[1]
-        if file_extension == '.py' and api_name not in special_module_list:
+        if file_extension == '.py' and not _is_special_module(api_name):
             module = _import_api(test_module_name, api_name)
             if module:
                 test_cases_dict[api_name] = module
@@ -134,7 +142,7 @@ def main(args):
     parser.add_argument(
         '--test_module_name',
         type=str,
-        default="tests",
+        default="tests_v2",
         help='The module_name under benchmark/api (tests|tests_v2|dynamic_tests_v2).'
     )
     parser.add_argument(