Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
9801241
Update the supported tf'version to 2.7.0 and allow precisely profiling.
Xreki Jan 4, 2022
6702a92
Remove the support of tf under 1.15.0.
Xreki Jan 4, 2022
f4ed56f
Allow the control the benchmark through environ.
Xreki Jan 4, 2022
6ec9b0f
Remove the set of optimizer options and fix showing bugs when writing…
Xreki Jan 5, 2022
fac27de
Remove argsort from no_backward_ops list.
Xreki Jan 5, 2022
f457a23
Merge branch 'master' into api/enhance_tf
Xreki Jan 5, 2022
aac1074
Change the approveal github ids.
Xreki Jan 6, 2022
3dd70d5
Merge branch 'master' into api/enhance_tf
Xreki Jan 7, 2022
cf162b2
Exclude the DtoH time when needs fetch.
Xreki Jan 7, 2022
5e61144
Merge branch 'master' into api/enhance_tf
Xreki Jan 17, 2022
ea9822c
Merge branch 'master' into api/enhance_tf
Xreki Jan 17, 2022
685bf70
Add fused_batch_norm_relu scripts.
Xreki Jan 18, 2022
3fdc3a9
Merge branch 'master' into api/enhance_tf
Xreki Jan 18, 2022
4438014
Does not test the fused_xxx ops in ci.
Xreki Jan 19, 2022
8923bcc
Merge branch 'master' into api/enhance_tf
Xreki Jan 20, 2022
f477069
Add fused_batch_norm_add_relu and fix a bug.
Xreki Jan 21, 2022
02cd833
Merge branch 'master' into api/enhance_tf
Xreki Jan 23, 2022
93506e5
Merge branch 'master' into api/enhance_tf
Xreki Feb 7, 2022
7941213
Merge branch 'master' into api/enhance_tf
Xreki Feb 20, 2022
ec29e72
Update tf'version to 2.8.0.
Xreki Feb 21, 2022
968c892
Merge branch 'master' into api/enhance_tf
Xreki Feb 22, 2022
6678cfe
Change copyright.
Xreki Mar 2, 2022
1eaa9c3
Merge branch 'master' into api/enhance_tf
Xreki Mar 2, 2022
3345fe2
Merge branch 'master' into api/enhance_tf
Xreki Mar 11, 2022
af45550
Merge branch 'master' into api/enhance_tf
Xreki Mar 11, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 23 additions & 0 deletions api/common/env.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import os


def benchmark_need_feed():
return os.environ.get("BENCHMARK_NEED_FEED", False)


def benchmark_need_fetch():
return os.environ.get("BENCHMARK_NEED_FETCH", False)
140 changes: 105 additions & 35 deletions api/common/launch.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
import sys
import argparse

from common import env
from common import system
from common import api_param

Expand All @@ -33,19 +34,76 @@ def is_ampere_gpu():
return False


class TimeUnit(object):
def __init__(self):
self.kernel_time = 0.0
self.memory_time = 0.0
self.memcpy_h2d = 0.0
self.memcpy_d2h = 0.0
self.memcpy_d2d = 0.0
self.memset = 0.0

def total(self):
self.memory_time = self.memcpy_h2d + self.memcpy_d2d + self.memset
if not env.benchmark_need_fetch():
# Normally DtoH is fetching results.
self.memory_time += self.memcpy_d2h
return self.kernel_time + self.memory_time

def __str__(self):
total_time = self.total()
if env.benchmark_need_fetch():
infostr = "total gpu_time (exclude DtoH): {:.4f} ms ".format(
total_time)
else:
infostr = "total gpu_time: {:.4f} ms ".format(total_time)
if total_time > 0.0:
infostr += "(kernel: {:.4f} ms ({:.2f}%); memory: {:.4f} ms ({:.2f}%))".format(
self.kernel_time, self.kernel_time * 100 / total_time,
self.memory_time, self.memory_time * 100 / total_time)
else:
infostr += "(kernel: {:.4f} ms; memory: {:.4f} ms)".format(
self.kernel_time, self.memory_time)
infostr += "\n"
return infostr

def add_info(self, time, name):
if name == "[CUDA memcpy HtoD]":
self._update_memory_time("memcpy_h2d", time)
elif name == "[CUDA memcpy DtoH]":
self._update_memory_time("memcpy_d2h", time)
elif name == "[CUDA memcpy DtoD]":
self._update_memory_time("memcpy_d2d", time)
elif name == "[CUDA memset]":
self._update_memory_time("memset", time)
else:
self.kernel_time += time

def _update_memory_time(self, member_name, time):
assert member_name in [
"memcpy_h2d", "memcpy_d2h", "memcpy_d2d", "memset"
]
setattr(self, member_name, time)
if member_name != "memcpy_d2h" or not env.benchmark_need_fetch():
self.memory_time += time


class NvprofRunner(object):
def run(self, cmd):
stdout, exit_code = self._nvprof(cmd)
def run(self, cmd, profile_from_start=False):
stdout, exit_code = self._nvprof(cmd, profile_from_start)
if exit_code == 0:
parse_status, gpu_time = self._parse_logs(stdout.split("\n"))
if parse_status:
return gpu_time
print("Running Error:\n {}".format(stdout))
return 0.0

def _nvprof(self, cmd):
return system.run_command("nvprof --profile-from-start off {}".format(
cmd))
def _nvprof(self, cmd, profile_from_start):
if profile_from_start:
profile_cmd = "nvprof {}".format(cmd)
else:
profile_cmd = "nvprof --profile-from-start off {}".format(cmd)
return system.run_command(profile_cmd)

def _parse_logs(self, logs):
line_from = None
Expand All @@ -58,51 +116,58 @@ def _parse_logs(self, logs):
line_to = i
break
if line_from is not None and line_to is not None:
time_unit = TimeUnit()
for i in range(line_from, line_to):
print(logs[i])
if i >= line_from + 1:
begin_pos = 2 if i == line_from + 1 else 0
gpu_time, percent, function = self._parse_line(logs[i],
begin_pos)
time_unit.add_info(gpu_time, function)
print("")
return True, self._parse_gpu_time(logs[line_from + 1])
print(time_unit)
return True, time_unit.total()
else:
return False, 0.0

def _parse_gpu_time(self, line):
infos = line.strip().split()
percent = float(infos[2].replace("%", "")) * 0.01
gpu_time = infos[3]
if gpu_time.endswith("us"):
gpu_time = float(gpu_time.replace("us", "")) * 0.001
elif gpu_time.endswith("ms"):
gpu_time = float(gpu_time.replace("ms", ""))
elif gpu_time.endswith("s"):
gpu_time = float(gpu_time.replace("s", "")) * 1000
def _to_millisecond(self, timestr):
if timestr.endswith("us"):
return float(timestr.replace("us", "")) * 0.001
elif timestr.endswith("ms"):
return float(timestr.replace("ms", ""))
elif timestr.endswith("s"):
return float(timestr.replace("s", "")) * 1000
else:
raise ValueError("Invalid time: %s" % gpu_time)
calls = int(infos[4])
function = infos[8]
for i in range(9, len(infos)):
function = function + " " + infos[i]
#print("percent: %.2f; gpu_time: %.4f ms; calls: %d; function: %s" %
# (percent, gpu_time, calls, function))

total_gpu_time = gpu_time / percent
print("total gpu_time: %.4f ms" % total_gpu_time)
print("")
return total_gpu_time
def _parse_line(self, line, begin_pos=0):
infos = line.strip().split()
percent = float(infos[begin_pos].replace("%", "")) * 0.01
gpu_time = self._to_millisecond(infos[begin_pos + 1])
calls = int(infos[begin_pos + 2])
function = infos[begin_pos + 6]
for i in range(begin_pos + 7, len(infos)):
function = function + " " + infos[i]
return gpu_time, percent, function


class NsightRunner(object):
def run(self, cmd):
stdout, exit_code = self._nsight(cmd)
def run(self, cmd, profile_from_start=False):
stdout, exit_code = self._nsight(cmd, profile_from_start)
if exit_code == 0:
parse_status, gpu_time = self._parse_logs(stdout.split("\n"))
if parse_status:
return gpu_time
print("Running Error:\n {}".format(stdout))
return 0.0

def _nsight(self, cmd):
return system.run_command(
"nsys nvprof --profile-from-start=off -o tmp.qdrep {}".format(cmd))
def _nsight(self, cmd, profile_from_start):
if profile_from_start:
profile_cmd = "nsys nvprof -o tmp.qdrep {}".format(cmd)
else:
profile_cmd = "nsys nvprof --profile-from-start=off -o tmp.qdrep {}".format(
cmd)
return system.run_command(profile_cmd)

def _parse_logs(self, logs):
kernel_line_from = None
Expand Down Expand Up @@ -362,7 +427,8 @@ def launch(benchmark_script,
task="speed",
repeat=1,
sync_interval=80,
with_nvprof=False):
with_nvprof=False,
profile_from_start=True):
"""
If with_nvprof is True, it will launch the following command firstly to
get the gpu_time:
Expand All @@ -371,7 +437,8 @@ def launch(benchmark_script,
Then the normal testing command will be launched:
python benchmark_script benchmark_script_args
"""
if with_nvprof:

if with_nvprof and not profile_from_start:
if task == "speed":
_set_args(benchmark_script_args, "--profiler", "nvprof")
elif task == "scheduling":
Expand All @@ -384,7 +451,7 @@ def launch(benchmark_script,
runner = NsightRunner()
else:
runner = NvprofRunner()
gpu_time = runner.run(cmd)
gpu_time = runner.run(cmd, profile_from_start)
_set_args(benchmark_script_args, "--profiler", "none")
return gpu_time
elif task == "scheduling":
Expand Down Expand Up @@ -438,6 +505,7 @@ def _set_args(args, arg, value):
args = parser.parse_args()
benchmark_args_dict = _args_list_to_dict(args.benchmark_script_args)
task = benchmark_args_dict.get("task", "speed")
framework = benchmark_args_dict.get("framework", "paddle")
use_gpu = system.str2bool(benchmark_args_dict.get(
"use_gpu", "False")) and os.environ.get("CUDA_VISIBLE_DEVICES",
None) != ""
Expand All @@ -448,13 +516,15 @@ def _set_args(args, arg, value):
system.check_commit()

if use_gpu and task in ["speed", "scheduling"] and profiler == "none":
profile_from_start = False
output_time = launch(
args.benchmark_script,
args.benchmark_script_args,
task,
repeat,
sync_interval,
with_nvprof=True)
with_nvprof=True,
profile_from_start=profile_from_start)
if task == "speed":
args.benchmark_script_args.append(" --gpu_time ")
args.benchmark_script_args.append(str(output_time))
Expand Down
1 change: 0 additions & 1 deletion api/common/special_op_list.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,6 @@
"arange",
"argmax",
"argmin",
"argsort",
"assign",
"cast",
"clip_by_norm",
Expand Down
26 changes: 16 additions & 10 deletions api/common/tensorflow_op_benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,12 +21,14 @@
from common import special_op_list
from common.benchmark import BenchmarkBase

from . import env
from . import utils
from . import api_param
from . import feeder

try:
import tensorflow as tf

from tensorflow.python.profiler import model_analyzer
from tensorflow.python.profiler import option_builder
from tensorflow.core.protobuf import config_pb2
Expand Down Expand Up @@ -55,9 +57,9 @@ def __enter__(self):
import cProfile
self._profiler_handle = cProfile.Profile()
self._profiler_handle.enable()
elif self.profiler != "none":
elif self.profiler == "native":
self._profiler_handle = model_analyzer.Profiler(
graph=self.sess.graph)
graph=self._sess.graph)
self.run_options = tf.compat.v1.RunOptions(
trace_level=tf.compat.v1.RunOptions.FULL_TRACE)
self.run_metadata = tf.compat.v1.RunMetadata()
Expand Down Expand Up @@ -247,8 +249,10 @@ def generate_random_feeder(self,
assert use_feed_fetch, "Argument use_feed_fetch must be True when feeder_adapter is initialized by paddle."

if feeder_adapter is None or feeder_adapter.framework != "tensorflow":
self._need_feed = config.name == "feed"
self._need_fetch = use_feed_fetch or config.name == "fetch"
self._need_feed = env.benchmark_need_feed(
) or config.name == "feed"
self._need_fetch = env.benchmark_need_fetch(
) or use_feed_fetch or config.name == "fetch"
self._feed_spec = feeder.copy_feed_spec(config.feed_spec)
self._feed_dict = {}

Expand Down Expand Up @@ -294,12 +298,14 @@ def run(self, config, args, use_feed_fetch=True, feeder_adapter=None):
self.fetch_list = fetch_list

self.allow_growth = False if args.task == "speed" else True
outputs, stats = self.run_impl(
use_gpu=args.use_gpu,
config=config,
feed=feed,
repeat=args.repeat,
profiler=args.profiler)
device = "GPU:0" if args.use_gpu else "CPU"
with tf.device(device):
outputs, stats = self.run_impl(
use_gpu=args.use_gpu,
config=config,
feed=feed,
repeat=args.repeat,
profiler=args.profiler)
return outputs, stats

def _init_session(self, use_gpu):
Expand Down
14 changes: 11 additions & 3 deletions api/deploy/collect_api_info.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,8 +51,16 @@ def collect_subclass_dict(test_cases_dict):


def import_all_tests(test_module_name):
def _is_special_module(api_name):
special_module_list = [
"__init__", "common_import", "test_main", "fused_"
]
for name in special_module_list:
if name in api_name:
return True
return False

test_cases_dict = {}
special_module_list = ["__init__", "common_import", "test_main"]

def _import_api(test_module_name, basename):
try:
Expand All @@ -68,7 +76,7 @@ def _import_api(test_module_name, basename):
for filename in sorted(os.listdir(tests_path)):
api_name = os.path.splitext(filename)[0]
file_extension = os.path.splitext(filename)[1]
if file_extension == '.py' and api_name not in special_module_list:
if file_extension == '.py' and not _is_special_module(api_name):
module = _import_api(test_module_name, api_name)
if module:
test_cases_dict[api_name] = module
Expand Down Expand Up @@ -134,7 +142,7 @@ def main(args):
parser.add_argument(
'--test_module_name',
type=str,
default="tests",
default="tests_v2",
help='The module_name under benchmark/api (tests|tests_v2|dynamic_tests_v2).'
)
parser.add_argument(
Expand Down
Loading