|
|
@@ -0,0 +1,418 @@ |
|
|
|
# MegEngine is Licensed under the Apache License, Version 2.0 (the "License") |
|
|
|
# |
|
|
|
# Copyright (c) 2014-2021 Megvii Inc. All rights reserved. |
|
|
|
# |
|
|
|
# Unless required by applicable law or agreed to in writing, |
|
|
|
# software distributed under the License is distributed on an |
|
|
|
# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
|
|
|
import argparse |
|
|
|
import logging |
|
|
|
import time |
|
|
|
from collections import OrderedDict |
|
|
|
|
|
|
|
import numpy as np |
|
|
|
|
|
|
|
import megengine as mge |
|
|
|
from megengine.core.tensor import megbrain_graph as G |
|
|
|
from megengine.device import get_device_count, set_default_device |
|
|
|
from megengine.functional.debug_param import set_execution_strategy |
|
|
|
from megengine.logger import enable_debug_log, get_logger, set_log_file |
|
|
|
from megengine.utils import comp_graph_tools as tools |
|
|
|
|
|
|
|
logger = get_logger(__name__) |
|
|
|
|
|
|
|
|
|
|
|
def make_data_given_desc(args, inputs, shape0_multiply=1): |
|
|
|
if args.load_input_data: |
|
|
|
logger.info("load data from {}".format(args.load_input_data)) |
|
|
|
data = mge.load(args.load_input_data) |
|
|
|
data_names = [inp.name for inp in inputs] |
|
|
|
|
|
|
|
if isinstance(data, np.ndarray): |
|
|
|
assert len(data_names) == 1, ( |
|
|
|
"data is given as a single numpy array, so there should be " |
|
|
|
"exactly one input in the graph; got: {}".format(data_names) |
|
|
|
) |
|
|
|
data = {data_names[0]: data} |
|
|
|
|
|
|
|
assert isinstance(data, dict) |
|
|
|
for v in data.values(): |
|
|
|
assert isinstance( |
|
|
|
v, np.ndarray |
|
|
|
), "data should provide ndarray; got {} instead".format(v) |
|
|
|
|
|
|
|
if args.batchsize: |
|
|
|
for k, v in list(data.items()): |
|
|
|
assert ( |
|
|
|
args.batchsize % v.shape[0] == 0 |
|
|
|
), "current batch size must divide given batch size: {} {}".format( |
|
|
|
args.batchsize, v.shape[0] |
|
|
|
) |
|
|
|
data[k] = np.repeat(v, args.batchsize // v.shape[0], axis=0) |
|
|
|
return data |
|
|
|
|
|
|
|
def iter_inpdesc(desc): |
|
|
|
if not desc: |
|
|
|
return |
|
|
|
for pair in desc.split(";"): |
|
|
|
name, value = pair.split(":") |
|
|
|
if name not in data_shapes: |
|
|
|
logger.warning("rng name {} not in data provider".format(name)) |
|
|
|
yield name, value |
|
|
|
|
|
|
|
rng = np.random.RandomState(args.seed) |
|
|
|
|
|
|
|
data_shapes = OrderedDict((inp.name, list(inp.shape)) for inp in inputs) |
|
|
|
data_dtypes = OrderedDict((inp.name, inp.dtype) for inp in inputs) |
|
|
|
|
|
|
|
for name, shape in iter_inpdesc(args.input_desc): |
|
|
|
data_shapes[name] = list(map(int, shape.split(","))) |
|
|
|
|
|
|
|
if args.batchsize: |
|
|
|
for i in data_shapes.values(): |
|
|
|
i[0] = args.batchsize |
|
|
|
|
|
|
|
data_rngs = dict(iter_inpdesc(args.rng)) |
|
|
|
|
|
|
|
result = OrderedDict() |
|
|
|
for name, shape in data_shapes.items(): |
|
|
|
shape[0] *= shape0_multiply |
|
|
|
rng_expr = data_rngs.get(name) |
|
|
|
if rng_expr: |
|
|
|
value = eval("rng.{}".format(rng_expr).format(shape), {"rng": rng}) |
|
|
|
else: |
|
|
|
value = rng.uniform(size=shape) |
|
|
|
|
|
|
|
value = np.ascontiguousarray(value, dtype=data_dtypes[name]) |
|
|
|
assert value.shape == tuple(shape) |
|
|
|
result[name] = value |
|
|
|
|
|
|
|
return result |
|
|
|
|
|
|
|
|
|
|
|
def get_execution_strategy(args): |
|
|
|
if not args.fast_run: |
|
|
|
logger.warning("--fast-run not enabled; execution may be slow") |
|
|
|
strategy = "HEURISTIC" |
|
|
|
else: |
|
|
|
logger.warning("--fast-run enabled; compile may be slow") |
|
|
|
strategy = "PROFILE" |
|
|
|
if args.reproducible: |
|
|
|
strategy += "_REPRODUCIBLE" |
|
|
|
return strategy |
|
|
|
|
|
|
|
|
|
|
|
def get_opt_kwargs(args): |
|
|
|
args_list = [ |
|
|
|
"enable_io16xc32", |
|
|
|
"enable_ioc16", |
|
|
|
"enable_hwcd4", |
|
|
|
"enable_nchw4", |
|
|
|
"enable_nchw88", |
|
|
|
"enable_nchw44", |
|
|
|
"enable_nchw44_dot", |
|
|
|
"enable_nchw32", |
|
|
|
"enable_chwn4", |
|
|
|
"enable_fuse_conv_bias_nonlinearity", |
|
|
|
"enable_fuse_conv_bias_with_z", |
|
|
|
] |
|
|
|
kwargs = {} |
|
|
|
for k in args_list: |
|
|
|
if getattr(args, k): |
|
|
|
kwargs[k] = True |
|
|
|
return kwargs |
|
|
|
|
|
|
|
|
|
|
|
def run_model(args, graph, inputs, outputs, data): |
|
|
|
# must use level0 to avoid unintended opr modification |
|
|
|
graph.options.graph_opt_level = 0 |
|
|
|
|
|
|
|
logger.info("input tensors: ") |
|
|
|
for k, v in data.items(): |
|
|
|
logger.info(" {}: {}".format(k, v.shape)) |
|
|
|
|
|
|
|
G.modify_opr_algo_strategy_inplace(outputs, get_execution_strategy(args)) |
|
|
|
|
|
|
|
if args.optimize_for_inference: |
|
|
|
opt_kwargs = get_opt_kwargs(args) |
|
|
|
outputs = G.optimize_for_inference(outputs, **opt_kwargs) |
|
|
|
|
|
|
|
# embed inputs must be on the last, to avoid const fold |
|
|
|
if args.embed_input: |
|
|
|
outputs, inp_dict = tools.embed_inputs(outputs, data.values(), inputs=inputs) |
|
|
|
else: |
|
|
|
outputs, inp_dict = tools.convert_inputs(outputs, inputs=inputs) |
|
|
|
|
|
|
|
if args.dump_cpp_model: |
|
|
|
dump_content, _ = G.dump_graph(outputs, keep_var_name=2) |
|
|
|
with open(args.dump_cpp_model, "wb") as file: |
|
|
|
file.write(dump_content) |
|
|
|
logger.info("C++ model written to {}".format(args.dump_cpp_model)) |
|
|
|
|
|
|
|
outputs, output_dict = tools.convert_outputs(outputs) |
|
|
|
|
|
|
|
if args.profile: |
|
|
|
profiler = tools.GraphProfiler(graph) |
|
|
|
|
|
|
|
func = graph.compile(outputs) |
|
|
|
|
|
|
|
def run(): |
|
|
|
if not args.embed_input: |
|
|
|
for key in inp_dict: |
|
|
|
inp_dict[key].set_value(mge.Tensor(data[key])._dev_tensor()) |
|
|
|
func.execute() |
|
|
|
func.wait() |
|
|
|
return [oup_node.get_value().numpy() for oup_node in output_dict.values()] |
|
|
|
|
|
|
|
if args.warm_up: |
|
|
|
logger.info("warming up") |
|
|
|
run() |
|
|
|
|
|
|
|
total_time = 0 |
|
|
|
|
|
|
|
for i in range(args.iter): |
|
|
|
logger.info("iter {}".format(i)) |
|
|
|
start_time = time.time() |
|
|
|
retval = run() |
|
|
|
cur_time = time.time() - start_time |
|
|
|
total_time += cur_time |
|
|
|
|
|
|
|
avg_speed = (i + 1) / total_time |
|
|
|
if "data" in data: |
|
|
|
avg_speed *= data["data"].shape[0] |
|
|
|
avg_speed_txt = "{:.3f}sample/s".format(avg_speed) |
|
|
|
else: |
|
|
|
avg_speed_txt = "{:.3f}batch/s".format(avg_speed) |
|
|
|
|
|
|
|
msg = ( |
|
|
|
"iter {}: duration={:.4f}({:.4f})s average={:.4f}s " |
|
|
|
"avg_speed={} time={:.4f}s" |
|
|
|
).format( |
|
|
|
i, |
|
|
|
cur_time, |
|
|
|
func.get_prev_exec_time(), |
|
|
|
total_time / (i + 1), |
|
|
|
avg_speed_txt, |
|
|
|
total_time, |
|
|
|
) |
|
|
|
if args.calc_output_rms: |
|
|
|
rms = [] |
|
|
|
for v in retval: |
|
|
|
rms.append("{:.3g}".format(float(((v ** 2).mean()) ** 0.5))) |
|
|
|
msg += " output_rms=[{}]".format(", ".join(rms)) |
|
|
|
if logger.level > logging.INFO: |
|
|
|
print(msg) |
|
|
|
else: |
|
|
|
logger.info(msg) |
|
|
|
|
|
|
|
if args.focused_nvprof: |
|
|
|
if get_device_count("gpu") < 1: |
|
|
|
logger.warning( |
|
|
|
"No cuda device detected. ``focused_nvprof`` will be ignored." |
|
|
|
) |
|
|
|
else: |
|
|
|
try: |
|
|
|
import pycuda.driver as D |
|
|
|
|
|
|
|
D.start_profiler() |
|
|
|
func.execute() |
|
|
|
func.wait() |
|
|
|
D.stop_profiler() |
|
|
|
except ImportError: |
|
|
|
logger.error("`focused_nvprof need pycuda`", exc_info=True) |
|
|
|
|
|
|
|
if args.profile: |
|
|
|
with open(args.profile, "w") as fout: |
|
|
|
fout.write(profiler.get()) |
|
|
|
|
|
|
|
return avg_speed |
|
|
|
|
|
|
|
|
|
|
|
def main(): |
|
|
|
parser = argparse.ArgumentParser( |
|
|
|
description="load a network and run inference on random data", |
|
|
|
formatter_class=argparse.ArgumentDefaultsHelpFormatter, |
|
|
|
) |
|
|
|
parser.add_argument("net") |
|
|
|
parser.add_argument( |
|
|
|
"--device", "-d", help="set defult device, like 'gpux' or 'cpux'" |
|
|
|
) |
|
|
|
parser.add_argument( |
|
|
|
"--calc-output-rms", |
|
|
|
action="store_true", |
|
|
|
help="compute RMS of outputs; useful for comparing computing results", |
|
|
|
) |
|
|
|
parser.add_argument( |
|
|
|
"--output-name", |
|
|
|
nargs="*", |
|
|
|
help="Specify output name. This option can be" |
|
|
|
" specified multiple times. We will look for opr/var" |
|
|
|
" in the graph", |
|
|
|
) |
|
|
|
parser.add_argument( |
|
|
|
"--load-input-data", |
|
|
|
help="load input data from pickle file; it should be" |
|
|
|
" a numpy array or a dict of numpy array", |
|
|
|
) |
|
|
|
parser.add_argument("--profile", help="profiler output file") |
|
|
|
parser.add_argument( |
|
|
|
"--fast-run", |
|
|
|
action="store_true", |
|
|
|
help="enable fast running by profiling conv algorithms during compiling.", |
|
|
|
) |
|
|
|
parser.add_argument( |
|
|
|
"--reproducible", action="store_true", help="use reproducible kernels" |
|
|
|
) |
|
|
|
parser.add_argument( |
|
|
|
"--input-desc", |
|
|
|
help="specifiy input names and shapes manually in" |
|
|
|
" format: <name>:<shape>[;<name>:<shape>, ...], where" |
|
|
|
" name is a string and shape is a comma separated" |
|
|
|
' string. e.g., "data:128,1,28,28,label:128".' |
|
|
|
" different input tensor are separated by semicolon.", |
|
|
|
) |
|
|
|
parser.add_argument( |
|
|
|
"--batchsize", |
|
|
|
type=int, |
|
|
|
help="change batchsize; the first dimension of each" |
|
|
|
" input is assumed to be batch size", |
|
|
|
) |
|
|
|
parser.add_argument( |
|
|
|
"--warm-up", |
|
|
|
action="store_true", |
|
|
|
help="warm up model before do timing " " for better estimation", |
|
|
|
) |
|
|
|
parser.add_argument( |
|
|
|
"--verbose", |
|
|
|
"-v", |
|
|
|
action="store_true", |
|
|
|
help="verbose output, logging in debug mode", |
|
|
|
) |
|
|
|
parser.add_argument( |
|
|
|
"--iter", type=int, default=1, help="number of iters to run the model" |
|
|
|
) |
|
|
|
parser.add_argument("--log", help="give a file path to duplicate log to") |
|
|
|
parser.add_argument( |
|
|
|
"--seed", |
|
|
|
type=int, |
|
|
|
default=0, |
|
|
|
help="seed for random number generator for input data", |
|
|
|
) |
|
|
|
parser.add_argument( |
|
|
|
"--rng", |
|
|
|
help="special RNG options to generate input data in" |
|
|
|
" format: <name>:func[;<name>:func, ...] where name is" |
|
|
|
" a string and func is a python expression containing" |
|
|
|
' "{}" for the size param, e.g. ' |
|
|
|
' "label:randint(low=0,high=1000,size={})"', |
|
|
|
) |
|
|
|
parser.add_argument( |
|
|
|
"--focused-nvprof", |
|
|
|
action="store_true", |
|
|
|
help="only profile last iter for `nvprof --profile-from-start off`", |
|
|
|
) |
|
|
|
parser.add_argument( |
|
|
|
"--optimize-for-inference", |
|
|
|
action="store_true", |
|
|
|
help="optimize model for inference", |
|
|
|
) |
|
|
|
parser.add_argument( |
|
|
|
"--enable-io16xc32", |
|
|
|
action="store_true", |
|
|
|
help="transform the mode to float16 io float32 compute", |
|
|
|
) |
|
|
|
parser.add_argument( |
|
|
|
"--enable-ioc16", |
|
|
|
action="store_true", |
|
|
|
help="transform the dtype of the model to float16 io and compute", |
|
|
|
) |
|
|
|
parser.add_argument( |
|
|
|
"--enable-hwcd4", |
|
|
|
action="store_true", |
|
|
|
help="transform the model format from NCHW to NHWCD4 for inference", |
|
|
|
) |
|
|
|
parser.add_argument( |
|
|
|
"--enable-nchw4", |
|
|
|
action="store_true", |
|
|
|
help="transform the model format from NCHW to NCHW4 for inference", |
|
|
|
) |
|
|
|
parser.add_argument( |
|
|
|
"--enable-nchw88", |
|
|
|
action="store_true", |
|
|
|
help="transform the model format from NCHW to NCHW88 for inference", |
|
|
|
) |
|
|
|
parser.add_argument( |
|
|
|
"--enable-nchw44", |
|
|
|
action="store_true", |
|
|
|
help="transform the model format from NCHW to NCHW44 for inference", |
|
|
|
) |
|
|
|
parser.add_argument( |
|
|
|
"--enable-nchw44-dot", |
|
|
|
action="store_true", |
|
|
|
help="transform the model format from NCHW to NCHW44_DOT " |
|
|
|
"for optimizing armv8.2 dot in inference", |
|
|
|
) |
|
|
|
parser.add_argument( |
|
|
|
"--enable-chwn4", |
|
|
|
action="store_true", |
|
|
|
help="transform the model format to CHWN4 " |
|
|
|
"for inference, mainly used for nvidia tensorcore", |
|
|
|
) |
|
|
|
parser.add_argument( |
|
|
|
"--enable-nchw32", |
|
|
|
action="store_true", |
|
|
|
help="transform the model format from NCHW4 to NCHW32 " |
|
|
|
"for inference on nvidia TensoCore", |
|
|
|
) |
|
|
|
parser.add_argument( |
|
|
|
"--enable-fuse-conv-bias-nonlinearity", |
|
|
|
action="store_true", |
|
|
|
help="fuse convolution bias and nonlinearity opr to a " |
|
|
|
"conv_bias opr and compute", |
|
|
|
) |
|
|
|
parser.add_argument( |
|
|
|
"--enable-fuse-conv-bias-with-z", |
|
|
|
action="store_true", |
|
|
|
help="fuse conv_bias with z input for inference on " |
|
|
|
"nvidia GPU (this optimization pass will result in mismatch " |
|
|
|
"of the precision of output of training and inference)", |
|
|
|
) |
|
|
|
parser.add_argument( |
|
|
|
"--dump-cpp-model", |
|
|
|
help="write a C++ model that can be loaded by " |
|
|
|
"megbrain/sdk/load-and-run; " |
|
|
|
"this implies --embed-input", |
|
|
|
) |
|
|
|
parser.add_argument( |
|
|
|
"--embed-input", |
|
|
|
action="store_true", |
|
|
|
help="embed input data as SharedDeviceTensor in model, " |
|
|
|
"to remove memory copy for inputs", |
|
|
|
) |
|
|
|
args = parser.parse_args() |
|
|
|
|
|
|
|
if args.verbose: |
|
|
|
enable_debug_log() |
|
|
|
if args.log: |
|
|
|
set_log_file(args.log) |
|
|
|
|
|
|
|
if args.device: |
|
|
|
set_default_device(args.device) |
|
|
|
|
|
|
|
if args.dump_cpp_model: |
|
|
|
args.embed_input = True |
|
|
|
|
|
|
|
logger.info("loading model ...") |
|
|
|
graph, _, output_vars = G.load_graph(args.net) |
|
|
|
input_vars = tools.get_dep_vars(output_vars, "Host2DeviceCopy") |
|
|
|
|
|
|
|
if args.output_name is not None: |
|
|
|
output_vars = tools.find_vars_by_name(output_vars, args.output_name) |
|
|
|
|
|
|
|
data = make_data_given_desc(args, input_vars) |
|
|
|
|
|
|
|
run_model(args, graph, input_vars, output_vars, data) |
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
|
main() |