You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

load_network_and_run.py 14 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438
  1. import argparse
  2. import logging
  3. import time
  4. from collections import OrderedDict
  5. import numpy as np
  6. import megengine as mge
  7. from megengine.core.ops import custom
  8. from megengine.core.tensor import megbrain_graph as G
  9. from megengine.device import get_device_count, set_default_device
  10. from megengine.functional.debug_param import set_execution_strategy
  11. from megengine.logger import enable_debug_log, get_logger, set_log_file
  12. from megengine.utils import comp_graph_tools as tools
  13. logger = get_logger(__name__)
  14. def make_data_given_desc(args, inputs, shape0_multiply=1):
  15. if args.load_input_data:
  16. logger.info("load data from {}".format(args.load_input_data))
  17. data = mge.load(args.load_input_data)
  18. data_names = [inp.name for inp in inputs]
  19. if isinstance(data, np.ndarray):
  20. assert len(data_names) == 1, (
  21. "data is given as a single numpy array, so there should be "
  22. "exactly one input in the graph; got: {}".format(data_names)
  23. )
  24. data = {data_names[0]: data}
  25. assert isinstance(data, dict)
  26. for v in data.values():
  27. assert isinstance(
  28. v, np.ndarray
  29. ), "data should provide ndarray; got {} instead".format(v)
  30. if args.batchsize:
  31. for k, v in list(data.items()):
  32. assert (
  33. args.batchsize % v.shape[0] == 0
  34. ), "current batch size must divide given batch size: {} {}".format(
  35. args.batchsize, v.shape[0]
  36. )
  37. data[k] = np.repeat(v, args.batchsize // v.shape[0], axis=0)
  38. return data
  39. def iter_inpdesc(desc):
  40. if not desc:
  41. return
  42. for pair in desc.split(";"):
  43. name, value = pair.split(":")
  44. if name not in data_shapes:
  45. logger.warning("rng name {} not in data provider".format(name))
  46. yield name, value
  47. rng = np.random.RandomState(args.seed)
  48. data_shapes = OrderedDict((inp.name, list(inp.shape)) for inp in inputs)
  49. data_dtypes = OrderedDict((inp.name, inp.dtype) for inp in inputs)
  50. for name, shape in iter_inpdesc(args.input_desc):
  51. data_shapes[name] = list(map(int, shape.split(",")))
  52. if args.batchsize:
  53. for i in data_shapes.values():
  54. i[0] = args.batchsize
  55. data_rngs = dict(iter_inpdesc(args.rng))
  56. result = OrderedDict()
  57. for name, shape in data_shapes.items():
  58. shape[0] *= shape0_multiply
  59. rng_expr = data_rngs.get(name)
  60. if rng_expr:
  61. value = eval("rng.{}".format(rng_expr).format(shape), {"rng": rng})
  62. else:
  63. value = rng.uniform(size=shape)
  64. value = np.ascontiguousarray(value, dtype=data_dtypes[name])
  65. assert value.shape == tuple(shape)
  66. result[name] = value
  67. return result
  68. def get_execution_strategy(args):
  69. if not args.fast_run:
  70. logger.warning("--fast-run not enabled; execution may be slow")
  71. strategy = "HEURISTIC"
  72. else:
  73. logger.warning("--fast-run enabled; compile may be slow")
  74. strategy = "PROFILE"
  75. if args.reproducible:
  76. strategy += "_REPRODUCIBLE"
  77. return strategy
  78. def get_opt_kwargs(args):
  79. args_list = [
  80. "enable_io16xc32",
  81. "enable_ioc16",
  82. "enable_hwcd4",
  83. "enable_nchw4",
  84. "enable_nchw88",
  85. "enable_nchw44",
  86. "enable_nchw44_dot",
  87. "enable_nchw32",
  88. "enable_chwn4",
  89. "enable_fuse_conv_bias_nonlinearity",
  90. "enable_fuse_conv_bias_with_z",
  91. ]
  92. kwargs = {}
  93. for k in args_list:
  94. if getattr(args, k):
  95. kwargs[k] = True
  96. return kwargs
  97. def run_model(args, graph, inputs, outputs, data):
  98. # must use level0 to avoid unintended opr modification
  99. graph.options.graph_opt_level = 0
  100. if args.weight_preprocess:
  101. graph.enable_weight_preprocess()
  102. logger.info("input tensors: ")
  103. for k, v in data.items():
  104. logger.info(" {}: {}".format(k, v.shape))
  105. G.modify_opr_algo_strategy_inplace(outputs, get_execution_strategy(args))
  106. if args.optimize_for_inference:
  107. opt_kwargs = get_opt_kwargs(args)
  108. outputs = G.optimize_for_inference(outputs, **opt_kwargs)
  109. # embed inputs must be on the last, to avoid const fold
  110. if args.embed_input:
  111. outputs, inp_dict = tools.embed_inputs(outputs, data.values(), inputs=inputs)
  112. else:
  113. outputs, inp_dict = tools.convert_inputs(outputs, inputs=inputs)
  114. if args.dump_cpp_model:
  115. dump_content, _ = G.dump_graph(outputs, keep_var_name=2)
  116. with open(args.dump_cpp_model, "wb") as file:
  117. file.write(dump_content)
  118. logger.info("C++ model written to {}".format(args.dump_cpp_model))
  119. outputs, output_dict = tools.convert_outputs(outputs)
  120. if args.profile:
  121. profiler = tools.GraphProfiler(graph)
  122. func = graph.compile(outputs)
  123. if args.get_static_mem_info:
  124. func.get_static_memory_alloc_info(args.get_static_mem_info)
  125. def run():
  126. if not args.embed_input:
  127. for key in inp_dict:
  128. inp_dict[key].set_value(mge.Tensor(data[key])._dev_tensor())
  129. func.execute()
  130. func.wait()
  131. return [oup_node.get_value().numpy() for oup_node in output_dict.values()]
  132. for i in range(args.warm_up):
  133. logger.info("warming up {}".format(i))
  134. run()
  135. total_time = 0
  136. for i in range(args.iter):
  137. logger.info("iter {}".format(i))
  138. start_time = time.time()
  139. retval = run()
  140. cur_time = time.time() - start_time
  141. total_time += cur_time
  142. avg_speed = (i + 1) / total_time
  143. if "data" in data:
  144. avg_speed *= data["data"].shape[0]
  145. avg_speed_txt = "{:.3f}sample/s".format(avg_speed)
  146. else:
  147. avg_speed_txt = "{:.3f}batch/s".format(avg_speed)
  148. msg = (
  149. "iter {}: duration={:.4f}({:.4f})s average={:.4f}s "
  150. "avg_speed={} time={:.4f}s"
  151. ).format(
  152. i,
  153. cur_time,
  154. func.get_prev_exec_time(),
  155. total_time / (i + 1),
  156. avg_speed_txt,
  157. total_time,
  158. )
  159. if args.calc_output_rms:
  160. rms = []
  161. for v in retval:
  162. rms.append("{:.3g}".format(float(((v ** 2).mean()) ** 0.5)))
  163. msg += " output_rms=[{}]".format(", ".join(rms))
  164. if logger.level > logging.INFO:
  165. print(msg)
  166. else:
  167. logger.info(msg)
  168. if args.focused_nvprof:
  169. if get_device_count("gpu") < 1:
  170. logger.warning(
  171. "No cuda device detected. ``focused_nvprof`` will be ignored."
  172. )
  173. else:
  174. try:
  175. import pycuda.driver as D
  176. D.start_profiler()
  177. func.execute()
  178. func.wait()
  179. D.stop_profiler()
  180. except ImportError:
  181. logger.error("`focused_nvprof need pycuda`", exc_info=True)
  182. if args.profile:
  183. with open(args.profile, "w") as fout:
  184. fout.write(profiler.get())
  185. return avg_speed
  186. def main():
  187. parser = argparse.ArgumentParser(
  188. description="load a network and run inference on random data",
  189. formatter_class=argparse.ArgumentDefaultsHelpFormatter,
  190. )
  191. parser.add_argument("net")
  192. parser.add_argument(
  193. "--device", "-d", help="set defult device, like 'gpux' or 'cpux'"
  194. )
  195. parser.add_argument(
  196. "--calc-output-rms",
  197. action="store_true",
  198. help="compute RMS of outputs; useful for comparing computing results",
  199. )
  200. parser.add_argument(
  201. "--output-name",
  202. nargs="*",
  203. help="Specify output name. This option can be"
  204. " specified multiple times. We will look for opr/var"
  205. " in the graph",
  206. )
  207. parser.add_argument(
  208. "--load-input-data",
  209. help="load input data from pickle file; it should be"
  210. " a numpy array or a dict of numpy array",
  211. )
  212. parser.add_argument("--profile", help="profiler output file")
  213. parser.add_argument(
  214. "--fast-run",
  215. action="store_true",
  216. help="enable fast running by profiling conv algorithms during compiling.",
  217. )
  218. parser.add_argument(
  219. "--reproducible", action="store_true", help="use reproducible kernels"
  220. )
  221. parser.add_argument(
  222. "--input-desc",
  223. help="specifiy input names and shapes manually in"
  224. " format: <name>:<shape>[;<name>:<shape>, ...], where"
  225. " name is a string and shape is a comma separated"
  226. ' string. e.g., "data:128,1,28,28,label:128".'
  227. " different input tensor are separated by semicolon.",
  228. )
  229. parser.add_argument(
  230. "--batchsize",
  231. type=int,
  232. help="change batchsize; the first dimension of each"
  233. " input is assumed to be batch size",
  234. )
  235. parser.add_argument(
  236. "--warm-up",
  237. type=int,
  238. default=0,
  239. help="times of warm up model before do timing " " for better estimation",
  240. )
  241. parser.add_argument(
  242. "--verbose",
  243. "-v",
  244. action="store_true",
  245. help="verbose output, logging in debug mode",
  246. )
  247. parser.add_argument(
  248. "--iter", type=int, default=1, help="number of iters to run the model"
  249. )
  250. parser.add_argument("--log", help="give a file path to duplicate log to")
  251. parser.add_argument(
  252. "--seed",
  253. type=int,
  254. default=0,
  255. help="seed for random number generator for input data",
  256. )
  257. parser.add_argument(
  258. "--rng",
  259. help="special RNG options to generate input data in"
  260. " format: <name>:func[;<name>:func, ...] where name is"
  261. " a string and func is a python expression containing"
  262. ' "{}" for the size param, e.g. '
  263. ' "label:randint(low=0,high=1000,size={})"',
  264. )
  265. parser.add_argument(
  266. "--focused-nvprof",
  267. action="store_true",
  268. help="only profile last iter for `nvprof --profile-from-start off`",
  269. )
  270. parser.add_argument(
  271. "--optimize-for-inference",
  272. action="store_true",
  273. help="optimize model for inference",
  274. )
  275. parser.add_argument(
  276. "--enable-io16xc32",
  277. action="store_true",
  278. help="transform the mode to float16 io float32 compute",
  279. )
  280. parser.add_argument(
  281. "--enable-ioc16",
  282. action="store_true",
  283. help="transform the dtype of the model to float16 io and compute",
  284. )
  285. parser.add_argument(
  286. "--enable-hwcd4",
  287. action="store_true",
  288. help="transform the model format from NCHW to NHWCD4 for inference",
  289. )
  290. parser.add_argument(
  291. "--enable-nchw4",
  292. action="store_true",
  293. help="transform the model format from NCHW to NCHW4 for inference",
  294. )
  295. parser.add_argument(
  296. "--enable-nchw88",
  297. action="store_true",
  298. help="transform the model format from NCHW to NCHW88 for inference",
  299. )
  300. parser.add_argument(
  301. "--enable-nchw44",
  302. action="store_true",
  303. help="transform the model format from NCHW to NCHW44 for inference",
  304. )
  305. parser.add_argument(
  306. "--enable-nchw44-dot",
  307. action="store_true",
  308. help="transform the model format from NCHW to NCHW44_DOT "
  309. "for optimizing armv8.2 dot in inference",
  310. )
  311. parser.add_argument(
  312. "--enable-chwn4",
  313. action="store_true",
  314. help="transform the model format to CHWN4 "
  315. "for inference, mainly used for nvidia tensorcore",
  316. )
  317. parser.add_argument(
  318. "--enable-nchw32",
  319. action="store_true",
  320. help="transform the model format from NCHW4 to NCHW32 "
  321. "for inference on nvidia TensoCore",
  322. )
  323. parser.add_argument(
  324. "--enable-fuse-conv-bias-nonlinearity",
  325. action="store_true",
  326. help="fuse convolution bias and nonlinearity opr to a "
  327. "conv_bias opr and compute",
  328. )
  329. parser.add_argument(
  330. "--enable-fuse-conv-bias-with-z",
  331. action="store_true",
  332. help="fuse conv_bias with z input for inference on "
  333. "nvidia GPU (this optimization pass will result in mismatch "
  334. "of the precision of output of training and inference)",
  335. )
  336. parser.add_argument(
  337. "--dump-cpp-model",
  338. help="write a C++ model that can be loaded by "
  339. "megbrain/lite/load_and_run; "
  340. "this implies --embed-input",
  341. )
  342. parser.add_argument(
  343. "--embed-input",
  344. action="store_true",
  345. help="embed input data as SharedDeviceTensor in model, "
  346. "to remove memory copy for inputs",
  347. )
  348. parser.add_argument(
  349. "--get-static-mem-info",
  350. type=str,
  351. help="Record the static graph's static memory info.",
  352. )
  353. parser.add_argument(
  354. "--custom-op-lib", type=str, help="path of the custom op",
  355. )
  356. parser.add_argument(
  357. "--weight-preprocess",
  358. action="store_true",
  359. help="Execute operators with weight preprocess, which can"
  360. "optimize the operator execution time with algo of winograd,"
  361. "im2col ,etc.,but it may consume more memory.",
  362. )
  363. args = parser.parse_args()
  364. if args.verbose:
  365. enable_debug_log()
  366. if args.log:
  367. set_log_file(args.log)
  368. if args.device:
  369. set_default_device(args.device)
  370. if args.dump_cpp_model:
  371. args.embed_input = True
  372. if args.custom_op_lib is not None:
  373. custom.load(args.custom_op_lib)
  374. logger.info("loading model ...")
  375. ret = G.load_graph(args.net)
  376. graph, output_vars = ret.graph, ret.output_vars_list
  377. input_vars = tools.get_dep_vars(output_vars, "Host2DeviceCopy")
  378. if args.output_name is not None:
  379. output_vars = tools.find_vars_by_name(output_vars, args.output_name)
  380. data = make_data_given_desc(args, input_vars)
  381. run_model(args, graph, input_vars, output_vars, data)
  382. if __name__ == "__main__":
  383. main()