You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

run_resnet50_perf.py 5.5 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162
  1. # -*- coding: utf-8 -*-
  2. # MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
  3. #
  4. # Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
  5. #
  6. # Unless required by applicable law or agreed to in writing,
  7. # software distributed under the License is distributed on an
  8. # "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  9. import argparse
  10. import os
  11. import pathlib
  12. import subprocess
  13. from megengine.utils.profile_analyze import main as profiler
  14. home = pathlib.Path(__file__).parent.absolute()
  15. script_path = os.path.join(str(home), "resnet50_perf.py")
  16. script_path = "python3 " + script_path
  17. prof_path = "prof.json"
  18. log_path = "log.txt"
  19. def print_log(msg: str, log: str = log_path):
  20. print(msg)
  21. with open(log, "a") as f:
  22. print(msg, file=f)
  23. def run_cmd(cmd: str, log: str = log_path) -> bool:
  24. stdout = subprocess.getoutput(cmd)
  25. token = "Wall time"
  26. gpu_msg = "GPU Usage"
  27. run_finished = False
  28. for line in stdout.split("\n"):
  29. if token in line:
  30. print(line)
  31. print_log("Run status: finished")
  32. run_finished = True
  33. if gpu_msg in line:
  34. print(line)
  35. if not run_finished:
  36. print_log("Run status: failed")
  37. with open(log, "a") as f:
  38. print(stdout, file=f)
  39. return run_finished
  40. if __name__ == "__main__":
  41. parser = argparse.ArgumentParser(description="ResNet50 train performance")
  42. parser.add_argument(
  43. "--run-debug-tool", action="store_true", help="run profiler and valgrind"
  44. )
  45. parser.add_argument(
  46. "--run-parallel", action="store_true", help="run data parallel performance"
  47. )
  48. parser.add_argument("--run-eager", action="store_false", help="run eager graph")
  49. args = parser.parse_args()
  50. f = open(log_path, "w")
  51. f.close()
  52. print_log("**************************************")
  53. print_log("Run ResNet 50 performance test with batch size = 64")
  54. print_log("**************************************")
  55. print_log("Run static graph with default opt level")
  56. cmd = script_path
  57. run_cmd(cmd)
  58. print_log("**************************************")
  59. print_log("Run static graph with conv fastrun")
  60. cmd = script_path + " --conv-fastrun=yes"
  61. run_cmd(cmd)
  62. print_log("**************************************")
  63. print_log("Run static graph with conv fastrun and JIT")
  64. cmd = script_path + " --conv-fastrun=yes --opt-level=3"
  65. run_cmd(cmd)
  66. print_log("**************************************")
  67. print_log("Run static graph with JIT, conv fastrun and without running step")
  68. cmd = script_path + " --conv-fastrun=yes --opt-level=3 --run-step=no"
  69. run_cmd(cmd)
  70. if args.run_eager:
  71. print_log("**************************************")
  72. print_log("Run static graph with default opt level and batch-size=8")
  73. cmd = script_path + " --batch-size=8"
  74. run_cmd(cmd)
  75. print_log("**************************************")
  76. print_log("Run eager graph with default opt level and batch-size=8")
  77. cmd = script_path
  78. run_cmd("MGE_DISABLE_TRACE=1 " + cmd + " --eager=yes")
  79. if args.run_debug_tool:
  80. print_log("**************************************")
  81. print_log("Run with dump_prof")
  82. cmd = script_path + " --dump-prof=" + prof_path
  83. if run_cmd(cmd):
  84. print("Printing profiling result")
  85. profiler([prof_path, "--aggregate-by=type", "--aggregate=sum", "-t 10"])
  86. print_log("**************************************")
  87. print_log("Run with valgrind massif")
  88. massif_out = "massif.out"
  89. # Use 0.01% as valgrind massif threashold
  90. # A smaller value reports more details but it may take longer time to analyze the log
  91. # Change it accordingly.
  92. mem_threshold = 0.01
  93. cmd = (
  94. "valgrind --tool=massif --threshold={} --massif-out-file=".format(
  95. mem_threshold
  96. )
  97. + massif_out
  98. + " "
  99. )
  100. cmd = cmd + script_path + " --warm-up=no --run-iter=20"
  101. run_cmd(cmd)
  102. ms_print_file = "massif.out.ms_print"
  103. cmd = (
  104. "ms_print --threshold={} ".format(mem_threshold)
  105. + massif_out
  106. + " > "
  107. + ms_print_file
  108. )
  109. os.system(cmd)
  110. cmd = "head -n 33 " + ms_print_file
  111. os.system(cmd)
  112. print_log("Read {} for detailed massif output".format(ms_print_file))
  113. if args.run_parallel:
  114. print_log("**************************************")
  115. tmp_out = "/dev/null"
  116. # Change server and port to run at your system
  117. server = "localhost"
  118. port = "2222"
  119. for num_gpu in (2, 4, 8):
  120. print_log("Run with {} GPUs".format(num_gpu))
  121. cmd = script_path + " --num-gpu={} --server={} --port={} ".format(
  122. num_gpu, server, port
  123. )
  124. for i in range(num_gpu - 1):
  125. irank = num_gpu - 1 - i
  126. os.system(
  127. cmd
  128. + " --device={}".format(irank)
  129. + " 1>{} 2>{} &".format(tmp_out, tmp_out)
  130. )
  131. if not run_cmd(cmd):
  132. break
  133. print_log("**************************************")
  134. print_log("**************************************")
  135. print("Finish run, summary:")
  136. cmd = 'grep "Run with\|Wall time\|Run status\|Error\|GPU Usage" ' + log_path
  137. os.system(cmd)

MegEngine 安装包中集成了使用 GPU 运行代码所需的 CUDA 环境,不用区分 CPU 和 GPU 版。 如果想要运行 GPU 程序,请确保机器本身配有 GPU 硬件设备并安装好驱动。 如果你想体验在云端 GPU 算力平台进行深度学习开发的感觉,欢迎访问 MegStudio 平台

Contributors (1)