|
- # -*- coding: utf-8 -*-
- # MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
- #
- # Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
- #
- # Unless required by applicable law or agreed to in writing,
- # software distributed under the License is distributed on an
- # "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- import argparse
- import os
- import pathlib
- import subprocess
-
- from megengine.utils.profile_analyze import main as profiler
-
- home = pathlib.Path(__file__).parent.absolute()
- script_path = os.path.join(str(home), "resnet50_perf.py")
- script_path = "python3 " + script_path
-
- prof_path = "prof.json"
-
- log_path = "log.txt"
-
-
- def print_log(msg: str, log: str = log_path):
- print(msg)
- with open(log, "a") as f:
- print(msg, file=f)
-
-
- def run_cmd(cmd: str, log: str = log_path) -> bool:
- stdout = subprocess.getoutput(cmd)
- token = "Wall time"
- gpu_msg = "GPU Usage"
- run_finished = False
- for line in stdout.split("\n"):
- if token in line:
- print(line)
- print_log("Run status: finished")
- run_finished = True
- if gpu_msg in line:
- print(line)
- if not run_finished:
- print_log("Run status: failed")
- with open(log, "a") as f:
- print(stdout, file=f)
-
- return run_finished
-
-
- if __name__ == "__main__":
-
- parser = argparse.ArgumentParser(description="ResNet50 train performance")
- parser.add_argument(
- "--run-debug-tool", action="store_true", help="run profiler and valgrind"
- )
- parser.add_argument(
- "--run-parallel", action="store_true", help="run data parallel performance"
- )
- parser.add_argument("--run-eager", action="store_false", help="run eager graph")
- args = parser.parse_args()
-
- f = open(log_path, "w")
- f.close()
-
- print_log("**************************************")
- print_log("Run ResNet 50 performance test with batch size = 64")
-
- print_log("**************************************")
- print_log("Run static graph with default opt level")
- cmd = script_path
- run_cmd(cmd)
-
- print_log("**************************************")
- print_log("Run static graph with conv fastrun")
- cmd = script_path + " --conv-fastrun=yes"
- run_cmd(cmd)
-
- print_log("**************************************")
- print_log("Run static graph with conv fastrun and JIT")
- cmd = script_path + " --conv-fastrun=yes --opt-level=3"
- run_cmd(cmd)
-
- print_log("**************************************")
- print_log("Run static graph with JIT, conv fastrun and without running step")
- cmd = script_path + " --conv-fastrun=yes --opt-level=3 --run-step=no"
- run_cmd(cmd)
-
- if args.run_eager:
- print_log("**************************************")
- print_log("Run static graph with default opt level and batch-size=8")
- cmd = script_path + " --batch-size=8"
- run_cmd(cmd)
- print_log("**************************************")
- print_log("Run eager graph with default opt level and batch-size=8")
- cmd = script_path
- run_cmd("MGE_DISABLE_TRACE=1 " + cmd + " --eager=yes")
-
- if args.run_debug_tool:
-
- print_log("**************************************")
- print_log("Run with dump_prof")
- cmd = script_path + " --dump-prof=" + prof_path
- if run_cmd(cmd):
- print("Printing profiling result")
- profiler([prof_path, "--aggregate-by=type", "--aggregate=sum", "-t 10"])
-
- print_log("**************************************")
- print_log("Run with valgrind massif")
- massif_out = "massif.out"
- # Use 0.01% as valgrind massif threashold
- # A smaller value reports more details but it may take longer time to analyze the log
- # Change it accordingly.
- mem_threshold = 0.01
- cmd = (
- "valgrind --tool=massif --threshold={} --massif-out-file=".format(
- mem_threshold
- )
- + massif_out
- + " "
- )
- cmd = cmd + script_path + " --warm-up=no --run-iter=20"
- run_cmd(cmd)
- ms_print_file = "massif.out.ms_print"
- cmd = (
- "ms_print --threshold={} ".format(mem_threshold)
- + massif_out
- + " > "
- + ms_print_file
- )
- os.system(cmd)
- cmd = "head -n 33 " + ms_print_file
- os.system(cmd)
- print_log("Read {} for detailed massif output".format(ms_print_file))
-
- if args.run_parallel:
- print_log("**************************************")
- tmp_out = "/dev/null"
- # Change server and port to run at your system
- server = "localhost"
- port = "2222"
- for num_gpu in (2, 4, 8):
- print_log("Run with {} GPUs".format(num_gpu))
-
- cmd = script_path + " --num-gpu={} --server={} --port={} ".format(
- num_gpu, server, port
- )
- for i in range(num_gpu - 1):
- irank = num_gpu - 1 - i
- os.system(
- cmd
- + " --device={}".format(irank)
- + " 1>{} 2>{} &".format(tmp_out, tmp_out)
- )
- if not run_cmd(cmd):
- break
-
- print_log("**************************************")
- print_log("**************************************")
- print("Finish run, summary:")
- cmd = 'grep "Run with\|Wall time\|Run status\|Error\|GPU Usage" ' + log_path
- os.system(cmd)
|