|
- #!/bin/bash
- OLD_IFS="$IFS"
- IFS=$'\n'
- TYPICAL_MODEL_DATD="model_source/resnet50_b1_float32_without_data.mge --input \"data:input_data/resnet50_input.npy\""
- DEVICE_DESC=""
- WORK_DIR_PATH="."
- RUN_ARM_DEVICE="false"
- RUN_TARGET="diff_model"
- ONLY_PREPARE_MODEL="false"
- MODEL_PREAPRED="false"
- ONLY_BUILD="false"
- LAR_BUILT="false"
- CLEAN_ALL="false"
-
- RUN_TARGETS=("diff_model")
- RUN_TARGETS+=("diff_device")
- RUN_TARGETS+=("fast_run")
- RUN_TARGETS+=("io")
- RUN_TARGETS+=("layout")
- RUN_TARGETS+=("optimize")
- RUN_TARGETS+=("plugin")
- RUN_TARGETS+=("all")
-
-
- function usage() {
- echo "$0 args1 args2 .."
- echo "available args detail:"
- echo "-p : prepare example model "
- echo "-b : build load_and_run for x86/armv7/arm64 cpu and CUDA"
- echo "-t : set the ssh arm device "
- echo "-w : set the arm device workspace dir"
- echo "-c : clean all"
- echo "-a : run all test"
- echo "-e : set the running target for test (details use \"-e\" to see)"
- echo "-h : show usage"
- exit -1
- }
-
-
- while getopts "pbcahe:w:t:" arg
- do
- case $arg in
- t)
- DEVICE_DESC=$OPTARG
- RUN_ARM_DEVICE="true"
- echo "config arm device DEVICE_DESC to ${DEVICE_DESC}"
- ;;
- w)
- WORK_DIR_PATH=$OPTARG
- echo "config arm device WORK_DIR_PATH to ${WORK_DIR_PATH}"
- ;;
- e)
- tmp_target=null
- for target in ${RUN_TARGETS[@]}; do
- if [ "$target" = "$OPTARG" ]; then
- echo "CONFIG BUILD RUN_TARGET to : $OPTARG"
- tmp_target=$OPTARG
- RUN_TARGET=$OPTARG
- break
- fi
- done
- if [ "$tmp_target" = "null" ]; then
- echo "ERR args for target (-e)"
- echo "available target usage :"
- for target in ${RUN_TARGETS[@]}; do
- echo " -e $target"
- done
- exit -1
- fi
- ;;
- h)
- echo "show usage"
- usage
- ;;
- a)
- echo "config RUN_TARGET=all"
- RUN_TARGET="all"
- ;;
- c)
- echo "clean all directory generated by script"
- CLEAN_ALL="true"
-
- ;;
- b)
- echo "run build"
- ONLY_BUILD="true"
- ;;
- p)
- echo "prepare model and input"
- ONLY_PREPARE_MODEL="true"
- ;;
- ?)
- echo "unkonw argument"
- usage
- ;;
- esac
- done
-
-
- function prepare_model_and_data(){
- rm -rf model_source && mkdir model_source
- # dump mgb model
- python3 script/resnet50_mgb.py -o model_source/resnet50.pkl
- ../dump_with_testcase.py model_source/resnet50.pkl -o model_source/resnet50_with_data.mgb -d "#rand(0, 255)" --no-assert
-
- # prepare simple add model
- python3 script/add_demo.py --dir model_source
- python3 script/conv_demo.py --dir model_source
-
- #generate trt model
- script/gen_trt_model.sh
-
- #prepare mge model
- python3 script/resnet50_mge.py --dir model_source
- python3 script/resnet50_mge.py --dir model_source -d uint8
- python3 script/resnet50_mge.py --dir model_source --inputs "#rand(0,255)"
-
- #make input_data
- rm -rf input_data && mkdir input_data
- python3 script/mge_input_data.py
-
- rm -rf tmpdir && mkdir tmpdir
- }
-
- function build_lar(){
- # build cpu and cuda version
- ../../../scripts/cmake-build/host_build.sh -r -t -e load_and_run
- #WARNING:config the cuda environment before compile
- ../../../scripts/cmake-build/host_build.sh -c -t -e load_and_run
-
- # # build arm version
-
- ../../../scripts/cmake-build/cross_build_android_arm_inference.sh -r -a arm64-v8a -e load_and_run
- ../../../scripts/cmake-build/cross_build_android_arm_inference.sh -r -a armeabi-v7a -e load_and_run
-
-
- # link or for test
- ln -s ../../../build_dir/host/MGE_WITH_CUDA_OFF/MGE_INFERENCE_ONLY_OFF/Release//build/lite/load_and_run/load_and_run lar_cpu
- ln -s ../../../build_dir/host/MGE_WITH_CUDA_ON/MGE_INFERENCE_ONLY_OFF/Release//build/lite/load_and_run/load_and_run lar_cuda
- cp ../../../build_dir/android/arm64-v8a/Release/build/lite/load_and_run/load_and_run ./lar_arm64
- cp ../../../build_dir/android/armeabi-v7a/Release/build/lite/load_and_run/load_and_run ./lar_armv7
-
-
-
- }
-
- function set_arm_device_and_upload(){
- DEVICE_DESC="${1}"
- WORK_DIR_PATH="${2}"
- RUN_ARM_DEVICE="${3}"
- cmd="rsync -aP -zz ./lar_arm64 ./lar_armv7 model_source/resnet50_b1_float32_without_data.mge input_data/resnet50_input.npy $DEVICE_DESC:$WORK_DIR_PATH/"
- echo $cmd
- bash -c "$cmd"
- }
-
- function test_different_model(){
- CmdArray=("./lar_cpu model_source/resnet50_with_data.mgb")
- CmdArray+=("./lar_cpu model_source/resnet50_b1_float32_with_data.mge")
- CmdArray+=("./lar_cpu $TYPICAL_MODEL_DATD")
- CmdArray+=("./lar_cpu $TYPICAL_MODEL_DATD --lite")
- for cmd in ${CmdArray[@]}; do
- echo "$cmd"
- bash -c "$cmd"
- done
- }
-
- function test_different_device(){
- #dispatch时,计算任务会加入一个工作队列,由队列统一管理执行 均值 131.278 ms 标准差 15.197ms m_asyc_exec异步执行
- CmdArray=("./lar_cpu $TYPICAL_MODEL_DATD --cpu")
- #dispatch时,计算任务直接执行 均值 131.875 ms 标准差 7.758ms m_asyc_exec同步执行
- CmdArray+=("./lar_cpu $TYPICAL_MODEL_DATD --cpu-default")
- #多线程运行 1~8平均运行时间(ms):129.611, 84.266, 76.963, 55.212, 69.283, 77.338, 58.386, 64.585
- CmdArray+=("./lar_cpu $TYPICAL_MODEL_DATD --multithread 4")
- #主线程锁核,其他任务在线程池中的线程上运行 132.614, 83.095, 69.792, 54.452, 48.890, 48.206, 46.386, 53.908
- CmdArray+=("./lar_cpu $TYPICAL_MODEL_DATD --multithread-default 4")
- #cpu多线程绑核(x86上绑核影响不大)
- CmdArray+=("./lar_cpu $TYPICAL_MODEL_DATD --multithread 2 --multi-thread-core-ids 1,5")
-
- #xpu 设置为cpu上运行 132.740 ms comp_node:cpu
- CmdArray+=("./lar_cuda $TYPICAL_MODEL_DATD --cpu")
- #xpu 设置为cuda上运行 6.495 ms comp_node:gpu
- CmdArray+=("./lar_cuda $TYPICAL_MODEL_DATD --cuda")
-
- for cmd in ${CmdArray[@]}; do
- echo $cmd
- bash -c "$cmd"
- done
-
- function test_fast_run(){
-
- CmdArray=("./lar_cuda $TYPICAL_MODEL_DATD --fast-run")
-
- CmdArray+=("./lar_cuda $TYPICAL_MODEL_DATD --full-run")
- #fast run 搜参
- CmdArray+=("./lar_cuda $TYPICAL_MODEL_DATD --fast-run --fast-run-algo-policy tmpdir/algo_cache_file")
-
- #fast run 带参执行
- CmdArray+=("./lar_cuda $TYPICAL_MODEL_DATD --fast-run-algo-policy tmpdir/algo_cache_file")
-
- CmdArray+=("./lar_cuda $TYPICAL_MODEL_DATD --fast-run --reproducible")
-
- CmdArray+=("./lar_cuda $TYPICAL_MODEL_DATD --fast-run --fast-run-shared-batch-size 1")
-
- CmdArray+=("./lar_cuda $TYPICAL_MODEL_DATD --fast-run --binary-equal-between-batch")
-
- for cmd in ${CmdArray[@]}; do
- echo $cmd
- bash -c "$cmd"
- done
- }
-
- function test_io(){
- rm -rf tmpdir/bin_io_info tmpdir/bin_out_info tmpdir/bin_out_info_cuda tmpdir/io_info.txt
- mkdir tmpdir/bin_io_info tmpdir/bin_out_info tmpdir/bin_io_info_cuda
-
- CmdArray=("./lar_cpu $TYPICAL_MODEL_DATD")
-
- CmdArray+=("./lar_cpu model_source/add_demo_f32_without_data.mge --input \"input_data/add_demo_input.json\"")
- #! the model must support input with nhwc shape
- CmdArray+=("./lar_cpu model_source/resnet50_b1_int8_without_data.mge --input \"data:input_data/cat.ppm\"")
-
- CmdArray+=("./lar_cpu model_source/add_demo_f32_without_data.mge --input \"data:[2,3,4]\"")
-
- CmdArray+=("./lar_cpu model_source/add_demo_f32_without_data.mge --input \"data:[2,3,4]\" --io-dump tmpdir/io_info.txt --iter 1 --warmup-iter 0")
-
- CmdArray+=("./lar_cpu model_source/add_demo_f32_without_data.mge --input \"data:[2,3,4]\" --io-dump-stdout --iter 1 --warmup-iter 0")
-
-
-
- CmdArray+=("./lar_cpu model_source/add_demo_f32_without_data.mge --input \"data:[2,3,4]\" --io-dump-stderr --iter 1 --warmup-iter 0")
-
- #different data in the given directory the name is the var id which is the same with txt-dump information
- CmdArray+=("./lar_cpu model_source/add_demo_f32_without_data.mge --input \"data:[2,3,4]\" --bin-io-dump tmpdir/bin_io_info --iter 1 --warmup-iter 0")
-
- CmdArray+=("./lar_cuda model_source/add_demo_f32_without_data.mge --input \"data:[2,3,4]\" --bin-io-dump tmpdir/bin_io_info_cuda --iter 1 --warmup-iter 0")
-
-
- CmdArray+=("./lar_cpu model_source/add_demo_f32_without_data.mge --input \"data:[2,3,4]\" --bin-out-dump tmpdir/bin_out_info --iter 1 --warmup-iter 0")
-
- CmdArray+=("./lar_cuda $TYPICAL_MODEL_DATD --copy-to-host")
-
- for cmd in ${CmdArray[@]}; do
- echo $cmd
- bash -c "$cmd"
- done
-
- #compare the binary io information
- python3 ../../../imperative/python/megengine/tools/compare_binary_iodump.py tmpdir/bin_io_info tmpdir/bin_io_info_cuda
-
- }
-
- function test_layout_related(){
- # very little speed up
- CmdArray=("./lar_cuda $TYPICAL_MODEL_DATD --enable-nchw4")
- CmdArray+=("./lar_cuda $TYPICAL_MODEL_DATD --enable-chwn4")
- CmdArray+=("./lar_cuda $TYPICAL_MODEL_DATD --enable-nchw32")
- CmdArray+=("./lar_cuda $TYPICAL_MODEL_DATD --enable-nchw64")
-
- #speed up
- CmdArray+=("./lar_cpu $TYPICAL_MODEL_DATD --enable-nchw88")
- CmdArray+=("./lar_cuda $TYPICAL_MODEL_DATD --cuda --layout-transform cuda")
- CmdArray+=("./lar_cuda $TYPICAL_MODEL_DATD --cuda --layout-transform cuda --layout-transform-dump model_source/resnet50_b1_float32_without_data_with_global_layout_trans_cuda.mge")
- CmdArray+=("./lar_cuda model_source/resnet50_b1_float32_without_data_with_global_layout_trans_cuda.mge")
-
- CmdArray+=("./lar_cpu $TYPICAL_MODEL_DATD --cpu --layout-transform cpu")
- CmdArray+=("./lar_cpu $TYPICAL_MODEL_DATD --cpu --layout-transform cpu --layout-transform-dump model_source/resnet50_b1_float32_without_data_with_global_layout_trans_cpu.mge")
- CmdArray+=("./lar_cpu model_source/resnet50_b1_float32_without_data_with_global_layout_trans_cpu.mge")
-
- for cmd in ${CmdArray[@]}; do
- echo $cmd
- bash -c "$cmd"
- done
-
- if [ ${RUN_ARM_DEVICE} == "true" ] ; then
- #speed up
- CmdArray=("./lar_arm64 resnet50_b1_float32_without_data.mge --input \"data:resnet50_input.npy\" --cpu --enable-nchw44")
-
- #speed up
- CmdArray+=("./lar_arm64 resnet50_b1_float32_without_data.mge --input \"data:resnet50_input.npy\" --cpu --enable-nchw44-dot")
-
- for cmd in ${CmdArray[@]}; do
- echo $cmd
- ssh -t $DEVICE_DESC "unset LD_PRELOAD && cd $WORK_DIR_PATH && LD_LIBRARY_PATH=./ $cmd"
- done
- else
- echo "SET arm device ON : $RUN_ARM_DEVICE"
- fi
-
- }
-
- function test_optimize(){
- CmdArray=("./lar_cpu $TYPICAL_MODEL_DATD --enable-fuse-preprocess")
- #warm up speed up
- CmdArray+=("./lar_cuda $TYPICAL_MODEL_DATD --enable-fuse-conv-bias-nonlinearity")
-
- CmdArray+=("./lar_cuda $TYPICAL_MODEL_DATD --enable-fuse-conv-bias-with-z")
-
- CmdArray+=("./lar_cuda model_source/trt_conv_demo_with_data.mgb --tensorrt")
-
- CmdArray+=("./lar_cuda model_source/trt_conv_demo_with_data.mgb --tensorrt --tensorrt-cache tmpdir/TRT_cache")
-
- CmdArray+=("./lar_cuda model_source/trt_conv_demo_with_data.mgb --tensorrt-cache tmpdir/TRT_cache")
-
-
- CmdArray+=("./lar_cpu $TYPICAL_MODEL_DATD --no-sanity-check --record-comp-seq2")
-
- CmdArray+=("./lar_cpu $TYPICAL_MODEL_DATD --disable_mem_opt")
-
- CmdArray+=("./lar_cpu $TYPICAL_MODEL_DATD --workspace_limit 10000")
-
- CmdArray+=("./lar_cpu $TYPICAL_MODEL_DATD --fake-first")
-
- CmdArray+=("./lar_cuda $TYPICAL_MODEL_DATD --enable_jit ")
-
- for cmd in ${CmdArray[@]}; do
- echo $cmd
- bash -c "$cmd"
- done
-
- }
-
- function test_plugin(){
-
- rm -rf tmpdir/staticMemInfoDir tmpdir/staticMemInfoDirLogs
- mkdir tmpdir/staticMemInfoDir
-
- CmdArray=("./lar_cpu model_source/add_demo_f32_without_data.mge --input \"data:[2,3,4]\" --check-dispatch")
-
- CmdArray+=("./lar_cpu model_source/add_demo_f32_without_data.mge --input \"data:[2,3,4]\" --check-var-value 5:0")
-
- CmdArray+=("./lar_cpu model_source/add_demo_f32_without_data.mge --input \"data:[2,3,4]\" --range 2")
-
- CmdArray+=("./lar_cpu model_source/add_demo_f32_without_data.mge --input \"data:[2,3,4]\" --profile tmpdir/opr_profile.json
-
- ")
- CmdArray+=("./lar_cuda model_source/add_demo_f32_without_data.mge --input \"data:[2,3,4]\" --profile-host tmpdir/opr_profile_host.json")
-
-
- CmdArray+=("./lar_cpu $TYPICAL_MODEL_DATD --model-info")
-
- CmdArray+=("./lar_cpu $TYPICAL_MODEL_DATD --verbose")
-
- CmdArray+=("./lar_cpu model_source/resnet50_with_data.mgb --disable-assert-throw")
-
- # wait gdb attach to given PID
- # CmdArray+=("./lar_cpu $TYPICAL_MODEL_DATD --wait-gdb")
-
- CmdArray+=("./lar_cuda $TYPICAL_MODEL_DATD --get-static-mem-info tmpdir/staticMemInfoDir")
-
- for cmd in ${CmdArray[@]}; do
- echo $cmd
- bash -c "$cmd"
- done
-
- # view the graph with given url (usally: http://localhost:6006/)
- # mkdir tmpdir/staticMemInfoDirLogs && python3 ../../../imperative/python/megengine/tools/graph_info_analyze.py -i tmpdir/staticMemInfoDir -o tmpdir/staticMemInfoDirLogs
- # pip3 install tensorboard && tensorboard --logdir tmpdir/staticMemInfoDirLogs
- }
-
- function clean(){
- rm -rf tmpdir model_source input_data lar_cpu lar_cuda lar_arm64 lar_armv7
-
- }
-
- function main(){
-
- if [ ${CLEAN_ALL} == "true" ] ; then
- clean
- exit 0
- fi
-
- if [ ${ONLY_PREPARE_MODEL} == "true" ] ; then
- prepare_model_and_data
- MODEL_PREAPRED="true"
- exit 0
- fi
-
- if [ ${ONLY_BUILD} == "true" ] ; then
- build_lar
- LAR_BUILT="true"
- exit 0
- fi
-
- if [ ${RUN_ARM_DEVICE} == "true" ] ; then
- set_arm_device_and_upload $DEVICE_DESC "$WORK_DIR_PATH" "true"
- fi
-
- if [ ${MODEL_PREAPRED} != "true" ] ; then
- CHECK_MODEL=$(find . -name add_demo_input.json)
- if [ ${CHECK_MODEL} == "" ] ; then
- prepare_model_and_data
- MODEL_PREAPRED="true"
- fi
- fi
-
- if [ ${LAR_BUILT} != "true" ] ; then
- CHECK_LAR=$(find . -name lar_armv7)
- if [ ${CHECK_LAR} == "" ] ; then
- build_lar
- LAR_BUILT="true"
- fi
- fi
-
- if [ ${RUN_TARGET} == "diff_model" -o ${RUN_TARGET} == "all" ] ; then
- test_different_model
- fi
-
- if [ ${RUN_TARGET} == "diff_device" -o ${RUN_TARGET} == "all" ] ; then
- test_different_device
- fi
-
- if [ ${RUN_TARGET} == "fast_run" -o ${RUN_TARGET} == "all" ] ; then
- test_fast_run
- fi
-
- if [ ${RUN_TARGET} == "io" -o ${RUN_TARGET} == "all" ] ; then
- test_io
- fi
-
- if [ ${RUN_TARGET} == "layout" -o ${RUN_TARGET} == "all" ] ; then
- test_layout_related
- fi
-
- if [ ${RUN_TARGET} == "optimize" -o ${RUN_TARGET} == "all" ] ; then
- test_optimize
- fi
-
- if [ ${RUN_TARGET} == "plugin" -o ${RUN_TARGET} == "all" ] ; then
- test_plugin
- fi
-
-
-
- }
-
- main
- IFS=$OLD_IFS
|