fix(lite/load_and_run): fix cuda options build error for load and run

GitOrigin-RevId: b84b51d341
3 years ago · be84d13f48
--- a/lite/load_and_run/dump_with_testcase.py
+++ b/lite/load_and_run/dump_with_testcase.py
@@ -1,404 +0,0 @@
 #!/usr/bin/env mdl
 # -*- coding: utf-8 -*-
 # MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
 #
 # Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
 #
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

 from megskull.graph import NodeFilter, FpropEnv
 from megskull.opr.all import AssertEqual, DataProvider, BatchNormalization
 from megskull.utils.logconf import get_logger
 from meghair.utils import io
 import megbrain as mgb

 import argparse
 import struct
 import re
 import os

 import numpy as np
 import cv2

 logger = get_logger(__name__)

 def auto_reformat_image(args, path, data, dst_shape):
    """reformat image to target shape

    :param data: image data as numpy array
    :param dst_shape: target shape
    """
    dim3_format = False     # required input format does not contain batch
    hwc_format = False      # required input format is NHWC

    if len(dst_shape) == 3:
        dst_shape = (1, ) + dst_shape
        dim3_format = True

    assert len(dst_shape) == 4, 'bad dst_shape: {}'.format(dst_shape)
    chl = dst_shape[1]
    if chl in [1, 3]:
        n, c, h, w = dst_shape
        dst_shape = (n, h, w, c)
    else:
        chl = dst_shape[3]
        assert chl in [1, 3], (
            'can not infer input format from shape: {}'.format(dst_shape))
        hwc_format = True

    # dst_shape has now been normalized to NHWC format

    if args.resize_input:
        h, w = dst_shape[1:3]
        data = cv2.resize(data, (w, h))
        logger.info('input {} resized to {}'.format(path, data.shape))

    if chl == 1:
        data = cv2.cvtColor(data, cv2.COLOR_BGR2GRAY)
        data = data[:, :, np.newaxis]

    assert data.ndim == 3
    data = data[np.newaxis]
    # data normalized to NHWC format

    if not hwc_format:
        data = np.transpose(data, (0, 3, 1, 2))

    if dim3_format:
        data = np.squeeze(data, 0)

    return data

 def read_input_data(args, dst_shape, dtype, path, repeat):
    def check_shape_equal(dst_shape, data_shape):
        assert len(data_shape) == len(dst_shape) , (
            'input/data shapes mismatch: {} vs {}'.format(
                dst_shape, data_shape))

        if data_shape[1:] != dst_shape[1:]:
            logger.warning('dst_shape is {}; data_shape is {}'.format(
                dst_shape, data_shape))

    if path.startswith('#'):
        assert not args.resize_input
        assert not args.input_transform
        spec = path
        m = re.match(
            r'^#rand\(([-0-9.]*)\s*,\s*([-0-9.]*)\s*(,[^\)]+)?\)$', spec)
        assert m, 'bad spec {}'.format(spec)

        rng_min = float(m.group(1))
        rng_max = float(m.group(2))
        if m.group(3):
            shape_str = m.group(3)
            try:
                shape = shape_str[1:].split(',')
                if shape[-1].strip() == '...':
                    shape = shape[:-1]
                    shape.extend(list(dst_shape[len(shape):]))
                data_shape = tuple(map(int, shape))
            except ValueError as e:
                raise ValueError('bad spec {}: {}'.format(spec, e.args))
        else:
            data_shape = dst_shape

        check_shape_equal(dst_shape, data_shape)
        return np.random.uniform(rng_min, rng_max, data_shape).astype(dtype)

    # try to load image
    data = cv2.imread(path, cv2.IMREAD_COLOR)
    if data is None:
        assert not args.resize_input
        data = io.load(path)
        assert isinstance(data, np.ndarray)
    else:
        # load image succeeds, so we expect input format is image format
        data = auto_reformat_image(args, path, data, dst_shape)

    data = np.repeat(data, repeat, axis=0)
    if repeat > 1:
        logger.info('repeat input for {} times, data shape is {}'.format(
            repeat, data.shape))

    check_shape_equal(dst_shape, data.shape)

    if args.input_transform:
        data = eval(args.input_transform, {'data': data, 'np': np})

    return data


 def gen_one_testcase(args, inputs, spec):
    paths = spec.split(';')
    if len(paths) != len(inputs):
        if len(paths) == 1 and paths[0].startswith('#'):
            paths = ['{}:{}'.format(name, paths[0]) for name in inputs.keys()]
    assert len(paths) == len(inputs), (
        'required inputs: {}; data paths: {}'.format(inputs.keys(), paths))
    if len(paths) == 1 and ':' not in paths[0]:
        paths[0] = next(iter(inputs.keys())) + ':' + paths[0]

    ret = {}
    for path in paths:
        var, path = path.split(':')
        if args.repeat:
            repeat = args.repeat
        else:
            repeat = 1
        ret[var] = read_input_data(args, inputs[var].imm_shape,
                                   inputs[var].dtype, path, repeat)
    return ret


 def make_feeds(args):
    outputs = io.load_network(args.input).outputs
    if not args.no_assert:
        env = FpropEnv(verbose_fprop=False)
        # set flag so ExternCOprPlaceholder produce expected output
        env.flags.user['extern_c_opr_eval'] = True
        func = env.comp_graph.compile(None, [mgb.copy_output(env.get_mgbvar(i))
                                             for i in outputs])

        def expect_name(var): return 'expect:{}'.format(var.name)

    nf = NodeFilter.make_all_deps(*outputs)
    inputs = {i.name: i for i in nf.data_provider()}
    if args.init_bn:
        for i in nf:
            if isinstance(i, BatchNormalization):
                if i._iter.get_value() == 0:
                    i._iter.set_value(1)
                    i._variance.set_value(np.ones(i._variance.shape))

    testcases = []

    np.set_printoptions(precision=2, threshold=4, suppress=True)

    data_list = []
    for item in args.data:
        if item.startswith('@'):
            with open(item[1:], 'r') as f:
                data_list.extend([ line.rstrip() for line in f if line.rstrip() != ''])
        else:
            data_list.append(item)

    for inp_spec in data_list:
        cur_testcase = gen_one_testcase(args, inputs, inp_spec)
        assert len(cur_testcase) == len(inputs), (
            'required inputs: {}; given data: {}'.format(
                inputs.keys(), cur_testcase.keys()))

        if not args.no_assert:
            outputs_get = func(**cur_testcase)
            for var, val in zip(outputs, outputs_get):
                cur_testcase[expect_name(var)] = val
                logger.info(
                    'generate test groundtruth: var={} shape={} range=({}, {})'
                    ' mean={} var={}'.format(
                        var, val.shape, val.min(), val.max(),
                        np.mean(val), np.var(val)))
        testcases.append(cur_testcase)
        logger.info('add testcase: \n {}'.format(
            '\n '.join('{}: shape={} dtype={} range=({:.2f},{:.2f}) '
                       'mean={:.2f} sd={:.2f}'.format(
                           k, v.shape, v.dtype, v.min(), v.max(), np.mean(v),
                           np.std(v))
                       for k, v in sorted(cur_testcase.items()))))

    if not args.no_assert:
        def expect_shp(var):
            ret = var.partial_shape.determined_shape
            if ret:
                return ret
            return testcases[0][expect_name(var)].shape

        verbose = not args.silent
        outputs = [AssertEqual(DataProvider(expect_name(i), expect_shp(i),
                                            dtype=i.dtype,
                                            comp_node=i.comp_node),
                               i, verbose=verbose, maxerr=args.maxerr)
                   for i in outputs]
    return {'outputs': outputs, 'testcases': testcases}

 def optimize_for_inference(args, outputs):
    args_map = {
        'enable_io16xc32': 'f16_io_f32_comp',
        'enable_ioc16': 'f16_io_comp',
        'enable_hwcd4': 'use_nhwcd4',
        'enable_nchw4': 'use_nchw4',
        'enable_nchw88': 'use_nchw88',
        'enable_nchw44': 'use_nchw44',
        'enable_nchw44_dot': 'use_nchw44_dot',
        'enable_nchw32': 'use_nchw32',
        'enable_chwn4': 'use_chwn4',
        'enable_fuse_conv_bias_nonlinearity': 'fuse_conv_bias_nonlinearity',
        'enable_fuse_conv_bias_with_z': 'fuse_conv_bias_with_z',
        'enable_nchw64': 'use_nchw64', 
        'enable_fuse_preprocess': 'fuse_preprocess', 
    }

    kwargs = {}
    for k, v in args_map.items():
        if getattr(args, k):
            assert args.optimize_for_inference, (
                'optimize_for_inference should be set when {} is given'.format(
                    k))
            kwargs[v] = True

    if args.optimize_for_inference:
        return mgb.optimize_for_inference(outputs, **kwargs)

    return outputs

 def main():
    parser = argparse.ArgumentParser(
        description='Pack computing graph, input values and expected output '
        'values into one file for checking correctness. README.md gives more '
        'details on the usage',
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('input', help='input file; see README for details')
    parser.add_argument('-o', '--output', help='output file', required=True)
    parser.add_argument('--init-bn', action='store_true',
                        help='initialize untrained batch-normalization, to '
                        'avoid NaN or Inf results')
    parser.add_argument(
        '-d', '--data', default=[], action='append',
        help='Given input test data when input file is a network, '
        'and current network output would be used as groundtruth. '
        'The format is var0:file0;var1:file1... to specify data files for '
        'input vars. It can also be #rand(min,max,shape...) for generating '
        'random input data, for example, #rand(0,255), '
        '#rand(0,255,1,3,224,224) or #rand(0, 255, 1, ...) where `...` means '
        'the remaining part of the original shape. '
        'If the shape is not specified, the shape of '
        'corresponding DataProvider in the network will be used. '
        'If there is only one input var, its name can be omitted. '
        'Each data file can either be an image which can be loaded by opencv, '
        'or a pickled numpy.ndarray. '
        'This option can be given multiple times to add multiple testcases. '
        ' *NOTE* '
        'If you start the data with the letter @, the rest should be a '
        'filename, and each line in the file should be a single datum in '
        'the format described above. '
    )
    parser.add_argument(
        '--repeat', type=int, default=1,
        help='Specify how many times the input image is repeated. '
        'Useful when running benchmark for batch size other than one. '
        'Have no effect on randomly generated input data.')
    parser.add_argument('--silent', action='store_true',
                        help='set verbose to False in AssertEqual opr')
    parser.add_argument('--optimize-for-inference', action='store_true',
                        help='enbale optimization for inference')
    parser.add_argument('--no-assert', action='store_true',
                        help='do not insert AssertEqual opr to check result; '
                        'this option is useful for benchmarking')
    parser.add_argument('--maxerr', type=float, default=AssertEqual.maxerr,
                        help='max error for AssertEqual check during runtime')
    parser.add_argument('--resize-input', action='store_true',
                        help='resize input image to fit input var shape')
    parser.add_argument('--input-transform',
                        help='a python expression to transform the input data. '
                        'Example: data / np.std(data)')
    parser.add_argument('--discard-var-name', action='store_true',
                        help='discard variable and param names in the '
                        'generated output')
    parser.add_argument('--output-strip-info', action='store_true',
                        help='output code strip information')
    parser.add_argument('--enable-io16xc32', action='store_true',
                        help='transform the mode to float16 io float32 compute')
    parser.add_argument('--enable-ioc16', action='store_true',
                        help='transform the dtype of the model to float16 io '
                        'and compute')
    parser.add_argument('--enable-fuse-conv-bias-nonlinearity',
                        action='store_true',
                        help='fuse convolution bias and nonlinearity opr to a '
                        'conv_bias opr and compute')
    parser.add_argument('--enable-hwcd4', action='store_true',
                        help='transform the model format from NCHW to NHWCD4 '
                        'for inference; you may need to disable CUDA and set '
                        'MGB_USE_MEGDNN_DBG=2')
    parser.add_argument('--enable-nchw4', action='store_true',
                        help='transform the model format from NCHW to NCHW4 '
                        'for inference')
    parser.add_argument('--enable-nchw88', action='store_true',
                        help='transform the model format from NCHW to NCHW88 '
                        'for inference')
    parser.add_argument('--enable-nchw44', action='store_true',
                        help='transform the model format from NCHW to NCHW44 '
                        'for inference')
    parser.add_argument('--enable-nchw44-dot', action='store_true',
                        help='transform the model format from NCHW to NCHW44_DOT '
                        'for optimizing armv8.2 dot in inference')
    parser.add_argument('--enable-chwn4', action='store_true',
                        help='transform the model format to CHWN4 '
                        'for inference, mainly used for nvidia tensorcore')
    parser.add_argument('--enable-nchw32', action='store_true',
                        help='transform the model format from NCHW4 to NCHW32 '
                        'for inference on nvidia TensoCore')
    parser.add_argument('--enable-nchw64', action='store_true', 
                        help='transform the model format from NCHW to NCHW64 '
                        'for inference on Nvidia GPU')
    parser.add_argument('--enable-fuse-conv-bias-with-z', action='store_true',
                        help='fuse conv_bias with z input for inference on '
                        'nvidia GPU (this optimization pass will result in mismatch '
                        'of the precision of output of training and inference)')
    parser.add_argument('--enable-fuse-preprocess', action='store_true', 
                        help='fuse astype\pad_channel\dimshuffle and etc opr '
                        'from h2d op')
    args = parser.parse_args()
    if args.data:
        feeds = make_feeds(args)
    else:
        feeds = io.load(args.input)

    assert isinstance(feeds, dict) and feeds['testcases'], (
        'testcases can not be empty')

    env = FpropEnv(verbose_fprop=False)

    outputs = feeds['outputs']
    output_mgbvars = list(map(env.get_mgbvar, outputs))

    output_mgbvars = optimize_for_inference(args, output_mgbvars)

    inputs = sorted(((i.name, i.dtype) for i in
                     NodeFilter.make_all_deps(*outputs).data_provider()))
    if args.discard_var_name:
        sereg_kwargs = dict(keep_var_name=0, keep_param_name=False)
    else:
        sereg_kwargs = dict(keep_var_name=2, keep_param_name=True)

    with open(args.output, 'wb') as fout:
        fout.write(b'mgbtest0')
        fout.write(struct.pack('I', len(feeds['testcases'])))
    stat = mgb.serialize_comp_graph_to_file(
        args.output, output_mgbvars, append=True,
        output_strip_info=args.output_strip_info,
        **sereg_kwargs)
    logger.info('graph dump sizes: tot_size={:.3f}KiB overhead={:.3f}KiB'.
                format(stat.tot_bytes / 1024,
                       (stat.tot_bytes - stat.tensor_value_bytes) / 1024))

    for testcase in feeds['testcases']:
        assert isinstance(testcase, dict)
        cg = mgb.comp_graph()
        cn = mgb.comp_node('cpux')
        output_mgbvars = []
        for name, dtype in inputs:
            output_mgbvars.append(cg.make_shared(cn, value=testcase.pop(name),
                                                 dtype=dtype))
        assert not testcase, 'extra inputs provided in testcase: {}'.format(
            testcase.keys())

        mgb.serialize_comp_graph_to_file(
            args.output,
            output_mgbvars,
            append=True,
            output_strip_info=args.output_strip_info,
            append_json=True)

 if __name__ == '__main__':
    main()
--- a/lite/load_and_run/src/options/device_options.cpp
+++ b/lite/load_and_run/src/options/device_options.cpp
@@ -31,8 +31,9 @@ void XPUDeviceOption::config_model_internel<ModelLite>(
            LITE_WARN("using cpu device\n");
            model->get_config().device_type = LiteDeviceType::LITE_CPU;
        }
 #if MGE_WITH_CUDA
 #if LITE_WITH_CUDA
        if (enable_cuda) {
            LITE_WARN("using cuda device\n");
            model->get_config().device_type = LiteDeviceType::LITE_CUDA;
        }
 #endif
@@ -75,11 +76,12 @@ void XPUDeviceOption::config_model_internel<ModelMdl>(
                loc.type = mgb::CompNode::DeviceType::CPU;
            };
        }
 #if MGE_WITH_CUDA
 #if MGB_CUDA
        if (enable_cuda) {
            mgb_log_warn("using cuda device\n");
            model->get_mdl_config().comp_node_mapper = [](mgb::CompNode::Locator& loc) {
                loc.type = mgb::CompNode::DeviceType::CUDA;
                loc.device = 0;
            };
        }
 #endif
@@ -130,7 +132,7 @@ void XPUDeviceOption::config_model_internel<ModelMdl>(
 XPUDeviceOption::XPUDeviceOption() {
    m_option_name = "xpu_device";
    enable_cpu = FLAGS_cpu;
 #if MGE_WITH_CUDA
 #if MGB_CUDA
    enable_cuda = FLAGS_cuda;
 #endif
    enable_cpu_default = FLAGS_cpu_default;
@@ -163,7 +165,7 @@ XPUDeviceOption::XPUDeviceOption() {

 bool XPUDeviceOption::is_valid() {
    bool ret = FLAGS_cpu || FLAGS_cpu_default;
 #if MGE_WITH_CUDA
 #if MGB_CUDA
    ret = ret || FLAGS_cuda;
 #endif
    ret = ret || FLAGS_multithread >= 0;
@@ -188,7 +190,7 @@ void XPUDeviceOption::config_model(
 }
 ///////////////////////// xpu gflags ////////////////////////////
 DEFINE_bool(cpu, false, "set CPU device as running device");
 #if MGE_WITH_CUDA
 #if MGB_CUDA || LITE_WITH_CUDA
 DEFINE_bool(cuda, false, "set CUDA device as running device ");
 #endif
 DEFINE_bool(cpu_default, false, "set running device as CPU device with inplace mode");
--- a/lite/load_and_run/src/options/device_options.h
+++ b/lite/load_and_run/src/options/device_options.h
@@ -6,14 +6,13 @@
 *
 * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
 */

 #pragma once
 #include <gflags/gflags.h>
 #include "models/model.h"
 #include "option_base.h"

 DECLARE_bool(cpu);
 #if MGE_WITH_CUDA
 #if MGB_CUDA || LITE_WITH_CUDA
 DECLARE_bool(cuda);
 #endif
 DECLARE_bool(cpu_default);
@@ -35,7 +34,7 @@ private:
    template <typename ModelImpl>
    void config_model_internel(RuntimeParam&, std::shared_ptr<ModelImpl>){};
    bool enable_cpu;
 #if MGE_WITH_CUDA
 #if MGB_CUDA || LITE_WITH_CUDA
    bool enable_cuda;
 #endif
    bool enable_cpu_default;
--- a/lite/load_and_run/src/options/layout_trans_options.cpp
+++ b/lite/load_and_run/src/options/layout_trans_options.cpp
@@ -113,7 +113,7 @@ bool GoptLayoutOption::is_valid() {
            ret = true;
        }
    }
    ret = ret || FLAGS_layout_transform_dump.empty();
    ret = ret || !FLAGS_layout_transform_dump.empty();
    return ret;
 }