diff --git a/python_module/megengine/_internal/__init__.py b/python_module/megengine/_internal/__init__.py index 95a68965..9201a5da 100644 --- a/python_module/megengine/_internal/__init__.py +++ b/python_module/megengine/_internal/__init__.py @@ -539,7 +539,7 @@ def optimize_for_inference( f16_io_comp=False, use_nhwcd4=False, fuse_conv_bias_nonlinearity=False, - use_tensor_core=False, + use_nchw32=False, fuse_conv_bias_with_z=False, use_nchw88=False, use_nchw44=False @@ -564,6 +564,8 @@ def optimize_for_inference( times. :param use_nchw44: whether to use NCHW44 tensor format. This maybe faster some times. + :param use_nchw32: whether to use NCHW32 tensor format. Mainly used for + nvidia tensorcore. :return: list of transformed vars corresponding to given output vars @@ -575,15 +577,28 @@ def optimize_for_inference( for i in [ "f16_io_f32_comp", "f16_io_comp", - "use_nhwcd4", "fuse_conv_bias_nonlinearity", - "use_tensor_core", "fuse_conv_bias_with_z", - "use_nchw88", - "use_nchw44", ]: if settings[i]: getattr(opt, "enable_{}".format(i))() + + layout_tranform = None + for k, v in { + "use_nhwcd4": "nchw2nhwcd4", + "use_nchw32": "nchw2nchw32", + "use_nchw88": "nchw2nchw88", + "use_nchw44": "nchw2nchw44", + }.items(): + if settings[k]: + assert ( + not layout_tranform + ), "Only one layout transform supported, both {} and {}".format( + layout_tranform, k + ) + getattr(opt, "enable_{}".format(v))() + layout_tranform = k + vec = _detail._VectorSymbolVar() for i in output_vars: assert isinstance(i, _detail.SymbolVar), "bad var: {}".format(i) diff --git a/python_module/src/swig/misc.i b/python_module/src/swig/misc.i index 0b96f763..7b765949 100644 --- a/python_module/src/swig/misc.i +++ b/python_module/src/swig/misc.i @@ -71,15 +71,19 @@ class _PersistentCache { }; struct _OptimizeForInferenceOptions { -#define SET(n) void enable_##n() - SET(f16_io_f32_comp); - SET(f16_io_comp); - SET(fuse_conv_bias_nonlinearity); - SET(use_nhwcd4); - SET(use_tensor_core); - SET(fuse_conv_bias_with_z); - SET(use_nchw88); - SET(use_nchw44); +#define SET(n) void enable_##n(); + SET(f16_io_f32_comp); + SET(f16_io_comp); + SET(fuse_conv_bias_nonlinearity); + SET(fuse_conv_bias_with_z); +#undef SET +#define SET(_trans, _trans_capital) \ + void enable_##_trans(); \ + + SET(nchw2nhwcd4, NCHW2NHWCD4); + SET(nchw2nchw88, NCHW2NCHW88); + SET(nchw2nchw44, NCHW2NCHW44); + SET(nchw2nchw32, NCHW2NCHW32); #undef SET }; diff --git a/sdk/load-and-run/dump_with_testcase_mge.py b/sdk/load-and-run/dump_with_testcase_mge.py index a87ade52..3d67486b 100755 --- a/sdk/load-and-run/dump_with_testcase_mge.py +++ b/sdk/load-and-run/dump_with_testcase_mge.py @@ -255,7 +255,7 @@ def optimize_for_inference(args, outputs): 'enable_nchw88': 'use_nchw88', 'enable_nchw44': 'use_nchw44', 'enable_fuse_conv_bias_nonlinearity': 'fuse_conv_bias_nonlinearity', - 'enable_tensorcore': 'use_tensor_core', + 'enable_nchw32': 'use_nchw32', 'enable_fuse_conv_bias_with_z': 'fuse_conv_bias_with_z', } kwargs = {} @@ -393,7 +393,7 @@ def main(): 'for inference' ) parser.add_argument( - '--enable-tensorcore', + '--enable-nchw32', action='store_true', help='transform the model format from NCHW4 to NCHW32 ' 'for inference on nvidia TensoCore' diff --git a/src/gopt/impl/framework.cpp b/src/gopt/impl/framework.cpp index 2703fffd..a981a84d 100644 --- a/src/gopt/impl/framework.cpp +++ b/src/gopt/impl/framework.cpp @@ -642,21 +642,6 @@ GraphOptimizer& GraphOptimizer::add_preset_passes( add_pass(); add_pass(cv_type); - if (inference_opt) { - if (inference_opt->use_nhwcd4) { - add_pass(ConvertFormatPass::make_nhwcd4_converter()); - } - if (inference_opt->f16_io_f32_comp) { - add_pass(ConvertF32ToF16Pass::make(true)); - } - if (inference_opt->f16_io_comp) { - add_pass(ConvertF32ToF16Pass::make(false)); - } - - // fuse again after reordering - add_pass(); - } - add_pass(); // reorder again because shapes of fused oprs might change add_pass(cv_type); @@ -687,32 +672,7 @@ GraphOptimizer& GraphOptimizer::add_preset_passes( } #endif - if (inference_opt) { - if (inference_opt->fuse_conv_bias_nonlinearity) - add_pass(); - if (inference_opt->fuse_conv_bias_with_z) { - mgb_assert(inference_opt->fuse_conv_bias_nonlinearity, - "fuse conv bias with z input should fuse conv bias " - "activation " - "first"); - add_pass(); - } - if (inference_opt->use_nchw88) { - add_pass(EnableNchwxxPass::make_nchwxx_converter(8)); - } - if (inference_opt->use_nchw44) { - add_pass(EnableNchwxxPass::make_nchwxx_converter(4)); - } - if (inference_opt->use_tensor_core) { - mgb_assert(inference_opt->fuse_conv_bias_nonlinearity, - "enable tensor core should fuse conv bias activation " - "first"); - add_pass(EnableTensorCorePass::make_tensorcore_converter()); - add_pass(); - add_pass(); - } - add_pass(); - } + apply_optimize_options(inference_opt); if (inference_opt) { // merge params to reduce loading time and graph overhead @@ -739,6 +699,42 @@ VarNode* GraphOptimizer::var_replace_lookup(VarNode *var) { } } +void GraphOptimizer::apply_optimize_options( + const OptimizeOptions* options) { + if (!options) return; + if (options->f16_io_comp) { + add_pass(ConvertF32ToF16Pass::make(false)); + } + if (options->f16_io_f32_comp) { + add_pass(ConvertF32ToF16Pass::make(true)); + } + if (options->transform_nchw2nhwcd4()) { + add_pass(ConvertFormatPass::make_nhwcd4_converter()); + add_pass(); + } + if (options->transform_nchw2nchw88()) { + add_pass(EnableNchwxxPass::make_nchwxx_converter(8)); + } + if (options->transform_nchw2nchw44()) { + add_pass(EnableNchwxxPass::make_nchwxx_converter(4)); + } + if (options->transform_nchw2nchw32()) { + add_pass(); + add_pass(EnableTensorCorePass::make_tensorcore_converter()); + add_pass(); + add_pass(); + } + + if (options->fuse_conv_bias_nonlinearity) { + add_pass(); + } + if (options->fuse_conv_bias_with_z) { + add_pass(); + add_pass(); + } + add_pass(); +} + /* ================ ConstVarPropogateBase ================ */ ConstVarPropogateBase::AddOprResult ConstVarPropogateBase::add_opr( diff --git a/src/gopt/impl/tensor_reformat.cpp b/src/gopt/impl/tensor_reformat.cpp index 0579eaa7..c797be79 100644 --- a/src/gopt/impl/tensor_reformat.cpp +++ b/src/gopt/impl/tensor_reformat.cpp @@ -1770,7 +1770,7 @@ public: return reformat.node(); }; - + m_reformat[std::make_pair(TensorFormat::CHWN4, TensorFormat::NCHW4)] = [](VarNode* inp) -> VarNode* { megdnn::param::RelayoutFormat param; diff --git a/src/gopt/include/megbrain/gopt/framework.h b/src/gopt/include/megbrain/gopt/framework.h index 19027251..cd0b3015 100644 --- a/src/gopt/include/megbrain/gopt/framework.h +++ b/src/gopt/include/megbrain/gopt/framework.h @@ -377,6 +377,57 @@ namespace gopt { RecursiveSubGraphRewriteHelper(OptState &state); }; + /** + * \brief common optimize options, it both can be used for optimize for + * inference in graph dump but also used in graph optimization in runtime. + */ + struct OptimizeOptions { + //! whether to enable IO in float16 compute in float32 + bool f16_io_f32_comp = false; + //! whether to enable tranform to pure float16 model + bool f16_io_comp = false; + //! whether to enable conv bias nonlinearity fusion + bool fuse_conv_bias_nonlinearity = false; + enum LayoutTransform : uint32_t { + DEFAULT, + NCHW2NHWCD4, ///< compute using NHWCD4 tensor format + NCHW2NCHW88, ///< compute using NCHW88 tensor format + NCHW2NCHW44, ///< compute using NCHW44 tensor format + NCHW2NCHW32, ///< compute using NCHW32 tensor format, used for + ///< tensorcore + }; + LayoutTransform layout_transform = LayoutTransform::DEFAULT; + //! fuse pattern like ReLU(conv_bias(x, w, b) + z) or conv_bias(x, w, b) + //! + z -> conv_bias(x, w, b, z) + bool fuse_conv_bias_with_z = false; + +#define SET(n) \ + OptimizeOptions& enable_##n() { \ + n = true; \ + return *this; \ + } + SET(f16_io_f32_comp); + SET(f16_io_comp); + SET(fuse_conv_bias_nonlinearity); + SET(fuse_conv_bias_with_z); +#undef SET +#define SET(_trans, _trans_capital) \ + OptimizeOptions& enable_##_trans() { \ + layout_transform = LayoutTransform::_trans_capital; \ + return *this; \ + } \ + bool transform_##_trans() const { \ + return layout_transform == LayoutTransform::_trans_capital; \ + } + + SET(nchw2nhwcd4, NCHW2NHWCD4); + SET(nchw2nchw88, NCHW2NCHW88); + SET(nchw2nchw44, NCHW2NCHW44); + SET(nchw2nchw32, NCHW2NCHW32); +#undef SET + }; + + /*! * \brief manage passes and their applying on graphs * @@ -465,6 +516,11 @@ namespace gopt { * var_replace_map(var->owner_graph()) corresponding to var */ static VarNode* var_replace_lookup(VarNode *var); + + /** + * \brief apply optimize options + */ + void apply_optimize_options(const OptimizeOptions* options); }; /*! diff --git a/src/gopt/include/megbrain/gopt/inference.h b/src/gopt/include/megbrain/gopt/inference.h index af86d61b..773d05ec 100644 --- a/src/gopt/include/megbrain/gopt/inference.h +++ b/src/gopt/include/megbrain/gopt/inference.h @@ -256,40 +256,7 @@ namespace gopt { size_t pack_c_size); }; - struct OptimizeForInferenceOptions { - //! whether to enable IO in float16 compute in float32 - bool f16_io_f32_comp = false; - //! whether to enable tranform to pure float16 model - bool f16_io_comp = false; - //! whether to enable conv bias nonlinearity fusion - bool fuse_conv_bias_nonlinearity = false; - //! whether to compute using NHWCD4 tensor format - bool use_nhwcd4 = false; - //! whether to compute using NCHW88 tensor format - bool use_nchw88 = false; - //! whether to compute using NCHW44 tensor format - bool use_nchw44 = false; - //! whether to enable tensor core - bool use_tensor_core = false; - //! fuse pattern like ReLU(conv_bias(x, w, b) + z) or conv_bias(x, w, b) - //! + z -> conv_bias(x, w, b, z) - bool fuse_conv_bias_with_z = false; - -#define SET(n) \ - OptimizeForInferenceOptions& enable_##n() { \ - n = true; \ - return *this; \ - } - SET(f16_io_f32_comp); - SET(f16_io_comp); - SET(fuse_conv_bias_nonlinearity); - SET(use_nhwcd4); - SET(use_tensor_core); - SET(fuse_conv_bias_with_z); - SET(use_nchw88); - SET(use_nchw44); -#undef SET - }; + struct OptimizeForInferenceOptions : OptimizeOptions {}; /*! * \brief optimize a computing graph for inference diff --git a/src/gopt/test/inference.cpp b/src/gopt/test/inference.cpp index 7d6456ac..b271b764 100644 --- a/src/gopt/test/inference.cpp +++ b/src/gopt/test/inference.cpp @@ -635,10 +635,9 @@ TEST(TestGoptInference, Float16IOFloat32Compute) { y = opr::Concat::make({y, -y}, 0); y = opr::Reduce::make(y, {}, y.make_scalar(1)); SymbolVar y_opt; - unpack_vector(gopt::optimize_for_inference( - {y}, gopt::OptimizeForInferenceOptions{} - .enable_f16_io_f32_comp()), - y_opt); + auto options = gopt::OptimizeForInferenceOptions{}; + options.enable_f16_io_f32_comp(); + unpack_vector(gopt::optimize_for_inference({y}, options), y_opt); ASSERT_EQ(y_opt.dtype(), dtype::Float32()); HostTensorND host_y, host_y_opt; @@ -683,10 +682,9 @@ TEST(TestGoptInference, Float16IOFloat32ComputeWarpPerspective) { TensorShape out_shp{20, 20}; auto y = opr::WarpPerspective::make(a, mat, out_shp); SymbolVar y_opt; - unpack_vector(gopt::optimize_for_inference( - {y}, gopt::OptimizeForInferenceOptions{} - .enable_f16_io_f32_comp()), - y_opt); + auto options = gopt::OptimizeForInferenceOptions{}; + options.enable_f16_io_f32_comp(); + unpack_vector(gopt::optimize_for_inference({y}, options), y_opt); ASSERT_EQ(y_opt.dtype(), dtype::Float32()); HostTensorND host_y, host_y_opt; auto func = graph->compile({make_callback_copy(y, host_y), @@ -723,10 +721,9 @@ TEST(TestGoptInference, Float16IOFloat32ComputeRemap) { auto map = opr::Host2DeviceCopy::make(*graph, map_host).rename("map"); auto y = opr::Remap::make(a, map); SymbolVar y_opt; - unpack_vector(gopt::optimize_for_inference( - {y}, gopt::OptimizeForInferenceOptions{} - .enable_f16_io_f32_comp()), - y_opt); + auto options = gopt::OptimizeForInferenceOptions{}; + options.enable_f16_io_f32_comp(); + unpack_vector(gopt::optimize_for_inference({y}, options), y_opt); ASSERT_EQ(y_opt.dtype(), dtype::Float32()); HostTensorND host_y, host_y_opt; auto func = graph->compile({make_callback_copy(y, host_y), @@ -770,10 +767,9 @@ TEST(TestGoptInference, Uint8IOFloat16ComputeWarpPerspective) { TensorShape out_shp{20, 20}; auto y = opr::WarpPerspective::make(a, mat, out_shp); SymbolVar y_opt; - unpack_vector(gopt::optimize_for_inference( - {y}, gopt::OptimizeForInferenceOptions{} - .enable_f16_io_comp()), - y_opt); + auto options = gopt::OptimizeForInferenceOptions{}; + options.enable_f16_io_comp(); + unpack_vector(gopt::optimize_for_inference({y}, options), y_opt); ASSERT_EQ(y_opt.dtype(), dtype::Uint8()); HostTensorND host_y, host_y_opt; auto func = graph->compile({make_callback_copy(y, host_y), @@ -801,10 +797,9 @@ TEST(TestGoptInference, Float32TOFloat16) { y = opr::Reduce::make(y, {}, y.make_scalar(1)); SymbolVar y_opt; - unpack_vector(gopt::optimize_for_inference( - {y}, gopt::OptimizeForInferenceOptions{} - .enable_f16_io_comp()), - y_opt); + auto options = gopt::OptimizeForInferenceOptions{}; + options.enable_f16_io_comp(); + unpack_vector(gopt::optimize_for_inference({y}, options), y_opt); return y_opt; }; @@ -857,10 +852,9 @@ TEST(TestGoptInference, Float32TOFloat16EndpointElemwise) { auto y = d0 + b; SymbolVar y_opt; - unpack_vector(gopt::optimize_for_inference( - {y}, gopt::OptimizeForInferenceOptions{} - .enable_f16_io_comp()), - y_opt); + auto options = gopt::OptimizeForInferenceOptions{}; + options.enable_f16_io_comp(); + unpack_vector(gopt::optimize_for_inference({y}, options), y_opt); return y_opt; }; @@ -897,7 +891,7 @@ TEST(TestGoptInference, Float32TOFloat16EndpointElemwise) { TEST(TestGoptInference, Float32TOFloat16Linspace) { CompNode cn = CompNode::load("cpu0"); HostTensorGenerator<> gen(0, 1, 0); - auto host_x = gen({3, 1}, cn); + auto host_x = gen({3, 1}, cn); auto graph = ComputingGraph::make(); auto make_f32_to_f16_graph = [&]() { @@ -916,10 +910,9 @@ TEST(TestGoptInference, Float32TOFloat16Linspace) { auto mm = opr::MatrixMul::make(x, y); SymbolVar mm_opt; - unpack_vector(gopt::optimize_for_inference( - {mm}, gopt::OptimizeForInferenceOptions{} - .enable_f16_io_comp()), - mm_opt); + auto options = gopt::OptimizeForInferenceOptions{}; + options.enable_f16_io_comp(); + unpack_vector(gopt::optimize_for_inference({mm}, options), mm_opt); return mm_opt; }; @@ -998,11 +991,9 @@ TEST(TestGoptInference, ConvertFormatNHWCD4) { y = opr::Convolution::make(elem, w2, param); SymbolVar y_opt; - unpack_vector( - gopt::optimize_for_inference( - {y}, - gopt::OptimizeForInferenceOptions{}.enable_use_nhwcd4()), - y_opt); + auto options = gopt::OptimizeForInferenceOptions{}; + options.enable_nchw2nhwcd4(); + unpack_vector(gopt::optimize_for_inference({y}, options), y_opt); ASSERT_EQ(opr::Convolution::Param::Format::NHWCD4, find_opr(y_opt).param().format); @@ -1059,11 +1050,9 @@ TEST(TestGoptInference, ConvertFormatNHWCD4LOCAL) { y = opr::Convolution::make(group_local, w5, param); SymbolVar y_opt; - unpack_vector( - gopt::optimize_for_inference( - {y}, - gopt::OptimizeForInferenceOptions{}.enable_use_nhwcd4()), - y_opt); + auto options = gopt::OptimizeForInferenceOptions{}; + options.enable_nchw2nhwcd4(); + unpack_vector(gopt::optimize_for_inference({y}, options), y_opt); ASSERT_EQ(opr::Convolution::Param::Format::NHWCD4, find_opr(y_opt).param().format); @@ -1112,11 +1101,9 @@ TEST(TestGoptInference, ConvertFormatNHWCD4Deconv) { y = opr::ConvolutionBackwardData::make(w1, conv, param, {}, {}); SymbolVar y_opt; - unpack_vector( - gopt::optimize_for_inference( - {y}, - gopt::OptimizeForInferenceOptions{}.enable_use_nhwcd4()), - y_opt); + auto options = gopt::OptimizeForInferenceOptions{}; + options.enable_nchw2nhwcd4(); + unpack_vector(gopt::optimize_for_inference({y}, options), y_opt); ASSERT_EQ(opr::Convolution::Param::Format::NCHW, find_opr(y_opt).param().format); @@ -1159,11 +1146,9 @@ TEST(TestGoptInference, ConvertFormatNHWCD4Qint8) { OperatorNodeConfig{dtype::QuantizedS8(0.2f)}); SymbolVar y_opt; - unpack_vector( - gopt::optimize_for_inference( - {y}, - gopt::OptimizeForInferenceOptions{}.enable_use_nhwcd4()), - y_opt); + auto options = gopt::OptimizeForInferenceOptions{}; + options.enable_nchw2nhwcd4(); + unpack_vector(gopt::optimize_for_inference({y}, options), y_opt); ASSERT_EQ(opr::ConvBias::Param::Format::NHWCD4, find_opr(y_opt).param().format); @@ -1213,11 +1198,9 @@ TEST(TestGoptInference, ConvertFormatPadIC) { auto w1 = mkcvar("w1", {12, 12, 3, 3}); auto y = opr::Convolution::make(concat, w1, param); SymbolVar y_opt; - unpack_vector( - gopt::optimize_for_inference( - {y}, - gopt::OptimizeForInferenceOptions{}.enable_use_nhwcd4()), - y_opt); + auto options = gopt::OptimizeForInferenceOptions{}; + options.enable_nchw2nhwcd4(); + unpack_vector(gopt::optimize_for_inference({y}, options), y_opt); HostTensorND host_y_opt, host_y; auto func = graph->compile({make_callback_copy(y, host_y), @@ -1301,11 +1284,9 @@ TEST(TestGoptInference, ConvBiasNonlinearityFusePass) { opr::Elemwise::make({y_cut}, opr::Elemwise::Param::Mode::RELU), y_y = opr::Convolution::make(y_expand, w3, param), y = y_y + y_tmp; SymbolVar y_opt; - unpack_vector(gopt::optimize_for_inference( - {y}, gopt::OptimizeForInferenceOptions{} - .enable_use_nhwcd4() - .enable_fuse_conv_bias_nonlinearity()), - y_opt); + auto options = gopt::OptimizeForInferenceOptions{}; + options.enable_nchw2nhwcd4().enable_fuse_conv_bias_nonlinearity(); + unpack_vector(gopt::optimize_for_inference({y}, options), y_opt); ASSERT_EQ(3u, find_opr(y_opt).input().size()); graph->compile({{y_opt, {}}}) ->to_json() @@ -1533,15 +1514,16 @@ TEST(TestEnableTensorCore, SmallInputShape) { SymbolVar y_opt; SymbolVar y_no_tc; - unpack_vector(gopt::optimize_for_inference( - {y}, gopt::OptimizeForInferenceOptions{} - .enable_fuse_conv_bias_nonlinearity() - .enable_use_tensor_core()), - y_opt); - unpack_vector(gopt::optimize_for_inference( - {y}, gopt::OptimizeForInferenceOptions{} - .enable_fuse_conv_bias_nonlinearity()), - y_no_tc); + { + auto options = gopt::OptimizeForInferenceOptions{}; + options.enable_nchw2nchw32().enable_fuse_conv_bias_nonlinearity(); + unpack_vector(gopt::optimize_for_inference({y}, options), y_opt); + } + { + auto options = gopt::OptimizeForInferenceOptions{}; + options.enable_fuse_conv_bias_nonlinearity(); + unpack_vector(gopt::optimize_for_inference({y}, options), y_no_tc); + } auto nr_dimshuffle = find_opr_num(y_opt); ASSERT_EQ(2u, nr_dimshuffle); HostTensorND host_y, host_y_opt; @@ -1597,15 +1579,16 @@ TEST(TestEnableTensorCore, ConvBiasWithZ) { SymbolVar y_opt; SymbolVar y_no_tc; - unpack_vector(gopt::optimize_for_inference( - {y}, gopt::OptimizeForInferenceOptions{} - .enable_fuse_conv_bias_nonlinearity() - .enable_use_tensor_core()), - y_opt); - unpack_vector(gopt::optimize_for_inference( - {y}, gopt::OptimizeForInferenceOptions{} - .enable_fuse_conv_bias_nonlinearity()), - y_no_tc); + { + auto options = gopt::OptimizeForInferenceOptions{}; + options.enable_fuse_conv_bias_nonlinearity().enable_nchw2nchw32(); + unpack_vector(gopt::optimize_for_inference({y}, options), y_opt); + } + { + auto options = gopt::OptimizeForInferenceOptions{}; + options.enable_fuse_conv_bias_nonlinearity(); + unpack_vector(gopt::optimize_for_inference({y}, options), y_no_tc); + } HostTensorND host_y, host_y_opt; auto func = graph->compile({make_callback_copy(y_no_tc, host_y), make_callback_copy(y_opt, host_y_opt)}); @@ -1664,15 +1647,16 @@ TEST(TestGoptInference, EnableTensorCore) { y4 = opr::TypeCvt::make(y4, dtype::Float32()); SymbolVar y_opt; SymbolVar y_no_tc; - unpack_vector(gopt::optimize_for_inference( - {y4}, gopt::OptimizeForInferenceOptions{} - .enable_fuse_conv_bias_nonlinearity() - .enable_use_tensor_core()), - y_opt); - unpack_vector(gopt::optimize_for_inference( - {y4}, gopt::OptimizeForInferenceOptions{} - .enable_fuse_conv_bias_nonlinearity()), - y_no_tc); + { + auto options = gopt::OptimizeForInferenceOptions{}; + options.enable_fuse_conv_bias_nonlinearity().enable_nchw2nchw32(); + unpack_vector(gopt::optimize_for_inference({y4}, options), y_opt); + } + { + auto options = gopt::OptimizeForInferenceOptions{}; + options.enable_fuse_conv_bias_nonlinearity().enable_nchw2nchw32(); + unpack_vector(gopt::optimize_for_inference({y4}, options), y_no_tc); + } auto nr_dimshuffle = find_opr_num(y_opt); ASSERT_EQ(3u, nr_dimshuffle); graph->compile({{y_opt, {}}}) @@ -1763,15 +1747,17 @@ TEST(FuseConvBiasZPass, BlockFuse) { SymbolVar z_fuse; SymbolVar z_nonfuse; - unpack_vector(gopt::optimize_for_inference( - {z}, gopt::OptimizeForInferenceOptions{} - .enable_fuse_conv_bias_nonlinearity() - .enable_fuse_conv_bias_with_z()), - z_fuse); - unpack_vector(gopt::optimize_for_inference( - {z4}, gopt::OptimizeForInferenceOptions{} - .enable_fuse_conv_bias_nonlinearity()), - z_nonfuse); + { + auto options = gopt::OptimizeForInferenceOptions{}; + options.enable_fuse_conv_bias_nonlinearity() + .enable_fuse_conv_bias_with_z(); + unpack_vector(gopt::optimize_for_inference({z}, options), z_fuse); + } + { + auto options = gopt::OptimizeForInferenceOptions{}; + options.enable_fuse_conv_bias_nonlinearity(); + unpack_vector(gopt::optimize_for_inference({z4}, options), z_nonfuse); + } auto nr_elem_multi_type = find_opr_num(z_fuse); MGB_MARK_USED_VAR(nr_elem_multi_type); ASSERT_EQ(1u, nr_elem_multi_type); @@ -1867,15 +1853,16 @@ TEST(TestEnableTensorCore, ShuffleMerge) { SymbolVar y_opt; SymbolVar y_no_tc; - unpack_vector(gopt::optimize_for_inference( - {y}, gopt::OptimizeForInferenceOptions{} - .enable_fuse_conv_bias_nonlinearity() - .enable_use_tensor_core()), - y_opt); - unpack_vector(gopt::optimize_for_inference( - {y}, gopt::OptimizeForInferenceOptions{} - .enable_fuse_conv_bias_nonlinearity()), - y_no_tc); + { + auto options = gopt::OptimizeForInferenceOptions{}; + options.enable_fuse_conv_bias_nonlinearity().enable_nchw2nchw32(); + unpack_vector(gopt::optimize_for_inference({y}, options), y_opt); + } + { + auto options = gopt::OptimizeForInferenceOptions{}; + options.enable_fuse_conv_bias_nonlinearity(); + unpack_vector(gopt::optimize_for_inference({y}, options), y_no_tc); + } auto nr_dimshuffle = find_opr_num(y_opt); ASSERT_EQ(3u, nr_dimshuffle); HostTensorND host_y, host_y_opt; @@ -1932,13 +1919,13 @@ TEST(FuseConvBiasZPass, Basic) { opr::ElemwiseMultiType::Param::Mode::QFUSE_ADD_RELU}) { auto y1 = opr::ElemwiseMultiType::make( {y, b1}, {mode}, OperatorNodeConfig{dtype::QuantizedS8(2.5f)}); - unpack_vector( - gopt::optimize_for_inference( - {y1}, gopt::OptimizeForInferenceOptions{} - .enable_fuse_conv_bias_nonlinearity() - .enable_fuse_conv_bias_with_z() - .enable_use_tensor_core()), - y_opt); + { + auto options = gopt::OptimizeForInferenceOptions{}; + options.enable_fuse_conv_bias_nonlinearity() + .enable_fuse_conv_bias_with_z() + .enable_nchw2nchw32(); + unpack_vector(gopt::optimize_for_inference({y1}, options), y_opt); + } auto nr_elemwisemultitype = find_opr_num(y_opt); if (mode == opr::ElemwiseMultiType::Param::Mode::QMUL) { ASSERT_NE(0u, nr_elemwisemultitype); @@ -1949,13 +1936,14 @@ TEST(FuseConvBiasZPass, Basic) { auto y2 = opr::ElemwiseMultiType::make( {y1, b2}, {mode}, OperatorNodeConfig{dtype::QuantizedS8(2.5f)}); - unpack_vector( - gopt::optimize_for_inference( - {y2}, gopt::OptimizeForInferenceOptions{} - .enable_fuse_conv_bias_nonlinearity() - .enable_fuse_conv_bias_with_z() - .enable_use_tensor_core()), - y_opt); + { + auto options = gopt::OptimizeForInferenceOptions{}; + options.enable_fuse_conv_bias_nonlinearity() + .enable_fuse_conv_bias_with_z() + .enable_nchw2nchw32(); + unpack_vector(gopt::optimize_for_inference({y2}, options), + y_opt); + } auto nr_elemwisemultitype = find_opr_num(y_opt); ASSERT_NE(0u, nr_elemwisemultitype); @@ -2401,11 +2389,11 @@ TEST(TestGoptInference, ConvertFormatNCHW88) { y = opr::ConvBias::make(conv5, w6, b6, param_conv_bias); SymbolVar y_opt; - unpack_vector( - gopt::optimize_for_inference( - {y}, - gopt::OptimizeForInferenceOptions{}.enable_use_nchw88()), - y_opt); + { + auto options = gopt::OptimizeForInferenceOptions{}; + options.enable_nchw2nchw88(); + unpack_vector(gopt::optimize_for_inference({y}, options), y_opt); + } ASSERT_EQ(opr::ConvBias::Param::Format::NCHW88, find_opr(y_opt).param().format); @@ -2483,11 +2471,9 @@ TEST(TestGoptInference, ConvertFormatNCHW44) { y = opr::ConvBias::make(conv5, w6, b6, param_conv_bias); SymbolVar y_opt; - unpack_vector( - gopt::optimize_for_inference( - {y}, - gopt::OptimizeForInferenceOptions{}.enable_use_nchw44()), - y_opt); + auto options = gopt::OptimizeForInferenceOptions{}; + options.enable_nchw2nchw44(); + unpack_vector(gopt::optimize_for_inference({y}, options), y_opt); ASSERT_EQ(opr::ConvBias::Param::Format::NCHW44, find_opr(y_opt).param().format); diff --git a/src/opr/test/dnn/convolution.cpp b/src/opr/test/dnn/convolution.cpp index 8eecfaa6..f28509ef 100644 --- a/src/opr/test/dnn/convolution.cpp +++ b/src/opr/test/dnn/convolution.cpp @@ -495,7 +495,7 @@ TEST(TestOprDNN, ConvolutionBackwardFilter) { Param{Mode::CROSS_CORRELATION, PH, PW, SH, SW}); dest[0] = *out; }; - + #define get_shp(N, P, S, F) ((N + 2 * P - F) / S + 1) #define inp_tensor(N, IC, OC, IH, IW, FH, FW) \ { TensorShape{N, IC, IH, IW}, \ @@ -1282,9 +1282,10 @@ TEST(TestOprDNN, ConvBiasINT8x8xX_NCHW4) { *graph, inp[i]); } + auto options = gopt::OptimizeForInferenceOptions{}; + options.enable_fuse_conv_bias_nonlinearity(); auto y = gopt::optimize_for_inference({make_graph(inputs)[0]}, - gopt::OptimizeForInferenceOptions{}.enable_fuse_conv_bias_nonlinearity())[0]; - //gopt::OptimizeForInferenceOptions{})[0]; + options)[0]; auto func = graph->compile({make_callback_copy(y, dest[0])}); func->execute(); func->wait(); @@ -1720,7 +1721,7 @@ TEST(TestOprDNN, DeformableConvForward) { } }; //! generate offset to avoid value near integer - /// because bilinear function is not derivable over there + /// because bilinear function is not derivable over there checker.set_input_generator(2, gen_off); checker.set_input_dtype(0, dtype::Float32()); checker.set_input_dtype(1, dtype::Float32()); diff --git a/src/opr/test/io.cpp b/src/opr/test/io.cpp index 08b60cdb..7bff8f77 100644 --- a/src/opr/test/io.cpp +++ b/src/opr/test/io.cpp @@ -500,10 +500,10 @@ TEST(TestOprIO, MultipleDeviceTensorWithFormatHolderCpu) { conv2 = opr::Convolution::make(conv1, w2, param); auto y = opr::Elemwise::make({conv2}, opr::Elemwise::Param::Mode::RELU); - SymbolVar y_opt = gopt::optimize_for_inference( - {y}, gopt::OptimizeForInferenceOptions{} - .enable_use_nhwcd4())[0] - .rename("out"); + auto options = gopt::OptimizeForInferenceOptions{}; + options.enable_nchw2nhwcd4(); + SymbolVar y_opt = + gopt::optimize_for_inference({y}, options)[0].rename("out"); auto dumper = serialization::GraphDumper::make( serialization::OutputFile::make_fs(fname.c_str()));