From c49d3070ba478d242eeb2094c34229b839020c30 Mon Sep 17 00:00:00 2001 From: Megvii Engine Team Date: Fri, 1 Jul 2022 16:58:22 +0800 Subject: [PATCH] refactor(imperative/ops): extends DnnOprCaller with template GitOrigin-RevId: 402cba209a3d190285e09c54b7b658988f3bef3d --- dnn/include/megdnn/oprs/general.h | 19 +- dnn/include/megdnn/oprs/linalg.h | 6 +- dnn/include/megdnn/oprs/nn.h | 33 +- dnn/src/common/check_non_finite.cpp | 6 +- dnn/src/common/cond_take/opr_impl.cpp | 2 +- dnn/src/common/lamb.cpp | 6 +- dnn/src/cuda/check_non_finite/opr_impl.cpp | 8 +- dnn/src/cuda/check_non_finite/opr_impl.h | 2 +- dnn/src/cuda/cond_take/opr_impl.cpp | 3 +- dnn/src/cuda/cond_take/opr_impl.h | 3 +- dnn/src/cuda/param_pack/opr_impl.cpp | 4 +- dnn/src/cuda/param_pack/opr_impl.h | 2 +- dnn/src/naive/check_non_finite/opr_impl.h | 3 +- dnn/src/naive/cond_take/opr_impl.cpp | 3 +- dnn/src/naive/cond_take/opr_impl.h | 3 +- dnn/src/naive/param_pack/opr_impl.h | 2 +- dnn/src/rocm/param_pack/opr_impl.cpp | 4 +- dnn/src/rocm/param_pack/opr_impl.h | 2 +- dnn/test/common/cond_take.cpp | 2 +- dnn/test/common/opr_proxy.h | 7 +- dnn/test/cuda/param_pack.cpp | 2 +- dnn/test/rocm/param_pack.cpp | 2 +- imperative/src/impl/blob_manager_impl.cpp | 59 +-- imperative/src/impl/blob_manager_impl.h | 12 +- imperative/src/impl/dnn_op_helper.h | 342 +++++++++++-- .../src/impl/interpreter/interpreter_impl.cpp | 17 +- imperative/src/impl/ops/adaptive_pooling.cpp | 40 +- imperative/src/impl/ops/batch_norm.cpp | 87 +--- imperative/src/impl/ops/cond_take.cpp | 27 +- imperative/src/impl/ops/convolution.cpp | 538 ++++----------------- imperative/src/impl/ops/elemwise.cpp | 83 ++-- imperative/src/impl/ops/indexing.cpp | 41 +- imperative/src/impl/ops/io_remote.cpp | 11 +- imperative/src/impl/ops/lamb.cpp | 29 +- imperative/src/impl/ops/layer_norm.cpp | 43 +- imperative/src/impl/ops/matmul.cpp | 46 +- imperative/src/impl/ops/misc.cpp | 35 +- imperative/src/impl/ops/padding.cpp | 41 +- imperative/src/impl/ops/pooling.cpp | 48 +- imperative/src/impl/ops/reduce.cpp | 236 +++++---- imperative/src/impl/ops/tensor_manip.cpp | 32 +- imperative/src/impl/ops/vision.cpp | 74 +-- imperative/src/impl/physical_tensor.cpp | 8 +- imperative/src/impl/proxy_graph/mini_graph.h | 13 +- .../src/include/megbrain/imperative/blob_manager.h | 13 +- .../include/megbrain/imperative/physical_tensor.h | 21 +- .../src/include/megbrain/imperative/utils/helper.h | 35 +- .../include/megbrain/imperative/utils/platform.h | 6 + src/opr/impl/misc.cpp | 7 +- src/opr/impl/tensor_manip.cpp | 7 +- src/rdnn/impl/algo_chooser.cpp | 3 +- tools/format.py | 5 +- 52 files changed, 912 insertions(+), 1171 deletions(-) diff --git a/dnn/include/megdnn/oprs/general.h b/dnn/include/megdnn/oprs/general.h index af414dca..9b556bc3 100644 --- a/dnn/include/megdnn/oprs/general.h +++ b/dnn/include/megdnn/oprs/general.h @@ -397,7 +397,8 @@ public: OutputDType infer_dtype(DType data, DType mask); - virtual size_t get_workspace_in_bytes(const TensorLayout& data) = 0; + virtual size_t get_workspace_in_bytes( + const TensorLayout& data, const TensorLayout& mask) = 0; virtual Output exec( _megdnn_tensor_in data, _megdnn_tensor_in mask, _megdnn_workspace workspace, @@ -512,7 +513,8 @@ public: virtual void exec( _megdnn_in const TensorNDArray& srcs, _megdnn_tensor_out dst, _megdnn_workspace workspace) = 0; - void deduce_layout(const TensorLayoutArray& srcs, TensorLayout& dst); + MGE_WIN_DECLSPEC_FUC void deduce_layout( + const TensorLayoutArray& srcs, TensorLayout& dst); virtual size_t get_workspace_in_bytes( const TensorLayoutArray& srcs, const TensorLayout& dst) = 0; @@ -596,7 +598,7 @@ public: _megdnn_workspace workspace) = 0; virtual size_t get_workspace_in_bytes( - const TensorShapeArray& srcs, const TensorShape& offsets, + const TensorShape& srcs, const TensorShape& offsets, const TensorShape& dst) = 0; }; @@ -1145,7 +1147,7 @@ protected: /*! * \return axis on dst used by indexer (i.e. ExecInfo::idx_axis) */ - static size_t deduce_layout_fwd( + MGE_WIN_DECLSPEC_FUC static size_t deduce_layout_fwd( const TensorLayout& data, const IndexDescLayoutOnly& index, TensorLayout& dst); @@ -1362,9 +1364,10 @@ class CheckNonFinite : public OperatorBase { public: virtual size_t get_workspace_in_bytes( - const TensorNDArray& srcs, const TensorLayout& dst) = 0; + const TensorLayoutArray& srcs, const TensorLayout& dst) = 0; - void deduce_layout(const TensorLayoutArray& srcs, TensorLayout& dst); + MGE_WIN_DECLSPEC_FUC void deduce_layout( + const TensorLayoutArray& srcs, TensorLayout& dst); virtual void exec( _megdnn_in const TensorNDArray& srcs, _megdnn_tensor_out dst, @@ -1420,7 +1423,7 @@ public: } virtual size_t get_workspace_in_bytes( const TensorLayout& src, const TensorLayout& dst) = 0; - void deduce_layout(const TensorLayout& src, TensorLayout& dst); + MGE_WIN_DECLSPEC_FUC void deduce_layout(const TensorLayout& src, TensorLayout& dst); MGE_WIN_DECLSPEC_FUC static void deduce_layout_impl( const TensorLayout& src, TensorLayout& dst, const Param& p); @@ -1464,7 +1467,7 @@ public: const TensorLayout& m_t, const TensorLayout& v_t, const TensorLayout& new_param) = 0; - void deduce_layout( + MGE_WIN_DECLSPEC_FUC void deduce_layout( const TensorLayout& m_t_1, const TensorLayout& v_t_1, const TensorLayout& lamb_param, const TensorLayout& grad, TensorLayout& m_t, TensorLayout& v_t, TensorLayout& new_param); diff --git a/dnn/include/megdnn/oprs/linalg.h b/dnn/include/megdnn/oprs/linalg.h index b98642d0..65b7175f 100644 --- a/dnn/include/megdnn/oprs/linalg.h +++ b/dnn/include/megdnn/oprs/linalg.h @@ -27,7 +27,8 @@ public: _megdnn_tensor_in A, _megdnn_tensor_in B, _megdnn_tensor_out C, _megdnn_workspace workspace) = 0; MGE_WIN_DECLSPEC_FUC void deduce_dtype(DType A, DType B, DType& C); - void deduce_layout(const TensorLayout& A, const TensorLayout& B, TensorLayout& C); + MGE_WIN_DECLSPEC_FUC void deduce_layout( + const TensorLayout& A, const TensorLayout& B, TensorLayout& C); virtual size_t get_workspace_in_bytes( const TensorLayout& A, const TensorLayout& B, const TensorLayout& C) = 0; @@ -64,7 +65,8 @@ public: _megdnn_tensor_in A, _megdnn_tensor_in B, _megdnn_tensor_out C, _megdnn_workspace workspace) = 0; MGE_WIN_DECLSPEC_FUC void deduce_dtype(DType A, DType B, DType& C); - void deduce_layout(const TensorLayout& A, const TensorLayout& B, TensorLayout& C); + MGE_WIN_DECLSPEC_FUC void deduce_layout( + const TensorLayout& A, const TensorLayout& B, TensorLayout& C); virtual size_t get_workspace_in_bytes( const TensorLayout& A, const TensorLayout& B, const TensorLayout& C) = 0; diff --git a/dnn/include/megdnn/oprs/nn.h b/dnn/include/megdnn/oprs/nn.h index b117b7e1..93743087 100644 --- a/dnn/include/megdnn/oprs/nn.h +++ b/dnn/include/megdnn/oprs/nn.h @@ -224,9 +224,9 @@ public: const TensorLayout& src_layout, _megdnn_tensor_in filter, const TensorLayout& dst_layout, PreprocessedFilter* preprocessed_filter, _megdnn_workspace workspace) = 0; - void deduce_dtype(DType src, DType filter, DType& dst); + MGE_WIN_DECLSPEC_FUC void deduce_dtype(DType src, DType filter, DType& dst); - void deduce_layout( + MGE_WIN_DECLSPEC_FUC void deduce_layout( const TensorLayout& src, const TensorLayout& filter, TensorLayout& dst); /** @@ -300,7 +300,7 @@ public: const TensorLayout& grad) = 0; MGE_WIN_DECLSPEC_FUC void deduce_dtype(DType filter, DType diff, DType& grad); - void deduce_layout( + MGE_WIN_DECLSPEC_FUC void deduce_layout( const TensorLayout& filter, const TensorLayout& diff, TensorLayout& grad); static Algorithm::OprType get_opr_type() { @@ -378,6 +378,12 @@ public: const PreprocessedFilter* preprocessed_filter, _megdnn_workspace workspace) = 0; + MGE_WIN_DECLSPEC_FUC void exec( + _megdnn_tensor_in src, _megdnn_tensor_in filter, _megdnn_tensor_in bias, + _megdnn_tensor_in z, _megdnn_tensor_out dst, _megdnn_workspace workspace) { + exec(src, filter, bias, z, dst, nullptr, workspace); + } + /** * \brief execute weight preprocessing, read weights form filter and bias, * write to preprocessed_filter after preprocessed. @@ -390,8 +396,9 @@ public: _megdnn_tensor_in bias, const TensorLayout& z_layout, const TensorLayout& dst_layout, PreprocessedFilter* preprocessed_filter, _megdnn_workspace workspace) = 0; - void deduce_dtype(DType src, DType filter, DType bias, DType z, DType& dst); - void deduce_layout( + MGE_WIN_DECLSPEC_FUC void deduce_dtype( + DType src, DType filter, DType bias, DType z, DType& dst); + MGE_WIN_DECLSPEC_FUC void deduce_layout( const TensorLayout& src, const TensorLayout& filter, const TensorLayout& bias, const TensorLayout& z, TensorLayout& dst); @@ -775,7 +782,7 @@ protected: void check_layout_fwd(const TensorLayout& src, const TensorLayout& dst); public: - MGE_WIN_DECLSPEC_FUC static void deduce_layout_impl( + static void deduce_layout_impl( const TensorLayout& src, const Param& param, TensorLayout& dst); }; @@ -791,7 +798,7 @@ public: virtual void exec( _megdnn_tensor_in src, _megdnn_tensor_out dst, _megdnn_workspace workspace) = 0; - void deduce_layout(const TensorLayout& src, TensorLayout& dst); + MGE_WIN_DECLSPEC_FUC void deduce_layout(const TensorLayout& src, TensorLayout& dst); virtual size_t get_workspace_in_bytes( const TensorLayout& src, const TensorLayout& dst) = 0; @@ -1253,7 +1260,7 @@ public: virtual void exec( _megdnn_tensor_in src, _megdnn_tensor_in filter, _megdnn_tensor_out dst, _megdnn_workspace workspace) = 0; - void deduce_layout( + MGE_WIN_DECLSPEC_FUC void deduce_layout( const TensorLayout& src, const TensorLayout& filter, TensorLayout& dst); virtual size_t get_workspace_in_bytes( const TensorLayout& src, const TensorLayout& filter, @@ -1281,18 +1288,16 @@ public: * \param[in] diff (n, oc, od, oh, ow) * \param[out] grad (n, ic, id, ih, iw) */ - MGE_WIN_DECLSPEC_FUC static void deduce_layout_impl( + static void deduce_layout_impl( const TensorLayout& filter, const TensorLayout& diff, const Param& param, TensorLayout& grad); - virtual void exec( _megdnn_tensor_in filter, _megdnn_tensor_in diff, _megdnn_tensor_out grad, _megdnn_workspace workspace) = 0; virtual size_t get_workspace_in_bytes( const TensorLayout& filter, const TensorLayout& diff, const TensorLayout& grad) = 0; - - void deduce_layout( + MGE_WIN_DECLSPEC_FUC void deduce_layout( const TensorLayout& filter, const TensorLayout& diff, TensorLayout& grad); static Algorithm::OprType get_opr_type() { @@ -1472,7 +1477,7 @@ public: virtual void exec( _megdnn_tensor_in src, _megdnn_tensor_in rois, _megdnn_tensor_out dst, _megdnn_tensor_out index, _megdnn_workspace workspace) = 0; - void deduce_layout( + MGE_WIN_DECLSPEC_FUC void deduce_layout( const TensorLayout& src, const TensorLayout& rois, TensorLayout& dst, TensorLayout& index); virtual size_t get_workspace_in_bytes( @@ -1963,7 +1968,7 @@ public: _megdnn_tensor_in data, _megdnn_tensor_in weight, _megdnn_tensor_in bias, _megdnn_tensor_out dst, _megdnn_tensor_out mean, _megdnn_tensor_out rstd, _megdnn_workspace workspace) = 0; - void deduce_layout( + MGE_WIN_DECLSPEC_FUC void deduce_layout( const TensorLayout& data, const TensorLayout& weight, const TensorLayout& bias, TensorLayout& dst, TensorLayout& mean, TensorLayout& rstd); diff --git a/dnn/src/common/check_non_finite.cpp b/dnn/src/common/check_non_finite.cpp index 2ad69721..784c3267 100644 --- a/dnn/src/common/check_non_finite.cpp +++ b/dnn/src/common/check_non_finite.cpp @@ -7,7 +7,11 @@ void CheckNonFinite::check_exec( const TensorNDArray& srcs, const TensorND& dst, size_t workspace_in_bytes) { megdnn_assert_contiguous(dst.layout); megdnn_assert(srcs.size() > 0); - auto required_workspace_in_bytes = get_workspace_in_bytes(srcs, dst.layout); + TensorLayoutArray src_layouts; + for (auto&& src : srcs) { + src_layouts.push_back(src.layout); + } + auto required_workspace_in_bytes = get_workspace_in_bytes(src_layouts, dst.layout); megdnn_assert(workspace_in_bytes >= required_workspace_in_bytes); } diff --git a/dnn/src/common/cond_take/opr_impl.cpp b/dnn/src/common/cond_take/opr_impl.cpp index 4057c123..d8e95797 100644 --- a/dnn/src/common/cond_take/opr_impl.cpp +++ b/dnn/src/common/cond_take/opr_impl.cpp @@ -11,7 +11,7 @@ size_t CondTake::check_exec_get_size( mask.TensorShape::to_string().c_str()); megdnn_assert(data.is_physical_contiguous() && mask.is_physical_contiguous()); megdnn_assert(m_param.eps > 0, "eps must be non-negative; got: %g", m_param.eps); - megdnn_assert(workspace_in_bytes >= get_workspace_in_bytes(data)); + megdnn_assert(workspace_in_bytes >= get_workspace_in_bytes(data, mask)); return data.total_nr_elems(); } diff --git a/dnn/src/common/lamb.cpp b/dnn/src/common/lamb.cpp index f837e800..264a8649 100644 --- a/dnn/src/common/lamb.cpp +++ b/dnn/src/common/lamb.cpp @@ -7,9 +7,9 @@ void LAMBUpdate::deduce_layout( const TensorLayout& m_t_1, const TensorLayout& v_t_1, const TensorLayout& lamb_param, const TensorLayout& grad, TensorLayout& m_t, TensorLayout& v_t, TensorLayout& new_param) { - m_t = TensorLayout(m_t_1); - v_t = TensorLayout(v_t_1); - new_param = TensorLayout(lamb_param); + m_t = m_t_1; + v_t = v_t_1; + new_param = lamb_param; MEGDNN_MARK_USED_VAR(grad); } diff --git a/dnn/src/cuda/check_non_finite/opr_impl.cpp b/dnn/src/cuda/check_non_finite/opr_impl.cpp index 2724abc9..807705b7 100644 --- a/dnn/src/cuda/check_non_finite/opr_impl.cpp +++ b/dnn/src/cuda/check_non_finite/opr_impl.cpp @@ -26,14 +26,14 @@ size_t CheckNonFiniteImpl::_get_workspace_in_bytes() { } size_t CheckNonFiniteImpl::get_workspace_in_bytes( - const TensorNDArray& srcs, const TensorLayout&) { + const TensorLayoutArray& srcs, const TensorLayout&) { m_size = 0; for (const auto& src : srcs) { - m_size += DIVUP(src.layout.total_nr_elems(), total_nr_elems_max); + m_size += DIVUP(src.total_nr_elems(), total_nr_elems_max); } - if (srcs.begin()->layout.dtype == dtype::Float32()) { + if (srcs.begin()->dtype == dtype::Float32()) { return _get_workspace_in_bytes(); - } else if (srcs.begin()->layout.dtype == dtype::Float16()) { + } else if (srcs.begin()->dtype == dtype::Float16()) { return _get_workspace_in_bytes(); } else { megdnn_log_warn("only support fp16 and fp32, fallback to fp32"); diff --git a/dnn/src/cuda/check_non_finite/opr_impl.h b/dnn/src/cuda/check_non_finite/opr_impl.h index 6e44014f..643c9b4e 100644 --- a/dnn/src/cuda/check_non_finite/opr_impl.h +++ b/dnn/src/cuda/check_non_finite/opr_impl.h @@ -19,7 +19,7 @@ public: using CheckNonFinite::CheckNonFinite; size_t get_workspace_in_bytes( - const TensorNDArray& srcs, const TensorLayout& dst) override; + const TensorLayoutArray& srcs, const TensorLayout& dst) override; bool is_thread_safe() const override { return true; } diff --git a/dnn/src/cuda/cond_take/opr_impl.cpp b/dnn/src/cuda/cond_take/opr_impl.cpp index aa375c83..7928c628 100644 --- a/dnn/src/cuda/cond_take/opr_impl.cpp +++ b/dnn/src/cuda/cond_take/opr_impl.cpp @@ -20,7 +20,8 @@ WorkspaceBundle CondTakeImpl::make_bundle(size_t nr_item) { handle()->alignment_requirement()}; } -size_t CondTakeImpl::get_workspace_in_bytes(const TensorLayout& data) { +size_t CondTakeImpl::get_workspace_in_bytes( + const TensorLayout& data, const TensorLayout&) { return make_bundle(data.total_nr_elems()).total_size_in_bytes(); } diff --git a/dnn/src/cuda/cond_take/opr_impl.h b/dnn/src/cuda/cond_take/opr_impl.h index 5bc9400b..10c69e56 100644 --- a/dnn/src/cuda/cond_take/opr_impl.h +++ b/dnn/src/cuda/cond_take/opr_impl.h @@ -15,7 +15,8 @@ public: _megdnn_tensor_in data, _megdnn_tensor_in mask, _megdnn_workspace workspace, DynOutMallocPolicyCall malloc_policy) override; - size_t get_workspace_in_bytes(const TensorLayout& data) override; + size_t get_workspace_in_bytes( + const TensorLayout& data, const TensorLayout& mask) override; }; } // namespace cuda diff --git a/dnn/src/cuda/param_pack/opr_impl.cpp b/dnn/src/cuda/param_pack/opr_impl.cpp index 12be8279..339f4aa8 100644 --- a/dnn/src/cuda/param_pack/opr_impl.cpp +++ b/dnn/src/cuda/param_pack/opr_impl.cpp @@ -6,8 +6,8 @@ namespace megdnn { namespace cuda { size_t ParamPackConcatImpl::get_workspace_in_bytes( - const TensorShapeArray& srcs, const TensorShape&, const TensorShape&) { - return sizeof(size_t) * srcs.size(); + const TensorShape&, const TensorShape& offsets, const TensorShape&) { + return sizeof(size_t) * (offsets.shape[0] / 2); } template diff --git a/dnn/src/cuda/param_pack/opr_impl.h b/dnn/src/cuda/param_pack/opr_impl.h index 5c5b2c12..689ff502 100644 --- a/dnn/src/cuda/param_pack/opr_impl.h +++ b/dnn/src/cuda/param_pack/opr_impl.h @@ -12,7 +12,7 @@ public: _megdnn_workspace workspace) override; size_t get_workspace_in_bytes( - const TensorShapeArray& srcs, const TensorShape& table, + const TensorShape& srcs, const TensorShape& table, const TensorShape& dst) override; private: diff --git a/dnn/src/naive/check_non_finite/opr_impl.h b/dnn/src/naive/check_non_finite/opr_impl.h index 0f9c782f..b8dd9df7 100644 --- a/dnn/src/naive/check_non_finite/opr_impl.h +++ b/dnn/src/naive/check_non_finite/opr_impl.h @@ -13,7 +13,8 @@ public: bool is_thread_safe() const override { return true; } - size_t get_workspace_in_bytes(const TensorNDArray&, const TensorLayout&) override { + size_t get_workspace_in_bytes( + const TensorLayoutArray&, const TensorLayout&) override { m_size = 0; return _get_workspace_in_bytes(); } diff --git a/dnn/src/naive/cond_take/opr_impl.cpp b/dnn/src/naive/cond_take/opr_impl.cpp index 823b46e3..40b3ecaf 100644 --- a/dnn/src/naive/cond_take/opr_impl.cpp +++ b/dnn/src/naive/cond_take/opr_impl.cpp @@ -38,7 +38,8 @@ void copy_data( } // anonymous namespace -size_t CondTakeImpl::get_workspace_in_bytes(const TensorLayout& data) { +size_t CondTakeImpl::get_workspace_in_bytes( + const TensorLayout& data, const TensorLayout&) { return (data.total_nr_elems() + 1) * sizeof(dt_int32); } diff --git a/dnn/src/naive/cond_take/opr_impl.h b/dnn/src/naive/cond_take/opr_impl.h index 7df8e4c5..7beabb78 100644 --- a/dnn/src/naive/cond_take/opr_impl.h +++ b/dnn/src/naive/cond_take/opr_impl.h @@ -11,7 +11,8 @@ class CondTakeImpl : public CondTake { public: using CondTake::CondTake; - size_t get_workspace_in_bytes(const TensorLayout& data) override; + size_t get_workspace_in_bytes( + const TensorLayout& data, const TensorLayout& mask) override; Output exec( _megdnn_tensor_in data, _megdnn_tensor_in mask, _megdnn_workspace workspace, diff --git a/dnn/src/naive/param_pack/opr_impl.h b/dnn/src/naive/param_pack/opr_impl.h index 4068b6f8..432f32f6 100644 --- a/dnn/src/naive/param_pack/opr_impl.h +++ b/dnn/src/naive/param_pack/opr_impl.h @@ -11,7 +11,7 @@ public: _megdnn_workspace workspace) override; size_t get_workspace_in_bytes( - const TensorShapeArray&, const TensorShape&, const TensorShape&) override { + const TensorShape&, const TensorShape&, const TensorShape&) override { return 0; } }; diff --git a/dnn/src/rocm/param_pack/opr_impl.cpp b/dnn/src/rocm/param_pack/opr_impl.cpp index 37cae7c3..b49aaa15 100644 --- a/dnn/src/rocm/param_pack/opr_impl.cpp +++ b/dnn/src/rocm/param_pack/opr_impl.cpp @@ -7,8 +7,8 @@ namespace megdnn { namespace rocm { size_t ParamPackConcatImpl::get_workspace_in_bytes( - const TensorShapeArray& srcs, const TensorShape&, const TensorShape&) { - return sizeof(size_t) * srcs.size(); + const TensorShape&, const TensorShape& offsets, const TensorShape&) { + return sizeof(size_t) * (offsets.shape[0] / 2); } template diff --git a/dnn/src/rocm/param_pack/opr_impl.h b/dnn/src/rocm/param_pack/opr_impl.h index 6e72831c..e833c211 100644 --- a/dnn/src/rocm/param_pack/opr_impl.h +++ b/dnn/src/rocm/param_pack/opr_impl.h @@ -12,7 +12,7 @@ public: _megdnn_workspace workspace) override; size_t get_workspace_in_bytes( - const TensorShapeArray& srcs, const TensorShape& table, + const TensorShape& srcs, const TensorShape& table, const TensorShape& dst) override; private: diff --git a/dnn/test/common/cond_take.cpp b/dnn/test/common/cond_take.cpp index 03890c82..0553490f 100644 --- a/dnn/test/common/cond_take.cpp +++ b/dnn/test/common/cond_take.cpp @@ -71,7 +71,7 @@ CondTakeTestcase::Result CondTakeTestcase::run(CondTake* opr) { opr->param() = m_param; DynOutMallocPolicyImpl malloc_policy(handle); - auto workspace_size = opr->get_workspace_in_bytes(data->layout); + auto workspace_size = opr->get_workspace_in_bytes(data->layout, mask->layout); auto workspace_ptr = malloc_policy.alloc_workspace(workspace_size, nullptr); auto result = opr->exec( *data, *mask, {(dt_byte*)workspace_ptr, workspace_size}, &malloc_policy); diff --git a/dnn/test/common/opr_proxy.h b/dnn/test/common/opr_proxy.h index 873859c6..baecd215 100644 --- a/dnn/test/common/opr_proxy.h +++ b/dnn/test/common/opr_proxy.h @@ -205,9 +205,14 @@ struct OprProxy { auto inps = tensors; inps.pop_back(); + TensorLayoutArray inp_layouts(inps.size()); + std::transform( + inps.begin(), inps.end(), inp_layouts.begin(), + [](const TensorND& tensor) { return tensor.layout; }); + WorkspaceWrapper W( opr->handle(), - opr->get_workspace_in_bytes(inps, tensors.back().layout)); + opr->get_workspace_in_bytes(inp_layouts, tensors.back().layout)); opr->exec(inps, tensors.back(), W.workspace()); } }; diff --git a/dnn/test/cuda/param_pack.cpp b/dnn/test/cuda/param_pack.cpp index a67f2957..d4bb24ce 100644 --- a/dnn/test/cuda/param_pack.cpp +++ b/dnn/test/cuda/param_pack.cpp @@ -95,7 +95,7 @@ void test_param_pack_concat( test::WorkspaceWrapper workspace( handle, - concat->get_workspace_in_bytes(shapes, offsets_layout, {pack_size})); + concat->get_workspace_in_bytes({nr_params}, offsets_layout, {pack_size})); TensorND src_tensor(param_ptrs.data(), TensorLayout({nr_params}, dtype::Int32())); concat->exec(src_tensor, offsets_tensor, dst_tensor, workspace.workspace()); diff --git a/dnn/test/rocm/param_pack.cpp b/dnn/test/rocm/param_pack.cpp index 8d1ef2a4..0aa51cd9 100644 --- a/dnn/test/rocm/param_pack.cpp +++ b/dnn/test/rocm/param_pack.cpp @@ -97,7 +97,7 @@ void test_param_pack_concat( test::WorkspaceWrapper workspace( handle, - concat->get_workspace_in_bytes(shapes, offsets_layout, {pack_size})); + concat->get_workspace_in_bytes({nr_params}, offsets_layout, {pack_size})); TensorND src_tensor(param_ptrs.data(), TensorLayout({nr_params}, dtype::Int32())); concat->exec(src_tensor, offsets_tensor, dst_tensor, workspace.workspace()); diff --git a/imperative/src/impl/blob_manager_impl.cpp b/imperative/src/impl/blob_manager_impl.cpp index 1c5cecfa..4073a261 100644 --- a/imperative/src/impl/blob_manager_impl.cpp +++ b/imperative/src/impl/blob_manager_impl.cpp @@ -9,11 +9,8 @@ BlobManagerImpl::BlobData::BlobData(OwnedBlob* in_blob) { blob = in_blob; DeviceTensorStorage d_storage; d_storage.reset(blob->m_comp_node, blob->m_size, blob->m_storage); - h_storage = HostTensorStorage(blob->m_comp_node); - h_storage.ensure_size(blob->m_size); - h_storage.copy_from(const_cast(d_storage), blob->m_size); } @@ -30,65 +27,36 @@ void BlobManagerImpl::unregister_blob(OwnedBlob* blob) { } void BlobManagerImpl::alloc_with_defrag(OwnedBlob* blob, size_t size) { - if (custom_allocator) { - blob->m_storage = custom_allocator(blob->m_comp_node, size); + if (m_custom_allocator) { + blob->m_storage = m_custom_allocator(blob->m_comp_node, size); return; } // try alloc - MGB_TRY { alloc_direct(blob, size); } // if fail, try defrag, alloc again - MGB_CATCH(MemAllocError&, { + if (!try_alloc_direct(blob, size)) { mgb_log_warn("memory allocation failed for blob; try defragmenting"); defrag(blob->m_comp_node); alloc_direct(blob, size); - }); + } } void BlobManagerImpl::alloc_direct(OwnedBlob* blob, size_t size) { - DeviceTensorStorage storage(blob->m_comp_node); mgb_assert(blob->m_comp_node.valid()); + DeviceTensorStorage storage(blob->m_comp_node); storage.ensure_size(size); blob->m_storage = storage.raw_storage(); } -DeviceTensorND BlobManagerImpl::alloc_workspace_with_defrag( - CompNode cn, TensorLayout& layout) { - DeviceTensorND dev_tensor; - if (custom_allocator) { - DeviceTensorStorage storage(cn); - size_t sz = layout.dtype.size(layout.total_nr_elems()); - storage.reset(cn, sz, custom_allocator(cn, sz)); - dev_tensor.reset(storage, layout); - return dev_tensor; - } - MGB_TRY { dev_tensor = alloc_workspace(cn, layout); } - MGB_CATCH(MemAllocError&, { - mgb_log_warn("memory allocation failed for workspace; try defragmenting"); - defrag(cn); - dev_tensor = alloc_workspace(cn, layout); - }); - return dev_tensor; -}; - -DeviceTensorND BlobManagerImpl::alloc_workspace(CompNode cn, TensorLayout layout) { - DeviceTensorStorage storage(cn); - storage.ensure_size(layout.dtype.size(layout.total_nr_elems())); - DeviceTensorND dev_tensor; - dev_tensor.reset(storage, layout); - return dev_tensor; -} - void BlobManagerImpl::set_allocator(allocator_t allocator) { - custom_allocator = allocator; + m_custom_allocator = allocator; } void BlobManagerImpl::defrag(const CompNode& cn) { - BlobSetWithMux* blobs_set_ptr; - { + auto& blobs_set_ptr = ([&]() -> auto& { MGB_LOCK_GUARD(m_mtx); - blobs_set_ptr = &m_comp2blobs_map[cn]; - } - MGB_LOCK_GUARD(blobs_set_ptr->mtx); + return m_comp2blobs_map[cn]; + })(); + MGB_LOCK_GUARD(blobs_set_ptr.mtx); std::vector blob_data_arrary; std::set storage_set; @@ -96,7 +64,7 @@ void BlobManagerImpl::defrag(const CompNode& cn) { size_t tot_sz = 0; // copy to HostTensorStorage, and release - for (auto i : blobs_set_ptr->blobs_set) { + for (auto i : blobs_set_ptr.blobs_set) { // skip if blob do not have m_storage if (!i->m_storage) continue; @@ -153,9 +121,6 @@ struct BlobManagerStub : BlobManager { void alloc_with_defrag(OwnedBlob* blob, size_t size) { mgb_assert(0, "prohibited after global variable destruction"); }; - DeviceTensorND alloc_workspace_with_defrag(CompNode cn, TensorLayout& layout) { - mgb_assert(0, "prohibited after global variable destruction"); - }; void register_blob(OwnedBlob* blob) { mgb_assert(0, "prohibited after global variable destruction"); }; @@ -163,7 +128,7 @@ struct BlobManagerStub : BlobManager { void defrag(const CompNode& cn) { mgb_assert(0, "prohibited after global variable destruction"); }; - virtual void set_allocator(allocator_t allocator) { + void set_allocator(allocator_t allocator) { mgb_assert(0, "prohibited after global variable destruction"); }; }; diff --git a/imperative/src/impl/blob_manager_impl.h b/imperative/src/impl/blob_manager_impl.h index d20e0ddc..9c8a068f 100644 --- a/imperative/src/impl/blob_manager_impl.h +++ b/imperative/src/impl/blob_manager_impl.h @@ -27,27 +27,21 @@ class BlobManagerImpl final : public BlobManager { std::mutex m_mtx; CompNode::UnorderedMap m_comp2blobs_map; - - void defrag(const CompNode& cn) override; + BlobManager::allocator_t m_custom_allocator; void alloc_direct(OwnedBlob* blob, size_t size) override; - DeviceTensorND alloc_workspace(CompNode cn, TensorLayout layout); - - BlobManager::allocator_t custom_allocator; - public: static BlobManager* inst(); void alloc_with_defrag(OwnedBlob* blob, size_t size) override; - DeviceTensorND alloc_workspace_with_defrag( - CompNode cn, TensorLayout& layout) override; - void register_blob(OwnedBlob* blob) override; void unregister_blob(OwnedBlob* blob) override; + void defrag(const CompNode& cn) override; + void set_allocator(allocator_t allocator) override; }; diff --git a/imperative/src/impl/dnn_op_helper.h b/imperative/src/impl/dnn_op_helper.h index 029207a8..4b563ce6 100644 --- a/imperative/src/impl/dnn_op_helper.h +++ b/imperative/src/impl/dnn_op_helper.h @@ -1,79 +1,331 @@ -#pragma once +#include +#include + +#include "algo_chooser.h" #include "megbrain/comp_node.h" #include "megbrain/comp_node_env.h" +#include "megbrain/imperative/blob_manager.h" #include "megbrain/imperative/physical_tensor.h" +#include "megbrain/imperative/utils/helper.h" +#include "megbrain/imperative/utils/platform.h" #include "megbrain/rdnn/management.h" - -using namespace megdnn; +#include "megdnn/basic_types.h" namespace mgb { namespace imperative { /*! - * \brief A struct for safely calling DNN oprs - * In some cases, op may be released before the complete of the execution - * This destructor will prevent this + * /brief Helps deduce layout and dtype */ template -struct DnnOprCaller { - CompNode cn; - DeviceTensorND dev_tensor; - Workspace workspace; - mgb::opr::intl::UniqPtrWithCN op; +class DnnOprDeducer { +private: + Opr* m_opr; - DnnOprCaller(CompNode cn) : cn(cn), op(std::move(create_operator(cn))) {} +public: + DnnOprDeducer(Opr* opr) : m_opr(opr) { mgb_assert(opr); } - static mgb::opr::intl::UniqPtrWithCN create_operator(CompNode cn) { - return mgb::opr::intl::create_megdnn_opr(cn); + // FIXME: maybe in-place style deduction works better + template + TensorLayout deduce_layout(TArgs&&... args) { + static_assert((std::is_convertible_v && ...)); + TensorLayout output_layout; + m_opr->deduce_layout(args..., output_layout); + return output_layout; } - Workspace create_workspace(size_t sz) { - if (workspace.raw_ptr) { - mgb_throw(MegBrainError, "workspace should not be applicated many times"); - } - if (sz) { - TensorLayout layout({sz}, dtype::Byte()); - dev_tensor = Tensor::make(layout, cn)->dev_tensor(); - workspace = megdnn::Workspace( - dev_tensor.raw_ptr(), dev_tensor.storage().size()); + template + TensorLayout deduce_layout_fallible(TArgs&&... args) { + static_assert((std::is_convertible_v && ...)); + TensorLayout output_layout; + bool success = (args.ndim * ...) > 0; + if (success) { + m_opr->deduce_layout(args..., output_layout); + } else { + m_opr->deduce_dtype(args.dtype..., output_layout.dtype); } - return workspace; + return output_layout; } - ~DnnOprCaller() { + template + std::array deduce_layouts(TArgs&&... args) { + static_assert((std::is_convertible_v && ...)); + std::array layouts; + std::apply( + [&](auto&&... outputs) { m_opr->deduce_layout(args..., outputs...); }, + layouts); + return layouts; + } +}; + +/*! + * /brief Declare an abstract operator and initialize it's param + */ +template +class DnnOprStub { +private: + // TODO: make opr concrete + std::aligned_storage_t m_storage; + + using Param = typename Opr::Param; + +private: + DnnOprStub() { new (¶m()) Param(); } + +public: + DnnOprStub(const Param& param) { this->param() = param; } + + // undefined behavior + Opr& opr() { return *reinterpret_cast(&m_storage); } + + auto& param() { return opr().param(); } + + auto& param() const { return opr().param(); } + + ~DnnOprStub() { param().~Param(); } +}; + +/*! + * /brief Deduce layout without create concrete opr + */ +template +class DnnOprHelper : public DnnOprStub, public DnnOprDeducer { +private: + using Stub = DnnOprStub; + using Deducer = DnnOprDeducer; + +public: + DnnOprHelper(const typename Opr::Param& param) + : Stub(param), Deducer(&Stub::opr()) {} +}; + +// hold a concrete operator in given comp_node +template +class DnnOprHolder { +private: + CompNode m_comp_node; + opr::intl::UniqPtrWithCN m_opr = + opr::intl::create_megdnn_opr(m_comp_node); + +public: + DnnOprHolder(CompNode comp_node) : m_comp_node(comp_node) {} + + auto& op() { return m_opr; } + + auto comp_node() { return m_comp_node; } + + auto& param() { return m_opr->param(); } + + auto& param() const { return m_opr->param(); } + + ~DnnOprHolder() { using DT = CompNode::DeviceType; - if (cn.device_type() == DT::CPU && cn != CompNode::default_cpu()) { - CompNodeEnv::from_comp_node(cn).cpu_env().dispatch( - [p = op.release()] { delete p; }); + + if (m_comp_node.device_type() == DT::CPU && + m_comp_node != CompNode::default_cpu()) { + CompNodeEnv::from_comp_node(m_comp_node) + .cpu_env() + .dispatch([p = m_opr.release()] { delete p; }); + } + } +}; + +/*! + * /brief Prevent binary float + */ +class DnnOprCallerBase { +protected: + static auto&& get_layout(const megdnn::TensorND& tensor) { return tensor.layout; } + + static auto get_layout(const megdnn::TensorNDArray& tensors) { + SmallVector layouts; + for (auto&& tensor : tensors) { + layouts.push_back(tensor.layout); } + return layouts; } }; -template -class MegDNNDynOutMallocImpl final : public megdnn::DynOutMallocPolicy { - using Output = std::array; +/*! + * \brief A struct for safely calling DNN oprs + * + * In some cases, op may be released before the complete of the execution + * This destructor will prevent this + */ +template +class DnnOprCaller final : public DnnOprHolder, + public DnnOprDeducer, + public DnnOprCallerBase { +private: + using Holder = DnnOprHolder; + using Deducer = DnnOprDeducer; + using Base = DnnOprCallerBase; + + std::optional m_workspace; + std::optional m_policy; - CompNode m_cn; - Output m_out; + megdnn::Workspace create_workspace(size_t sz) { + mgb_assert( + !m_workspace, "workspace asked more than once by op: %s", + demangled_typename()); + dt_byte* ptr = nullptr; + if (sz) { + TensorLayout layout({sz}, dtype::Byte()); + m_workspace.emplace( + Tensor::make(layout, Holder::comp_node())->dnn_tensor()); + ptr = reinterpret_cast(m_workspace->raw_ptr()); + } + return {ptr, sz}; + } public: - MegDNNDynOutMallocImpl(CompNode cn) : m_cn{cn} {} - - megdnn::TensorND alloc_output( - size_t id, DType dtype, const TensorShape& shape, - void* user_data) override { - TensorLayout m_layout(shape, dtype); - m_out[id] = Tensor::make(m_layout, m_cn); - return m_out[id]->dev_tensor().as_megdnn(); + using Param = typename Opr::Param; + + DnnOprCaller(CompNode cn) : Holder(cn), Deducer(Holder::op().get()) {} + + DnnOprCaller(CompNode cn, const Param& param) : DnnOprCaller(cn) { + Holder::param() = param; + } + + DnnOprCaller(CompNode cn, const Param& param, megdnn::param::ExecutionPolicy policy) + : DnnOprCaller(cn, param) { + m_policy.emplace(policy); } - void* alloc_workspace(size_t sz, void* user_data) override { - return m_cn.alloc_device(sz); + /** + * /brief Convert TensorPtr args to megdnn::TensorND and call f + * + */ + template + auto call_dnn(TFunctor&& f, TArgs&&... args) { + std::optional>> input_ptrs; + // recursive convert: + // 1. TensorPtr to DnnTensorND (subclass of megdnn::TensorND) ; + // 2. DeviceTensorND, HostTensorND to megdnn::TensorND ; + // 3. SmallVector of above to SmallVector . + auto to_dnn = [&](auto&& arg, auto&& to_dnn) { + using T = decltype(arg); + if constexpr (std::is_convertible_v) { + return arg->dnn_tensor(); + } else if constexpr ( + std::is_convertible_v || + std::is_convertible_v) { + return arg.as_megdnn(); + } else if constexpr ( + std::is_convertible_v || + std::is_convertible_v>) { + return std::forward(arg); + } else if constexpr (is_small_vector_v>) { + using TItem = std::decay_t; + SmallVector dnn_tensors; + for (auto&& tensor : arg) { + if constexpr (std::is_same_v) { + if (!input_ptrs) { + input_ptrs.emplace(); + } + auto dnn_tensor = to_dnn(tensor, to_dnn); + input_ptrs->push_back(std::move(dnn_tensor.reference)); + dnn_tensors.push_back(std::move(dnn_tensor)); + } else if constexpr (std::is_same_v) { + dnn_tensors.push_back(to_dnn(tensor, to_dnn)); + } else { + static_assert(!std::is_same_v); + } + } + return dnn_tensors; + } else { + static_assert(!std::is_same_v); + } + }; + return f(to_dnn(std::forward(args), to_dnn)...); } - void free_workspace(void* ptr, void* user_data) override { m_cn.free_device(ptr); } + // common execution (opr->exec(inputs..., outputs...)) + template + void exec(TArgs&&... args) { + call_dnn( + [this](auto&&... args) { + Holder::op()->exec(std::forward(args)...); + }, + std::forward(args)...); + } + + // execution fastrun opr + // (opr->exec(inputs..., outputs..., create_ws(setup_algo(...)))) + template + void exec_fastrun(TArgs&&... args) { + call_dnn( + [&](auto&&... args) { + using FixedTensorLayouts = + typename rdnn::AlgoChooser::FixedTensorLayouts; + SmallVector dnn_inputs = {args...}; + mgb_assert(m_policy, "policy not set"); + size_t workspace_size = setup_algo( + FixedTensorLayouts{args.layout...}, Holder::op().get(), 0, + false, false, Holder::comp_node(), *m_policy, false, + &dnn_inputs); + Holder::op()->exec( + std::forward(args)..., + create_workspace(workspace_size)); + }, + std::forward(args)...); + } + + // execute with fixed workspace + // (opr->exec(input..., outputs..., create_ws(get_workspace_in_bytes(...)))) + template + void exec_with_ws(TArgs&&... args) { + call_dnn( + [&](auto&&... args) { + size_t workspace_size = + Holder::op()->get_workspace_in_bytes(get_layout(args)...); + Holder::op()->exec( + std::forward(args)..., + create_workspace(workspace_size)); + }, + std::forward(args)...); + } - TensorPtr at(size_t id) { return m_out[id]; } + // execute dynamic out opr + // (opr->exec(inputs..., outputs... create_ws(get_workspace_in_bytes(...)), alloc)) + template + auto exec_dynout(TArgs&&... args) { + struct Alloc final : public megdnn::DynOutMallocPolicy { + CompNode comp_node; + std::array output_tensors; + std::array, nr_out> output_dnn_tensors; + + public: + Alloc(CompNode comp_node) : comp_node(comp_node) {} + megdnn::TensorND alloc_output( + size_t id, DType dtype, const TensorShape& shape, + void* user_data) override { + TensorLayout layout(shape, dtype); + output_tensors[id] = Tensor::make(layout, comp_node); + output_dnn_tensors[id].emplace( + output_tensors[id]->dnn_tensor()); // pin output + return *output_dnn_tensors[id]; + } + + void* alloc_workspace(size_t sz, void* user_data) override { + mgb_assert(false); + } + + void free_workspace(void* ptr, void* user_data) override { + mgb_assert(false); + } + } alloc{Holder::comp_node()}; + call_dnn( + [&](auto&&... args) { + size_t workspace_size = + Holder::op()->get_workspace_in_bytes(get_layout(args)...); + Holder::op()->exec( + std::forward(args)..., + create_workspace(workspace_size), &alloc); + }, + std::forward(args)...); + return alloc.output_tensors; + } }; } // namespace imperative diff --git a/imperative/src/impl/interpreter/interpreter_impl.cpp b/imperative/src/impl/interpreter/interpreter_impl.cpp index 4be8930f..b871a364 100644 --- a/imperative/src/impl/interpreter/interpreter_impl.cpp +++ b/imperative/src/impl/interpreter/interpreter_impl.cpp @@ -605,6 +605,7 @@ TensorInfo* ChannelImpl::alloc() { void ChannelImpl::init(TensorInfo* info, LogicalTensorDesc&& desc) { m_valid_handle.insert(reinterpret_cast(info)); MGB_RECORD_EVENT(TensorDeclareEvent, info->id, info->name); + mgb_assert(desc.comp_node.valid(), "comp_node invalid"); info->status = TensorInfo::Allocated; info->desc = std::move(desc); } @@ -831,6 +832,7 @@ void ChannelImpl::do_apply_op(const ApplyOp& cmd, std::string reason) { output_descs.push_back(i->desc); } } else { + // i may be null validated = false; } // Here std::move is REQUIRED for removing duplicated references. @@ -1064,17 +1066,16 @@ void ChannelImpl::alloc_tensor_with_evict(OwnedBlob* x) { if (in_worker) { reserve_size(x->size()); } - MGB_TRY { BlobManager::inst()->alloc_direct(x, x->size()); } - MGB_CATCH(MemAllocError&, { + if (!BlobManager::inst()->try_alloc_direct(x, x->size())) { bool suc = false; if (in_worker) { while (!suc) { if (!auto_evict(1)) { break; } - MGB_TRY { BlobManager::inst()->alloc_direct(x, x->size()); } - MGB_CATCH(MemAllocError&, { continue; }); - suc = true; + if (BlobManager::inst()->try_alloc_direct(x, x->size())) { + suc = true; + } } } if (!suc) { @@ -1086,9 +1087,11 @@ void ChannelImpl::alloc_tensor_with_evict(OwnedBlob* x) { imperative_log_profile_begin("defrag"); BlobManager::inst()->defrag(x->comp_node()); imperative_log_profile_end("defrag"); - BlobManager::inst()->alloc_direct(x, x->size()); + mgb_assert( + BlobManager::inst()->try_alloc_direct(x, x->size()), + "allocation failed after defrag"); } - }); + } set_log_level(pre_level); } diff --git a/imperative/src/impl/ops/adaptive_pooling.cpp b/imperative/src/impl/ops/adaptive_pooling.cpp index 183f1d68..5a02414d 100644 --- a/imperative/src/impl/ops/adaptive_pooling.cpp +++ b/imperative/src/impl/ops/adaptive_pooling.cpp @@ -75,13 +75,12 @@ std::tuple, bool> infer_output_attrs_fallible( SmallVector apply_on_physical_tensor( const OpDef& def, const SmallVector& inputs, SmallVector& output_descs, const bool& validated) { - auto&& pool = static_cast(def); + auto&& pooling = def.cast_final_safe(); auto&& cn = inputs[0]->comp_node(); - using TensorND = megdnn::TensorND; auto&& src_layout = inputs[0]->layout(); - TensorLayout dst_layout = output_descs[0].layout; - auto param_format = pool.format; + TensorLayout dst_layout{inputs[0]->dtype()}; + auto param_format = pooling.format; if (!validated) { dst_layout.ndim = src_layout.ndim; const dt_int32* oshp2d = nullptr; @@ -91,7 +90,7 @@ SmallVector apply_on_physical_tensor( tshp1n = inputs[1]->layout().total_nr_elems() == 1; oshp2d = tshp_nd->get_value().proxy_to_default_cpu().ptr(); } else { - oshp2d = pool.shape.data(); + oshp2d = pooling.shape.data(); } if (param_format == opr::AdaptivePooling::Param::Format::NCHW) { dst_layout[0] = src_layout[0]; @@ -108,15 +107,17 @@ SmallVector apply_on_physical_tensor( MegBrainError, "AdaptivePooling only support NCHW or NHWC format"); } dst_layout.init_contiguous_stride(); + } else { + dst_layout = output_descs[0].layout; } size_t IH, IW, OH, OW; - if (param_format == param::AdaptivePooling::Format::NCHW) { + if (param_format == megdnn::param::AdaptivePooling::Format::NCHW) { IH = src_layout[2]; IW = src_layout[3]; OH = dst_layout[2]; OW = dst_layout[3]; - } else if (param_format == param::AdaptivePooling::Format::NHWC) { + } else if (param_format == megdnn::param::AdaptivePooling::Format::NHWC) { IH = src_layout[1]; IW = src_layout[2]; OH = dst_layout[1]; @@ -124,26 +125,21 @@ SmallVector apply_on_physical_tensor( } else { mgb_throw(MegBrainError, "AdaptivePooling only support NCHW or NHWC format"); } - DnnOprCaller dnn_opr(cn); - auto&& param = dnn_opr.op->param(); - param.mode = pool.mode; - param.format = pool.format; + + // adaptive_pooling param to pooling + auto&& param = megdnn::Pooling::Param(); + param.mode = pooling.mode; + param.format = pooling.format; param.pad_h = param.pad_w = 0; - param.stride_h = floor(IH / OH); - param.stride_w = floor(IW / OW); + param.stride_h = IH / OH; + param.stride_w = IW / OW; param.window_h = IH - (OH - 1) * param.stride_h; param.window_w = IW - (OW - 1) * param.stride_w; - TensorND src = inputs[0]->dnn_tensor(); + DnnOprCaller dnn_opr(cn, param, megdnn::param::ExecutionPolicy{}); + auto src = inputs[0]; auto dst = Tensor::make(dst_layout, cn); - - size_t sz = setup_algo( - {src_layout, dst_layout}, dnn_opr.op.get(), 0, false, false, cn, - ::megdnn::param::ExecutionPolicy{}, false); - - auto dnn_wk = dnn_opr.create_workspace(sz); - dnn_opr.op->exec(src, dst->dnn_tensor(), dnn_wk); - + dnn_opr.exec_fastrun(inputs[0], dst); return {dst}; } diff --git a/imperative/src/impl/ops/batch_norm.cpp b/imperative/src/impl/ops/batch_norm.cpp index bc4e798c..e6bd117f 100644 --- a/imperative/src/impl/ops/batch_norm.cpp +++ b/imperative/src/impl/ops/batch_norm.cpp @@ -145,79 +145,44 @@ SmallVector apply_on_physical_tensor( auto&& op_def = def.cast_final_safe(); auto&& comp_node = inputs[0]->comp_node(); - using TensorND = megdnn::TensorND; + DnnOprCaller dnn_opr(comp_node, op_def.param()); - SmallVector inp_tensornds(inputs.size()); - for (size_t i = 0; i < inputs.size(); ++i) { - inp_tensornds[i] = inputs[i]->dnn_tensor(); - } - - DnnOprCaller dnn_opr(comp_node); - dnn_opr.op->param() = op_def.param(); - - TensorLayout src_layout = inputs[0]->layout(); - TensorLayout scale_layout = inputs[1]->layout(); + auto src_layout = inputs[0]->layout(); + auto scale_layout = inputs[1]->layout(); bool empty_input = src_layout.is_empty(); size_t nr_inp = inputs.size(); - size_t sz = 0, rsz = 0; - - TensorLayout r_layout({rsz}, dtype::Byte()); - - if (!empty_input) { - sz = dnn_opr.op->get_workspace_in_bytes( - src_layout, src_layout, src_layout, src_layout, src_layout, src_layout, - src_layout, src_layout, src_layout); - rsz = dnn_opr.op->get_reserve_in_bytes(src_layout); - - r_layout = TensorLayout({rsz}, dtype::Byte()); - } - auto dnn_wk = dnn_opr.create_workspace(sz); - auto reserve = Tensor::make(r_layout, comp_node); + // size_t ws_size = 0, reserve_size = 0; + size_t reserve_size = + empty_input ? (size_t)0 : dnn_opr.op()->get_reserve_in_bytes(src_layout); - // alloc memory + // alloc outputs auto y = Tensor::make(src_layout, comp_node); - auto save_mean = Tensor::make(scale_layout, comp_node); - auto save_variance = Tensor::make(scale_layout, comp_node); + auto reserve = Tensor::make(TensorLayout{{reserve_size}, dtype::Byte()}, comp_node); if (op_def.fwd_mode == ::megdnn::param::BN::FwdMode::INFERENCE) { - if (!empty_input) - dnn_opr.op->exec( - inp_tensornds[0], inp_tensornds[1], inp_tensornds[2], - inp_tensornds[3], inp_tensornds[4], save_mean->dnn_tensor(), - save_variance->dnn_tensor(), reserve->dnn_tensor(), y->dnn_tensor(), - dnn_wk); + if (!empty_input) { + dnn_opr.exec_with_ws( + inputs[0], inputs[1], inputs[2], inputs[3], inputs[4], save_mean, + save_variance, reserve, y); + } return {inputs[3], inputs[4], reserve, y}; } else { if (nr_inp == 5) { auto mean = Tensor::make(scale_layout, comp_node); - auto variance = Tensor::make(scale_layout, comp_node); - megdnn::RefPtr src_ptr1( - inp_tensornds[3].get_ref_ptr().get_ptr(), inputs[3]->offset()); - megdnn::RefPtr dst_ptr1( - mean->dev_tensor().storage().get_ref_ptr(), - mean->dev_tensor().storage().offset(), false); - comp_node.peer_copy_to_ref( - comp_node, dst_ptr1, src_ptr1, scale_layout.span().high_byte); - - megdnn::RefPtr src_ptr2( - inp_tensornds[4].get_ref_ptr().get_ptr(), inputs[4]->offset()); - megdnn::RefPtr dst_ptr2( - variance->dev_tensor().storage().get_ref_ptr(), - variance->dev_tensor().storage().offset(), false); - comp_node.peer_copy_to_ref( - comp_node, dst_ptr2, src_ptr2, scale_layout.span().high_byte); - - if (!empty_input) - dnn_opr.op->exec( - inp_tensornds[0], inp_tensornds[1], inp_tensornds[2], - mean->dnn_tensor(), variance->dnn_tensor(), - save_mean->dnn_tensor(), save_variance->dnn_tensor(), - reserve->dnn_tensor(), y->dnn_tensor(), dnn_wk); + // FIXME + mean->dev_tensor().copy_from(inputs[3]->dev_tensor()); + variance->dev_tensor().copy_from(inputs[4]->dev_tensor()); + + if (!empty_input) { + dnn_opr.exec_with_ws( + inputs[0], inputs[1], inputs[2], mean, variance, save_mean, + save_variance, reserve, y); + } return {mean, variance, save_mean, save_variance, reserve, y}; } @@ -227,11 +192,9 @@ SmallVector apply_on_physical_tensor( auto variance = Tensor::make(m_layout, comp_node); if (!empty_input) { - dnn_opr.op->exec( - inp_tensornds[0], inp_tensornds[1], inp_tensornds[2], - mean->dnn_tensor(), variance->dnn_tensor(), save_mean->dnn_tensor(), - save_variance->dnn_tensor(), reserve->dnn_tensor(), y->dnn_tensor(), - dnn_wk); + dnn_opr.exec_with_ws( + inputs[0], inputs[1], inputs[2], mean, variance, save_mean, + save_variance, reserve, y); } return {save_mean, save_variance, reserve, y}; diff --git a/imperative/src/impl/ops/cond_take.cpp b/imperative/src/impl/ops/cond_take.cpp index 28b44905..2d5bd7e3 100644 --- a/imperative/src/impl/ops/cond_take.cpp +++ b/imperative/src/impl/ops/cond_take.cpp @@ -28,33 +28,26 @@ SmallVector apply_on_physical_tensor( auto&& inp = inputs[0]; auto&& msk = inputs[1]; - SmallVector out; mgb_assert( inp->layout().eq_shape(msk->layout()), "input shape does not match mask shape"); mgb_assert( msk->get_value().dtype().enumv() == DTypeEnum::Bool, "mask dtype must be bool"); - MegDNNDynOutMallocImpl<2> policy{inp->comp_node()}; if (inp->layout().is_empty()) { // empty tensor - policy.alloc_output(0, inp->layout().dtype, {0}, nullptr); - policy.alloc_output(1, dtype::Int32(), {0}, nullptr); + return { + Tensor::make(TensorLayout{{0}, inp->dtype()}, inp->comp_node()), + Tensor::make(TensorLayout{{0}, dtype::Int32()}, inp->comp_node()), + }; } else { - DnnOprCaller dnn_op(inp->comp_node()); - dnn_op.op->param().val = 1; - - size_t sz = dnn_op.op->get_workspace_in_bytes(inp->layout()); - - auto dnn_workspace = dnn_op.create_workspace(sz); - - dnn_op.op->exec( - inp->dev_tensor().as_megdnn(), msk->dev_tensor().as_megdnn(), - dnn_workspace, &policy); + // maybe we need to split CondTake + megdnn::CondTake::Param param; + param.val = 1; + DnnOprCaller dnn_op(inp->comp_node(), param); + auto&& [out0, out1] = dnn_op.exec_dynout<2>(inp, msk); + return {out0, out1}; } - out.push_back(policy.at(0)); - out.push_back(policy.at(1)); - return out; } std::tuple, bool> infer_output_attrs_fallible( diff --git a/imperative/src/impl/ops/convolution.cpp b/imperative/src/impl/ops/convolution.cpp index 189a5962..98ca1e20 100644 --- a/imperative/src/impl/ops/convolution.cpp +++ b/imperative/src/impl/ops/convolution.cpp @@ -8,14 +8,7 @@ namespace mgb { namespace imperative { - namespace { - -size_t infer_conv_shape(size_t inp, size_t flt, size_t stride, size_t pad) { - mgb_assert(inp + 2 * pad >= flt, "input=%zu padding=%zu filter=%zu", inp, pad, flt); - return (inp + 2 * pad - flt) / stride + 1; -} - namespace convolution { std::shared_ptr make_from_op_node(cg::OperatorNodeBase* node_) { auto* node = &node_->cast_final_safe(); @@ -29,131 +22,23 @@ auto apply_on_var_node(const OpDef& def, const VarNodeArray& inputs) { inputs[0], inputs[1], conv.param(), conv.policy(), config); } -TensorLayout do_shape_infer( - const OpDef& def, size_t src_ndim, TensorLayout src, TensorLayout filter) { - auto&& conv = static_cast(def); - using Param = ::megdnn::param::Convolution; - - auto img_ndim = src_ndim - 2; - mgb_assert( - img_ndim == 2, - "only 2D convolution is supported, and input should be 4-dim; " - "got input dim = %zu", - src_ndim); - size_t group = 1; - size_t flt_start, flt_spatial_start, ocpg_pos, icpg_pos; - if (conv.sparse == Param::Sparse::DENSE) { - mgb_assert( - filter.ndim == img_ndim + 2 || filter.ndim == img_ndim + 4, - "bad filter ndim for dense convolution: " - "spatial_ndim=%zu filter_ndim=%zu", - img_ndim, filter.ndim); - group = 1; - flt_start = 0; - } else { // Param::Sparse::GROUP - mgb_assert( - filter.ndim == img_ndim + 3 || filter.ndim == img_ndim + 5, - "bad filter ndim for group convolution: " - "spatial_ndim=%zu filter_ndim=%zu", - img_ndim, filter.ndim); - // grp, oc, ic, dims[] - group = filter[0]; - flt_start = 1; - } - - uint32_t ic_block_size = 1, oc_block_size = 1; - size_t src_or_dst_c_pos = 0; - size_t src_or_dst_spatial_start = 0; - if (conv.format == Param::Format::NCHW) { - // filter should be (oc, ic, fh, fw) - flt_spatial_start = 2; - ocpg_pos = 0; - icpg_pos = 1; - src_or_dst_c_pos = 1; - src_or_dst_spatial_start = 2; - } else { // Param::Format::NHWC - // filter should be (oc, fh, fw, ic) - flt_spatial_start = 1; - ocpg_pos = 0; - icpg_pos = 3; - src_or_dst_c_pos = 3; - src_or_dst_spatial_start = 1; - } - size_t ocpg = filter[flt_start + ocpg_pos] * oc_block_size; - size_t icpg = filter[flt_start + icpg_pos] * ic_block_size; - uint32_t dilation[2], dilated_spatial[2], stride[2], padding[2]; - dilation[0] = conv.dilate_h; - dilation[1] = conv.dilate_w; - stride[0] = conv.stride_h; - stride[1] = conv.stride_w; - padding[0] = conv.pad_h; - padding[1] = conv.pad_w; - for (size_t i = 0; i < img_ndim; ++i) { - mgb_assert( - dilation[i] > 0, "invalid dilation on spatial dim %zu: %u", i, - dilation[i]); - dilated_spatial[i] = - (filter[i + flt_start + flt_spatial_start] - 1) * dilation[i] + 1; - } - mgb_assert( - icpg * group == src[src_or_dst_c_pos], - "group conv invalid: input channel of Conv expect %zu, but got %zu\n" - "hint: weight may be changed by mistake\n", - icpg * group, src[src_or_dst_c_pos]); - TensorLayout dst{src.dtype}; - dst.ndim = src_ndim; - dst[0] = src[0]; - dst[src_or_dst_c_pos] = ocpg * group; - for (size_t i = 0; i < img_ndim; ++i) { - dst[i + src_or_dst_spatial_start] = infer_conv_shape( - src[i + src_or_dst_spatial_start], dilated_spatial[i], stride[i], - padding[i]); - } - dst.init_contiguous_stride(); - return dst; -} - std::tuple, bool> infer_output_attrs_fallible( const OpDef& def, const SmallVector& inputs) { - SmallVector dests(1); - auto&& desc = dests[0]; - desc.comp_node = inputs[0].comp_node; - - TensorLayout src = inputs[0].layout; - TensorLayout filter = inputs[1].layout; - size_t src_ndim = src.ndim; - if (src_ndim == 0 || filter.ndim == 0) { - desc.layout = TensorLayout{{}, src.dtype}; - return {dests, false}; + auto&& conv = def.cast_final_safe(); + DnnOprHelper dnn_opr(conv.param()); + auto&& data = inputs[0].layout; + auto&& filter = inputs[1].layout; + TensorLayout output_layout{data.dtype}; + if (data.ndim && filter.ndim) { + // deduce_layout won't override existing dtype + dnn_opr.opr().deduce_layout(data, filter, output_layout); } - - desc.layout = do_shape_infer(def, src_ndim, src, filter); - return {dests, true}; + return {{{output_layout, inputs[0].comp_node}}, output_layout.ndim != 0}; } -SmallVector apply_on_physical_tensor( - const OpDef& def, const SmallVector& inputs, - SmallVector& output_descs, const bool& validated) { - // create megdnn opr - auto&& conv = static_cast(def); - CompNode cn = inputs[0]->comp_node(); - - TensorLayout out_layout = output_descs[0].layout; - if (!validated) - out_layout = do_shape_infer( - def, inputs[0]->layout().ndim, inputs[0]->layout(), - inputs[1]->layout()); - - using TensorND = megdnn::TensorND; - SmallVector inp_tensornds(inputs.size() + 2); - TensorLayoutArray inp_shapes(inputs.size()), oup_shapes(output_descs.size()); - for (unsigned i = 0; i < inputs.size(); ++i) { - inp_tensornds[i] = inputs[i]->dnn_tensor(); - inp_shapes[i] = inputs[i]->layout(); - } - oup_shapes[0] = out_layout; - DnnOprCaller dnn_opr(cn); - auto&& param = dnn_opr.op->param(); +// Convolution::Param -> ConvBias::Param +auto conv_bias_param_from_convolution(const Convolution& conv) { + megdnn::ConvBias::Param param; param.pad_h = conv.pad_h; param.pad_w = conv.pad_w; param.stride_h = conv.stride_h; @@ -163,30 +48,37 @@ SmallVector apply_on_physical_tensor( param.sparse = conv.sparse; param.compute_mode = conv.compute_mode; param.format = conv.format; + return param; +} - // shape infer - TensorLayout empty_shp({0}, inputs[0]->dtype()); - empty_shp.ndim = 0; - - auto empty_bias = Tensor::make(empty_shp, cn); - - inp_tensornds[2] = empty_bias->dnn_tensor(); - inp_tensornds[3] = empty_bias->dnn_tensor(); - - size_t sz = setup_algo( - {inp_shapes[0], inp_shapes[1], empty_shp, empty_shp, oup_shapes[0]}, - dnn_opr.op.get(), 0, false, false, cn, conv.policy(), false, - &inp_tensornds); +SmallVector apply_on_physical_tensor( + const OpDef& def, const SmallVector& inputs, + SmallVector& output_descs, const bool& validated) { + // create megdnn opr + auto&& conv = def.cast_final_safe(); + CompNode cn = inputs[0]->comp_node(); + auto&& param = conv_bias_param_from_convolution(conv); + DnnOprCaller dnn_opr(cn, param, conv.policy()); + + megdnn::TensorND empty_bias; + empty_bias.layout.dtype = inputs[0]->dtype(); + empty_bias.layout.ndim = 0; + + auto out_layout = [&] { + if (validated) { + return output_descs[0].layout; + } else { + TensorLayout out_layout{inputs[0]->dtype()}; + dnn_opr.op()->deduce_layout( + inputs[0]->layout(), inputs[1]->layout(), empty_bias.layout, + empty_bias.layout, out_layout); + return out_layout; + } + }(); // alloc memory auto out = Tensor::make(out_layout, cn); - - auto dnn_wk = dnn_opr.create_workspace(sz); - - // exeucte - dnn_opr.op->exec( - inp_tensornds[0], inp_tensornds[1], inp_tensornds[2], inp_tensornds[3], - out->dnn_tensor(), nullptr, dnn_wk); + dnn_opr.exec_fastrun(inputs[0], inputs[1], empty_bias, empty_bias, out); return {out}; } @@ -243,155 +135,41 @@ auto apply_on_var_node(const OpDef& def, const VarNodeArray& inputs) { } } -TensorLayout convbwd_do_shape_infer( - const OpDef& def, size_t diff_ndim, TensorLayout filter, TensorLayout diff, - CompNode cn) { - auto&& bwd_conv = static_cast(def); - DnnOprCaller caller(cn); - auto&& dnn_opr = caller.op; - using Param = ::megdnn::param::Convolution; - // using Param1 = ::megdnn::param::ConvolutionBackwardData; - - auto img_ndim = diff_ndim - 2; - mgb_assert( - img_ndim == 2, - "only 2D convolution is supported, and input should be 4-dim; " - "got input dim = %zu", - diff_ndim); - size_t group = 1; - size_t flt_start, flt_spatial_start, ocpg_pos, icpg_pos; - if (bwd_conv.sparse == Param::Sparse::DENSE) { - mgb_assert( - filter.ndim == img_ndim + 2 || filter.ndim == img_ndim + 4, - "bad filter ndim for dense convolution: " - "spatial_ndim=%zu filter_ndim=%zu", - img_ndim, filter.ndim); - group = 1; - flt_start = 0; - } else { // Param::Sparse::GROUP - mgb_assert( - filter.ndim == img_ndim + 3 || filter.ndim == img_ndim + 5, - "bad filter ndim for group convolution: " - "spatial_ndim=%zu filter_ndim=%zu", - img_ndim, filter.ndim); - // grp, oc, ic, dims[] - group = filter[0]; - flt_start = 1; - } - - uint32_t ic_block_size = 1, oc_block_size = 1; - size_t src_or_dst_c_pos = 0; - size_t src_or_dst_spatial_start = 0; - if (bwd_conv.format == Param::Format::NCHW) { - // filter should be (oc, ic, fh, fw) - flt_spatial_start = 2; - ocpg_pos = 0; - icpg_pos = 1; - src_or_dst_c_pos = 1; - src_or_dst_spatial_start = 2; - } else { // Param::Format::NHWC - // filter should be (oc, fh, fw, ic) - flt_spatial_start = 1; - ocpg_pos = 0; - icpg_pos = 3; - src_or_dst_c_pos = 3; - src_or_dst_spatial_start = 1; - } - size_t ocpg = filter[flt_start + ocpg_pos] * oc_block_size; - size_t icpg = filter[flt_start + icpg_pos] * ic_block_size; - uint32_t dilation[2], dilated_spatial[2], stride[2], padding[2]; - dilation[0] = bwd_conv.dilate_h; - dilation[1] = bwd_conv.dilate_w; - stride[0] = bwd_conv.stride_h; - stride[1] = bwd_conv.stride_w; - padding[0] = bwd_conv.pad_h; - padding[1] = bwd_conv.pad_w; - for (size_t i = 0; i < img_ndim; ++i) { - mgb_assert( - dilation[i] > 0, "invalid dilation on spatial dim %zu: %u", i, - dilation[i]); - dilated_spatial[i] = - (filter[i + flt_start + flt_spatial_start] - 1) * dilation[i] + 1; - } - mgb_assert( - ocpg * group == diff[src_or_dst_c_pos], - "group conv invalid: input channel of Conv expect %zu, but got %zu\n" - "hint: weight may be changed by mistake\n", - ocpg * group, diff[src_or_dst_c_pos]); - auto deduce = [](size_t out, size_t filter, size_t stride, size_t pad) { - auto i = (out - 1) * stride + filter; - mgb_assert(i > pad * 2); - return i - pad * 2; - }; - - DType dst_dtype = bwd_conv.dtype; - dnn_opr->deduce_dtype(filter.dtype, diff.dtype, dst_dtype); - TensorLayout dst{dst_dtype}; - dst.ndim = diff_ndim; - dst[0] = diff[0]; - dst[src_or_dst_c_pos] = icpg * group; - for (size_t i = 0; i < img_ndim; ++i) { - dst[i + src_or_dst_spatial_start] = - deduce(diff[i + src_or_dst_spatial_start], dilated_spatial[i], - stride[i], padding[i]); - } - dst.init_contiguous_stride(); - return dst; -} - std::tuple, bool> infer_output_attrs_fallible( const OpDef& def, const SmallVector& inputs) { - SmallVector dests(1); - auto&& desc = dests[0]; - desc.comp_node = inputs[0].comp_node; - - TensorLayout filter = inputs[0].layout; - TensorLayout diff = inputs[1].layout; - size_t diff_ndim = diff.ndim; - if (diff_ndim == 0 || filter.ndim == 0) { - desc.layout = TensorLayout{{}, diff.dtype}; - return {dests, false}; + auto&& convbwd = def.cast_final_safe(); + DnnOprHelper dnn_opr(convbwd.param()); + // force set dtype + auto&& filter = inputs[0].layout; + auto&& diff = inputs[1].layout; + TensorLayout output_layout{convbwd.dtype}; + if (filter.ndim && diff.ndim) { + // deduce_layout won't override existing dtype + dnn_opr.opr().deduce_layout(filter, diff, output_layout); } - - desc.layout = - convbwd_do_shape_infer(def, diff_ndim, filter, diff, inputs[0].comp_node); - return {dests, true}; + return {{{output_layout, inputs[0].comp_node}}, output_layout.ndim != 0}; } SmallVector apply_on_physical_tensor( const OpDef& def, const SmallVector& inputs, SmallVector& output_descs, const bool& validated) { // create megdnn opr - auto&& convbwd = static_cast(def); + auto&& convbwd = def.cast_final_safe(); CompNode cn = inputs[0]->comp_node(); - - TensorLayout out_layout = output_descs[0].layout; - if (!validated) - out_layout = convbwd_do_shape_infer( - def, inputs[1]->layout().ndim, inputs[0]->layout(), inputs[1]->layout(), - cn); - + DnnOprCaller dnn_opr( + cn, convbwd.param(), convbwd.policy()); + auto out_layout = [&] { + if (validated) { + return output_descs[0].layout; + } else { + TensorLayout out_layout{inputs[0]->dtype()}; + dnn_opr.op()->deduce_layout( + inputs[0]->layout(), inputs[1]->layout(), out_layout); + return out_layout; + } + }(); auto out = Tensor::make(out_layout, cn); - - using TensorND = megdnn::TensorND; - SmallVector inp_tensornds(inputs.size()); - TensorLayoutArray inp_shapes(inputs.size()), oup_shapes(output_descs.size()); - for (unsigned i = 0; i < inputs.size(); ++i) { - inp_tensornds[i] = inputs[i]->dnn_tensor(); - inp_shapes[i] = inputs[i]->layout(); - } - oup_shapes[0] = out_layout; - DnnOprCaller dnn_opr(cn); - dnn_opr.op->param() = convbwd.param(); - - size_t sz = setup_algo( - {inp_shapes[0], inp_shapes[1], oup_shapes[0]}, dnn_opr.op.get(), 0, false, - false, cn, convbwd.policy(), false, &inp_tensornds); - - auto dnn_wk = dnn_opr.create_workspace(sz); - - // exeucte - dnn_opr.op->exec(inp_tensornds[0], inp_tensornds[1], out->dnn_tensor(), dnn_wk); + dnn_opr.exec_fastrun(inputs[0], inputs[1], out); return {out}; } @@ -415,149 +193,36 @@ auto apply_on_var_node(const OpDef& def, const VarNodeArray& inputs) { return opr::Convolution3D::make(inputs[0], inputs[1], conv.param(), conv.policy()); } -TensorLayout do_shape_infer( - const OpDef& def, size_t src_ndim, TensorLayout src, TensorLayout filter) { - auto&& conv = static_cast(def); - using Param = ::megdnn::param::Convolution3D; - auto img_ndim = src_ndim - 2; - mgb_assert( - img_ndim == 3, - "only 3D convolution is supported, and input should be 5-dim; " - "got input dim = %zu", - src_ndim); - - size_t group = 1; - size_t flt_start, flt_spatial_start, ocpg_pos, icpg_pos; - if (conv.sparse == Param::Sparse::DENSE) { - mgb_assert( - filter.ndim == img_ndim + 2 || filter.ndim == img_ndim + 4, - "bad filter ndim for dense convolution: " - "spatial_ndim=%zu filter_ndim=%zu", - img_ndim, filter.ndim); - group = 1; - flt_start = 0; - } else { // Param::Sparse::GROUP - mgb_assert( - filter.ndim == img_ndim + 3 || filter.ndim == img_ndim + 5, - "bad filter ndim for group convolution: " - "spatial_ndim=%zu filter_ndim=%zu", - img_ndim, filter.ndim); - - // grp, oc, ic, dims[] - group = filter[0]; - flt_start = 1; - } - - uint32_t ic_block_size = 1, oc_block_size = 1; - size_t src_or_dst_c_pos = 0; - size_t src_or_dst_spatial_start = 0; - if (conv.format == Param::Format::NCDHW) { - // filter should be (oc, ic, fd, fh, fw) - flt_spatial_start = 2; - ocpg_pos = 0; - icpg_pos = 1; - src_or_dst_c_pos = 1; - src_or_dst_spatial_start = 2; - } else { // Param::Format::NDHWC - // filter should be (oc, fd, fh, fw, ic) - flt_spatial_start = 1; - ocpg_pos = 0; - icpg_pos = 4; - src_or_dst_c_pos = 4; - src_or_dst_spatial_start = 1; - } - size_t ocpg = filter[flt_start + ocpg_pos] * oc_block_size; - size_t icpg = filter[flt_start + icpg_pos] * ic_block_size; - uint32_t dilation[3], dilated_spatial[3], stride[3], padding[3]; - dilation[0] = conv.dilate_d; - dilation[1] = conv.dilate_h; - dilation[2] = conv.dilate_w; - stride[0] = conv.stride_d; - stride[1] = conv.stride_h; - stride[2] = conv.stride_w; - padding[0] = conv.pad_d; - padding[1] = conv.pad_h; - padding[2] = conv.pad_w; - for (size_t i = 0; i < img_ndim; ++i) { - mgb_assert( - dilation[i] > 0, "invalid dilation on spatial dim %zu: %u", i, - dilation[i]); - dilated_spatial[i] = - (filter[i + flt_start + flt_spatial_start] - 1) * dilation[i] + 1; - } - mgb_assert( - icpg * group == src[src_or_dst_c_pos], - "group conv invalid: input channel of Conv expect %zu, but got %zu\n" - "hint: weight may be changed by mistake\n", - icpg * group, src[src_or_dst_c_pos]); - TensorLayout dst{src.dtype}; - dst.ndim = src_ndim; - dst[0] = src[0]; - dst[src_or_dst_c_pos] = ocpg * group; - for (size_t i = 0; i < img_ndim; ++i) { - dst[i + src_or_dst_spatial_start] = infer_conv_shape( - src[i + src_or_dst_spatial_start], dilated_spatial[i], stride[i], - padding[i]); - } - dst.init_contiguous_stride(); - - return dst; -} - std::tuple, bool> infer_output_attrs_fallible( const OpDef& def, const SmallVector& inputs) { - SmallVector dests(1); - auto&& desc = dests[0]; - desc.comp_node = inputs[0].comp_node; - + auto&& conv = def.cast_final_safe(); TensorLayout src = inputs[0].layout; TensorLayout filter = inputs[1].layout; - size_t src_ndim = src.ndim; - if (src_ndim == 0 || filter.ndim == 0) { - desc.layout = TensorLayout{{}, src.dtype}; - return {dests, false}; + if (src.ndim == 0 || filter.ndim == 0) { + return {{{TensorLayout{src.dtype}, inputs[0].comp_node}}, false}; } - - desc.layout = do_shape_infer(def, src_ndim, src, filter); - return {dests, true}; + DnnOprHelper dnn_opr(conv.param()); + auto output = dnn_opr.deduce_layout(src, filter); + return {{{output, inputs[0].comp_node}}, false}; } SmallVector apply_on_physical_tensor( const OpDef& def, const SmallVector& inputs, SmallVector& output_descs, const bool& validated) { // create megdnn opr - auto&& conv = static_cast(def); - - TensorLayout out_layout = output_descs[0].layout; - if (!validated) - out_layout = do_shape_infer( - def, inputs[0]->layout().ndim, inputs[0]->layout(), - inputs[1]->layout()); - - using TensorND = megdnn::TensorND; + auto&& conv = def.cast_final_safe(); CompNode cn = inputs[0]->comp_node(); - SmallVector inp_tensornds(inputs.size()); - TensorLayoutArray inp_shapes(inputs.size()), oup_shapes(output_descs.size()); - for (unsigned i = 0; i < inputs.size(); ++i) { - inp_tensornds[i] = inputs[i]->dnn_tensor(); - inp_shapes[i] = inputs[i]->layout(); - } - oup_shapes[0] = out_layout; - DnnOprCaller dnn_opr(cn); - dnn_opr.op->param() = conv.param(); - - // shape infer - size_t sz = setup_algo( - {inp_shapes[0], inp_shapes[1], oup_shapes[0]}, dnn_opr.op.get(), 0, false, - false, cn, conv.policy(), false, &inp_tensornds); - + DnnOprCaller dnn_opr(cn, conv.param(), conv.policy()); + auto out_layout = [&] { + if (validated) { + return output_descs[0].layout; + } else { + return dnn_opr.deduce_layout(inputs[0]->layout(), inputs[1]->layout()); + } + }(); // alloc memory auto out = Tensor::make(out_layout, cn); - - auto dnn_wk = dnn_opr.create_workspace(sz); - - // exeucte - dnn_opr.op->exec(inp_tensornds[0], inp_tensornds[1], out->dnn_tensor(), dnn_wk); + dnn_opr.exec_fastrun(inputs[0], inputs[1], out); return {out}; } @@ -579,51 +244,38 @@ std::tuple, bool> infer_output_attrs_fallible( inputs.size() == 2, "inputs num of conv_transpose3d should be 2 but you give %zu", inputs.size()); - auto&& op_def = def.cast_final_safe(); auto&& weight = inputs[0]; auto&& diff = inputs[1]; - auto& cn = weight.comp_node; - - if (weight.layout.ndim == 0 || diff.layout.ndim == 0) { - return {{{TensorLayout{weight.layout.dtype}, cn, {}}}, false}; + if (!(weight.layout.ndim && diff.layout.ndim)) { + return {{{TensorLayout{weight.layout.dtype}, weight.comp_node}}, false}; } - - TensorLayout oup_layout; - megdnn::Convolution3DBackwardData::deduce_layout_impl( - weight.layout, diff.layout, op_def.param(), oup_layout); - return {{{oup_layout, cn, {}}}, true}; + DnnOprHelper dnn_opr(op_def.param()); + auto oup_layout = dnn_opr.deduce_layout(weight.layout, diff.layout); + return {{{oup_layout, weight.comp_node}}, true}; } SmallVector apply_on_physical_tensor( const OpDef& def, const SmallVector& inputs, SmallVector& output_descs, const bool& validated) { - auto&& op_def = def.cast_final_safe(); + auto&& conv = def.cast_final_safe(); auto cn = inputs[0]->comp_node(); auto&& wlayout = inputs[0]->layout(); auto&& dlayout = inputs[1]->layout(); - DnnOprCaller caller(cn); - auto&& dnn_opr = caller.op; - dnn_opr->param() = op_def.param(); + DnnOprCaller dnn_op( + cn, conv.param(), conv.policy()); - TensorLayout& oup_layout = output_descs[0].layout; - if (!validated) { - megdnn::Convolution3DBackwardData::deduce_layout_impl( - wlayout, dlayout, op_def.param(), oup_layout); - } + auto oup_layout = [&] { + if (validated) { + return output_descs[0].layout; + } else { + return dnn_op.deduce_layout(wlayout, dlayout); + } + }(); auto oup = Tensor::make(oup_layout, cn); - - SmallVector inp_tensornds(inputs.size()); - inp_tensornds[0] = inputs[0]->dnn_tensor(); - inp_tensornds[1] = inputs[1]->dnn_tensor(); - size_t wk_size = setup_algo( - {wlayout, dlayout, oup_layout}, dnn_opr.get(), 0, false, false, cn, - op_def.policy(), false, &inp_tensornds); - auto dnn_wk = caller.create_workspace(wk_size); - - dnn_opr->exec(inp_tensornds[0], inp_tensornds[1], oup->dnn_tensor(), dnn_wk); + dnn_op.exec_fastrun(inputs[0], inputs[1], oup); return {oup}; } diff --git a/imperative/src/impl/ops/elemwise.cpp b/imperative/src/impl/ops/elemwise.cpp index 2394a4c6..c54ee0e5 100644 --- a/imperative/src/impl/ops/elemwise.cpp +++ b/imperative/src/impl/ops/elemwise.cpp @@ -94,52 +94,44 @@ void apply_on_device_tensornd( mgb_assert( inputs.size() == trait.arity, "%s expects %u inputs; got %zu actually", trait.name, trait.arity, inputs.size()); - DnnOprCaller dnn_opr(inputs[0].comp_node()); - opr::Elemwise::perform(op_def.mode, (*outputs)[0], inputs, dnn_opr.op); + DnnOprCaller dnn_opr(inputs[0].comp_node(), {op_def.mode}); + opr::Elemwise::perform(op_def.mode, (*outputs)[0], inputs, dnn_opr.op()); } SmallVector apply_on_physical_tensor( const OpDef& def, const SmallVector& inputs, SmallVector& output_descs, const bool& validated) { auto comp_node = inputs[0]->comp_node(); + auto dtype = inputs[0]->dtype(); using Mode = Elemwise::Mode; - using TensorND = megdnn::TensorND; auto&& op_def = def.cast_final_safe(); - SmallVector inp_tensornds; - TensorShapeArray inp_shapes(inputs.size()); - inp_tensornds.reserve(inputs.size()); - - TensorLayout layout{inputs[0]->layout().dtype}; - bool is_empty = false; - for (unsigned i = 0; i < inputs.size(); ++i) { - if (inputs[i]->layout().is_empty()) { - is_empty = true; - } - inp_tensornds.push_back(inputs[i]->dnn_tensor()); - inp_shapes[i] = inputs[i]->layout(); + auto mode = op_def.mode; + TensorShapeArray input_shapes; + input_shapes.reserve(inputs.size()); + for (auto&& input : inputs) { + input_shapes.push_back(input->shape()); } - megdnn::Elemwise::deduce_shape(inp_shapes, layout); - layout.init_contiguous_stride(); - - auto out = Tensor::make(layout, comp_node); - - if (is_empty) { - return {out}; + // deduce_shape is static and fast + TensorLayout output_layout{dtype}; + // TODO: deduce_layout directly + megdnn::Elemwise::deduce_shape(input_shapes, output_layout); + output_layout.init_contiguous_stride(); + auto output = Tensor::make(output_layout, comp_node); + if (output_layout.is_empty()) { + return {output}; } - DnnOprCaller dnn_opr(comp_node); - - dnn_opr.op->param() = op_def.param(); - if (dnn_opr.op->param().mode == Mode::FUSE_MUL_ADD3 || - dnn_opr.op->param().mode == Mode::FUSE_MUL_ADD4 || - (inp_tensornds.size() && - inp_tensornds[0].layout.dtype.category() == DTypeCategory::QUANTIZED)) { - opr::Elemwise::perform_dnn( - comp_node, out->dnn_tensor(), inp_tensornds, dnn_opr.op); + DnnOprCaller dnn_opr(comp_node, op_def.param()); + if (mode == Mode::FUSE_MUL_ADD3 || mode == Mode::FUSE_MUL_ADD4 || + dtype.category() == DTypeCategory::QUANTIZED) { + dnn_opr.call_dnn( + [&](auto&& inputs, auto&& output) { + opr::Elemwise::perform_dnn(comp_node, output, inputs, dnn_opr.op()); + }, + inputs, output); } else { - dnn_opr.op->exec(inp_tensornds, out->dnn_tensor()); + dnn_opr.exec(inputs, output); } - - return {out}; + return {output}; } MGB_DEFINE_OPR_CLASS( @@ -179,7 +171,7 @@ protected: return ret; } void create_megdnn_opr() override { - auto opr = DnnOprCaller::create_operator(comp_node()); + auto opr = mgb::opr::intl::create_megdnn_opr(comp_node()); opr->param().mode = m_param.mode; set_megdnn_opr(std::move(opr)); } @@ -243,22 +235,19 @@ SmallVector apply_inplace_add_on_physical_tensor( "This inplace modification may change the elements of other tensors. " "Fallback to non-inplace update."); - DeviceTensorStorage storage; - storage.reset(dest->comp_node(), dest->blob()->size(), dest->blob()->storage()); - storage = storage.sub(dest->offset()); - DeviceTensorND dv; - dv.reset(storage, dest->layout()); - - DeviceTensorND dv_new; - dv_new.copy_from(dv); - dest = Tensor::make(dv_new); + auto dest_layout = inputs[0]->layout(); + dest_layout.init_contiguous_stride(); + auto new_dest = Tensor::make(dest_layout, inputs[0]->comp_node()); + new_dest->dev_tensor().copy_from(dest->dev_tensor()); + dest = new_dest; } auto tensor_to_scalar = [](const TensorPtr& tensor) -> float { return *tensor->get_value().ptr(); }; - DnnOprCaller caller{dest->comp_node()}; - caller.op->param() = {tensor_to_scalar(alpha), tensor_to_scalar(beta)}; - caller.op->exec(dest->dev_tensor().as_megdnn(), delta->dev_tensor().as_megdnn()); + DnnOprCaller caller{ + dest->comp_node(), {tensor_to_scalar(alpha), tensor_to_scalar(beta)}}; + caller.exec(dest, delta); + // FIXME: inplace update host value return {std::make_shared(dest->blob(), dest->offset(), dest->layout())}; } diff --git a/imperative/src/impl/ops/indexing.cpp b/imperative/src/impl/ops/indexing.cpp index f31d71dd..6187654e 100644 --- a/imperative/src/impl/ops/indexing.cpp +++ b/imperative/src/impl/ops/indexing.cpp @@ -67,10 +67,8 @@ SmallVector apply_on_physical_tensor( auto&& op = def.cast_final_safe(); auto&& inp = inputs[0]; auto&& index = inputs[1]; - TensorLayout layout = inp->layout(); - TensorLayout index_layout = index->layout(); - DnnOprCaller dnn_op(inp->comp_node()); - auto&& indexing_one_hot_param = dnn_op.op->param(); + auto&& layout = inp->layout(); + auto&& index_layout = index->layout(); int real_axis = static_cast(op.axis); if (real_axis < 0) { real_axis += static_cast(layout.ndim); @@ -79,16 +77,10 @@ SmallVector apply_on_physical_tensor( 0 <= real_axis && real_axis < static_cast(layout.ndim), "Dimension out of range (expected to be in range of [%d, %d], but got %d)", 0, static_cast(layout.ndim) - 1, op.axis); - indexing_one_hot_param = real_axis; - TensorLayout tlayout; - dnn_op.op->deduce_layout(layout, index_layout, tlayout); - TensorPtr out = Tensor::make(tlayout, inp->comp_node()); - megdnn::TensorND in = inp->dnn_tensor(); - megdnn::TensorND ind = index->dnn_tensor(); - size_t sz = dnn_op.op->get_workspace_in_bytes(layout, index_layout, tlayout); - - auto dnn_workspace = dnn_op.create_workspace(sz); - dnn_op.op->exec(in, ind, out->dnn_tensor(), dnn_workspace); + DnnOprCaller dnn_op(inp->comp_node(), real_axis); + auto tlayout = dnn_op.deduce_layout(layout, index_layout); + auto out = Tensor::make(tlayout, inp->comp_node()); + dnn_op.exec_with_ws(inp, index, out); return {out}; } @@ -105,15 +97,14 @@ std::tuple, bool> infer_output_attrs_fallible( const OpDef& def, const SmallVector& input_descs) { mgb_assert(input_descs.size() == 3, "IndexingSetOneHot expects three inputs"); auto comp_node = input_descs[0].comp_node; - TensorLayout src = input_descs[0].layout, index = input_descs[1].layout; - + auto&& src = input_descs[0].layout; + auto&& index = input_descs[1].layout; mgb_assert(index.dtype == dtype::Int32(), "index dtype must be int32"); - if (!src.ndim) { return {{{{{}, src.dtype}, comp_node}}, false}; } mgb_assert(src.is_contiguous(), "src should be contiguous"); - return {{input_descs[0]}, true}; + return {{{src, comp_node}}, true}; } auto apply_on_var_node(const OpDef& def, const VarNodeArray& inputs) { @@ -136,25 +127,15 @@ SmallVector apply_on_physical_tensor( auto&& index = inputs[1]; auto&& sub = inputs[2]; TensorLayout layout = inp->layout(); - TensorLayout index_layout = index->layout(); - TensorLayout tlayout = sub->layout(); mgb_assert(layout.is_contiguous()); - DnnOprCaller dnn_op(inp->comp_node()); - auto&& indexing_one_hot_param = dnn_op.op->param(); int real_axis = static_cast(op.axis); if (real_axis < 0) { real_axis += static_cast(layout.ndim); } - indexing_one_hot_param = real_axis; + DnnOprCaller dnn_op(inp->comp_node(), real_axis); TensorPtr out = Tensor::make(layout, inp->comp_node()); out->dev_tensor().copy_from_fixlayout(inp->dev_tensor()); - megdnn::TensorND in = inp->dnn_tensor(); - megdnn::TensorND ind = index->dnn_tensor(); - megdnn::TensorND su = sub->dnn_tensor(); - - size_t sz = dnn_op.op->get_workspace_in_bytes(layout, index_layout, tlayout); - auto dnn_workspace = dnn_op.create_workspace(sz); - dnn_op.op->exec(out->dnn_tensor(), ind, su, dnn_workspace); + dnn_op.exec_with_ws(out, index, sub); return {out}; } diff --git a/imperative/src/impl/ops/io_remote.cpp b/imperative/src/impl/ops/io_remote.cpp index 86e0cb35..625e3281 100644 --- a/imperative/src/impl/ops/io_remote.cpp +++ b/imperative/src/impl/ops/io_remote.cpp @@ -54,14 +54,15 @@ cg::OperatorNodeBase* apply_on_var_node_remote_recv( TensorPtr megray_recv_tensor( std::shared_ptr megray_comm, TensorLayout& layout, CompNode cn, uint32_t rank_from) { - DeviceTensorND out = BlobManager::inst()->alloc_workspace_with_defrag(cn, layout); + auto out = Tensor::make(layout, cn); + auto dnn_out = out->dnn_tensor(); auto megray_ctx = mgb::opr::get_megray_context(cn); size_t data_size = layout.total_nr_elems(); auto status = megray_comm->recv( - out.raw_ptr(), data_size, mgb::opr::get_megray_dtype(layout.dtype), + dnn_out.raw_ptr(), data_size, mgb::opr::get_megray_dtype(layout.dtype), rank_from, megray_ctx); mgb_assert(status == MegRay::MEGRAY_OK, "MegRay recv failed"); - return Tensor::make(out); + return out; } void megray_send_tensor( @@ -105,9 +106,7 @@ SmallVector apply_on_physical_tensor_remote_send( mgb_assert(megray_comm != nullptr); megray_send_tensor(megray_comm, inputs[0], op.rank_to); TensorLayout layout({0}, inputs[0]->dtype()); - DeviceTensorND out = BlobManager::inst()->alloc_workspace_with_defrag( - inputs[0]->comp_node(), layout); - return {Tensor::make(out)}; + return {Tensor::make(layout, inputs[0]->comp_node())}; } std::tuple, bool> infer_output_attrs_fallible_remote_recv( diff --git a/imperative/src/impl/ops/lamb.cpp b/imperative/src/impl/ops/lamb.cpp index 598563f1..63563e65 100644 --- a/imperative/src/impl/ops/lamb.cpp +++ b/imperative/src/impl/ops/lamb.cpp @@ -21,14 +21,17 @@ SmallVector get_input_layout_constraint( std::tuple, bool> infer_output_attrs_fallible( const OpDef& def, const SmallVector& input_descs) { mgb_assert(input_descs.size() == 4, "IndexingOneHot expects 4inputs"); - auto comp_node = input_descs[0].comp_node; auto comp_node1 = input_descs[1].comp_node; auto comp_node2 = input_descs[2].comp_node; - TensorLayout m_t_1 = input_descs[0].layout, v_t_1 = input_descs[1].layout, - lamb_param = input_descs[2].layout, grad = input_descs[3].layout; - - TensorLayout new_param = lamb_param, m_t = m_t_1, v_t = v_t_1; + auto&& m_t_1 = input_descs[0].layout; + auto&& v_t_1 = input_descs[1].layout; + auto&& lamb_param = input_descs[2].layout; + auto&& grad = input_descs[3].layout; + MGB_MARK_USED_VAR(grad); + auto&& new_param = lamb_param; + auto&& m_t = m_t_1; + auto&& v_t = v_t_1; return {{{m_t, comp_node}, {v_t, comp_node1}, {new_param, comp_node2}}, true}; } @@ -46,23 +49,11 @@ SmallVector apply_on_physical_tensor( TensorLayout lamb_param_layout{lamb_param->layout()}; auto m_t = Tensor::make(m_t_1_layout, m_t_1->comp_node()); - auto v_t = Tensor::make(v_t_1_layout, v_t_1->comp_node()); - auto new_param = Tensor::make(lamb_param_layout, lamb_param->comp_node()); - DnnOprCaller caller{lamb_param->comp_node()}; - size_t sz = caller.op->get_workspace_in_bytes( - m_t_1->layout(), v_t_1->layout(), lamb_param->layout(), grad->layout(), - m_t->layout(), v_t->layout(), new_param->layout()); - - auto dnn_workspace = caller.create_workspace(sz); - caller.op->param() = op.param(); - caller.op->exec( - m_t_1->dev_tensor().as_megdnn(), v_t_1->dev_tensor().as_megdnn(), - lamb_param->dev_tensor().as_megdnn(), grad->dev_tensor().as_megdnn(), - m_t->dnn_tensor(), v_t->dnn_tensor(), new_param->dnn_tensor(), - dnn_workspace); + DnnOprCaller dnn_opr{lamb_param->comp_node(), op.param()}; + dnn_opr.exec_with_ws(m_t_1, v_t_1, lamb_param, grad, m_t, v_t, new_param); return {m_t, v_t, new_param}; } diff --git a/imperative/src/impl/ops/layer_norm.cpp b/imperative/src/impl/ops/layer_norm.cpp index 051d8f06..a718005d 100644 --- a/imperative/src/impl/ops/layer_norm.cpp +++ b/imperative/src/impl/ops/layer_norm.cpp @@ -29,11 +29,11 @@ cg::OperatorNodeBase* apply_on_var_node(const OpDef& def, const VarNodeArray& in std::tuple, bool> infer_output_attrs_fallible( const OpDef& def, const SmallVector& inputs) { - auto&& op_def = def.cast_final_safe(); + auto&& layer_norm = def.cast_final_safe(); size_t nr_inp = inputs.size(); - auto p = op_def.param(); + auto affine = layer_norm.affine; mgb_assert( - (nr_inp == 3 && p.affine) || (nr_inp == 1 && !p.affine), + (nr_inp == 3 && affine) || (nr_inp == 1 && !affine), "num of inputs of pooling should be 1 or 3 but you give %zu", inputs.size()); @@ -47,9 +47,9 @@ std::tuple, bool> infer_output_attrs_fallible( false}; } - TensorLayout oup_layout, mean_layout, rstd_layout; - megdnn::LayerNorm::deduce_layout_fwd_impl( - inp.layout, p, oup_layout, mean_layout, rstd_layout); + DnnOprHelper dnn_opr(layer_norm.param()); + auto&& [oup_layout, mean_layout, rstd_layout] = + dnn_opr.deduce_layouts<3>(inp.layout, TensorLayout{}, TensorLayout{}); return {{{oup_layout, inp_cn, {}}, {mean_layout, inp_cn, {}}, {rstd_layout, inp_cn, {}}}, @@ -69,32 +69,21 @@ SmallVector apply_on_physical_tensor( inputs.size()); auto cn = inputs[0]->comp_node(); - DnnOprCaller caller(cn); - auto&& dnn_opr = caller.op; - dnn_opr->param() = p; + DnnOprCaller caller(cn, op_def.param()); - TensorLayout oup_layout, mean_layout, rstd_layout; - megdnn::LayerNorm::deduce_layout_fwd_impl( - inputs[0]->dnn_tensor().layout, p, oup_layout, mean_layout, rstd_layout); + auto&& [oup_layout, mean_layout, rstd_layout] = caller.deduce_layouts<3>( + inputs[0]->layout(), TensorLayout{}, TensorLayout{}); auto out = Tensor::make(oup_layout, cn); - auto mean = Tensor::make(mean_layout, cn); - auto rstd = Tensor::make(rstd_layout, cn); - auto wk_size = caller.op->get_workspace_in_bytes( - inputs[0]->dnn_tensor().layout, - p.affine ? inputs[1]->dnn_tensor().layout : TensorLayout(), - p.affine ? inputs[2]->dnn_tensor().layout : TensorLayout(), oup_layout, - mean_layout, rstd_layout); - auto dnn_wk = caller.create_workspace(wk_size); - - caller.op->exec( - inputs[0]->dnn_tensor(), - p.affine ? inputs[1]->dnn_tensor() : megdnn::TensorND(), - p.affine ? inputs[2]->dnn_tensor() : megdnn::TensorND(), out->dnn_tensor(), - mean->dnn_tensor(), rstd->dnn_tensor(), dnn_wk); + if (p.affine) { + caller.exec_with_ws(inputs[0], inputs[1], inputs[2], out, mean, rstd); + } else { + megdnn::TensorND empty_dnn; + caller.exec_with_ws(inputs[0], empty_dnn, empty_dnn, out, mean, rstd); + } return {out, mean, rstd}; } @@ -105,4 +94,4 @@ OP_TRAIT_REG(LayerNorm, LayerNorm) .fallback(); } // namespace layer_norm -} // namespace mgb::imperative \ No newline at end of file +} // namespace mgb::imperative diff --git a/imperative/src/impl/ops/matmul.cpp b/imperative/src/impl/ops/matmul.cpp index 3b1cdb62..5fb4d199 100644 --- a/imperative/src/impl/ops/matmul.cpp +++ b/imperative/src/impl/ops/matmul.cpp @@ -24,7 +24,6 @@ auto apply_on_var_node(const OpDef& def, const VarNodeArray& inputs) { auto dim1 = matmul.dimA, dim2 = matmul.dimB; auto cn = inputs[0]->comp_node(); - using Desc = opr::AxisAddRemove::AxisDesc; using IndexDesc = opr::Subtensor::IndexDesc; OperatorNodeConfig config{matmul.make_name(), cn}; @@ -104,9 +103,8 @@ std::tuple, bool> infer_output_attrs_fallible( dim1 = dim2 = 2; } - DnnOprCaller dnn_opr(inputs[0].comp_node); - dnn_opr.op->param() = matmul.param(); - dnn_opr.op->deduce_dtype(layout1.dtype, layout1.dtype, dst_dtype); + DnnOprHelper dnn_opr(matmul.param()); + dnn_opr.opr().deduce_dtype(layout1.dtype, layout1.dtype, dst_dtype); if (dim1 == 0 || dim2 == 0) { return {{{TensorLayout(dst_dtype), inputs[0].comp_node}}, false}; @@ -143,8 +141,7 @@ SmallVector apply_on_physical_tensor( SmallVector inp_tensornds(inputs.size()); TensorLayout layout1 = inputs[0]->layout(), layout2 = inputs[1]->layout(); - DnnOprCaller dnn_opr(cn); - dnn_opr.op->param() = matmul.param(); + DnnOprCaller dnn_opr(cn, matmul.param(), matmul.policy()); if (matmul.dimA == matmul.dimB && matmul.dimB >= 3) { // only happens in backward for (size_t i = 1; i + 1 < layout1.ndim; ++i) { @@ -160,7 +157,7 @@ SmallVector apply_on_physical_tensor( } DType dst_dtype; - dnn_opr.op->deduce_dtype(layout1.dtype, layout1.dtype, dst_dtype); + dnn_opr.op()->deduce_dtype(layout1.dtype, layout1.dtype, dst_dtype); // only matters when layout1 has dim 2 if (matmul.transposeA) @@ -229,13 +226,8 @@ SmallVector apply_on_physical_tensor( inp_tensornds[0].layout = layout_a; inp_tensornds[1].layout = layout_b; } - size_t sz = setup_algo( - {layout_a, layout_b, dst_layout}, dnn_opr.op.get(), 0, false, false, cn, - matmul.policy(), false, &inp_tensornds); auto out = Tensor::make(dst_layout, cn); - auto dnn_wk = dnn_opr.create_workspace(sz); - - dnn_opr.op->exec(inp_tensornds[0], inp_tensornds[1], out->dnn_tensor(), dnn_wk); + dnn_opr.exec_fastrun(inp_tensornds[0], inp_tensornds[1], out); return {out->sub(0, real_dst_layout)}; } @@ -266,7 +258,6 @@ auto apply_on_var_node(const OpDef& def, const VarNodeArray& inputs) { auto dim1 = matmul.dimA, dim2 = matmul.dimB; auto cn = inputs[0]->comp_node(); - using Desc = opr::AxisAddRemove::AxisDesc; using IndexDesc = opr::Subtensor::IndexDesc; OperatorNodeConfig config{matmul.make_name(), cn}; @@ -343,9 +334,8 @@ std::tuple, bool> infer_output_attrs_fallible( DType dst_dtype; - DnnOprCaller dnn_opr(inputs[0].comp_node); - dnn_opr.op->param() = matmul.param(); - dnn_opr.op->deduce_dtype(layout1.dtype, layout1.dtype, dst_dtype); + DnnOprHelper dnn_opr(matmul.param()); + dnn_opr.opr().deduce_dtype(layout1.dtype, layout1.dtype, dst_dtype); if (dim1 == 0 || dim2 == 0) { return {{{TensorLayout(dst_dtype), inputs[0].comp_node}}, false}; @@ -386,10 +376,9 @@ SmallVector apply_on_physical_tensor( TensorLayout layout1 = inputs[0]->layout(), layout2 = inputs[1]->layout(); size_t dim1 = layout1.ndim, dim2 = layout2.ndim; - DnnOprCaller dnn_opr(cn); - dnn_opr.op->param() = matmul.param(); + DnnOprCaller dnn_opr(cn, matmul.param(), matmul.policy()); DType dst_dtype; - dnn_opr.op->deduce_dtype(layout1.dtype, layout1.dtype, dst_dtype); + dnn_opr.op()->deduce_dtype(layout1.dtype, layout1.dtype, dst_dtype); TensorShape tshp, batch_shp; size_t j = 0; @@ -473,14 +462,9 @@ SmallVector apply_on_physical_tensor( inp_tensornds[1] = inp2->dnn_tensor(); inp_tensornds[1].layout = layout2; - size_t sz = setup_algo( - {layout1, layout2, dst_layout}, dnn_opr.op.get(), 0, false, false, cn, - matmul.policy(), false, &inp_tensornds); - auto out = Tensor::make(dst_layout, cn); - auto dnn_wk = dnn_opr.create_workspace(sz); - dnn_opr.op->exec(inp_tensornds[0], inp_tensornds[1], out->dnn_tensor(), dnn_wk); + dnn_opr.exec_fastrun(inp_tensornds[0], inp_tensornds[1], out); shp1[shp1.ndim - 2] = dst_layout[dst_layout.ndim - 2]; shp1[shp1.ndim - 1] = dst_layout[dst_layout.ndim - 1]; @@ -533,7 +517,7 @@ SmallVector apply_on_physical_tensor( TensorLayout oup_layout{inputs[0]->dtype()}; auto inp1_tensor = inputs[0]->dnn_tensor(); auto inp2_tensor = inputs[1]->dnn_tensor(); - dnn_opr.op->deduce_layout(inp1_tensor.layout, inp2_tensor.layout, oup_layout); + oup_layout = dnn_opr.deduce_layout(inp1_tensor.layout, inp2_tensor.layout); if (inputs[0]->layout().is_empty() || inputs[1]->layout().is_empty()) { auto out = Tensor::make(oup_layout, comp_node); @@ -543,14 +527,8 @@ SmallVector apply_on_physical_tensor( return {out}; } - auto sz = dnn_opr.op->get_workspace_in_bytes( - inp_tensornds[0].layout, inp_tensornds[1].layout, output_descs[0].layout); - auto out = Tensor::make(oup_layout, comp_node); - - auto dnn_wk = dnn_opr.create_workspace(sz); - - dnn_opr.op->exec(inp_tensornds[0], inp_tensornds[1], out->dnn_tensor(), dnn_wk); + dnn_opr.exec_with_ws(inp_tensornds[0], inp_tensornds[1], out); return {out}; } diff --git a/imperative/src/impl/ops/misc.cpp b/imperative/src/impl/ops/misc.cpp index b72f4eb2..2cab95b2 100644 --- a/imperative/src/impl/ops/misc.cpp +++ b/imperative/src/impl/ops/misc.cpp @@ -17,27 +17,18 @@ SymbolVarArray apply_on_var_node(const OpDef& def, const VarNodeArray& inputs) { SmallVector apply_on_physical_tensor( const OpDef& def, const SmallVector& inputs, SmallVector& output_descs, const bool& validated) { - size_t size = inputs.size(); auto&& op = def.cast_final_safe(); - SmallVector outputs(size + 1); - outputs[size] = Tensor::make( - TensorLayout(TensorShape({1}), dtype::Int32()), inputs[0]->comp_node()); - - auto dest = outputs[size]; - auto cn = dest->comp_node(); - DnnOprCaller dnn_opr(cn); - SmallVector srcs(size); - // copy an outputs to the dnn for inplace - for (size_t i = 0; i < size; ++i) { - outputs[i] = Tensor::make(inputs[i]->layout(), inputs[0]->comp_node()); - outputs[i]->dev_tensor().copy_from_fixlayout(inputs[i]->dev_tensor()); - srcs[i] = outputs[i]->dev_tensor().as_megdnn(); + auto comp_node = inputs[0]->comp_node(); + auto dest = Tensor::make(TensorLayout({1}, dtype::Int32()), comp_node); + SmallVector outputs; + outputs.reserve(inputs.size() + 1); + for (auto&& input : inputs) { + outputs.push_back(Tensor::make(input->layout(), comp_node)); + outputs.back()->dev_tensor().copy_from_fixlayout(input->dev_tensor()); } - megdnn::CheckNonFinite::Param param({op.scale}); - dnn_opr.op->param() = param; - size_t sz = dnn_opr.op->get_workspace_in_bytes(srcs, dest->layout()); - auto dnn_wk = dnn_opr.create_workspace(sz); - dnn_opr.op->exec(srcs, dest->dnn_tensor(), dnn_wk); + DnnOprCaller dnn_opr(comp_node, {op.scale}); + dnn_opr.exec_with_ws(outputs, dest); + outputs.push_back(dest); return outputs; } @@ -45,13 +36,15 @@ std::tuple, bool> infer_output_attrs_fallible( const OpDef& def, const SmallVector& inputs) { size_t size = inputs.size(); SmallVector dests(size + 1); + bool validated = true; for (size_t i = 0; i < size; ++i) { dests[i].comp_node = inputs[i].comp_node; dests[i].layout = inputs[i].layout; + validated &= bool(dests[i].layout.ndim); } dests[size].comp_node = inputs[0].comp_node; - dests[size].layout = TensorLayout(TensorShape({1}), dtype::Int32()); - return {dests, true}; + dests[size].layout = TensorLayout({1}, dtype::Int32()); + return {dests, validated}; } OP_TRAIT_REG(CheckNonFinite, CheckNonFinite) diff --git a/imperative/src/impl/ops/padding.cpp b/imperative/src/impl/ops/padding.cpp index d668f023..b34c3254 100644 --- a/imperative/src/impl/ops/padding.cpp +++ b/imperative/src/impl/ops/padding.cpp @@ -27,40 +27,31 @@ SmallVector apply_on_physical_tensor( SmallVector& output_descs, const bool& validated) { auto comp_node = inputs[0]->comp_node(); auto&& op_def = def.cast_final_safe(); - - DnnOprCaller dnn_op(comp_node); - dnn_op.op->param() = op_def.param(); - - TensorLayout dst = output_descs[0].layout; - if (!validated) { - megdnn::Padding::deduce_layout_impl( - inputs[0]->dnn_tensor().layout, dst, op_def.param()); - } - - DeviceTensorND out = - BlobManager::inst()->alloc_workspace_with_defrag(comp_node, dst); - - dnn_op.op->exec(inputs[0]->dnn_tensor(), out.as_megdnn()); - - return {Tensor::make(out)}; + DnnOprCaller dnn_op(comp_node, op_def.param()); + auto dst = [&] { + if (validated) { + return output_descs[0].layout; + } else { + return dnn_op.deduce_layout(inputs[0]->layout()); + } + }(); + auto out = Tensor::make(dst, comp_node); + dnn_op.exec(inputs[0], out); + return {out}; } std::tuple, bool> infer_output_attrs_fallible( const OpDef& def, const SmallVector& inputs) { auto&& op_def = def.cast_final_safe(); - size_t nr_inp = inputs.size(); - auto p = op_def.param(); - auto&& inp = inputs[0]; - auto& inp_cn = inp.comp_node; if (inp.layout.ndim == 0) { - return {{{TensorLayout{inp.layout.dtype}, inp_cn, {}}}, false}; + return {{{TensorLayout{inp.layout.dtype}, inp.comp_node, {}}}, false}; } - TensorLayout oup_layout; - megdnn::Padding::deduce_layout_impl(inp.layout, oup_layout, p); - return {{{oup_layout, inp_cn, {}}}, true}; + DnnOprHelper dnn_op(op_def.param()); + auto oup_layout = dnn_op.deduce_layout(inp.layout); + return {{{oup_layout, inp.comp_node}}, true}; } OP_TRAIT_REG(Padding, Padding, opr::Padding) @@ -74,4 +65,4 @@ OP_TRAIT_REG(Padding, Padding, opr::Padding) } // namespace imperative } // namespace mgb -// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} \ No newline at end of file +// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} diff --git a/imperative/src/impl/ops/pooling.cpp b/imperative/src/impl/ops/pooling.cpp index 0c8dc25d..66dfbc21 100644 --- a/imperative/src/impl/ops/pooling.cpp +++ b/imperative/src/impl/ops/pooling.cpp @@ -25,19 +25,13 @@ std::tuple, bool> infer_output_attrs_fallible( mgb_assert( inputs.size() == 1, "num of inputs of pooling should be 1 but you give %zu", inputs.size()); - auto&& op_def = def.cast_final_safe(); - auto&& inp = inputs[0]; - auto& inp_cn = inp.comp_node; - - if (inp.layout.ndim == 0) { - return {{{TensorLayout{inp.layout.dtype}, inp_cn, {}}}, false}; + if (!inputs[0].layout.ndim) { + return {{{inputs[0].layout, inputs[0].comp_node}}, false}; } - - TensorLayout oup_layout; - megdnn::Pooling::deduce_layout_impl(inp.layout, op_def.param(), oup_layout); - - return {{{oup_layout, inp_cn, {}}}, true}; + DnnOprHelper dnn_opr(op_def.param()); + auto oup_layout = dnn_opr.deduce_layout(inputs[0].layout); + return {{{oup_layout, inputs[0].comp_node}}, true}; } SmallVector apply_on_physical_tensor( @@ -47,30 +41,18 @@ SmallVector apply_on_physical_tensor( inputs.size() == 1, "num of inputs of pooling should be 1 but you give %zu", inputs.size()); - auto&& op_def = def.cast_final_safe(); + auto&& pooling = def.cast_final_safe(); auto cn = inputs[0]->comp_node(); - DnnOprCaller caller(cn); - auto&& dnn_opr = caller.op; - dnn_opr->param() = op_def.param(); - - SmallVector inp_tensornds(inputs.size()); - inp_tensornds[0] = inputs[0]->dnn_tensor(); - - TensorLayout& oup_layout = output_descs[0].layout; - if (!validated) { - megdnn::Pooling::deduce_layout_impl( - inp_tensornds[0].layout, op_def.param(), oup_layout); - } - - size_t wk_size = setup_algo( - {inp_tensornds[0].layout, oup_layout}, dnn_opr.get(), 0, false, false, cn, - op_def.policy(), false, &inp_tensornds); - + DnnOprCaller dnn_opr(cn, pooling.param(), pooling.policy()); + auto oup_layout = [&] { + if (validated) { + return output_descs[0].layout; + } else { + return dnn_opr.deduce_layout(inputs[0]->layout()); + } + }(); auto out = Tensor::make(oup_layout, cn); - - auto dnn_wk = caller.create_workspace(wk_size); - - caller.op->exec(inp_tensornds[0], out->dnn_tensor(), dnn_wk); + dnn_opr.exec_fastrun(inputs[0], out); return {out}; } diff --git a/imperative/src/impl/ops/reduce.cpp b/imperative/src/impl/ops/reduce.cpp index 3d4d8143..102f7429 100644 --- a/imperative/src/impl/ops/reduce.cpp +++ b/imperative/src/impl/ops/reduce.cpp @@ -18,33 +18,31 @@ namespace reduce { auto apply_on_var_node(const OpDef& def, const VarNodeArray& inputs) { auto&& reduce = static_cast(def); auto comp_node = inputs[0]->comp_node(); - OperatorNodeConfig config{reduce.make_name(), comp_node, inputs[0]->dtype()}; + auto name = reduce.make_name(); - if (inputs.size() > 1) { - return opr::Reduce::make(inputs[0], reduce.param(), inputs[1], config); - } - - using Param = megdnn::param::Reduce; auto param = reduce.param(); - if (param.axis < 0) { - param.axis = inputs[0]->shape().ndim + param.axis; + auto axis = param.axis; + auto keepdim = reduce.keepdim; + + if (inputs.size() == 2) { + return opr::Reduce::make(inputs[0], param, inputs[1], {name}); } + mgb_assert(inputs.size() == 1); - SymbolVar target_shape = (cg::VarNode*)nullptr; - if (param.axis == INT_MAX) { - DTypeScalar vi{1}; - // auto graph = ComputingGraph::make(); + if (axis == INT_MAX) { + // keepdim could be ignored when ndim == 1 auto graph = inputs[0]->owner_graph(); - target_shape = opr::ImmutableTensor::make(*graph, vi, config); + auto scalar_shape = + opr::ImmutableTensor::make(*graph, DTypeScalar(1), {name, comp_node}); + return opr::Reduce::make(inputs[0], param, scalar_shape, {name}); } - auto res = opr::Reduce::make(inputs[0], param, target_shape, config); - if (!reduce.keepdim && param.axis != INT_MAX) { + // mgb::opr::Reduce supports negative axis + auto res = opr::Reduce::make(inputs[0], param, {}, {name}); + if (!keepdim) { using Desc = opr::AxisAddRemove::AxisDesc; - std::vector remove_param; - remove_param.push_back(Desc::make_remove(param.axis)); - OperatorNodeConfig remove_config{ - def.make_name(), comp_node, inputs[0]->dtype()}; - return opr::AxisAddRemove::make(res, remove_param, remove_config); + std::vector remove_axis_param; + remove_axis_param.push_back(Desc::make_remove(axis)); + res = opr::AxisAddRemove::make(res, remove_axis_param, {name}); } return res; } @@ -71,111 +69,104 @@ bool memory_forward_success(const OpDef& def, SmallVector inputs) { SmallVector apply_on_physical_tensor( const OpDef& def, const SmallVector& inputs, SmallVector& output_descs, const bool& validated) { + // memory forward if (memory_forward_success(def, inputs)) { + // maybe returns inputs[0] directly return {Tensor::make( inputs[0]->blob(), inputs[0]->offset(), inputs[0]->layout())}; } - auto size = inputs.size(); - if (size > 1) { + if (inputs.size() == 2) { + // reduce to target shape, fallback to proxy_graph return proxy_graph_detail::apply_on_physical_tensor( def, inputs, output_descs, validated); } + mgb_assert(inputs.size() == 1); auto comp_node = inputs[0]->comp_node(); - using TensorND = megdnn::TensorND; auto&& op_def = def.cast_final_safe(); - SmallVector inp_tensornds; - inp_tensornds.reserve(inputs.size()); - auto src = inputs[0]->layout(); - - DnnOprCaller dnn_op(comp_node); - dnn_op.op->param() = op_def.param(); - auto axis = op_def.param().axis; + DnnOprCaller dnn_op(comp_node, op_def.param()); + auto&& mode = dnn_op.param().mode; + auto& axis = dnn_op.param().axis; auto keepdim = op_def.keepdim; - if (axis < 0) { - axis = inputs[0]->layout().ndim + axis; - } - - dnn_op.op->param().axis = axis == INT_MAX ? 0 : axis; - - if (axis == INT_MAX) { - src.shape[0] = src.total_nr_elems(); - src.ndim = 1; - src.init_contiguous_stride(); - } - TensorLayout layout{src.dtype}; - dnn_op.op->deduce_layout(src, layout); - - if (inputs[0]->layout().is_empty()) { - inputs[0]->dev_tensor().reset(inputs[0]->dev_tensor().storage(), src); - - auto mode = op_def.param().mode; - - if (!keepdim && src.ndim > 1) { - layout.remove_axis_inplace(axis); - layout.init_contiguous_stride(); + DnnTensorND dnn_input = [&] { + if (axis == INT_MAX) { // reduce to scalar + axis = 0; + // flatten input + return inputs[0]->dnn_tensor({inputs[0]->shape().total_nr_elems()}); + } else { + if (axis < 0) { + axis = inputs[0]->layout().ndim + axis; + } + mgb_assert(axis >= 0 && axis < inputs[0]->layout().ndim); + return inputs[0]->dnn_tensor(); } - auto out = Tensor::make(layout, comp_node); + }(); + auto output_layout = dnn_op.deduce_layout(dnn_input.layout); + auto resolve_keepdim = [&] { + if (!keepdim) { + if (output_layout.ndim > 1) { + mgb_assert(output_layout.shape[axis] == 1); + output_layout.remove_axis_inplace(axis); + } + } + }; - std::string err_msg; + TensorPtr output; + if (output_layout.is_empty()) { + // output empty, no computation + resolve_keepdim(); + output = Tensor::make(output_layout, comp_node); + } else if (dnn_input.layout.is_empty()) { + // input empty but output not, do fill + resolve_keepdim(); + output = Tensor::make(output_layout, comp_node); + auto on_bad_empty_reduce = [](const char* name) { + mgb_throw( + MegBrainError, "empty input is not allowed for reduce mode: %s", + name); + }; switch (mode) { case Reduce::Mode::SUM: - if (!out->empty()) { - dev_tensor_memset(out->dev_tensor(), 0); - } + // fill 0 + dev_tensor_memset(output->dev_tensor(), 0); break; - case Reduce::Mode::PRODUCT: - if (!out->empty()) { - DnnOprCaller fill_op(comp_node); - fill_op.op->param() = 1; - fill_op.op->exec(out->dnn_tensor(), {}); - } + case Reduce::Mode::PRODUCT: { + // fill 1 + DnnOprCaller fill_op(comp_node, {1}); + fill_op.exec_with_ws(output); break; + } case Reduce::Mode::MEAN: - err_msg = "mean"; + on_bad_empty_reduce("mean"); break; case Reduce::Mode::MIN: - err_msg = "min"; + on_bad_empty_reduce("min"); break; case Reduce::Mode::MAX: - err_msg = "max"; + on_bad_empty_reduce("max"); break; case Reduce::Mode::SUM_SQR: - err_msg = "sum_sqr"; + on_bad_empty_reduce("sum_sqr"); break; default: mgb_throw(MegBrainError, "bad reduce mode"); } - if (!err_msg.empty()) { - mgb_throw( - MegBrainError, "empty input is not allowed for reduce mode: %s", - err_msg.c_str()); + } else { + // common reduction + if (keepdim) { + output = Tensor::make(output_layout, comp_node); + dnn_op.exec_with_ws(dnn_input, output); + } else { + // used by megdnn::exec + auto output_layout_keepdim = output_layout; + resolve_keepdim(); + output = Tensor::make(output_layout, comp_node); + dnn_op.exec_with_ws(dnn_input, output->dnn_tensor(output_layout_keepdim)); } - return {out}; } - - auto dnn_ten = inputs[0]->dnn_tensor(); - dnn_ten.layout = src; - inp_tensornds.push_back(dnn_ten); - - auto wk_size = dnn_op.op->get_workspace_in_bytes(src, layout); - auto dnn_wk = dnn_op.create_workspace(wk_size); - TensorLayout ori_layout = layout; - - if (!keepdim && src.ndim > 1) { - layout.remove_axis_inplace(axis); - layout.init_contiguous_stride(); - } - - auto out = Tensor::make(layout, comp_node); - auto dnn_out = out->dnn_tensor(); - dnn_out.layout = ori_layout; - - dnn_op.op->exec(inp_tensornds[0], dnn_out, dnn_wk); - - return {out}; + return {output}; } std::tuple, bool> infer_output_attrs_fallible( @@ -184,16 +175,12 @@ std::tuple, bool> infer_output_attrs_fallible( auto axis = op_def.param().axis; auto keepdim = op_def.keepdim; - size_t size = inputs.size(); - SmallVector dests(size); + mgb_assert(inputs.size() > 0); + auto&& comp_node = inputs[0].comp_node; + auto&& input_layout = inputs[0].layout; - for (size_t i = 0; i < size; i++) { - if (inputs[i].layout.ndim == 0) { - return {{{TensorLayout(inputs[0].layout.dtype), inputs[0].comp_node}}, - false}; - } - } - if (size > 1) { + if (inputs.size() == 2) { + // fallback to proxy_graph, matters on backward auto [output_descs, validated] = proxy_graph_detail::infer_output_attrs_fallible(def, inputs); if (!inputs[1].value.empty()) { @@ -203,30 +190,37 @@ std::tuple, bool> infer_output_attrs_fallible( return {output_descs, validated}; } + mgb_assert(inputs.size() == 1); + + if (axis == INT_MAX) { + // reduce to scalar + // ignore keepdim because ndim is 1 + auto&& dtype = input_layout.dtype; + auto&& format = input_layout.format; + auto output_layout = TensorLayout{{1}, dtype, format}; + return {{{output_layout, comp_node}}, true}; + } + + if (input_layout.ndim == 0) { + // shape incomplete + return {{{TensorLayout(input_layout.dtype, input_layout.format), comp_node}}, + false}; + } + if (axis < 0) { - axis = inputs[0].layout.ndim + axis; + axis = input_layout.ndim + axis; } + mgb_assert(axis >= 0 && axis < input_layout.ndim); - if (axis == INT_MAX || inputs[0].layout.ndim == 1) { - TensorLayout layout{inputs[0].layout.dtype}; - layout.shape[0] = 1; - layout.ndim = 1; - dests[0].layout = layout; - dests[0].comp_node = inputs[0].comp_node; + TensorLayout output_layout = input_layout; + bool remove_axis = (!keepdim) && input_layout.ndim > 1; + if (remove_axis) { + output_layout.remove_axis_inplace(axis); } else { - for (size_t i = 0; i < size; ++i) { - dests[i].comp_node = inputs[i].comp_node; - dests[i].layout = inputs[i].layout; - if (!keepdim && dests[i].layout.ndim > 1) { - dests[i].layout.remove_axis_inplace(axis); - } else { - dests[i].layout.shape[axis] = 1; - } - dests[i].layout.init_contiguous_stride(); - } + output_layout.shape[axis] = 1; } - - return {dests, true}; + output_layout.init_contiguous_stride(); + return {{{output_layout, comp_node}}, true}; } SmallVector get_input_layout_constraint( diff --git a/imperative/src/impl/ops/tensor_manip.cpp b/imperative/src/impl/ops/tensor_manip.cpp index acb0ad46..7afc86df 100644 --- a/imperative/src/impl/ops/tensor_manip.cpp +++ b/imperative/src/impl/ops/tensor_manip.cpp @@ -230,31 +230,19 @@ SmallVector param_pack_concat_apply_on_physical_tensor( } auto dest_layout = TensorLayout({nr_elems}, dtype); auto output = Tensor::make(dest_layout, comp_node); - auto caller = DnnOprCaller(comp_node); - size_t srcs_size = sizeof(void*) * nr_inputs; - void** srcs_raw_ptr = (void**)comp_node.alloc_host(srcs_size); - std::shared_ptr srcs_ptr = { - (dt_byte*)srcs_raw_ptr, - [comp_node](dt_byte* ptr) { comp_node.free_host(ptr); }}; + // FIXME: add param to ParamPackConcat + DnnOprCaller caller{comp_node}; + HostTensorStorage srcs_storage{comp_node}; + srcs_storage.ensure_size(sizeof(void*) * nr_inputs); TensorLayout srcs_layout = TensorLayout{{nr_inputs}, dtype::Int32()}; - size_t ws_size; - { - TensorShapeArray src_shapes; - for (size_t i = 0; i < nr_inputs; ++i) { - src_shapes.push_back(inputs[i]->shape()); - } - ws_size = caller.op->get_workspace_in_bytes( - src_shapes, inputs.back()->shape(), TensorShape{}); - } + HostTensorND srcs_tensornd; + srcs_tensornd.reset(srcs_storage, srcs_layout); + auto* srcs_raw_ptr = reinterpret_cast(srcs_storage.ptr()); for (size_t i = 0; i < nr_inputs; ++i) { - srcs_raw_ptr[i] = inputs[i]->dev_tensor().as_megdnn().raw_ptr(); + srcs_raw_ptr[i] = inputs[i]->dnn_tensor().raw_ptr(); } - HostTensorStorage srcs_storage; - srcs_storage.reset(comp_node, srcs_size, srcs_ptr); - caller.op->exec( - {srcs_raw_ptr, srcs_layout}, inputs.back()->dnn_tensor(), - output->dnn_tensor(), caller.create_workspace(ws_size)); - async_release(HostTensorND{comp_node, srcs_layout}.storage(srcs_storage)); + caller.exec_with_ws(srcs_tensornd.as_megdnn(), inputs.back(), output); + async_release(srcs_tensornd); return {output}; } diff --git a/imperative/src/impl/ops/vision.cpp b/imperative/src/impl/ops/vision.cpp index 33dbd039..fd56ba12 100644 --- a/imperative/src/impl/ops/vision.cpp +++ b/imperative/src/impl/ops/vision.cpp @@ -33,69 +33,39 @@ VarNodeArray apply_on_var_node(const OpDef& def, const VarNodeArray& inputs) { std::tuple, bool> infer_output_attrs_fallible( const OpDef& def, const SmallVector& inputs) { - auto&& op = static_cast(def); - if (inputs[0].layout.is_empty() || inputs[1].layout.is_empty()) { - return {{{TensorLayout(inputs[0].layout.dtype), inputs[0].comp_node}, - {TensorLayout(dtype::Int32()), inputs[1].comp_node}}, - false}; - } - - SmallVector descs(2u); - size_t n = inputs[1].layout[0]; - size_t c = inputs[0].layout[1]; - descs[0].layout = TensorLayout( - {n, c, op.pooled_height, op.pooled_width}, inputs[0].layout.dtype); - descs[0].layout.init_contiguous_stride(); - descs[0].comp_node = inputs[0].comp_node; - - descs[1].layout = - TensorLayout({n, c, op.pooled_height, op.pooled_width}, dtype::Int32()); - descs[1].layout.init_contiguous_stride(); - descs[1].comp_node = descs[0].comp_node; - - return {descs, true}; + auto&& op = def.cast_final_safe(); + DnnOprHelper dnn_opr(op.param()); + auto cn = inputs[0].comp_node; + auto&& [out_layout, ind_layout] = + dnn_opr.deduce_layouts<2>(inputs[0].layout, inputs[1].layout); + bool validated = out_layout.ndim == 0 && ind_layout.ndim == 0; + return {{{out_layout, cn}, {ind_layout, cn}}, validated}; } SmallVector apply_on_physical_tensor( const OpDef& def, const SmallVector& inputs, SmallVector& output_descs, const bool& validated) { - auto&& op = static_cast(def); - CompNode cn = inputs[0]->comp_node(); + auto&& op = def.cast_final_safe(); + auto cn = inputs[0]->comp_node(); - TensorLayout out_layout = output_descs[0].layout; - TensorLayout ind_layout = output_descs[1].layout; - if (!validated) { - size_t n = inputs[1]->layout()[0]; - size_t c = inputs[0]->layout()[1]; - out_layout = TensorLayout( - {n, c, op.pooled_height, op.pooled_width}, inputs[0]->layout().dtype); - out_layout.init_contiguous_stride(); - ind_layout = - TensorLayout({n, c, op.pooled_height, op.pooled_width}, dtype::Int32()); - ind_layout.init_contiguous_stride(); - } + DnnOprCaller dnn_opr(cn, op.param()); + auto&& [out_layout, ind_layout] = [&]() -> std::array { + if (validated) { + return {output_descs[0].layout, output_descs[1].layout}; + } else { + return dnn_opr.deduce_layouts<2>(inputs[0]->layout(), inputs[1]->layout()); + } + }(); - DeviceTensorND out = - BlobManager::inst()->alloc_workspace_with_defrag(cn, out_layout); - DeviceTensorND inds = - BlobManager::inst()->alloc_workspace_with_defrag(cn, ind_layout); + auto out = Tensor::make(out_layout, cn); + auto ind = Tensor::make(ind_layout, cn); if (out_layout.is_empty() || ind_layout.is_empty()) { - return {Tensor::make(out), Tensor::make(inds)}; + return {out, ind}; } - DnnOprCaller dnn_opr(cn); - dnn_opr.op->param() = op.param(); - - size_t sz = dnn_opr.op->get_workspace_in_bytes( - inputs[0]->layout(), inputs[1]->layout(), out_layout, ind_layout); - - auto dnn_wk = dnn_opr.create_workspace(sz); - - dnn_opr.op->exec( - inputs[0]->dnn_tensor(), inputs[1]->dnn_tensor(), out.as_megdnn(), - inds.as_megdnn(), dnn_wk); - return {Tensor::make(out), Tensor::make(inds)}; + dnn_opr.exec_with_ws(inputs[0], inputs[1], out, ind); + return {out, ind}; } SmallVector get_input_layout_constraint( diff --git a/imperative/src/impl/physical_tensor.cpp b/imperative/src/impl/physical_tensor.cpp index 4a492aac..d17965fb 100644 --- a/imperative/src/impl/physical_tensor.cpp +++ b/imperative/src/impl/physical_tensor.cpp @@ -570,11 +570,17 @@ bool Tensor::empty() { return !m_blob->size(); } -megdnn::TensorND Tensor::dnn_tensor() { +DnnTensorND Tensor::dnn_tensor() { mgb_assert(m_blob, "uninitialized tensor."); + mgb_assert(m_layout.ndim, "dnn don't support scalar"); return DnnTensorND{m_layout, m_blob->storage(), m_offset}; } +DnnTensorND Tensor::dnn_tensor(TensorShape new_shape) { + mgb_assert(m_blob, "uninitialized tensor."); + return DnnTensorND{m_layout.reshape(new_shape), m_blob->storage(), m_offset}; +} + void Tensor::fetch_value() { MGB_LOCK_GUARD(m_value_mtx); if (m_value.empty()) { diff --git a/imperative/src/impl/proxy_graph/mini_graph.h b/imperative/src/impl/proxy_graph/mini_graph.h index 423bd74a..32415c3e 100644 --- a/imperative/src/impl/proxy_graph/mini_graph.h +++ b/imperative/src/impl/proxy_graph/mini_graph.h @@ -334,9 +334,16 @@ public: size_t j = 0; for (auto&& var : m_opr->output()) { if (var->contain_flag(VarNode::Flag::VOLATILE_CONTENT)) { - TensorLayout layout{var->shape(), var->dtype(), var->format()}; - var->m_dev_tensor = BlobManager::inst()->alloc_workspace_with_defrag( - var->comp_node(), layout); + auto comp_node = var->comp_node(); + auto dtype = var->dtype(); + auto&& shape = var->shape(); + size_t size = dtype.size(shape.total_nr_elems()); + mgb_assert( + var->format().is_default(), "non default format for workspace"); + auto raw_storage = Blob::make(comp_node, size)->storage(); + DeviceTensorStorage storage; + storage.reset(comp_node, size, raw_storage); + var->m_dev_tensor.reset(storage, {shape, dtype}); } else { mgb_assert(j < outputs.size()); auto&& tensor = outputs[j]; diff --git a/imperative/src/include/megbrain/imperative/blob_manager.h b/imperative/src/include/megbrain/imperative/blob_manager.h index b3a1c911..7ca87c92 100644 --- a/imperative/src/include/megbrain/imperative/blob_manager.h +++ b/imperative/src/include/megbrain/imperative/blob_manager.h @@ -1,6 +1,7 @@ #pragma once #include "megbrain/imperative/physical_tensor.h" +#include "megbrain/imperative/utils/helper.h" namespace mgb { namespace imperative { @@ -15,13 +16,19 @@ public: virtual void alloc_direct(OwnedBlob* blob, size_t size) = 0; + virtual bool try_alloc_direct(OwnedBlob* blob, size_t size) { + try { + alloc_direct(blob, size); + return true; + } catch (MemAllocError&) { + return false; + } + } + virtual void alloc_with_defrag(OwnedBlob* blob, size_t size) = 0; virtual void set_allocator(allocator_t allocator) = 0; - virtual DeviceTensorND alloc_workspace_with_defrag( - CompNode cn, TensorLayout& layout) = 0; - virtual void register_blob(OwnedBlob* blob) = 0; virtual void unregister_blob(OwnedBlob* blob) = 0; diff --git a/imperative/src/include/megbrain/imperative/physical_tensor.h b/imperative/src/include/megbrain/imperative/physical_tensor.h index 2085e723..ebc8fed1 100644 --- a/imperative/src/include/megbrain/imperative/physical_tensor.h +++ b/imperative/src/include/megbrain/imperative/physical_tensor.h @@ -89,24 +89,19 @@ using EventPtr = std::unique_ptr; class Tensor; using TensorPtr = std::shared_ptr; -/* - using DnnTensorND to save the reference count of workspace - allocted by blobmanager to prevent invalidation -*/ struct DnnTensorND : megdnn::TensorND { -private: - std::shared_ptr m_reference; + // hold extra reference to repvent defrag-in-use + std::shared_ptr reference; -public: - DnnTensorND(TensorLayout& layout_, std::shared_ptr ref_ptr, size_t offset) - : megdnn::TensorND(layout_, {ref_ptr.get(), offset}) { - m_reference = ref_ptr; + DnnTensorND( + const TensorLayout& layout_, std::shared_ptr ptr, size_t offset) + : megdnn::TensorND(layout_, {ptr.get(), offset}) { + reference = std::move(ptr); } }; class Tensor : public NonCopyableObj { public: - Tensor() = default; Tensor(BlobPtr blob, const TensorLayout& layout, size_t offset = 0, const HostTensorND& hv = {}); Tensor(BlobPtr blob, const TensorLayout& layout, const HostTensorND& hv = {}) @@ -154,7 +149,9 @@ public: void assign_from_dev_tensor(DeviceTensorND); - megdnn::TensorND dnn_tensor(); + DnnTensorND dnn_tensor(); + + DnnTensorND dnn_tensor(TensorShape new_shape); static TensorPtr make_scalar(DTypeScalar value, CompNode cn); diff --git a/imperative/src/include/megbrain/imperative/utils/helper.h b/imperative/src/include/megbrain/imperative/utils/helper.h index c646816b..8fbefcc5 100644 --- a/imperative/src/include/megbrain/imperative/utils/helper.h +++ b/imperative/src/include/megbrain/imperative/utils/helper.h @@ -3,6 +3,7 @@ #include #include #include +#include #include #include "megbrain/utils/metahelper.h" @@ -14,11 +15,28 @@ namespace imperative { template > class CleanupGuard : public NonCopyableObj { private: - T m_callback; + std::optional m_callback; public: + CleanupGuard() = default; explicit CleanupGuard(T cb) : m_callback{std::move(cb)} {} - ~CleanupGuard() { m_callback(); } + ~CleanupGuard() { reset(); } + CleanupGuard(CleanupGuard&& rhs) : m_callback(std::move(rhs.m_callback)) { + rhs.m_callback.reset(); + } + CleanupGuard& operator=(CleanupGuard&& rhs) { + swap(m_callback, rhs.m_callback); + rhs.reset(); + return *this; + } + +public: + void reset() { + if (m_callback) { + (*m_callback)(); + m_callback.reset(); + } + } }; inline std::string quoted(std::string str) { @@ -33,6 +51,19 @@ inline std::string quoted(std::string str) { std::call_once(_once_flag, [&] { __VA_ARGS__; }); \ } while (false) +template +struct is_small_vector { + static constexpr bool value = false; +}; + +template +struct is_small_vector> { + static constexpr bool value = true; +}; + +template +static constexpr bool is_small_vector_v = is_small_vector::value; + } // namespace imperative } // namespace mgb diff --git a/imperative/src/include/megbrain/imperative/utils/platform.h b/imperative/src/include/megbrain/imperative/utils/platform.h index 89685b5b..843e229d 100644 --- a/imperative/src/include/megbrain/imperative/utils/platform.h +++ b/imperative/src/include/megbrain/imperative/utils/platform.h @@ -6,4 +6,10 @@ namespace mgb::imperative { std::string demangle(std::string mangled); +template +const char* demangled_typename() { + static auto name = demangle(typeid(T).name()); + return name.c_str(); } + +} // namespace mgb::imperative diff --git a/src/opr/impl/misc.cpp b/src/opr/impl/misc.cpp index 59ba67b1..3b45f822 100644 --- a/src/opr/impl/misc.cpp +++ b/src/opr/impl/misc.cpp @@ -314,7 +314,8 @@ void CondTake::init_output_static_infer_desc() { auto dtype = input(0)->dtype(); TensorLayout ily(iv.val[0].shape(), dtype); dest.ndim = 1; - dest.shape[0] = megdnn_opr()->get_workspace_in_bytes(ily); + TensorLayout mly(iv.val[0].shape(), dtype::Int32()); + dest.shape[0] = megdnn_opr()->get_workspace_in_bytes(ily, mly); return true; }; owner_graph()->static_infer_manager().register_shape_infer( @@ -548,9 +549,9 @@ void CheckNonFinite::init_output_static_infer_desc() { auto infer_wk = [this](TensorShape& dest, const InpVal& inp) { dest.ndim = 1; - megdnn::TensorNDArray inp_arr(input().size()); + SmallVector inp_arr(input().size()); for (size_t i = 0; i < input().size(); ++i) { - inp_arr[i] = {NULL, {inp.val.at(i).shape(), input(0)->dtype()}}; + inp_arr[i] = {inp.val.at(i).shape(), input(0)->dtype()}; } dest.shape[0] = megdnn_opr()->get_workspace_in_bytes( inp_arr, {output(input().size() + 1)->shape(), diff --git a/src/opr/impl/tensor_manip.cpp b/src/opr/impl/tensor_manip.cpp index 45796a69..8932104f 100644 --- a/src/opr/impl/tensor_manip.cpp +++ b/src/opr/impl/tensor_manip.cpp @@ -1447,11 +1447,8 @@ void ParamPackConcat::init_output_static_infer_desc() { auto infer_wk = [this](TensorShape& dest, const InpVal& inp) { TensorShapeArray shapes; auto vals = inp.val; - shapes.reserve(vals.size() - 1); - for (size_t i = 0; i < vals.size() - 1; i++) { - shapes.push_back(vals[i].shape()); - } - dest = {m_opr->get_workspace_in_bytes(shapes, vals.back().shape(), dest)}; + size_t nr_params = vals.size() - 1; + dest = {m_opr->get_workspace_in_bytes({nr_params}, vals.back().shape(), dest)}; return true; }; mgr.register_shape_infer(output(0), {SourceType::DEP, shp_deps, infer_out}); diff --git a/src/rdnn/impl/algo_chooser.cpp b/src/rdnn/impl/algo_chooser.cpp index 9630b421..8bc26fb4 100644 --- a/src/rdnn/impl/algo_chooser.cpp +++ b/src/rdnn/impl/algo_chooser.cpp @@ -970,8 +970,9 @@ void AlgoChooser::AlgoChooserHelper::profile( if (!policy.algo.valid()) continue; size_t workspace_needed = get_workspace_size_bytes(policy); - if (m_inputs != nullptr) + if (m_inputs == nullptr) { workspace_needed += data_size; + } if (workspace_needed > m_desc.get_workspace_limit(m_cn, m_execution_policy.workspace_limit)) { continue; diff --git a/tools/format.py b/tools/format.py index e2d6921f..0e6ce625 100755 --- a/tools/format.py +++ b/tools/format.py @@ -18,7 +18,8 @@ failed_files = Manager().list() def process_file(file, clang_format, write): - source = open(file, "r").read() + original_source = open(file, "r").read() + source = original_source source = re.sub(r"MGB_DEFINE(?P([^\\]|\n)*?)// *{", r"class MGB_DEFINE\g{", source) source, count = re.subn(r"(?