GitOrigin-RevId: 402cba209a
HuaHua404-patch-4
@@ -397,7 +397,8 @@ public: | |||||
OutputDType infer_dtype(DType data, DType mask); | OutputDType infer_dtype(DType data, DType mask); | ||||
virtual size_t get_workspace_in_bytes(const TensorLayout& data) = 0; | |||||
virtual size_t get_workspace_in_bytes( | |||||
const TensorLayout& data, const TensorLayout& mask) = 0; | |||||
virtual Output exec( | virtual Output exec( | ||||
_megdnn_tensor_in data, _megdnn_tensor_in mask, _megdnn_workspace workspace, | _megdnn_tensor_in data, _megdnn_tensor_in mask, _megdnn_workspace workspace, | ||||
@@ -512,7 +513,8 @@ public: | |||||
virtual void exec( | virtual void exec( | ||||
_megdnn_in const TensorNDArray& srcs, _megdnn_tensor_out dst, | _megdnn_in const TensorNDArray& srcs, _megdnn_tensor_out dst, | ||||
_megdnn_workspace workspace) = 0; | _megdnn_workspace workspace) = 0; | ||||
void deduce_layout(const TensorLayoutArray& srcs, TensorLayout& dst); | |||||
MGE_WIN_DECLSPEC_FUC void deduce_layout( | |||||
const TensorLayoutArray& srcs, TensorLayout& dst); | |||||
virtual size_t get_workspace_in_bytes( | virtual size_t get_workspace_in_bytes( | ||||
const TensorLayoutArray& srcs, const TensorLayout& dst) = 0; | const TensorLayoutArray& srcs, const TensorLayout& dst) = 0; | ||||
@@ -596,7 +598,7 @@ public: | |||||
_megdnn_workspace workspace) = 0; | _megdnn_workspace workspace) = 0; | ||||
virtual size_t get_workspace_in_bytes( | virtual size_t get_workspace_in_bytes( | ||||
const TensorShapeArray& srcs, const TensorShape& offsets, | |||||
const TensorShape& srcs, const TensorShape& offsets, | |||||
const TensorShape& dst) = 0; | const TensorShape& dst) = 0; | ||||
}; | }; | ||||
@@ -1145,7 +1147,7 @@ protected: | |||||
/*! | /*! | ||||
* \return axis on dst used by indexer (i.e. ExecInfo::idx_axis) | * \return axis on dst used by indexer (i.e. ExecInfo::idx_axis) | ||||
*/ | */ | ||||
static size_t deduce_layout_fwd( | |||||
MGE_WIN_DECLSPEC_FUC static size_t deduce_layout_fwd( | |||||
const TensorLayout& data, const IndexDescLayoutOnly& index, | const TensorLayout& data, const IndexDescLayoutOnly& index, | ||||
TensorLayout& dst); | TensorLayout& dst); | ||||
@@ -1362,9 +1364,10 @@ class CheckNonFinite : public OperatorBase { | |||||
public: | public: | ||||
virtual size_t get_workspace_in_bytes( | virtual size_t get_workspace_in_bytes( | ||||
const TensorNDArray& srcs, const TensorLayout& dst) = 0; | |||||
const TensorLayoutArray& srcs, const TensorLayout& dst) = 0; | |||||
void deduce_layout(const TensorLayoutArray& srcs, TensorLayout& dst); | |||||
MGE_WIN_DECLSPEC_FUC void deduce_layout( | |||||
const TensorLayoutArray& srcs, TensorLayout& dst); | |||||
virtual void exec( | virtual void exec( | ||||
_megdnn_in const TensorNDArray& srcs, _megdnn_tensor_out dst, | _megdnn_in const TensorNDArray& srcs, _megdnn_tensor_out dst, | ||||
@@ -1420,7 +1423,7 @@ public: | |||||
} | } | ||||
virtual size_t get_workspace_in_bytes( | virtual size_t get_workspace_in_bytes( | ||||
const TensorLayout& src, const TensorLayout& dst) = 0; | const TensorLayout& src, const TensorLayout& dst) = 0; | ||||
void deduce_layout(const TensorLayout& src, TensorLayout& dst); | |||||
MGE_WIN_DECLSPEC_FUC void deduce_layout(const TensorLayout& src, TensorLayout& dst); | |||||
MGE_WIN_DECLSPEC_FUC static void deduce_layout_impl( | MGE_WIN_DECLSPEC_FUC static void deduce_layout_impl( | ||||
const TensorLayout& src, TensorLayout& dst, const Param& p); | const TensorLayout& src, TensorLayout& dst, const Param& p); | ||||
@@ -1464,7 +1467,7 @@ public: | |||||
const TensorLayout& m_t, const TensorLayout& v_t, | const TensorLayout& m_t, const TensorLayout& v_t, | ||||
const TensorLayout& new_param) = 0; | const TensorLayout& new_param) = 0; | ||||
void deduce_layout( | |||||
MGE_WIN_DECLSPEC_FUC void deduce_layout( | |||||
const TensorLayout& m_t_1, const TensorLayout& v_t_1, | const TensorLayout& m_t_1, const TensorLayout& v_t_1, | ||||
const TensorLayout& lamb_param, const TensorLayout& grad, TensorLayout& m_t, | const TensorLayout& lamb_param, const TensorLayout& grad, TensorLayout& m_t, | ||||
TensorLayout& v_t, TensorLayout& new_param); | TensorLayout& v_t, TensorLayout& new_param); | ||||
@@ -27,7 +27,8 @@ public: | |||||
_megdnn_tensor_in A, _megdnn_tensor_in B, _megdnn_tensor_out C, | _megdnn_tensor_in A, _megdnn_tensor_in B, _megdnn_tensor_out C, | ||||
_megdnn_workspace workspace) = 0; | _megdnn_workspace workspace) = 0; | ||||
MGE_WIN_DECLSPEC_FUC void deduce_dtype(DType A, DType B, DType& C); | MGE_WIN_DECLSPEC_FUC void deduce_dtype(DType A, DType B, DType& C); | ||||
void deduce_layout(const TensorLayout& A, const TensorLayout& B, TensorLayout& C); | |||||
MGE_WIN_DECLSPEC_FUC void deduce_layout( | |||||
const TensorLayout& A, const TensorLayout& B, TensorLayout& C); | |||||
virtual size_t get_workspace_in_bytes( | virtual size_t get_workspace_in_bytes( | ||||
const TensorLayout& A, const TensorLayout& B, const TensorLayout& C) = 0; | const TensorLayout& A, const TensorLayout& B, const TensorLayout& C) = 0; | ||||
@@ -64,7 +65,8 @@ public: | |||||
_megdnn_tensor_in A, _megdnn_tensor_in B, _megdnn_tensor_out C, | _megdnn_tensor_in A, _megdnn_tensor_in B, _megdnn_tensor_out C, | ||||
_megdnn_workspace workspace) = 0; | _megdnn_workspace workspace) = 0; | ||||
MGE_WIN_DECLSPEC_FUC void deduce_dtype(DType A, DType B, DType& C); | MGE_WIN_DECLSPEC_FUC void deduce_dtype(DType A, DType B, DType& C); | ||||
void deduce_layout(const TensorLayout& A, const TensorLayout& B, TensorLayout& C); | |||||
MGE_WIN_DECLSPEC_FUC void deduce_layout( | |||||
const TensorLayout& A, const TensorLayout& B, TensorLayout& C); | |||||
virtual size_t get_workspace_in_bytes( | virtual size_t get_workspace_in_bytes( | ||||
const TensorLayout& A, const TensorLayout& B, const TensorLayout& C) = 0; | const TensorLayout& A, const TensorLayout& B, const TensorLayout& C) = 0; | ||||
@@ -224,9 +224,9 @@ public: | |||||
const TensorLayout& src_layout, _megdnn_tensor_in filter, | const TensorLayout& src_layout, _megdnn_tensor_in filter, | ||||
const TensorLayout& dst_layout, PreprocessedFilter* preprocessed_filter, | const TensorLayout& dst_layout, PreprocessedFilter* preprocessed_filter, | ||||
_megdnn_workspace workspace) = 0; | _megdnn_workspace workspace) = 0; | ||||
void deduce_dtype(DType src, DType filter, DType& dst); | |||||
MGE_WIN_DECLSPEC_FUC void deduce_dtype(DType src, DType filter, DType& dst); | |||||
void deduce_layout( | |||||
MGE_WIN_DECLSPEC_FUC void deduce_layout( | |||||
const TensorLayout& src, const TensorLayout& filter, TensorLayout& dst); | const TensorLayout& src, const TensorLayout& filter, TensorLayout& dst); | ||||
/** | /** | ||||
@@ -300,7 +300,7 @@ public: | |||||
const TensorLayout& grad) = 0; | const TensorLayout& grad) = 0; | ||||
MGE_WIN_DECLSPEC_FUC void deduce_dtype(DType filter, DType diff, DType& grad); | MGE_WIN_DECLSPEC_FUC void deduce_dtype(DType filter, DType diff, DType& grad); | ||||
void deduce_layout( | |||||
MGE_WIN_DECLSPEC_FUC void deduce_layout( | |||||
const TensorLayout& filter, const TensorLayout& diff, TensorLayout& grad); | const TensorLayout& filter, const TensorLayout& diff, TensorLayout& grad); | ||||
static Algorithm::OprType get_opr_type() { | static Algorithm::OprType get_opr_type() { | ||||
@@ -378,6 +378,12 @@ public: | |||||
const PreprocessedFilter* preprocessed_filter, | const PreprocessedFilter* preprocessed_filter, | ||||
_megdnn_workspace workspace) = 0; | _megdnn_workspace workspace) = 0; | ||||
MGE_WIN_DECLSPEC_FUC void exec( | |||||
_megdnn_tensor_in src, _megdnn_tensor_in filter, _megdnn_tensor_in bias, | |||||
_megdnn_tensor_in z, _megdnn_tensor_out dst, _megdnn_workspace workspace) { | |||||
exec(src, filter, bias, z, dst, nullptr, workspace); | |||||
} | |||||
/** | /** | ||||
* \brief execute weight preprocessing, read weights form filter and bias, | * \brief execute weight preprocessing, read weights form filter and bias, | ||||
* write to preprocessed_filter after preprocessed. | * write to preprocessed_filter after preprocessed. | ||||
@@ -390,8 +396,9 @@ public: | |||||
_megdnn_tensor_in bias, const TensorLayout& z_layout, | _megdnn_tensor_in bias, const TensorLayout& z_layout, | ||||
const TensorLayout& dst_layout, PreprocessedFilter* preprocessed_filter, | const TensorLayout& dst_layout, PreprocessedFilter* preprocessed_filter, | ||||
_megdnn_workspace workspace) = 0; | _megdnn_workspace workspace) = 0; | ||||
void deduce_dtype(DType src, DType filter, DType bias, DType z, DType& dst); | |||||
void deduce_layout( | |||||
MGE_WIN_DECLSPEC_FUC void deduce_dtype( | |||||
DType src, DType filter, DType bias, DType z, DType& dst); | |||||
MGE_WIN_DECLSPEC_FUC void deduce_layout( | |||||
const TensorLayout& src, const TensorLayout& filter, | const TensorLayout& src, const TensorLayout& filter, | ||||
const TensorLayout& bias, const TensorLayout& z, TensorLayout& dst); | const TensorLayout& bias, const TensorLayout& z, TensorLayout& dst); | ||||
@@ -775,7 +782,7 @@ protected: | |||||
void check_layout_fwd(const TensorLayout& src, const TensorLayout& dst); | void check_layout_fwd(const TensorLayout& src, const TensorLayout& dst); | ||||
public: | public: | ||||
MGE_WIN_DECLSPEC_FUC static void deduce_layout_impl( | |||||
static void deduce_layout_impl( | |||||
const TensorLayout& src, const Param& param, TensorLayout& dst); | const TensorLayout& src, const Param& param, TensorLayout& dst); | ||||
}; | }; | ||||
@@ -791,7 +798,7 @@ public: | |||||
virtual void exec( | virtual void exec( | ||||
_megdnn_tensor_in src, _megdnn_tensor_out dst, | _megdnn_tensor_in src, _megdnn_tensor_out dst, | ||||
_megdnn_workspace workspace) = 0; | _megdnn_workspace workspace) = 0; | ||||
void deduce_layout(const TensorLayout& src, TensorLayout& dst); | |||||
MGE_WIN_DECLSPEC_FUC void deduce_layout(const TensorLayout& src, TensorLayout& dst); | |||||
virtual size_t get_workspace_in_bytes( | virtual size_t get_workspace_in_bytes( | ||||
const TensorLayout& src, const TensorLayout& dst) = 0; | const TensorLayout& src, const TensorLayout& dst) = 0; | ||||
@@ -1253,7 +1260,7 @@ public: | |||||
virtual void exec( | virtual void exec( | ||||
_megdnn_tensor_in src, _megdnn_tensor_in filter, _megdnn_tensor_out dst, | _megdnn_tensor_in src, _megdnn_tensor_in filter, _megdnn_tensor_out dst, | ||||
_megdnn_workspace workspace) = 0; | _megdnn_workspace workspace) = 0; | ||||
void deduce_layout( | |||||
MGE_WIN_DECLSPEC_FUC void deduce_layout( | |||||
const TensorLayout& src, const TensorLayout& filter, TensorLayout& dst); | const TensorLayout& src, const TensorLayout& filter, TensorLayout& dst); | ||||
virtual size_t get_workspace_in_bytes( | virtual size_t get_workspace_in_bytes( | ||||
const TensorLayout& src, const TensorLayout& filter, | const TensorLayout& src, const TensorLayout& filter, | ||||
@@ -1281,18 +1288,16 @@ public: | |||||
* \param[in] diff (n, oc, od, oh, ow) | * \param[in] diff (n, oc, od, oh, ow) | ||||
* \param[out] grad (n, ic, id, ih, iw) | * \param[out] grad (n, ic, id, ih, iw) | ||||
*/ | */ | ||||
MGE_WIN_DECLSPEC_FUC static void deduce_layout_impl( | |||||
static void deduce_layout_impl( | |||||
const TensorLayout& filter, const TensorLayout& diff, const Param& param, | const TensorLayout& filter, const TensorLayout& diff, const Param& param, | ||||
TensorLayout& grad); | TensorLayout& grad); | ||||
virtual void exec( | virtual void exec( | ||||
_megdnn_tensor_in filter, _megdnn_tensor_in diff, _megdnn_tensor_out grad, | _megdnn_tensor_in filter, _megdnn_tensor_in diff, _megdnn_tensor_out grad, | ||||
_megdnn_workspace workspace) = 0; | _megdnn_workspace workspace) = 0; | ||||
virtual size_t get_workspace_in_bytes( | virtual size_t get_workspace_in_bytes( | ||||
const TensorLayout& filter, const TensorLayout& diff, | const TensorLayout& filter, const TensorLayout& diff, | ||||
const TensorLayout& grad) = 0; | const TensorLayout& grad) = 0; | ||||
void deduce_layout( | |||||
MGE_WIN_DECLSPEC_FUC void deduce_layout( | |||||
const TensorLayout& filter, const TensorLayout& diff, TensorLayout& grad); | const TensorLayout& filter, const TensorLayout& diff, TensorLayout& grad); | ||||
static Algorithm::OprType get_opr_type() { | static Algorithm::OprType get_opr_type() { | ||||
@@ -1472,7 +1477,7 @@ public: | |||||
virtual void exec( | virtual void exec( | ||||
_megdnn_tensor_in src, _megdnn_tensor_in rois, _megdnn_tensor_out dst, | _megdnn_tensor_in src, _megdnn_tensor_in rois, _megdnn_tensor_out dst, | ||||
_megdnn_tensor_out index, _megdnn_workspace workspace) = 0; | _megdnn_tensor_out index, _megdnn_workspace workspace) = 0; | ||||
void deduce_layout( | |||||
MGE_WIN_DECLSPEC_FUC void deduce_layout( | |||||
const TensorLayout& src, const TensorLayout& rois, TensorLayout& dst, | const TensorLayout& src, const TensorLayout& rois, TensorLayout& dst, | ||||
TensorLayout& index); | TensorLayout& index); | ||||
virtual size_t get_workspace_in_bytes( | virtual size_t get_workspace_in_bytes( | ||||
@@ -1963,7 +1968,7 @@ public: | |||||
_megdnn_tensor_in data, _megdnn_tensor_in weight, _megdnn_tensor_in bias, | _megdnn_tensor_in data, _megdnn_tensor_in weight, _megdnn_tensor_in bias, | ||||
_megdnn_tensor_out dst, _megdnn_tensor_out mean, _megdnn_tensor_out rstd, | _megdnn_tensor_out dst, _megdnn_tensor_out mean, _megdnn_tensor_out rstd, | ||||
_megdnn_workspace workspace) = 0; | _megdnn_workspace workspace) = 0; | ||||
void deduce_layout( | |||||
MGE_WIN_DECLSPEC_FUC void deduce_layout( | |||||
const TensorLayout& data, const TensorLayout& weight, | const TensorLayout& data, const TensorLayout& weight, | ||||
const TensorLayout& bias, TensorLayout& dst, TensorLayout& mean, | const TensorLayout& bias, TensorLayout& dst, TensorLayout& mean, | ||||
TensorLayout& rstd); | TensorLayout& rstd); | ||||
@@ -7,7 +7,11 @@ void CheckNonFinite::check_exec( | |||||
const TensorNDArray& srcs, const TensorND& dst, size_t workspace_in_bytes) { | const TensorNDArray& srcs, const TensorND& dst, size_t workspace_in_bytes) { | ||||
megdnn_assert_contiguous(dst.layout); | megdnn_assert_contiguous(dst.layout); | ||||
megdnn_assert(srcs.size() > 0); | megdnn_assert(srcs.size() > 0); | ||||
auto required_workspace_in_bytes = get_workspace_in_bytes(srcs, dst.layout); | |||||
TensorLayoutArray src_layouts; | |||||
for (auto&& src : srcs) { | |||||
src_layouts.push_back(src.layout); | |||||
} | |||||
auto required_workspace_in_bytes = get_workspace_in_bytes(src_layouts, dst.layout); | |||||
megdnn_assert(workspace_in_bytes >= required_workspace_in_bytes); | megdnn_assert(workspace_in_bytes >= required_workspace_in_bytes); | ||||
} | } | ||||
@@ -11,7 +11,7 @@ size_t CondTake::check_exec_get_size( | |||||
mask.TensorShape::to_string().c_str()); | mask.TensorShape::to_string().c_str()); | ||||
megdnn_assert(data.is_physical_contiguous() && mask.is_physical_contiguous()); | megdnn_assert(data.is_physical_contiguous() && mask.is_physical_contiguous()); | ||||
megdnn_assert(m_param.eps > 0, "eps must be non-negative; got: %g", m_param.eps); | megdnn_assert(m_param.eps > 0, "eps must be non-negative; got: %g", m_param.eps); | ||||
megdnn_assert(workspace_in_bytes >= get_workspace_in_bytes(data)); | |||||
megdnn_assert(workspace_in_bytes >= get_workspace_in_bytes(data, mask)); | |||||
return data.total_nr_elems(); | return data.total_nr_elems(); | ||||
} | } | ||||
@@ -7,9 +7,9 @@ void LAMBUpdate::deduce_layout( | |||||
const TensorLayout& m_t_1, const TensorLayout& v_t_1, | const TensorLayout& m_t_1, const TensorLayout& v_t_1, | ||||
const TensorLayout& lamb_param, const TensorLayout& grad, TensorLayout& m_t, | const TensorLayout& lamb_param, const TensorLayout& grad, TensorLayout& m_t, | ||||
TensorLayout& v_t, TensorLayout& new_param) { | TensorLayout& v_t, TensorLayout& new_param) { | ||||
m_t = TensorLayout(m_t_1); | |||||
v_t = TensorLayout(v_t_1); | |||||
new_param = TensorLayout(lamb_param); | |||||
m_t = m_t_1; | |||||
v_t = v_t_1; | |||||
new_param = lamb_param; | |||||
MEGDNN_MARK_USED_VAR(grad); | MEGDNN_MARK_USED_VAR(grad); | ||||
} | } | ||||
@@ -26,14 +26,14 @@ size_t CheckNonFiniteImpl::_get_workspace_in_bytes() { | |||||
} | } | ||||
size_t CheckNonFiniteImpl::get_workspace_in_bytes( | size_t CheckNonFiniteImpl::get_workspace_in_bytes( | ||||
const TensorNDArray& srcs, const TensorLayout&) { | |||||
const TensorLayoutArray& srcs, const TensorLayout&) { | |||||
m_size = 0; | m_size = 0; | ||||
for (const auto& src : srcs) { | for (const auto& src : srcs) { | ||||
m_size += DIVUP(src.layout.total_nr_elems(), total_nr_elems_max); | |||||
m_size += DIVUP(src.total_nr_elems(), total_nr_elems_max); | |||||
} | } | ||||
if (srcs.begin()->layout.dtype == dtype::Float32()) { | |||||
if (srcs.begin()->dtype == dtype::Float32()) { | |||||
return _get_workspace_in_bytes<dt_float32>(); | return _get_workspace_in_bytes<dt_float32>(); | ||||
} else if (srcs.begin()->layout.dtype == dtype::Float16()) { | |||||
} else if (srcs.begin()->dtype == dtype::Float16()) { | |||||
return _get_workspace_in_bytes<dt_float16>(); | return _get_workspace_in_bytes<dt_float16>(); | ||||
} else { | } else { | ||||
megdnn_log_warn("only support fp16 and fp32, fallback to fp32"); | megdnn_log_warn("only support fp16 and fp32, fallback to fp32"); | ||||
@@ -19,7 +19,7 @@ public: | |||||
using CheckNonFinite::CheckNonFinite; | using CheckNonFinite::CheckNonFinite; | ||||
size_t get_workspace_in_bytes( | size_t get_workspace_in_bytes( | ||||
const TensorNDArray& srcs, const TensorLayout& dst) override; | |||||
const TensorLayoutArray& srcs, const TensorLayout& dst) override; | |||||
bool is_thread_safe() const override { return true; } | bool is_thread_safe() const override { return true; } | ||||
@@ -20,7 +20,8 @@ WorkspaceBundle CondTakeImpl::make_bundle(size_t nr_item) { | |||||
handle()->alignment_requirement()}; | handle()->alignment_requirement()}; | ||||
} | } | ||||
size_t CondTakeImpl::get_workspace_in_bytes(const TensorLayout& data) { | |||||
size_t CondTakeImpl::get_workspace_in_bytes( | |||||
const TensorLayout& data, const TensorLayout&) { | |||||
return make_bundle(data.total_nr_elems()).total_size_in_bytes(); | return make_bundle(data.total_nr_elems()).total_size_in_bytes(); | ||||
} | } | ||||
@@ -15,7 +15,8 @@ public: | |||||
_megdnn_tensor_in data, _megdnn_tensor_in mask, _megdnn_workspace workspace, | _megdnn_tensor_in data, _megdnn_tensor_in mask, _megdnn_workspace workspace, | ||||
DynOutMallocPolicyCall malloc_policy) override; | DynOutMallocPolicyCall malloc_policy) override; | ||||
size_t get_workspace_in_bytes(const TensorLayout& data) override; | |||||
size_t get_workspace_in_bytes( | |||||
const TensorLayout& data, const TensorLayout& mask) override; | |||||
}; | }; | ||||
} // namespace cuda | } // namespace cuda | ||||
@@ -6,8 +6,8 @@ namespace megdnn { | |||||
namespace cuda { | namespace cuda { | ||||
size_t ParamPackConcatImpl::get_workspace_in_bytes( | size_t ParamPackConcatImpl::get_workspace_in_bytes( | ||||
const TensorShapeArray& srcs, const TensorShape&, const TensorShape&) { | |||||
return sizeof(size_t) * srcs.size(); | |||||
const TensorShape&, const TensorShape& offsets, const TensorShape&) { | |||||
return sizeof(size_t) * (offsets.shape[0] / 2); | |||||
} | } | ||||
template <typename T> | template <typename T> | ||||
@@ -12,7 +12,7 @@ public: | |||||
_megdnn_workspace workspace) override; | _megdnn_workspace workspace) override; | ||||
size_t get_workspace_in_bytes( | size_t get_workspace_in_bytes( | ||||
const TensorShapeArray& srcs, const TensorShape& table, | |||||
const TensorShape& srcs, const TensorShape& table, | |||||
const TensorShape& dst) override; | const TensorShape& dst) override; | ||||
private: | private: | ||||
@@ -13,7 +13,8 @@ public: | |||||
bool is_thread_safe() const override { return true; } | bool is_thread_safe() const override { return true; } | ||||
size_t get_workspace_in_bytes(const TensorNDArray&, const TensorLayout&) override { | |||||
size_t get_workspace_in_bytes( | |||||
const TensorLayoutArray&, const TensorLayout&) override { | |||||
m_size = 0; | m_size = 0; | ||||
return _get_workspace_in_bytes(); | return _get_workspace_in_bytes(); | ||||
} | } | ||||
@@ -38,7 +38,8 @@ void copy_data( | |||||
} // anonymous namespace | } // anonymous namespace | ||||
size_t CondTakeImpl::get_workspace_in_bytes(const TensorLayout& data) { | |||||
size_t CondTakeImpl::get_workspace_in_bytes( | |||||
const TensorLayout& data, const TensorLayout&) { | |||||
return (data.total_nr_elems() + 1) * sizeof(dt_int32); | return (data.total_nr_elems() + 1) * sizeof(dt_int32); | ||||
} | } | ||||
@@ -11,7 +11,8 @@ class CondTakeImpl : public CondTake { | |||||
public: | public: | ||||
using CondTake::CondTake; | using CondTake::CondTake; | ||||
size_t get_workspace_in_bytes(const TensorLayout& data) override; | |||||
size_t get_workspace_in_bytes( | |||||
const TensorLayout& data, const TensorLayout& mask) override; | |||||
Output exec( | Output exec( | ||||
_megdnn_tensor_in data, _megdnn_tensor_in mask, _megdnn_workspace workspace, | _megdnn_tensor_in data, _megdnn_tensor_in mask, _megdnn_workspace workspace, | ||||
@@ -11,7 +11,7 @@ public: | |||||
_megdnn_workspace workspace) override; | _megdnn_workspace workspace) override; | ||||
size_t get_workspace_in_bytes( | size_t get_workspace_in_bytes( | ||||
const TensorShapeArray&, const TensorShape&, const TensorShape&) override { | |||||
const TensorShape&, const TensorShape&, const TensorShape&) override { | |||||
return 0; | return 0; | ||||
} | } | ||||
}; | }; | ||||
@@ -7,8 +7,8 @@ namespace megdnn { | |||||
namespace rocm { | namespace rocm { | ||||
size_t ParamPackConcatImpl::get_workspace_in_bytes( | size_t ParamPackConcatImpl::get_workspace_in_bytes( | ||||
const TensorShapeArray& srcs, const TensorShape&, const TensorShape&) { | |||||
return sizeof(size_t) * srcs.size(); | |||||
const TensorShape&, const TensorShape& offsets, const TensorShape&) { | |||||
return sizeof(size_t) * (offsets.shape[0] / 2); | |||||
} | } | ||||
template <typename T> | template <typename T> | ||||
@@ -12,7 +12,7 @@ public: | |||||
_megdnn_workspace workspace) override; | _megdnn_workspace workspace) override; | ||||
size_t get_workspace_in_bytes( | size_t get_workspace_in_bytes( | ||||
const TensorShapeArray& srcs, const TensorShape& table, | |||||
const TensorShape& srcs, const TensorShape& table, | |||||
const TensorShape& dst) override; | const TensorShape& dst) override; | ||||
private: | private: | ||||
@@ -71,7 +71,7 @@ CondTakeTestcase::Result CondTakeTestcase::run(CondTake* opr) { | |||||
opr->param() = m_param; | opr->param() = m_param; | ||||
DynOutMallocPolicyImpl malloc_policy(handle); | DynOutMallocPolicyImpl malloc_policy(handle); | ||||
auto workspace_size = opr->get_workspace_in_bytes(data->layout); | |||||
auto workspace_size = opr->get_workspace_in_bytes(data->layout, mask->layout); | |||||
auto workspace_ptr = malloc_policy.alloc_workspace(workspace_size, nullptr); | auto workspace_ptr = malloc_policy.alloc_workspace(workspace_size, nullptr); | ||||
auto result = opr->exec( | auto result = opr->exec( | ||||
*data, *mask, {(dt_byte*)workspace_ptr, workspace_size}, &malloc_policy); | *data, *mask, {(dt_byte*)workspace_ptr, workspace_size}, &malloc_policy); | ||||
@@ -205,9 +205,14 @@ struct OprProxy<CheckNonFinite> { | |||||
auto inps = tensors; | auto inps = tensors; | ||||
inps.pop_back(); | inps.pop_back(); | ||||
TensorLayoutArray inp_layouts(inps.size()); | |||||
std::transform( | |||||
inps.begin(), inps.end(), inp_layouts.begin(), | |||||
[](const TensorND& tensor) { return tensor.layout; }); | |||||
WorkspaceWrapper W( | WorkspaceWrapper W( | ||||
opr->handle(), | opr->handle(), | ||||
opr->get_workspace_in_bytes(inps, tensors.back().layout)); | |||||
opr->get_workspace_in_bytes(inp_layouts, tensors.back().layout)); | |||||
opr->exec(inps, tensors.back(), W.workspace()); | opr->exec(inps, tensors.back(), W.workspace()); | ||||
} | } | ||||
}; | }; | ||||
@@ -95,7 +95,7 @@ void test_param_pack_concat( | |||||
test::WorkspaceWrapper workspace( | test::WorkspaceWrapper workspace( | ||||
handle, | handle, | ||||
concat->get_workspace_in_bytes(shapes, offsets_layout, {pack_size})); | |||||
concat->get_workspace_in_bytes({nr_params}, offsets_layout, {pack_size})); | |||||
TensorND src_tensor(param_ptrs.data(), TensorLayout({nr_params}, dtype::Int32())); | TensorND src_tensor(param_ptrs.data(), TensorLayout({nr_params}, dtype::Int32())); | ||||
concat->exec(src_tensor, offsets_tensor, dst_tensor, workspace.workspace()); | concat->exec(src_tensor, offsets_tensor, dst_tensor, workspace.workspace()); | ||||
@@ -97,7 +97,7 @@ void test_param_pack_concat( | |||||
test::WorkspaceWrapper workspace( | test::WorkspaceWrapper workspace( | ||||
handle, | handle, | ||||
concat->get_workspace_in_bytes(shapes, offsets_layout, {pack_size})); | |||||
concat->get_workspace_in_bytes({nr_params}, offsets_layout, {pack_size})); | |||||
TensorND src_tensor(param_ptrs.data(), TensorLayout({nr_params}, dtype::Int32())); | TensorND src_tensor(param_ptrs.data(), TensorLayout({nr_params}, dtype::Int32())); | ||||
concat->exec(src_tensor, offsets_tensor, dst_tensor, workspace.workspace()); | concat->exec(src_tensor, offsets_tensor, dst_tensor, workspace.workspace()); | ||||
@@ -9,11 +9,8 @@ BlobManagerImpl::BlobData::BlobData(OwnedBlob* in_blob) { | |||||
blob = in_blob; | blob = in_blob; | ||||
DeviceTensorStorage d_storage; | DeviceTensorStorage d_storage; | ||||
d_storage.reset(blob->m_comp_node, blob->m_size, blob->m_storage); | d_storage.reset(blob->m_comp_node, blob->m_size, blob->m_storage); | ||||
h_storage = HostTensorStorage(blob->m_comp_node); | h_storage = HostTensorStorage(blob->m_comp_node); | ||||
h_storage.ensure_size(blob->m_size); | h_storage.ensure_size(blob->m_size); | ||||
h_storage.copy_from(const_cast<DeviceTensorStorage&>(d_storage), blob->m_size); | h_storage.copy_from(const_cast<DeviceTensorStorage&>(d_storage), blob->m_size); | ||||
} | } | ||||
@@ -30,65 +27,36 @@ void BlobManagerImpl::unregister_blob(OwnedBlob* blob) { | |||||
} | } | ||||
void BlobManagerImpl::alloc_with_defrag(OwnedBlob* blob, size_t size) { | void BlobManagerImpl::alloc_with_defrag(OwnedBlob* blob, size_t size) { | ||||
if (custom_allocator) { | |||||
blob->m_storage = custom_allocator(blob->m_comp_node, size); | |||||
if (m_custom_allocator) { | |||||
blob->m_storage = m_custom_allocator(blob->m_comp_node, size); | |||||
return; | return; | ||||
} | } | ||||
// try alloc | // try alloc | ||||
MGB_TRY { alloc_direct(blob, size); } | |||||
// if fail, try defrag, alloc again | // if fail, try defrag, alloc again | ||||
MGB_CATCH(MemAllocError&, { | |||||
if (!try_alloc_direct(blob, size)) { | |||||
mgb_log_warn("memory allocation failed for blob; try defragmenting"); | mgb_log_warn("memory allocation failed for blob; try defragmenting"); | ||||
defrag(blob->m_comp_node); | defrag(blob->m_comp_node); | ||||
alloc_direct(blob, size); | alloc_direct(blob, size); | ||||
}); | |||||
} | |||||
} | } | ||||
void BlobManagerImpl::alloc_direct(OwnedBlob* blob, size_t size) { | void BlobManagerImpl::alloc_direct(OwnedBlob* blob, size_t size) { | ||||
DeviceTensorStorage storage(blob->m_comp_node); | |||||
mgb_assert(blob->m_comp_node.valid()); | mgb_assert(blob->m_comp_node.valid()); | ||||
DeviceTensorStorage storage(blob->m_comp_node); | |||||
storage.ensure_size(size); | storage.ensure_size(size); | ||||
blob->m_storage = storage.raw_storage(); | blob->m_storage = storage.raw_storage(); | ||||
} | } | ||||
DeviceTensorND BlobManagerImpl::alloc_workspace_with_defrag( | |||||
CompNode cn, TensorLayout& layout) { | |||||
DeviceTensorND dev_tensor; | |||||
if (custom_allocator) { | |||||
DeviceTensorStorage storage(cn); | |||||
size_t sz = layout.dtype.size(layout.total_nr_elems()); | |||||
storage.reset(cn, sz, custom_allocator(cn, sz)); | |||||
dev_tensor.reset(storage, layout); | |||||
return dev_tensor; | |||||
} | |||||
MGB_TRY { dev_tensor = alloc_workspace(cn, layout); } | |||||
MGB_CATCH(MemAllocError&, { | |||||
mgb_log_warn("memory allocation failed for workspace; try defragmenting"); | |||||
defrag(cn); | |||||
dev_tensor = alloc_workspace(cn, layout); | |||||
}); | |||||
return dev_tensor; | |||||
}; | |||||
DeviceTensorND BlobManagerImpl::alloc_workspace(CompNode cn, TensorLayout layout) { | |||||
DeviceTensorStorage storage(cn); | |||||
storage.ensure_size(layout.dtype.size(layout.total_nr_elems())); | |||||
DeviceTensorND dev_tensor; | |||||
dev_tensor.reset(storage, layout); | |||||
return dev_tensor; | |||||
} | |||||
void BlobManagerImpl::set_allocator(allocator_t allocator) { | void BlobManagerImpl::set_allocator(allocator_t allocator) { | ||||
custom_allocator = allocator; | |||||
m_custom_allocator = allocator; | |||||
} | } | ||||
void BlobManagerImpl::defrag(const CompNode& cn) { | void BlobManagerImpl::defrag(const CompNode& cn) { | ||||
BlobSetWithMux* blobs_set_ptr; | |||||
{ | |||||
auto& blobs_set_ptr = ([&]() -> auto& { | |||||
MGB_LOCK_GUARD(m_mtx); | MGB_LOCK_GUARD(m_mtx); | ||||
blobs_set_ptr = &m_comp2blobs_map[cn]; | |||||
} | |||||
MGB_LOCK_GUARD(blobs_set_ptr->mtx); | |||||
return m_comp2blobs_map[cn]; | |||||
})(); | |||||
MGB_LOCK_GUARD(blobs_set_ptr.mtx); | |||||
std::vector<BlobData> blob_data_arrary; | std::vector<BlobData> blob_data_arrary; | ||||
std::set<Blob::RawStorage> storage_set; | std::set<Blob::RawStorage> storage_set; | ||||
@@ -96,7 +64,7 @@ void BlobManagerImpl::defrag(const CompNode& cn) { | |||||
size_t tot_sz = 0; | size_t tot_sz = 0; | ||||
// copy to HostTensorStorage, and release | // copy to HostTensorStorage, and release | ||||
for (auto i : blobs_set_ptr->blobs_set) { | |||||
for (auto i : blobs_set_ptr.blobs_set) { | |||||
// skip if blob do not have m_storage | // skip if blob do not have m_storage | ||||
if (!i->m_storage) | if (!i->m_storage) | ||||
continue; | continue; | ||||
@@ -153,9 +121,6 @@ struct BlobManagerStub : BlobManager { | |||||
void alloc_with_defrag(OwnedBlob* blob, size_t size) { | void alloc_with_defrag(OwnedBlob* blob, size_t size) { | ||||
mgb_assert(0, "prohibited after global variable destruction"); | mgb_assert(0, "prohibited after global variable destruction"); | ||||
}; | }; | ||||
DeviceTensorND alloc_workspace_with_defrag(CompNode cn, TensorLayout& layout) { | |||||
mgb_assert(0, "prohibited after global variable destruction"); | |||||
}; | |||||
void register_blob(OwnedBlob* blob) { | void register_blob(OwnedBlob* blob) { | ||||
mgb_assert(0, "prohibited after global variable destruction"); | mgb_assert(0, "prohibited after global variable destruction"); | ||||
}; | }; | ||||
@@ -163,7 +128,7 @@ struct BlobManagerStub : BlobManager { | |||||
void defrag(const CompNode& cn) { | void defrag(const CompNode& cn) { | ||||
mgb_assert(0, "prohibited after global variable destruction"); | mgb_assert(0, "prohibited after global variable destruction"); | ||||
}; | }; | ||||
virtual void set_allocator(allocator_t allocator) { | |||||
void set_allocator(allocator_t allocator) { | |||||
mgb_assert(0, "prohibited after global variable destruction"); | mgb_assert(0, "prohibited after global variable destruction"); | ||||
}; | }; | ||||
}; | }; | ||||
@@ -27,27 +27,21 @@ class BlobManagerImpl final : public BlobManager { | |||||
std::mutex m_mtx; | std::mutex m_mtx; | ||||
CompNode::UnorderedMap<BlobSetWithMux> m_comp2blobs_map; | CompNode::UnorderedMap<BlobSetWithMux> m_comp2blobs_map; | ||||
void defrag(const CompNode& cn) override; | |||||
BlobManager::allocator_t m_custom_allocator; | |||||
void alloc_direct(OwnedBlob* blob, size_t size) override; | void alloc_direct(OwnedBlob* blob, size_t size) override; | ||||
DeviceTensorND alloc_workspace(CompNode cn, TensorLayout layout); | |||||
BlobManager::allocator_t custom_allocator; | |||||
public: | public: | ||||
static BlobManager* inst(); | static BlobManager* inst(); | ||||
void alloc_with_defrag(OwnedBlob* blob, size_t size) override; | void alloc_with_defrag(OwnedBlob* blob, size_t size) override; | ||||
DeviceTensorND alloc_workspace_with_defrag( | |||||
CompNode cn, TensorLayout& layout) override; | |||||
void register_blob(OwnedBlob* blob) override; | void register_blob(OwnedBlob* blob) override; | ||||
void unregister_blob(OwnedBlob* blob) override; | void unregister_blob(OwnedBlob* blob) override; | ||||
void defrag(const CompNode& cn) override; | |||||
void set_allocator(allocator_t allocator) override; | void set_allocator(allocator_t allocator) override; | ||||
}; | }; | ||||
@@ -1,79 +1,331 @@ | |||||
#pragma once | |||||
#include <optional> | |||||
#include <type_traits> | |||||
#include "algo_chooser.h" | |||||
#include "megbrain/comp_node.h" | #include "megbrain/comp_node.h" | ||||
#include "megbrain/comp_node_env.h" | #include "megbrain/comp_node_env.h" | ||||
#include "megbrain/imperative/blob_manager.h" | |||||
#include "megbrain/imperative/physical_tensor.h" | #include "megbrain/imperative/physical_tensor.h" | ||||
#include "megbrain/imperative/utils/helper.h" | |||||
#include "megbrain/imperative/utils/platform.h" | |||||
#include "megbrain/rdnn/management.h" | #include "megbrain/rdnn/management.h" | ||||
using namespace megdnn; | |||||
#include "megdnn/basic_types.h" | |||||
namespace mgb { | namespace mgb { | ||||
namespace imperative { | namespace imperative { | ||||
/*! | /*! | ||||
* \brief A struct for safely calling DNN oprs | |||||
* In some cases, op may be released before the complete of the execution | |||||
* This destructor will prevent this | |||||
* /brief Helps deduce layout and dtype | |||||
*/ | */ | ||||
template <typename Opr> | template <typename Opr> | ||||
struct DnnOprCaller { | |||||
CompNode cn; | |||||
DeviceTensorND dev_tensor; | |||||
Workspace workspace; | |||||
mgb::opr::intl::UniqPtrWithCN<Opr> op; | |||||
class DnnOprDeducer { | |||||
private: | |||||
Opr* m_opr; | |||||
DnnOprCaller(CompNode cn) : cn(cn), op(std::move(create_operator(cn))) {} | |||||
public: | |||||
DnnOprDeducer(Opr* opr) : m_opr(opr) { mgb_assert(opr); } | |||||
static mgb::opr::intl::UniqPtrWithCN<Opr> create_operator(CompNode cn) { | |||||
return mgb::opr::intl::create_megdnn_opr<Opr>(cn); | |||||
// FIXME: maybe in-place style deduction works better | |||||
template <typename... TArgs> | |||||
TensorLayout deduce_layout(TArgs&&... args) { | |||||
static_assert((std::is_convertible_v<TArgs, TensorLayout> && ...)); | |||||
TensorLayout output_layout; | |||||
m_opr->deduce_layout(args..., output_layout); | |||||
return output_layout; | |||||
} | } | ||||
Workspace create_workspace(size_t sz) { | |||||
if (workspace.raw_ptr) { | |||||
mgb_throw(MegBrainError, "workspace should not be applicated many times"); | |||||
} | |||||
if (sz) { | |||||
TensorLayout layout({sz}, dtype::Byte()); | |||||
dev_tensor = Tensor::make(layout, cn)->dev_tensor(); | |||||
workspace = megdnn::Workspace( | |||||
dev_tensor.raw_ptr(), dev_tensor.storage().size()); | |||||
template <typename... TArgs> | |||||
TensorLayout deduce_layout_fallible(TArgs&&... args) { | |||||
static_assert((std::is_convertible_v<TArgs, TensorLayout> && ...)); | |||||
TensorLayout output_layout; | |||||
bool success = (args.ndim * ...) > 0; | |||||
if (success) { | |||||
m_opr->deduce_layout(args..., output_layout); | |||||
} else { | |||||
m_opr->deduce_dtype(args.dtype..., output_layout.dtype); | |||||
} | } | ||||
return workspace; | |||||
return output_layout; | |||||
} | } | ||||
~DnnOprCaller() { | |||||
template <size_t nr_outputs, typename... TArgs> | |||||
std::array<TensorLayout, nr_outputs> deduce_layouts(TArgs&&... args) { | |||||
static_assert((std::is_convertible_v<TArgs, TensorLayout> && ...)); | |||||
std::array<TensorLayout, nr_outputs> layouts; | |||||
std::apply( | |||||
[&](auto&&... outputs) { m_opr->deduce_layout(args..., outputs...); }, | |||||
layouts); | |||||
return layouts; | |||||
} | |||||
}; | |||||
/*! | |||||
* /brief Declare an abstract operator and initialize it's param | |||||
*/ | |||||
template <typename Opr> | |||||
class DnnOprStub { | |||||
private: | |||||
// TODO: make opr concrete | |||||
std::aligned_storage_t<sizeof(Opr), alignof(Opr)> m_storage; | |||||
using Param = typename Opr::Param; | |||||
private: | |||||
DnnOprStub() { new (¶m()) Param(); } | |||||
public: | |||||
DnnOprStub(const Param& param) { this->param() = param; } | |||||
// undefined behavior | |||||
Opr& opr() { return *reinterpret_cast<Opr*>(&m_storage); } | |||||
auto& param() { return opr().param(); } | |||||
auto& param() const { return opr().param(); } | |||||
~DnnOprStub() { param().~Param(); } | |||||
}; | |||||
/*! | |||||
* /brief Deduce layout without create concrete opr | |||||
*/ | |||||
template <typename Opr> | |||||
class DnnOprHelper : public DnnOprStub<Opr>, public DnnOprDeducer<Opr> { | |||||
private: | |||||
using Stub = DnnOprStub<Opr>; | |||||
using Deducer = DnnOprDeducer<Opr>; | |||||
public: | |||||
DnnOprHelper(const typename Opr::Param& param) | |||||
: Stub(param), Deducer(&Stub::opr()) {} | |||||
}; | |||||
// hold a concrete operator in given comp_node | |||||
template <typename Opr> | |||||
class DnnOprHolder { | |||||
private: | |||||
CompNode m_comp_node; | |||||
opr::intl::UniqPtrWithCN<Opr> m_opr = | |||||
opr::intl::create_megdnn_opr<Opr>(m_comp_node); | |||||
public: | |||||
DnnOprHolder(CompNode comp_node) : m_comp_node(comp_node) {} | |||||
auto& op() { return m_opr; } | |||||
auto comp_node() { return m_comp_node; } | |||||
auto& param() { return m_opr->param(); } | |||||
auto& param() const { return m_opr->param(); } | |||||
~DnnOprHolder() { | |||||
using DT = CompNode::DeviceType; | using DT = CompNode::DeviceType; | ||||
if (cn.device_type() == DT::CPU && cn != CompNode::default_cpu()) { | |||||
CompNodeEnv::from_comp_node(cn).cpu_env().dispatch( | |||||
[p = op.release()] { delete p; }); | |||||
if (m_comp_node.device_type() == DT::CPU && | |||||
m_comp_node != CompNode::default_cpu()) { | |||||
CompNodeEnv::from_comp_node(m_comp_node) | |||||
.cpu_env() | |||||
.dispatch([p = m_opr.release()] { delete p; }); | |||||
} | |||||
} | |||||
}; | |||||
/*! | |||||
* /brief Prevent binary float | |||||
*/ | |||||
class DnnOprCallerBase { | |||||
protected: | |||||
static auto&& get_layout(const megdnn::TensorND& tensor) { return tensor.layout; } | |||||
static auto get_layout(const megdnn::TensorNDArray& tensors) { | |||||
SmallVector<TensorLayout> layouts; | |||||
for (auto&& tensor : tensors) { | |||||
layouts.push_back(tensor.layout); | |||||
} | } | ||||
return layouts; | |||||
} | } | ||||
}; | }; | ||||
template <size_t OSize> | |||||
class MegDNNDynOutMallocImpl final : public megdnn::DynOutMallocPolicy { | |||||
using Output = std::array<TensorPtr, OSize>; | |||||
/*! | |||||
* \brief A struct for safely calling DNN oprs | |||||
* | |||||
* In some cases, op may be released before the complete of the execution | |||||
* This destructor will prevent this | |||||
*/ | |||||
template <typename Opr> | |||||
class DnnOprCaller final : public DnnOprHolder<Opr>, | |||||
public DnnOprDeducer<Opr>, | |||||
public DnnOprCallerBase { | |||||
private: | |||||
using Holder = DnnOprHolder<Opr>; | |||||
using Deducer = DnnOprDeducer<Opr>; | |||||
using Base = DnnOprCallerBase; | |||||
std::optional<DnnTensorND> m_workspace; | |||||
std::optional<megdnn::param::ExecutionPolicy> m_policy; | |||||
CompNode m_cn; | |||||
Output m_out; | |||||
megdnn::Workspace create_workspace(size_t sz) { | |||||
mgb_assert( | |||||
!m_workspace, "workspace asked more than once by op: %s", | |||||
demangled_typename<Opr>()); | |||||
dt_byte* ptr = nullptr; | |||||
if (sz) { | |||||
TensorLayout layout({sz}, dtype::Byte()); | |||||
m_workspace.emplace( | |||||
Tensor::make(layout, Holder::comp_node())->dnn_tensor()); | |||||
ptr = reinterpret_cast<dt_byte*>(m_workspace->raw_ptr()); | |||||
} | |||||
return {ptr, sz}; | |||||
} | |||||
public: | public: | ||||
MegDNNDynOutMallocImpl(CompNode cn) : m_cn{cn} {} | |||||
megdnn::TensorND alloc_output( | |||||
size_t id, DType dtype, const TensorShape& shape, | |||||
void* user_data) override { | |||||
TensorLayout m_layout(shape, dtype); | |||||
m_out[id] = Tensor::make(m_layout, m_cn); | |||||
return m_out[id]->dev_tensor().as_megdnn(); | |||||
using Param = typename Opr::Param; | |||||
DnnOprCaller(CompNode cn) : Holder(cn), Deducer(Holder::op().get()) {} | |||||
DnnOprCaller(CompNode cn, const Param& param) : DnnOprCaller(cn) { | |||||
Holder::param() = param; | |||||
} | |||||
DnnOprCaller(CompNode cn, const Param& param, megdnn::param::ExecutionPolicy policy) | |||||
: DnnOprCaller(cn, param) { | |||||
m_policy.emplace(policy); | |||||
} | } | ||||
void* alloc_workspace(size_t sz, void* user_data) override { | |||||
return m_cn.alloc_device(sz); | |||||
/** | |||||
* /brief Convert TensorPtr args to megdnn::TensorND and call f | |||||
* | |||||
*/ | |||||
template <typename TFunctor, typename... TArgs> | |||||
auto call_dnn(TFunctor&& f, TArgs&&... args) { | |||||
std::optional<SmallVector<std::shared_ptr<dt_byte>>> input_ptrs; | |||||
// recursive convert: | |||||
// 1. TensorPtr to DnnTensorND (subclass of megdnn::TensorND) ; | |||||
// 2. DeviceTensorND, HostTensorND to megdnn::TensorND ; | |||||
// 3. SmallVector of above to SmallVector<megdnn::TensorND> . | |||||
auto to_dnn = [&](auto&& arg, auto&& to_dnn) { | |||||
using T = decltype(arg); | |||||
if constexpr (std::is_convertible_v<T, TensorPtr>) { | |||||
return arg->dnn_tensor(); | |||||
} else if constexpr ( | |||||
std::is_convertible_v<T, DeviceTensorND> || | |||||
std::is_convertible_v<T, HostTensorND>) { | |||||
return arg.as_megdnn(); | |||||
} else if constexpr ( | |||||
std::is_convertible_v<T, megdnn::TensorND> || | |||||
std::is_convertible_v<T, SmallVector<megdnn::TensorND>>) { | |||||
return std::forward<T>(arg); | |||||
} else if constexpr (is_small_vector_v<std::decay_t<T>>) { | |||||
using TItem = std::decay_t<decltype(to_dnn(arg[0], to_dnn))>; | |||||
SmallVector<megdnn::TensorND> dnn_tensors; | |||||
for (auto&& tensor : arg) { | |||||
if constexpr (std::is_same_v<TItem, DnnTensorND>) { | |||||
if (!input_ptrs) { | |||||
input_ptrs.emplace(); | |||||
} | |||||
auto dnn_tensor = to_dnn(tensor, to_dnn); | |||||
input_ptrs->push_back(std::move(dnn_tensor.reference)); | |||||
dnn_tensors.push_back(std::move(dnn_tensor)); | |||||
} else if constexpr (std::is_same_v<TItem, megdnn::TensorND>) { | |||||
dnn_tensors.push_back(to_dnn(tensor, to_dnn)); | |||||
} else { | |||||
static_assert(!std::is_same_v<TItem, TItem>); | |||||
} | |||||
} | |||||
return dnn_tensors; | |||||
} else { | |||||
static_assert(!std::is_same_v<T, T>); | |||||
} | |||||
}; | |||||
return f(to_dnn(std::forward<TArgs>(args), to_dnn)...); | |||||
} | } | ||||
void free_workspace(void* ptr, void* user_data) override { m_cn.free_device(ptr); } | |||||
// common execution (opr->exec(inputs..., outputs...)) | |||||
template <typename... TArgs> | |||||
void exec(TArgs&&... args) { | |||||
call_dnn( | |||||
[this](auto&&... args) { | |||||
Holder::op()->exec(std::forward<decltype(args)>(args)...); | |||||
}, | |||||
std::forward<TArgs>(args)...); | |||||
} | |||||
// execution fastrun opr | |||||
// (opr->exec(inputs..., outputs..., create_ws(setup_algo(...)))) | |||||
template <typename... TArgs> | |||||
void exec_fastrun(TArgs&&... args) { | |||||
call_dnn( | |||||
[&](auto&&... args) { | |||||
using FixedTensorLayouts = | |||||
typename rdnn::AlgoChooser<Opr>::FixedTensorLayouts; | |||||
SmallVector<megdnn::TensorND> dnn_inputs = {args...}; | |||||
mgb_assert(m_policy, "policy not set"); | |||||
size_t workspace_size = setup_algo<Opr>( | |||||
FixedTensorLayouts{args.layout...}, Holder::op().get(), 0, | |||||
false, false, Holder::comp_node(), *m_policy, false, | |||||
&dnn_inputs); | |||||
Holder::op()->exec( | |||||
std::forward<decltype(args)>(args)..., | |||||
create_workspace(workspace_size)); | |||||
}, | |||||
std::forward<TArgs>(args)...); | |||||
} | |||||
// execute with fixed workspace | |||||
// (opr->exec(input..., outputs..., create_ws(get_workspace_in_bytes(...)))) | |||||
template <typename... TArgs> | |||||
void exec_with_ws(TArgs&&... args) { | |||||
call_dnn( | |||||
[&](auto&&... args) { | |||||
size_t workspace_size = | |||||
Holder::op()->get_workspace_in_bytes(get_layout(args)...); | |||||
Holder::op()->exec( | |||||
std::forward<decltype(args)>(args)..., | |||||
create_workspace(workspace_size)); | |||||
}, | |||||
std::forward<TArgs>(args)...); | |||||
} | |||||
TensorPtr at(size_t id) { return m_out[id]; } | |||||
// execute dynamic out opr | |||||
// (opr->exec(inputs..., outputs... create_ws(get_workspace_in_bytes(...)), alloc)) | |||||
template <size_t nr_out, typename... TArgs> | |||||
auto exec_dynout(TArgs&&... args) { | |||||
struct Alloc final : public megdnn::DynOutMallocPolicy { | |||||
CompNode comp_node; | |||||
std::array<TensorPtr, nr_out> output_tensors; | |||||
std::array<std::optional<DnnTensorND>, nr_out> output_dnn_tensors; | |||||
public: | |||||
Alloc(CompNode comp_node) : comp_node(comp_node) {} | |||||
megdnn::TensorND alloc_output( | |||||
size_t id, DType dtype, const TensorShape& shape, | |||||
void* user_data) override { | |||||
TensorLayout layout(shape, dtype); | |||||
output_tensors[id] = Tensor::make(layout, comp_node); | |||||
output_dnn_tensors[id].emplace( | |||||
output_tensors[id]->dnn_tensor()); // pin output | |||||
return *output_dnn_tensors[id]; | |||||
} | |||||
void* alloc_workspace(size_t sz, void* user_data) override { | |||||
mgb_assert(false); | |||||
} | |||||
void free_workspace(void* ptr, void* user_data) override { | |||||
mgb_assert(false); | |||||
} | |||||
} alloc{Holder::comp_node()}; | |||||
call_dnn( | |||||
[&](auto&&... args) { | |||||
size_t workspace_size = | |||||
Holder::op()->get_workspace_in_bytes(get_layout(args)...); | |||||
Holder::op()->exec( | |||||
std::forward<decltype(args)>(args)..., | |||||
create_workspace(workspace_size), &alloc); | |||||
}, | |||||
std::forward<TArgs>(args)...); | |||||
return alloc.output_tensors; | |||||
} | |||||
}; | }; | ||||
} // namespace imperative | } // namespace imperative | ||||
@@ -605,6 +605,7 @@ TensorInfo* ChannelImpl::alloc() { | |||||
void ChannelImpl::init(TensorInfo* info, LogicalTensorDesc&& desc) { | void ChannelImpl::init(TensorInfo* info, LogicalTensorDesc&& desc) { | ||||
m_valid_handle.insert(reinterpret_cast<Handle>(info)); | m_valid_handle.insert(reinterpret_cast<Handle>(info)); | ||||
MGB_RECORD_EVENT(TensorDeclareEvent, info->id, info->name); | MGB_RECORD_EVENT(TensorDeclareEvent, info->id, info->name); | ||||
mgb_assert(desc.comp_node.valid(), "comp_node invalid"); | |||||
info->status = TensorInfo::Allocated; | info->status = TensorInfo::Allocated; | ||||
info->desc = std::move(desc); | info->desc = std::move(desc); | ||||
} | } | ||||
@@ -831,6 +832,7 @@ void ChannelImpl::do_apply_op(const ApplyOp& cmd, std::string reason) { | |||||
output_descs.push_back(i->desc); | output_descs.push_back(i->desc); | ||||
} | } | ||||
} else { | } else { | ||||
// i may be null | |||||
validated = false; | validated = false; | ||||
} | } | ||||
// Here std::move is REQUIRED for removing duplicated references. | // Here std::move is REQUIRED for removing duplicated references. | ||||
@@ -1064,17 +1066,16 @@ void ChannelImpl::alloc_tensor_with_evict(OwnedBlob* x) { | |||||
if (in_worker) { | if (in_worker) { | ||||
reserve_size(x->size()); | reserve_size(x->size()); | ||||
} | } | ||||
MGB_TRY { BlobManager::inst()->alloc_direct(x, x->size()); } | |||||
MGB_CATCH(MemAllocError&, { | |||||
if (!BlobManager::inst()->try_alloc_direct(x, x->size())) { | |||||
bool suc = false; | bool suc = false; | ||||
if (in_worker) { | if (in_worker) { | ||||
while (!suc) { | while (!suc) { | ||||
if (!auto_evict(1)) { | if (!auto_evict(1)) { | ||||
break; | break; | ||||
} | } | ||||
MGB_TRY { BlobManager::inst()->alloc_direct(x, x->size()); } | |||||
MGB_CATCH(MemAllocError&, { continue; }); | |||||
suc = true; | |||||
if (BlobManager::inst()->try_alloc_direct(x, x->size())) { | |||||
suc = true; | |||||
} | |||||
} | } | ||||
} | } | ||||
if (!suc) { | if (!suc) { | ||||
@@ -1086,9 +1087,11 @@ void ChannelImpl::alloc_tensor_with_evict(OwnedBlob* x) { | |||||
imperative_log_profile_begin("defrag"); | imperative_log_profile_begin("defrag"); | ||||
BlobManager::inst()->defrag(x->comp_node()); | BlobManager::inst()->defrag(x->comp_node()); | ||||
imperative_log_profile_end("defrag"); | imperative_log_profile_end("defrag"); | ||||
BlobManager::inst()->alloc_direct(x, x->size()); | |||||
mgb_assert( | |||||
BlobManager::inst()->try_alloc_direct(x, x->size()), | |||||
"allocation failed after defrag"); | |||||
} | } | ||||
}); | |||||
} | |||||
set_log_level(pre_level); | set_log_level(pre_level); | ||||
} | } | ||||
@@ -75,13 +75,12 @@ std::tuple<SmallVector<LogicalTensorDesc>, bool> infer_output_attrs_fallible( | |||||
SmallVector<TensorPtr> apply_on_physical_tensor( | SmallVector<TensorPtr> apply_on_physical_tensor( | ||||
const OpDef& def, const SmallVector<TensorPtr>& inputs, | const OpDef& def, const SmallVector<TensorPtr>& inputs, | ||||
SmallVector<LogicalTensorDesc>& output_descs, const bool& validated) { | SmallVector<LogicalTensorDesc>& output_descs, const bool& validated) { | ||||
auto&& pool = static_cast<const AdaptivePooling&>(def); | |||||
auto&& pooling = def.cast_final_safe<AdaptivePooling>(); | |||||
auto&& cn = inputs[0]->comp_node(); | auto&& cn = inputs[0]->comp_node(); | ||||
using TensorND = megdnn::TensorND; | |||||
auto&& src_layout = inputs[0]->layout(); | auto&& src_layout = inputs[0]->layout(); | ||||
TensorLayout dst_layout = output_descs[0].layout; | |||||
auto param_format = pool.format; | |||||
TensorLayout dst_layout{inputs[0]->dtype()}; | |||||
auto param_format = pooling.format; | |||||
if (!validated) { | if (!validated) { | ||||
dst_layout.ndim = src_layout.ndim; | dst_layout.ndim = src_layout.ndim; | ||||
const dt_int32* oshp2d = nullptr; | const dt_int32* oshp2d = nullptr; | ||||
@@ -91,7 +90,7 @@ SmallVector<TensorPtr> apply_on_physical_tensor( | |||||
tshp1n = inputs[1]->layout().total_nr_elems() == 1; | tshp1n = inputs[1]->layout().total_nr_elems() == 1; | ||||
oshp2d = tshp_nd->get_value().proxy_to_default_cpu().ptr<dt_int32>(); | oshp2d = tshp_nd->get_value().proxy_to_default_cpu().ptr<dt_int32>(); | ||||
} else { | } else { | ||||
oshp2d = pool.shape.data(); | |||||
oshp2d = pooling.shape.data(); | |||||
} | } | ||||
if (param_format == opr::AdaptivePooling::Param::Format::NCHW) { | if (param_format == opr::AdaptivePooling::Param::Format::NCHW) { | ||||
dst_layout[0] = src_layout[0]; | dst_layout[0] = src_layout[0]; | ||||
@@ -108,15 +107,17 @@ SmallVector<TensorPtr> apply_on_physical_tensor( | |||||
MegBrainError, "AdaptivePooling only support NCHW or NHWC format"); | MegBrainError, "AdaptivePooling only support NCHW or NHWC format"); | ||||
} | } | ||||
dst_layout.init_contiguous_stride(); | dst_layout.init_contiguous_stride(); | ||||
} else { | |||||
dst_layout = output_descs[0].layout; | |||||
} | } | ||||
size_t IH, IW, OH, OW; | size_t IH, IW, OH, OW; | ||||
if (param_format == param::AdaptivePooling::Format::NCHW) { | |||||
if (param_format == megdnn::param::AdaptivePooling::Format::NCHW) { | |||||
IH = src_layout[2]; | IH = src_layout[2]; | ||||
IW = src_layout[3]; | IW = src_layout[3]; | ||||
OH = dst_layout[2]; | OH = dst_layout[2]; | ||||
OW = dst_layout[3]; | OW = dst_layout[3]; | ||||
} else if (param_format == param::AdaptivePooling::Format::NHWC) { | |||||
} else if (param_format == megdnn::param::AdaptivePooling::Format::NHWC) { | |||||
IH = src_layout[1]; | IH = src_layout[1]; | ||||
IW = src_layout[2]; | IW = src_layout[2]; | ||||
OH = dst_layout[1]; | OH = dst_layout[1]; | ||||
@@ -124,26 +125,21 @@ SmallVector<TensorPtr> apply_on_physical_tensor( | |||||
} else { | } else { | ||||
mgb_throw(MegBrainError, "AdaptivePooling only support NCHW or NHWC format"); | mgb_throw(MegBrainError, "AdaptivePooling only support NCHW or NHWC format"); | ||||
} | } | ||||
DnnOprCaller<megdnn::Pooling> dnn_opr(cn); | |||||
auto&& param = dnn_opr.op->param(); | |||||
param.mode = pool.mode; | |||||
param.format = pool.format; | |||||
// adaptive_pooling param to pooling | |||||
auto&& param = megdnn::Pooling::Param(); | |||||
param.mode = pooling.mode; | |||||
param.format = pooling.format; | |||||
param.pad_h = param.pad_w = 0; | param.pad_h = param.pad_w = 0; | ||||
param.stride_h = floor(IH / OH); | |||||
param.stride_w = floor(IW / OW); | |||||
param.stride_h = IH / OH; | |||||
param.stride_w = IW / OW; | |||||
param.window_h = IH - (OH - 1) * param.stride_h; | param.window_h = IH - (OH - 1) * param.stride_h; | ||||
param.window_w = IW - (OW - 1) * param.stride_w; | param.window_w = IW - (OW - 1) * param.stride_w; | ||||
TensorND src = inputs[0]->dnn_tensor(); | |||||
DnnOprCaller<megdnn::Pooling> dnn_opr(cn, param, megdnn::param::ExecutionPolicy{}); | |||||
auto src = inputs[0]; | |||||
auto dst = Tensor::make(dst_layout, cn); | auto dst = Tensor::make(dst_layout, cn); | ||||
size_t sz = setup_algo<megdnn::Pooling>( | |||||
{src_layout, dst_layout}, dnn_opr.op.get(), 0, false, false, cn, | |||||
::megdnn::param::ExecutionPolicy{}, false); | |||||
auto dnn_wk = dnn_opr.create_workspace(sz); | |||||
dnn_opr.op->exec(src, dst->dnn_tensor(), dnn_wk); | |||||
dnn_opr.exec_fastrun(inputs[0], dst); | |||||
return {dst}; | return {dst}; | ||||
} | } | ||||
@@ -145,79 +145,44 @@ SmallVector<TensorPtr> apply_on_physical_tensor( | |||||
auto&& op_def = def.cast_final_safe<BatchNorm>(); | auto&& op_def = def.cast_final_safe<BatchNorm>(); | ||||
auto&& comp_node = inputs[0]->comp_node(); | auto&& comp_node = inputs[0]->comp_node(); | ||||
using TensorND = megdnn::TensorND; | |||||
DnnOprCaller<megdnn::BN> dnn_opr(comp_node, op_def.param()); | |||||
SmallVector<TensorND> inp_tensornds(inputs.size()); | |||||
for (size_t i = 0; i < inputs.size(); ++i) { | |||||
inp_tensornds[i] = inputs[i]->dnn_tensor(); | |||||
} | |||||
DnnOprCaller<megdnn::BN> dnn_opr(comp_node); | |||||
dnn_opr.op->param() = op_def.param(); | |||||
TensorLayout src_layout = inputs[0]->layout(); | |||||
TensorLayout scale_layout = inputs[1]->layout(); | |||||
auto src_layout = inputs[0]->layout(); | |||||
auto scale_layout = inputs[1]->layout(); | |||||
bool empty_input = src_layout.is_empty(); | bool empty_input = src_layout.is_empty(); | ||||
size_t nr_inp = inputs.size(); | size_t nr_inp = inputs.size(); | ||||
size_t sz = 0, rsz = 0; | |||||
TensorLayout r_layout({rsz}, dtype::Byte()); | |||||
if (!empty_input) { | |||||
sz = dnn_opr.op->get_workspace_in_bytes( | |||||
src_layout, src_layout, src_layout, src_layout, src_layout, src_layout, | |||||
src_layout, src_layout, src_layout); | |||||
rsz = dnn_opr.op->get_reserve_in_bytes(src_layout); | |||||
r_layout = TensorLayout({rsz}, dtype::Byte()); | |||||
} | |||||
auto dnn_wk = dnn_opr.create_workspace(sz); | |||||
auto reserve = Tensor::make(r_layout, comp_node); | |||||
// size_t ws_size = 0, reserve_size = 0; | |||||
size_t reserve_size = | |||||
empty_input ? (size_t)0 : dnn_opr.op()->get_reserve_in_bytes(src_layout); | |||||
// alloc memory | |||||
// alloc outputs | |||||
auto y = Tensor::make(src_layout, comp_node); | auto y = Tensor::make(src_layout, comp_node); | ||||
auto save_mean = Tensor::make(scale_layout, comp_node); | auto save_mean = Tensor::make(scale_layout, comp_node); | ||||
auto save_variance = Tensor::make(scale_layout, comp_node); | auto save_variance = Tensor::make(scale_layout, comp_node); | ||||
auto reserve = Tensor::make(TensorLayout{{reserve_size}, dtype::Byte()}, comp_node); | |||||
if (op_def.fwd_mode == ::megdnn::param::BN::FwdMode::INFERENCE) { | if (op_def.fwd_mode == ::megdnn::param::BN::FwdMode::INFERENCE) { | ||||
if (!empty_input) | |||||
dnn_opr.op->exec( | |||||
inp_tensornds[0], inp_tensornds[1], inp_tensornds[2], | |||||
inp_tensornds[3], inp_tensornds[4], save_mean->dnn_tensor(), | |||||
save_variance->dnn_tensor(), reserve->dnn_tensor(), y->dnn_tensor(), | |||||
dnn_wk); | |||||
if (!empty_input) { | |||||
dnn_opr.exec_with_ws( | |||||
inputs[0], inputs[1], inputs[2], inputs[3], inputs[4], save_mean, | |||||
save_variance, reserve, y); | |||||
} | |||||
return {inputs[3], inputs[4], reserve, y}; | return {inputs[3], inputs[4], reserve, y}; | ||||
} else { | } else { | ||||
if (nr_inp == 5) { | if (nr_inp == 5) { | ||||
auto mean = Tensor::make(scale_layout, comp_node); | auto mean = Tensor::make(scale_layout, comp_node); | ||||
auto variance = Tensor::make(scale_layout, comp_node); | auto variance = Tensor::make(scale_layout, comp_node); | ||||
megdnn::RefPtr src_ptr1( | |||||
inp_tensornds[3].get_ref_ptr().get_ptr(), inputs[3]->offset()); | |||||
megdnn::RefPtr dst_ptr1( | |||||
mean->dev_tensor().storage().get_ref_ptr(), | |||||
mean->dev_tensor().storage().offset(), false); | |||||
comp_node.peer_copy_to_ref( | |||||
comp_node, dst_ptr1, src_ptr1, scale_layout.span().high_byte); | |||||
megdnn::RefPtr src_ptr2( | |||||
inp_tensornds[4].get_ref_ptr().get_ptr(), inputs[4]->offset()); | |||||
megdnn::RefPtr dst_ptr2( | |||||
variance->dev_tensor().storage().get_ref_ptr(), | |||||
variance->dev_tensor().storage().offset(), false); | |||||
comp_node.peer_copy_to_ref( | |||||
comp_node, dst_ptr2, src_ptr2, scale_layout.span().high_byte); | |||||
if (!empty_input) | |||||
dnn_opr.op->exec( | |||||
inp_tensornds[0], inp_tensornds[1], inp_tensornds[2], | |||||
mean->dnn_tensor(), variance->dnn_tensor(), | |||||
save_mean->dnn_tensor(), save_variance->dnn_tensor(), | |||||
reserve->dnn_tensor(), y->dnn_tensor(), dnn_wk); | |||||
// FIXME | |||||
mean->dev_tensor().copy_from(inputs[3]->dev_tensor()); | |||||
variance->dev_tensor().copy_from(inputs[4]->dev_tensor()); | |||||
if (!empty_input) { | |||||
dnn_opr.exec_with_ws( | |||||
inputs[0], inputs[1], inputs[2], mean, variance, save_mean, | |||||
save_variance, reserve, y); | |||||
} | |||||
return {mean, variance, save_mean, save_variance, reserve, y}; | return {mean, variance, save_mean, save_variance, reserve, y}; | ||||
} | } | ||||
@@ -227,11 +192,9 @@ SmallVector<TensorPtr> apply_on_physical_tensor( | |||||
auto variance = Tensor::make(m_layout, comp_node); | auto variance = Tensor::make(m_layout, comp_node); | ||||
if (!empty_input) { | if (!empty_input) { | ||||
dnn_opr.op->exec( | |||||
inp_tensornds[0], inp_tensornds[1], inp_tensornds[2], | |||||
mean->dnn_tensor(), variance->dnn_tensor(), save_mean->dnn_tensor(), | |||||
save_variance->dnn_tensor(), reserve->dnn_tensor(), y->dnn_tensor(), | |||||
dnn_wk); | |||||
dnn_opr.exec_with_ws( | |||||
inputs[0], inputs[1], inputs[2], mean, variance, save_mean, | |||||
save_variance, reserve, y); | |||||
} | } | ||||
return {save_mean, save_variance, reserve, y}; | return {save_mean, save_variance, reserve, y}; | ||||
@@ -28,33 +28,26 @@ SmallVector<TensorPtr> apply_on_physical_tensor( | |||||
auto&& inp = inputs[0]; | auto&& inp = inputs[0]; | ||||
auto&& msk = inputs[1]; | auto&& msk = inputs[1]; | ||||
SmallVector<TensorPtr> out; | |||||
mgb_assert( | mgb_assert( | ||||
inp->layout().eq_shape(msk->layout()), | inp->layout().eq_shape(msk->layout()), | ||||
"input shape does not match mask shape"); | "input shape does not match mask shape"); | ||||
mgb_assert( | mgb_assert( | ||||
msk->get_value().dtype().enumv() == DTypeEnum::Bool, | msk->get_value().dtype().enumv() == DTypeEnum::Bool, | ||||
"mask dtype must be bool"); | "mask dtype must be bool"); | ||||
MegDNNDynOutMallocImpl<2> policy{inp->comp_node()}; | |||||
if (inp->layout().is_empty()) { | if (inp->layout().is_empty()) { | ||||
// empty tensor | // empty tensor | ||||
policy.alloc_output(0, inp->layout().dtype, {0}, nullptr); | |||||
policy.alloc_output(1, dtype::Int32(), {0}, nullptr); | |||||
return { | |||||
Tensor::make(TensorLayout{{0}, inp->dtype()}, inp->comp_node()), | |||||
Tensor::make(TensorLayout{{0}, dtype::Int32()}, inp->comp_node()), | |||||
}; | |||||
} else { | } else { | ||||
DnnOprCaller<megdnn::CondTake> dnn_op(inp->comp_node()); | |||||
dnn_op.op->param().val = 1; | |||||
size_t sz = dnn_op.op->get_workspace_in_bytes(inp->layout()); | |||||
auto dnn_workspace = dnn_op.create_workspace(sz); | |||||
dnn_op.op->exec( | |||||
inp->dev_tensor().as_megdnn(), msk->dev_tensor().as_megdnn(), | |||||
dnn_workspace, &policy); | |||||
// maybe we need to split CondTake | |||||
megdnn::CondTake::Param param; | |||||
param.val = 1; | |||||
DnnOprCaller<megdnn::CondTake> dnn_op(inp->comp_node(), param); | |||||
auto&& [out0, out1] = dnn_op.exec_dynout<2>(inp, msk); | |||||
return {out0, out1}; | |||||
} | } | ||||
out.push_back(policy.at(0)); | |||||
out.push_back(policy.at(1)); | |||||
return out; | |||||
} | } | ||||
std::tuple<SmallVector<LogicalTensorDesc>, bool> infer_output_attrs_fallible( | std::tuple<SmallVector<LogicalTensorDesc>, bool> infer_output_attrs_fallible( | ||||
@@ -8,14 +8,7 @@ | |||||
namespace mgb { | namespace mgb { | ||||
namespace imperative { | namespace imperative { | ||||
namespace { | namespace { | ||||
size_t infer_conv_shape(size_t inp, size_t flt, size_t stride, size_t pad) { | |||||
mgb_assert(inp + 2 * pad >= flt, "input=%zu padding=%zu filter=%zu", inp, pad, flt); | |||||
return (inp + 2 * pad - flt) / stride + 1; | |||||
} | |||||
namespace convolution { | namespace convolution { | ||||
std::shared_ptr<OpDef> make_from_op_node(cg::OperatorNodeBase* node_) { | std::shared_ptr<OpDef> make_from_op_node(cg::OperatorNodeBase* node_) { | ||||
auto* node = &node_->cast_final_safe<opr::Convolution>(); | auto* node = &node_->cast_final_safe<opr::Convolution>(); | ||||
@@ -29,131 +22,23 @@ auto apply_on_var_node(const OpDef& def, const VarNodeArray& inputs) { | |||||
inputs[0], inputs[1], conv.param(), conv.policy(), config); | inputs[0], inputs[1], conv.param(), conv.policy(), config); | ||||
} | } | ||||
TensorLayout do_shape_infer( | |||||
const OpDef& def, size_t src_ndim, TensorLayout src, TensorLayout filter) { | |||||
auto&& conv = static_cast<const Convolution&>(def); | |||||
using Param = ::megdnn::param::Convolution; | |||||
auto img_ndim = src_ndim - 2; | |||||
mgb_assert( | |||||
img_ndim == 2, | |||||
"only 2D convolution is supported, and input should be 4-dim; " | |||||
"got input dim = %zu", | |||||
src_ndim); | |||||
size_t group = 1; | |||||
size_t flt_start, flt_spatial_start, ocpg_pos, icpg_pos; | |||||
if (conv.sparse == Param::Sparse::DENSE) { | |||||
mgb_assert( | |||||
filter.ndim == img_ndim + 2 || filter.ndim == img_ndim + 4, | |||||
"bad filter ndim for dense convolution: " | |||||
"spatial_ndim=%zu filter_ndim=%zu", | |||||
img_ndim, filter.ndim); | |||||
group = 1; | |||||
flt_start = 0; | |||||
} else { // Param::Sparse::GROUP | |||||
mgb_assert( | |||||
filter.ndim == img_ndim + 3 || filter.ndim == img_ndim + 5, | |||||
"bad filter ndim for group convolution: " | |||||
"spatial_ndim=%zu filter_ndim=%zu", | |||||
img_ndim, filter.ndim); | |||||
// grp, oc, ic, dims[] | |||||
group = filter[0]; | |||||
flt_start = 1; | |||||
} | |||||
uint32_t ic_block_size = 1, oc_block_size = 1; | |||||
size_t src_or_dst_c_pos = 0; | |||||
size_t src_or_dst_spatial_start = 0; | |||||
if (conv.format == Param::Format::NCHW) { | |||||
// filter should be (oc, ic, fh, fw) | |||||
flt_spatial_start = 2; | |||||
ocpg_pos = 0; | |||||
icpg_pos = 1; | |||||
src_or_dst_c_pos = 1; | |||||
src_or_dst_spatial_start = 2; | |||||
} else { // Param::Format::NHWC | |||||
// filter should be (oc, fh, fw, ic) | |||||
flt_spatial_start = 1; | |||||
ocpg_pos = 0; | |||||
icpg_pos = 3; | |||||
src_or_dst_c_pos = 3; | |||||
src_or_dst_spatial_start = 1; | |||||
} | |||||
size_t ocpg = filter[flt_start + ocpg_pos] * oc_block_size; | |||||
size_t icpg = filter[flt_start + icpg_pos] * ic_block_size; | |||||
uint32_t dilation[2], dilated_spatial[2], stride[2], padding[2]; | |||||
dilation[0] = conv.dilate_h; | |||||
dilation[1] = conv.dilate_w; | |||||
stride[0] = conv.stride_h; | |||||
stride[1] = conv.stride_w; | |||||
padding[0] = conv.pad_h; | |||||
padding[1] = conv.pad_w; | |||||
for (size_t i = 0; i < img_ndim; ++i) { | |||||
mgb_assert( | |||||
dilation[i] > 0, "invalid dilation on spatial dim %zu: %u", i, | |||||
dilation[i]); | |||||
dilated_spatial[i] = | |||||
(filter[i + flt_start + flt_spatial_start] - 1) * dilation[i] + 1; | |||||
} | |||||
mgb_assert( | |||||
icpg * group == src[src_or_dst_c_pos], | |||||
"group conv invalid: input channel of Conv expect %zu, but got %zu\n" | |||||
"hint: weight may be changed by mistake\n", | |||||
icpg * group, src[src_or_dst_c_pos]); | |||||
TensorLayout dst{src.dtype}; | |||||
dst.ndim = src_ndim; | |||||
dst[0] = src[0]; | |||||
dst[src_or_dst_c_pos] = ocpg * group; | |||||
for (size_t i = 0; i < img_ndim; ++i) { | |||||
dst[i + src_or_dst_spatial_start] = infer_conv_shape( | |||||
src[i + src_or_dst_spatial_start], dilated_spatial[i], stride[i], | |||||
padding[i]); | |||||
} | |||||
dst.init_contiguous_stride(); | |||||
return dst; | |||||
} | |||||
std::tuple<SmallVector<LogicalTensorDesc>, bool> infer_output_attrs_fallible( | std::tuple<SmallVector<LogicalTensorDesc>, bool> infer_output_attrs_fallible( | ||||
const OpDef& def, const SmallVector<LogicalTensorDesc>& inputs) { | const OpDef& def, const SmallVector<LogicalTensorDesc>& inputs) { | ||||
SmallVector<LogicalTensorDesc> dests(1); | |||||
auto&& desc = dests[0]; | |||||
desc.comp_node = inputs[0].comp_node; | |||||
TensorLayout src = inputs[0].layout; | |||||
TensorLayout filter = inputs[1].layout; | |||||
size_t src_ndim = src.ndim; | |||||
if (src_ndim == 0 || filter.ndim == 0) { | |||||
desc.layout = TensorLayout{{}, src.dtype}; | |||||
return {dests, false}; | |||||
auto&& conv = def.cast_final_safe<Convolution>(); | |||||
DnnOprHelper<megdnn::ConvolutionForward> dnn_opr(conv.param()); | |||||
auto&& data = inputs[0].layout; | |||||
auto&& filter = inputs[1].layout; | |||||
TensorLayout output_layout{data.dtype}; | |||||
if (data.ndim && filter.ndim) { | |||||
// deduce_layout won't override existing dtype | |||||
dnn_opr.opr().deduce_layout(data, filter, output_layout); | |||||
} | } | ||||
desc.layout = do_shape_infer(def, src_ndim, src, filter); | |||||
return {dests, true}; | |||||
return {{{output_layout, inputs[0].comp_node}}, output_layout.ndim != 0}; | |||||
} | } | ||||
SmallVector<TensorPtr> apply_on_physical_tensor( | |||||
const OpDef& def, const SmallVector<TensorPtr>& inputs, | |||||
SmallVector<LogicalTensorDesc>& output_descs, const bool& validated) { | |||||
// create megdnn opr | |||||
auto&& conv = static_cast<const Convolution&>(def); | |||||
CompNode cn = inputs[0]->comp_node(); | |||||
TensorLayout out_layout = output_descs[0].layout; | |||||
if (!validated) | |||||
out_layout = do_shape_infer( | |||||
def, inputs[0]->layout().ndim, inputs[0]->layout(), | |||||
inputs[1]->layout()); | |||||
using TensorND = megdnn::TensorND; | |||||
SmallVector<TensorND> inp_tensornds(inputs.size() + 2); | |||||
TensorLayoutArray inp_shapes(inputs.size()), oup_shapes(output_descs.size()); | |||||
for (unsigned i = 0; i < inputs.size(); ++i) { | |||||
inp_tensornds[i] = inputs[i]->dnn_tensor(); | |||||
inp_shapes[i] = inputs[i]->layout(); | |||||
} | |||||
oup_shapes[0] = out_layout; | |||||
DnnOprCaller<megdnn::ConvBiasForward> dnn_opr(cn); | |||||
auto&& param = dnn_opr.op->param(); | |||||
// Convolution::Param -> ConvBias::Param | |||||
auto conv_bias_param_from_convolution(const Convolution& conv) { | |||||
megdnn::ConvBias::Param param; | |||||
param.pad_h = conv.pad_h; | param.pad_h = conv.pad_h; | ||||
param.pad_w = conv.pad_w; | param.pad_w = conv.pad_w; | ||||
param.stride_h = conv.stride_h; | param.stride_h = conv.stride_h; | ||||
@@ -163,30 +48,37 @@ SmallVector<TensorPtr> apply_on_physical_tensor( | |||||
param.sparse = conv.sparse; | param.sparse = conv.sparse; | ||||
param.compute_mode = conv.compute_mode; | param.compute_mode = conv.compute_mode; | ||||
param.format = conv.format; | param.format = conv.format; | ||||
return param; | |||||
} | |||||
// shape infer | |||||
TensorLayout empty_shp({0}, inputs[0]->dtype()); | |||||
empty_shp.ndim = 0; | |||||
auto empty_bias = Tensor::make(empty_shp, cn); | |||||
inp_tensornds[2] = empty_bias->dnn_tensor(); | |||||
inp_tensornds[3] = empty_bias->dnn_tensor(); | |||||
size_t sz = setup_algo<megdnn::ConvBiasForward>( | |||||
{inp_shapes[0], inp_shapes[1], empty_shp, empty_shp, oup_shapes[0]}, | |||||
dnn_opr.op.get(), 0, false, false, cn, conv.policy(), false, | |||||
&inp_tensornds); | |||||
SmallVector<TensorPtr> apply_on_physical_tensor( | |||||
const OpDef& def, const SmallVector<TensorPtr>& inputs, | |||||
SmallVector<LogicalTensorDesc>& output_descs, const bool& validated) { | |||||
// create megdnn opr | |||||
auto&& conv = def.cast_final_safe<Convolution>(); | |||||
CompNode cn = inputs[0]->comp_node(); | |||||
auto&& param = conv_bias_param_from_convolution(conv); | |||||
DnnOprCaller<megdnn::ConvBiasForward> dnn_opr(cn, param, conv.policy()); | |||||
megdnn::TensorND empty_bias; | |||||
empty_bias.layout.dtype = inputs[0]->dtype(); | |||||
empty_bias.layout.ndim = 0; | |||||
auto out_layout = [&] { | |||||
if (validated) { | |||||
return output_descs[0].layout; | |||||
} else { | |||||
TensorLayout out_layout{inputs[0]->dtype()}; | |||||
dnn_opr.op()->deduce_layout( | |||||
inputs[0]->layout(), inputs[1]->layout(), empty_bias.layout, | |||||
empty_bias.layout, out_layout); | |||||
return out_layout; | |||||
} | |||||
}(); | |||||
// alloc memory | // alloc memory | ||||
auto out = Tensor::make(out_layout, cn); | auto out = Tensor::make(out_layout, cn); | ||||
auto dnn_wk = dnn_opr.create_workspace(sz); | |||||
// exeucte | |||||
dnn_opr.op->exec( | |||||
inp_tensornds[0], inp_tensornds[1], inp_tensornds[2], inp_tensornds[3], | |||||
out->dnn_tensor(), nullptr, dnn_wk); | |||||
dnn_opr.exec_fastrun(inputs[0], inputs[1], empty_bias, empty_bias, out); | |||||
return {out}; | return {out}; | ||||
} | } | ||||
@@ -243,155 +135,41 @@ auto apply_on_var_node(const OpDef& def, const VarNodeArray& inputs) { | |||||
} | } | ||||
} | } | ||||
TensorLayout convbwd_do_shape_infer( | |||||
const OpDef& def, size_t diff_ndim, TensorLayout filter, TensorLayout diff, | |||||
CompNode cn) { | |||||
auto&& bwd_conv = static_cast<const ConvolutionBackwardData&>(def); | |||||
DnnOprCaller<megdnn::ConvolutionBackwardData> caller(cn); | |||||
auto&& dnn_opr = caller.op; | |||||
using Param = ::megdnn::param::Convolution; | |||||
// using Param1 = ::megdnn::param::ConvolutionBackwardData; | |||||
auto img_ndim = diff_ndim - 2; | |||||
mgb_assert( | |||||
img_ndim == 2, | |||||
"only 2D convolution is supported, and input should be 4-dim; " | |||||
"got input dim = %zu", | |||||
diff_ndim); | |||||
size_t group = 1; | |||||
size_t flt_start, flt_spatial_start, ocpg_pos, icpg_pos; | |||||
if (bwd_conv.sparse == Param::Sparse::DENSE) { | |||||
mgb_assert( | |||||
filter.ndim == img_ndim + 2 || filter.ndim == img_ndim + 4, | |||||
"bad filter ndim for dense convolution: " | |||||
"spatial_ndim=%zu filter_ndim=%zu", | |||||
img_ndim, filter.ndim); | |||||
group = 1; | |||||
flt_start = 0; | |||||
} else { // Param::Sparse::GROUP | |||||
mgb_assert( | |||||
filter.ndim == img_ndim + 3 || filter.ndim == img_ndim + 5, | |||||
"bad filter ndim for group convolution: " | |||||
"spatial_ndim=%zu filter_ndim=%zu", | |||||
img_ndim, filter.ndim); | |||||
// grp, oc, ic, dims[] | |||||
group = filter[0]; | |||||
flt_start = 1; | |||||
} | |||||
uint32_t ic_block_size = 1, oc_block_size = 1; | |||||
size_t src_or_dst_c_pos = 0; | |||||
size_t src_or_dst_spatial_start = 0; | |||||
if (bwd_conv.format == Param::Format::NCHW) { | |||||
// filter should be (oc, ic, fh, fw) | |||||
flt_spatial_start = 2; | |||||
ocpg_pos = 0; | |||||
icpg_pos = 1; | |||||
src_or_dst_c_pos = 1; | |||||
src_or_dst_spatial_start = 2; | |||||
} else { // Param::Format::NHWC | |||||
// filter should be (oc, fh, fw, ic) | |||||
flt_spatial_start = 1; | |||||
ocpg_pos = 0; | |||||
icpg_pos = 3; | |||||
src_or_dst_c_pos = 3; | |||||
src_or_dst_spatial_start = 1; | |||||
} | |||||
size_t ocpg = filter[flt_start + ocpg_pos] * oc_block_size; | |||||
size_t icpg = filter[flt_start + icpg_pos] * ic_block_size; | |||||
uint32_t dilation[2], dilated_spatial[2], stride[2], padding[2]; | |||||
dilation[0] = bwd_conv.dilate_h; | |||||
dilation[1] = bwd_conv.dilate_w; | |||||
stride[0] = bwd_conv.stride_h; | |||||
stride[1] = bwd_conv.stride_w; | |||||
padding[0] = bwd_conv.pad_h; | |||||
padding[1] = bwd_conv.pad_w; | |||||
for (size_t i = 0; i < img_ndim; ++i) { | |||||
mgb_assert( | |||||
dilation[i] > 0, "invalid dilation on spatial dim %zu: %u", i, | |||||
dilation[i]); | |||||
dilated_spatial[i] = | |||||
(filter[i + flt_start + flt_spatial_start] - 1) * dilation[i] + 1; | |||||
} | |||||
mgb_assert( | |||||
ocpg * group == diff[src_or_dst_c_pos], | |||||
"group conv invalid: input channel of Conv expect %zu, but got %zu\n" | |||||
"hint: weight may be changed by mistake\n", | |||||
ocpg * group, diff[src_or_dst_c_pos]); | |||||
auto deduce = [](size_t out, size_t filter, size_t stride, size_t pad) { | |||||
auto i = (out - 1) * stride + filter; | |||||
mgb_assert(i > pad * 2); | |||||
return i - pad * 2; | |||||
}; | |||||
DType dst_dtype = bwd_conv.dtype; | |||||
dnn_opr->deduce_dtype(filter.dtype, diff.dtype, dst_dtype); | |||||
TensorLayout dst{dst_dtype}; | |||||
dst.ndim = diff_ndim; | |||||
dst[0] = diff[0]; | |||||
dst[src_or_dst_c_pos] = icpg * group; | |||||
for (size_t i = 0; i < img_ndim; ++i) { | |||||
dst[i + src_or_dst_spatial_start] = | |||||
deduce(diff[i + src_or_dst_spatial_start], dilated_spatial[i], | |||||
stride[i], padding[i]); | |||||
} | |||||
dst.init_contiguous_stride(); | |||||
return dst; | |||||
} | |||||
std::tuple<SmallVector<LogicalTensorDesc>, bool> infer_output_attrs_fallible( | std::tuple<SmallVector<LogicalTensorDesc>, bool> infer_output_attrs_fallible( | ||||
const OpDef& def, const SmallVector<LogicalTensorDesc>& inputs) { | const OpDef& def, const SmallVector<LogicalTensorDesc>& inputs) { | ||||
SmallVector<LogicalTensorDesc> dests(1); | |||||
auto&& desc = dests[0]; | |||||
desc.comp_node = inputs[0].comp_node; | |||||
TensorLayout filter = inputs[0].layout; | |||||
TensorLayout diff = inputs[1].layout; | |||||
size_t diff_ndim = diff.ndim; | |||||
if (diff_ndim == 0 || filter.ndim == 0) { | |||||
desc.layout = TensorLayout{{}, diff.dtype}; | |||||
return {dests, false}; | |||||
auto&& convbwd = def.cast_final_safe<ConvolutionBackwardData>(); | |||||
DnnOprHelper<megdnn::ConvolutionBackwardData> dnn_opr(convbwd.param()); | |||||
// force set dtype | |||||
auto&& filter = inputs[0].layout; | |||||
auto&& diff = inputs[1].layout; | |||||
TensorLayout output_layout{convbwd.dtype}; | |||||
if (filter.ndim && diff.ndim) { | |||||
// deduce_layout won't override existing dtype | |||||
dnn_opr.opr().deduce_layout(filter, diff, output_layout); | |||||
} | } | ||||
desc.layout = | |||||
convbwd_do_shape_infer(def, diff_ndim, filter, diff, inputs[0].comp_node); | |||||
return {dests, true}; | |||||
return {{{output_layout, inputs[0].comp_node}}, output_layout.ndim != 0}; | |||||
} | } | ||||
SmallVector<TensorPtr> apply_on_physical_tensor( | SmallVector<TensorPtr> apply_on_physical_tensor( | ||||
const OpDef& def, const SmallVector<TensorPtr>& inputs, | const OpDef& def, const SmallVector<TensorPtr>& inputs, | ||||
SmallVector<LogicalTensorDesc>& output_descs, const bool& validated) { | SmallVector<LogicalTensorDesc>& output_descs, const bool& validated) { | ||||
// create megdnn opr | // create megdnn opr | ||||
auto&& convbwd = static_cast<const ConvolutionBackwardData&>(def); | |||||
auto&& convbwd = def.cast_final_safe<ConvolutionBackwardData>(); | |||||
CompNode cn = inputs[0]->comp_node(); | CompNode cn = inputs[0]->comp_node(); | ||||
TensorLayout out_layout = output_descs[0].layout; | |||||
if (!validated) | |||||
out_layout = convbwd_do_shape_infer( | |||||
def, inputs[1]->layout().ndim, inputs[0]->layout(), inputs[1]->layout(), | |||||
cn); | |||||
DnnOprCaller<megdnn::ConvolutionBackwardData> dnn_opr( | |||||
cn, convbwd.param(), convbwd.policy()); | |||||
auto out_layout = [&] { | |||||
if (validated) { | |||||
return output_descs[0].layout; | |||||
} else { | |||||
TensorLayout out_layout{inputs[0]->dtype()}; | |||||
dnn_opr.op()->deduce_layout( | |||||
inputs[0]->layout(), inputs[1]->layout(), out_layout); | |||||
return out_layout; | |||||
} | |||||
}(); | |||||
auto out = Tensor::make(out_layout, cn); | auto out = Tensor::make(out_layout, cn); | ||||
using TensorND = megdnn::TensorND; | |||||
SmallVector<TensorND> inp_tensornds(inputs.size()); | |||||
TensorLayoutArray inp_shapes(inputs.size()), oup_shapes(output_descs.size()); | |||||
for (unsigned i = 0; i < inputs.size(); ++i) { | |||||
inp_tensornds[i] = inputs[i]->dnn_tensor(); | |||||
inp_shapes[i] = inputs[i]->layout(); | |||||
} | |||||
oup_shapes[0] = out_layout; | |||||
DnnOprCaller<megdnn::ConvolutionBackwardData> dnn_opr(cn); | |||||
dnn_opr.op->param() = convbwd.param(); | |||||
size_t sz = setup_algo<megdnn::ConvolutionBackwardData>( | |||||
{inp_shapes[0], inp_shapes[1], oup_shapes[0]}, dnn_opr.op.get(), 0, false, | |||||
false, cn, convbwd.policy(), false, &inp_tensornds); | |||||
auto dnn_wk = dnn_opr.create_workspace(sz); | |||||
// exeucte | |||||
dnn_opr.op->exec(inp_tensornds[0], inp_tensornds[1], out->dnn_tensor(), dnn_wk); | |||||
dnn_opr.exec_fastrun(inputs[0], inputs[1], out); | |||||
return {out}; | return {out}; | ||||
} | } | ||||
@@ -415,149 +193,36 @@ auto apply_on_var_node(const OpDef& def, const VarNodeArray& inputs) { | |||||
return opr::Convolution3D::make(inputs[0], inputs[1], conv.param(), conv.policy()); | return opr::Convolution3D::make(inputs[0], inputs[1], conv.param(), conv.policy()); | ||||
} | } | ||||
TensorLayout do_shape_infer( | |||||
const OpDef& def, size_t src_ndim, TensorLayout src, TensorLayout filter) { | |||||
auto&& conv = static_cast<const Convolution3D&>(def); | |||||
using Param = ::megdnn::param::Convolution3D; | |||||
auto img_ndim = src_ndim - 2; | |||||
mgb_assert( | |||||
img_ndim == 3, | |||||
"only 3D convolution is supported, and input should be 5-dim; " | |||||
"got input dim = %zu", | |||||
src_ndim); | |||||
size_t group = 1; | |||||
size_t flt_start, flt_spatial_start, ocpg_pos, icpg_pos; | |||||
if (conv.sparse == Param::Sparse::DENSE) { | |||||
mgb_assert( | |||||
filter.ndim == img_ndim + 2 || filter.ndim == img_ndim + 4, | |||||
"bad filter ndim for dense convolution: " | |||||
"spatial_ndim=%zu filter_ndim=%zu", | |||||
img_ndim, filter.ndim); | |||||
group = 1; | |||||
flt_start = 0; | |||||
} else { // Param::Sparse::GROUP | |||||
mgb_assert( | |||||
filter.ndim == img_ndim + 3 || filter.ndim == img_ndim + 5, | |||||
"bad filter ndim for group convolution: " | |||||
"spatial_ndim=%zu filter_ndim=%zu", | |||||
img_ndim, filter.ndim); | |||||
// grp, oc, ic, dims[] | |||||
group = filter[0]; | |||||
flt_start = 1; | |||||
} | |||||
uint32_t ic_block_size = 1, oc_block_size = 1; | |||||
size_t src_or_dst_c_pos = 0; | |||||
size_t src_or_dst_spatial_start = 0; | |||||
if (conv.format == Param::Format::NCDHW) { | |||||
// filter should be (oc, ic, fd, fh, fw) | |||||
flt_spatial_start = 2; | |||||
ocpg_pos = 0; | |||||
icpg_pos = 1; | |||||
src_or_dst_c_pos = 1; | |||||
src_or_dst_spatial_start = 2; | |||||
} else { // Param::Format::NDHWC | |||||
// filter should be (oc, fd, fh, fw, ic) | |||||
flt_spatial_start = 1; | |||||
ocpg_pos = 0; | |||||
icpg_pos = 4; | |||||
src_or_dst_c_pos = 4; | |||||
src_or_dst_spatial_start = 1; | |||||
} | |||||
size_t ocpg = filter[flt_start + ocpg_pos] * oc_block_size; | |||||
size_t icpg = filter[flt_start + icpg_pos] * ic_block_size; | |||||
uint32_t dilation[3], dilated_spatial[3], stride[3], padding[3]; | |||||
dilation[0] = conv.dilate_d; | |||||
dilation[1] = conv.dilate_h; | |||||
dilation[2] = conv.dilate_w; | |||||
stride[0] = conv.stride_d; | |||||
stride[1] = conv.stride_h; | |||||
stride[2] = conv.stride_w; | |||||
padding[0] = conv.pad_d; | |||||
padding[1] = conv.pad_h; | |||||
padding[2] = conv.pad_w; | |||||
for (size_t i = 0; i < img_ndim; ++i) { | |||||
mgb_assert( | |||||
dilation[i] > 0, "invalid dilation on spatial dim %zu: %u", i, | |||||
dilation[i]); | |||||
dilated_spatial[i] = | |||||
(filter[i + flt_start + flt_spatial_start] - 1) * dilation[i] + 1; | |||||
} | |||||
mgb_assert( | |||||
icpg * group == src[src_or_dst_c_pos], | |||||
"group conv invalid: input channel of Conv expect %zu, but got %zu\n" | |||||
"hint: weight may be changed by mistake\n", | |||||
icpg * group, src[src_or_dst_c_pos]); | |||||
TensorLayout dst{src.dtype}; | |||||
dst.ndim = src_ndim; | |||||
dst[0] = src[0]; | |||||
dst[src_or_dst_c_pos] = ocpg * group; | |||||
for (size_t i = 0; i < img_ndim; ++i) { | |||||
dst[i + src_or_dst_spatial_start] = infer_conv_shape( | |||||
src[i + src_or_dst_spatial_start], dilated_spatial[i], stride[i], | |||||
padding[i]); | |||||
} | |||||
dst.init_contiguous_stride(); | |||||
return dst; | |||||
} | |||||
std::tuple<SmallVector<LogicalTensorDesc>, bool> infer_output_attrs_fallible( | std::tuple<SmallVector<LogicalTensorDesc>, bool> infer_output_attrs_fallible( | ||||
const OpDef& def, const SmallVector<LogicalTensorDesc>& inputs) { | const OpDef& def, const SmallVector<LogicalTensorDesc>& inputs) { | ||||
SmallVector<LogicalTensorDesc> dests(1); | |||||
auto&& desc = dests[0]; | |||||
desc.comp_node = inputs[0].comp_node; | |||||
auto&& conv = def.cast_final_safe<Convolution3D>(); | |||||
TensorLayout src = inputs[0].layout; | TensorLayout src = inputs[0].layout; | ||||
TensorLayout filter = inputs[1].layout; | TensorLayout filter = inputs[1].layout; | ||||
size_t src_ndim = src.ndim; | |||||
if (src_ndim == 0 || filter.ndim == 0) { | |||||
desc.layout = TensorLayout{{}, src.dtype}; | |||||
return {dests, false}; | |||||
if (src.ndim == 0 || filter.ndim == 0) { | |||||
return {{{TensorLayout{src.dtype}, inputs[0].comp_node}}, false}; | |||||
} | } | ||||
desc.layout = do_shape_infer(def, src_ndim, src, filter); | |||||
return {dests, true}; | |||||
DnnOprHelper<megdnn::Convolution3DForward> dnn_opr(conv.param()); | |||||
auto output = dnn_opr.deduce_layout(src, filter); | |||||
return {{{output, inputs[0].comp_node}}, false}; | |||||
} | } | ||||
SmallVector<TensorPtr> apply_on_physical_tensor( | SmallVector<TensorPtr> apply_on_physical_tensor( | ||||
const OpDef& def, const SmallVector<TensorPtr>& inputs, | const OpDef& def, const SmallVector<TensorPtr>& inputs, | ||||
SmallVector<LogicalTensorDesc>& output_descs, const bool& validated) { | SmallVector<LogicalTensorDesc>& output_descs, const bool& validated) { | ||||
// create megdnn opr | // create megdnn opr | ||||
auto&& conv = static_cast<const Convolution3D&>(def); | |||||
TensorLayout out_layout = output_descs[0].layout; | |||||
if (!validated) | |||||
out_layout = do_shape_infer( | |||||
def, inputs[0]->layout().ndim, inputs[0]->layout(), | |||||
inputs[1]->layout()); | |||||
using TensorND = megdnn::TensorND; | |||||
auto&& conv = def.cast_final_safe<Convolution3D>(); | |||||
CompNode cn = inputs[0]->comp_node(); | CompNode cn = inputs[0]->comp_node(); | ||||
SmallVector<TensorND> inp_tensornds(inputs.size()); | |||||
TensorLayoutArray inp_shapes(inputs.size()), oup_shapes(output_descs.size()); | |||||
for (unsigned i = 0; i < inputs.size(); ++i) { | |||||
inp_tensornds[i] = inputs[i]->dnn_tensor(); | |||||
inp_shapes[i] = inputs[i]->layout(); | |||||
} | |||||
oup_shapes[0] = out_layout; | |||||
DnnOprCaller<megdnn::Convolution3D> dnn_opr(cn); | |||||
dnn_opr.op->param() = conv.param(); | |||||
// shape infer | |||||
size_t sz = setup_algo<megdnn::Convolution3D>( | |||||
{inp_shapes[0], inp_shapes[1], oup_shapes[0]}, dnn_opr.op.get(), 0, false, | |||||
false, cn, conv.policy(), false, &inp_tensornds); | |||||
DnnOprCaller<megdnn::Convolution3D> dnn_opr(cn, conv.param(), conv.policy()); | |||||
auto out_layout = [&] { | |||||
if (validated) { | |||||
return output_descs[0].layout; | |||||
} else { | |||||
return dnn_opr.deduce_layout(inputs[0]->layout(), inputs[1]->layout()); | |||||
} | |||||
}(); | |||||
// alloc memory | // alloc memory | ||||
auto out = Tensor::make(out_layout, cn); | auto out = Tensor::make(out_layout, cn); | ||||
auto dnn_wk = dnn_opr.create_workspace(sz); | |||||
// exeucte | |||||
dnn_opr.op->exec(inp_tensornds[0], inp_tensornds[1], out->dnn_tensor(), dnn_wk); | |||||
dnn_opr.exec_fastrun(inputs[0], inputs[1], out); | |||||
return {out}; | return {out}; | ||||
} | } | ||||
@@ -579,51 +244,38 @@ std::tuple<SmallVector<LogicalTensorDesc>, bool> infer_output_attrs_fallible( | |||||
inputs.size() == 2, | inputs.size() == 2, | ||||
"inputs num of conv_transpose3d should be 2 but you give %zu", | "inputs num of conv_transpose3d should be 2 but you give %zu", | ||||
inputs.size()); | inputs.size()); | ||||
auto&& op_def = def.cast_final_safe<Convolution3DBackwardData>(); | auto&& op_def = def.cast_final_safe<Convolution3DBackwardData>(); | ||||
auto&& weight = inputs[0]; | auto&& weight = inputs[0]; | ||||
auto&& diff = inputs[1]; | auto&& diff = inputs[1]; | ||||
auto& cn = weight.comp_node; | |||||
if (weight.layout.ndim == 0 || diff.layout.ndim == 0) { | |||||
return {{{TensorLayout{weight.layout.dtype}, cn, {}}}, false}; | |||||
if (!(weight.layout.ndim && diff.layout.ndim)) { | |||||
return {{{TensorLayout{weight.layout.dtype}, weight.comp_node}}, false}; | |||||
} | } | ||||
TensorLayout oup_layout; | |||||
megdnn::Convolution3DBackwardData::deduce_layout_impl( | |||||
weight.layout, diff.layout, op_def.param(), oup_layout); | |||||
return {{{oup_layout, cn, {}}}, true}; | |||||
DnnOprHelper<megdnn::Convolution3DBackwardData> dnn_opr(op_def.param()); | |||||
auto oup_layout = dnn_opr.deduce_layout(weight.layout, diff.layout); | |||||
return {{{oup_layout, weight.comp_node}}, true}; | |||||
} | } | ||||
SmallVector<TensorPtr> apply_on_physical_tensor( | SmallVector<TensorPtr> apply_on_physical_tensor( | ||||
const OpDef& def, const SmallVector<TensorPtr>& inputs, | const OpDef& def, const SmallVector<TensorPtr>& inputs, | ||||
SmallVector<LogicalTensorDesc>& output_descs, const bool& validated) { | SmallVector<LogicalTensorDesc>& output_descs, const bool& validated) { | ||||
auto&& op_def = def.cast_final_safe<Convolution3DBackwardData>(); | |||||
auto&& conv = def.cast_final_safe<Convolution3DBackwardData>(); | |||||
auto cn = inputs[0]->comp_node(); | auto cn = inputs[0]->comp_node(); | ||||
auto&& wlayout = inputs[0]->layout(); | auto&& wlayout = inputs[0]->layout(); | ||||
auto&& dlayout = inputs[1]->layout(); | auto&& dlayout = inputs[1]->layout(); | ||||
DnnOprCaller<megdnn::Convolution3DBackwardData> caller(cn); | |||||
auto&& dnn_opr = caller.op; | |||||
dnn_opr->param() = op_def.param(); | |||||
DnnOprCaller<megdnn::Convolution3DBackwardData> dnn_op( | |||||
cn, conv.param(), conv.policy()); | |||||
TensorLayout& oup_layout = output_descs[0].layout; | |||||
if (!validated) { | |||||
megdnn::Convolution3DBackwardData::deduce_layout_impl( | |||||
wlayout, dlayout, op_def.param(), oup_layout); | |||||
} | |||||
auto oup_layout = [&] { | |||||
if (validated) { | |||||
return output_descs[0].layout; | |||||
} else { | |||||
return dnn_op.deduce_layout(wlayout, dlayout); | |||||
} | |||||
}(); | |||||
auto oup = Tensor::make(oup_layout, cn); | auto oup = Tensor::make(oup_layout, cn); | ||||
SmallVector<megdnn::TensorND> inp_tensornds(inputs.size()); | |||||
inp_tensornds[0] = inputs[0]->dnn_tensor(); | |||||
inp_tensornds[1] = inputs[1]->dnn_tensor(); | |||||
size_t wk_size = setup_algo<megdnn::Convolution3DBackwardData>( | |||||
{wlayout, dlayout, oup_layout}, dnn_opr.get(), 0, false, false, cn, | |||||
op_def.policy(), false, &inp_tensornds); | |||||
auto dnn_wk = caller.create_workspace(wk_size); | |||||
dnn_opr->exec(inp_tensornds[0], inp_tensornds[1], oup->dnn_tensor(), dnn_wk); | |||||
dnn_op.exec_fastrun(inputs[0], inputs[1], oup); | |||||
return {oup}; | return {oup}; | ||||
} | } | ||||
@@ -94,52 +94,44 @@ void apply_on_device_tensornd( | |||||
mgb_assert( | mgb_assert( | ||||
inputs.size() == trait.arity, "%s expects %u inputs; got %zu actually", | inputs.size() == trait.arity, "%s expects %u inputs; got %zu actually", | ||||
trait.name, trait.arity, inputs.size()); | trait.name, trait.arity, inputs.size()); | ||||
DnnOprCaller<megdnn::Elemwise> dnn_opr(inputs[0].comp_node()); | |||||
opr::Elemwise::perform(op_def.mode, (*outputs)[0], inputs, dnn_opr.op); | |||||
DnnOprCaller<megdnn::Elemwise> dnn_opr(inputs[0].comp_node(), {op_def.mode}); | |||||
opr::Elemwise::perform(op_def.mode, (*outputs)[0], inputs, dnn_opr.op()); | |||||
} | } | ||||
SmallVector<TensorPtr> apply_on_physical_tensor( | SmallVector<TensorPtr> apply_on_physical_tensor( | ||||
const OpDef& def, const SmallVector<TensorPtr>& inputs, | const OpDef& def, const SmallVector<TensorPtr>& inputs, | ||||
SmallVector<LogicalTensorDesc>& output_descs, const bool& validated) { | SmallVector<LogicalTensorDesc>& output_descs, const bool& validated) { | ||||
auto comp_node = inputs[0]->comp_node(); | auto comp_node = inputs[0]->comp_node(); | ||||
auto dtype = inputs[0]->dtype(); | |||||
using Mode = Elemwise::Mode; | using Mode = Elemwise::Mode; | ||||
using TensorND = megdnn::TensorND; | |||||
auto&& op_def = def.cast_final_safe<Elemwise>(); | auto&& op_def = def.cast_final_safe<Elemwise>(); | ||||
SmallVector<TensorND> inp_tensornds; | |||||
TensorShapeArray inp_shapes(inputs.size()); | |||||
inp_tensornds.reserve(inputs.size()); | |||||
TensorLayout layout{inputs[0]->layout().dtype}; | |||||
bool is_empty = false; | |||||
for (unsigned i = 0; i < inputs.size(); ++i) { | |||||
if (inputs[i]->layout().is_empty()) { | |||||
is_empty = true; | |||||
} | |||||
inp_tensornds.push_back(inputs[i]->dnn_tensor()); | |||||
inp_shapes[i] = inputs[i]->layout(); | |||||
auto mode = op_def.mode; | |||||
TensorShapeArray input_shapes; | |||||
input_shapes.reserve(inputs.size()); | |||||
for (auto&& input : inputs) { | |||||
input_shapes.push_back(input->shape()); | |||||
} | } | ||||
megdnn::Elemwise::deduce_shape(inp_shapes, layout); | |||||
layout.init_contiguous_stride(); | |||||
auto out = Tensor::make(layout, comp_node); | |||||
if (is_empty) { | |||||
return {out}; | |||||
// deduce_shape is static and fast | |||||
TensorLayout output_layout{dtype}; | |||||
// TODO: deduce_layout directly | |||||
megdnn::Elemwise::deduce_shape(input_shapes, output_layout); | |||||
output_layout.init_contiguous_stride(); | |||||
auto output = Tensor::make(output_layout, comp_node); | |||||
if (output_layout.is_empty()) { | |||||
return {output}; | |||||
} | } | ||||
DnnOprCaller<megdnn::Elemwise> dnn_opr(comp_node); | |||||
dnn_opr.op->param() = op_def.param(); | |||||
if (dnn_opr.op->param().mode == Mode::FUSE_MUL_ADD3 || | |||||
dnn_opr.op->param().mode == Mode::FUSE_MUL_ADD4 || | |||||
(inp_tensornds.size() && | |||||
inp_tensornds[0].layout.dtype.category() == DTypeCategory::QUANTIZED)) { | |||||
opr::Elemwise::perform_dnn( | |||||
comp_node, out->dnn_tensor(), inp_tensornds, dnn_opr.op); | |||||
DnnOprCaller<megdnn::Elemwise> dnn_opr(comp_node, op_def.param()); | |||||
if (mode == Mode::FUSE_MUL_ADD3 || mode == Mode::FUSE_MUL_ADD4 || | |||||
dtype.category() == DTypeCategory::QUANTIZED) { | |||||
dnn_opr.call_dnn( | |||||
[&](auto&& inputs, auto&& output) { | |||||
opr::Elemwise::perform_dnn(comp_node, output, inputs, dnn_opr.op()); | |||||
}, | |||||
inputs, output); | |||||
} else { | } else { | ||||
dnn_opr.op->exec(inp_tensornds, out->dnn_tensor()); | |||||
dnn_opr.exec(inputs, output); | |||||
} | } | ||||
return {out}; | |||||
return {output}; | |||||
} | } | ||||
MGB_DEFINE_OPR_CLASS( | MGB_DEFINE_OPR_CLASS( | ||||
@@ -179,7 +171,7 @@ protected: | |||||
return ret; | return ret; | ||||
} | } | ||||
void create_megdnn_opr() override { | void create_megdnn_opr() override { | ||||
auto opr = DnnOprCaller<megdnn::Elemwise>::create_operator(comp_node()); | |||||
auto opr = mgb::opr::intl::create_megdnn_opr<megdnn::Elemwise>(comp_node()); | |||||
opr->param().mode = m_param.mode; | opr->param().mode = m_param.mode; | ||||
set_megdnn_opr(std::move(opr)); | set_megdnn_opr(std::move(opr)); | ||||
} | } | ||||
@@ -243,22 +235,19 @@ SmallVector<TensorPtr> apply_inplace_add_on_physical_tensor( | |||||
"This inplace modification may change the elements of other tensors. " | "This inplace modification may change the elements of other tensors. " | ||||
"Fallback to non-inplace update."); | "Fallback to non-inplace update."); | ||||
DeviceTensorStorage storage; | |||||
storage.reset(dest->comp_node(), dest->blob()->size(), dest->blob()->storage()); | |||||
storage = storage.sub(dest->offset()); | |||||
DeviceTensorND dv; | |||||
dv.reset(storage, dest->layout()); | |||||
DeviceTensorND dv_new; | |||||
dv_new.copy_from(dv); | |||||
dest = Tensor::make(dv_new); | |||||
auto dest_layout = inputs[0]->layout(); | |||||
dest_layout.init_contiguous_stride(); | |||||
auto new_dest = Tensor::make(dest_layout, inputs[0]->comp_node()); | |||||
new_dest->dev_tensor().copy_from(dest->dev_tensor()); | |||||
dest = new_dest; | |||||
} | } | ||||
auto tensor_to_scalar = [](const TensorPtr& tensor) -> float { | auto tensor_to_scalar = [](const TensorPtr& tensor) -> float { | ||||
return *tensor->get_value().ptr<float>(); | return *tensor->get_value().ptr<float>(); | ||||
}; | }; | ||||
DnnOprCaller<megdnn::AddUpdate> caller{dest->comp_node()}; | |||||
caller.op->param() = {tensor_to_scalar(alpha), tensor_to_scalar(beta)}; | |||||
caller.op->exec(dest->dev_tensor().as_megdnn(), delta->dev_tensor().as_megdnn()); | |||||
DnnOprCaller<megdnn::AddUpdate> caller{ | |||||
dest->comp_node(), {tensor_to_scalar(alpha), tensor_to_scalar(beta)}}; | |||||
caller.exec(dest, delta); | |||||
// FIXME: inplace update host value | |||||
return {std::make_shared<Tensor>(dest->blob(), dest->offset(), dest->layout())}; | return {std::make_shared<Tensor>(dest->blob(), dest->offset(), dest->layout())}; | ||||
} | } | ||||
@@ -67,10 +67,8 @@ SmallVector<TensorPtr> apply_on_physical_tensor( | |||||
auto&& op = def.cast_final_safe<IndexingOneHot>(); | auto&& op = def.cast_final_safe<IndexingOneHot>(); | ||||
auto&& inp = inputs[0]; | auto&& inp = inputs[0]; | ||||
auto&& index = inputs[1]; | auto&& index = inputs[1]; | ||||
TensorLayout layout = inp->layout(); | |||||
TensorLayout index_layout = index->layout(); | |||||
DnnOprCaller<megdnn::IndexingOneHot> dnn_op(inp->comp_node()); | |||||
auto&& indexing_one_hot_param = dnn_op.op->param(); | |||||
auto&& layout = inp->layout(); | |||||
auto&& index_layout = index->layout(); | |||||
int real_axis = static_cast<int>(op.axis); | int real_axis = static_cast<int>(op.axis); | ||||
if (real_axis < 0) { | if (real_axis < 0) { | ||||
real_axis += static_cast<int>(layout.ndim); | real_axis += static_cast<int>(layout.ndim); | ||||
@@ -79,16 +77,10 @@ SmallVector<TensorPtr> apply_on_physical_tensor( | |||||
0 <= real_axis && real_axis < static_cast<int>(layout.ndim), | 0 <= real_axis && real_axis < static_cast<int>(layout.ndim), | ||||
"Dimension out of range (expected to be in range of [%d, %d], but got %d)", | "Dimension out of range (expected to be in range of [%d, %d], but got %d)", | ||||
0, static_cast<int>(layout.ndim) - 1, op.axis); | 0, static_cast<int>(layout.ndim) - 1, op.axis); | ||||
indexing_one_hot_param = real_axis; | |||||
TensorLayout tlayout; | |||||
dnn_op.op->deduce_layout(layout, index_layout, tlayout); | |||||
TensorPtr out = Tensor::make(tlayout, inp->comp_node()); | |||||
megdnn::TensorND in = inp->dnn_tensor(); | |||||
megdnn::TensorND ind = index->dnn_tensor(); | |||||
size_t sz = dnn_op.op->get_workspace_in_bytes(layout, index_layout, tlayout); | |||||
auto dnn_workspace = dnn_op.create_workspace(sz); | |||||
dnn_op.op->exec(in, ind, out->dnn_tensor(), dnn_workspace); | |||||
DnnOprCaller<megdnn::IndexingOneHot> dnn_op(inp->comp_node(), real_axis); | |||||
auto tlayout = dnn_op.deduce_layout(layout, index_layout); | |||||
auto out = Tensor::make(tlayout, inp->comp_node()); | |||||
dnn_op.exec_with_ws(inp, index, out); | |||||
return {out}; | return {out}; | ||||
} | } | ||||
@@ -105,15 +97,14 @@ std::tuple<SmallVector<LogicalTensorDesc>, bool> infer_output_attrs_fallible( | |||||
const OpDef& def, const SmallVector<LogicalTensorDesc>& input_descs) { | const OpDef& def, const SmallVector<LogicalTensorDesc>& input_descs) { | ||||
mgb_assert(input_descs.size() == 3, "IndexingSetOneHot expects three inputs"); | mgb_assert(input_descs.size() == 3, "IndexingSetOneHot expects three inputs"); | ||||
auto comp_node = input_descs[0].comp_node; | auto comp_node = input_descs[0].comp_node; | ||||
TensorLayout src = input_descs[0].layout, index = input_descs[1].layout; | |||||
auto&& src = input_descs[0].layout; | |||||
auto&& index = input_descs[1].layout; | |||||
mgb_assert(index.dtype == dtype::Int32(), "index dtype must be int32"); | mgb_assert(index.dtype == dtype::Int32(), "index dtype must be int32"); | ||||
if (!src.ndim) { | if (!src.ndim) { | ||||
return {{{{{}, src.dtype}, comp_node}}, false}; | return {{{{{}, src.dtype}, comp_node}}, false}; | ||||
} | } | ||||
mgb_assert(src.is_contiguous(), "src should be contiguous"); | mgb_assert(src.is_contiguous(), "src should be contiguous"); | ||||
return {{input_descs[0]}, true}; | |||||
return {{{src, comp_node}}, true}; | |||||
} | } | ||||
auto apply_on_var_node(const OpDef& def, const VarNodeArray& inputs) { | auto apply_on_var_node(const OpDef& def, const VarNodeArray& inputs) { | ||||
@@ -136,25 +127,15 @@ SmallVector<TensorPtr> apply_on_physical_tensor( | |||||
auto&& index = inputs[1]; | auto&& index = inputs[1]; | ||||
auto&& sub = inputs[2]; | auto&& sub = inputs[2]; | ||||
TensorLayout layout = inp->layout(); | TensorLayout layout = inp->layout(); | ||||
TensorLayout index_layout = index->layout(); | |||||
TensorLayout tlayout = sub->layout(); | |||||
mgb_assert(layout.is_contiguous()); | mgb_assert(layout.is_contiguous()); | ||||
DnnOprCaller<megdnn::IndexingSetOneHot> dnn_op(inp->comp_node()); | |||||
auto&& indexing_one_hot_param = dnn_op.op->param(); | |||||
int real_axis = static_cast<int>(op.axis); | int real_axis = static_cast<int>(op.axis); | ||||
if (real_axis < 0) { | if (real_axis < 0) { | ||||
real_axis += static_cast<int>(layout.ndim); | real_axis += static_cast<int>(layout.ndim); | ||||
} | } | ||||
indexing_one_hot_param = real_axis; | |||||
DnnOprCaller<megdnn::IndexingSetOneHot> dnn_op(inp->comp_node(), real_axis); | |||||
TensorPtr out = Tensor::make(layout, inp->comp_node()); | TensorPtr out = Tensor::make(layout, inp->comp_node()); | ||||
out->dev_tensor().copy_from_fixlayout(inp->dev_tensor()); | out->dev_tensor().copy_from_fixlayout(inp->dev_tensor()); | ||||
megdnn::TensorND in = inp->dnn_tensor(); | |||||
megdnn::TensorND ind = index->dnn_tensor(); | |||||
megdnn::TensorND su = sub->dnn_tensor(); | |||||
size_t sz = dnn_op.op->get_workspace_in_bytes(layout, index_layout, tlayout); | |||||
auto dnn_workspace = dnn_op.create_workspace(sz); | |||||
dnn_op.op->exec(out->dnn_tensor(), ind, su, dnn_workspace); | |||||
dnn_op.exec_with_ws(out, index, sub); | |||||
return {out}; | return {out}; | ||||
} | } | ||||
@@ -54,14 +54,15 @@ cg::OperatorNodeBase* apply_on_var_node_remote_recv( | |||||
TensorPtr megray_recv_tensor( | TensorPtr megray_recv_tensor( | ||||
std::shared_ptr<MegRay::Communicator> megray_comm, TensorLayout& layout, | std::shared_ptr<MegRay::Communicator> megray_comm, TensorLayout& layout, | ||||
CompNode cn, uint32_t rank_from) { | CompNode cn, uint32_t rank_from) { | ||||
DeviceTensorND out = BlobManager::inst()->alloc_workspace_with_defrag(cn, layout); | |||||
auto out = Tensor::make(layout, cn); | |||||
auto dnn_out = out->dnn_tensor(); | |||||
auto megray_ctx = mgb::opr::get_megray_context(cn); | auto megray_ctx = mgb::opr::get_megray_context(cn); | ||||
size_t data_size = layout.total_nr_elems(); | size_t data_size = layout.total_nr_elems(); | ||||
auto status = megray_comm->recv( | auto status = megray_comm->recv( | ||||
out.raw_ptr(), data_size, mgb::opr::get_megray_dtype(layout.dtype), | |||||
dnn_out.raw_ptr(), data_size, mgb::opr::get_megray_dtype(layout.dtype), | |||||
rank_from, megray_ctx); | rank_from, megray_ctx); | ||||
mgb_assert(status == MegRay::MEGRAY_OK, "MegRay recv failed"); | mgb_assert(status == MegRay::MEGRAY_OK, "MegRay recv failed"); | ||||
return Tensor::make(out); | |||||
return out; | |||||
} | } | ||||
void megray_send_tensor( | void megray_send_tensor( | ||||
@@ -105,9 +106,7 @@ SmallVector<TensorPtr> apply_on_physical_tensor_remote_send( | |||||
mgb_assert(megray_comm != nullptr); | mgb_assert(megray_comm != nullptr); | ||||
megray_send_tensor(megray_comm, inputs[0], op.rank_to); | megray_send_tensor(megray_comm, inputs[0], op.rank_to); | ||||
TensorLayout layout({0}, inputs[0]->dtype()); | TensorLayout layout({0}, inputs[0]->dtype()); | ||||
DeviceTensorND out = BlobManager::inst()->alloc_workspace_with_defrag( | |||||
inputs[0]->comp_node(), layout); | |||||
return {Tensor::make(out)}; | |||||
return {Tensor::make(layout, inputs[0]->comp_node())}; | |||||
} | } | ||||
std::tuple<SmallVector<LogicalTensorDesc>, bool> infer_output_attrs_fallible_remote_recv( | std::tuple<SmallVector<LogicalTensorDesc>, bool> infer_output_attrs_fallible_remote_recv( | ||||
@@ -21,14 +21,17 @@ SmallVector<VarNode::LayoutConstraintCallback> get_input_layout_constraint( | |||||
std::tuple<SmallVector<LogicalTensorDesc>, bool> infer_output_attrs_fallible( | std::tuple<SmallVector<LogicalTensorDesc>, bool> infer_output_attrs_fallible( | ||||
const OpDef& def, const SmallVector<LogicalTensorDesc>& input_descs) { | const OpDef& def, const SmallVector<LogicalTensorDesc>& input_descs) { | ||||
mgb_assert(input_descs.size() == 4, "IndexingOneHot expects 4inputs"); | mgb_assert(input_descs.size() == 4, "IndexingOneHot expects 4inputs"); | ||||
auto comp_node = input_descs[0].comp_node; | auto comp_node = input_descs[0].comp_node; | ||||
auto comp_node1 = input_descs[1].comp_node; | auto comp_node1 = input_descs[1].comp_node; | ||||
auto comp_node2 = input_descs[2].comp_node; | auto comp_node2 = input_descs[2].comp_node; | ||||
TensorLayout m_t_1 = input_descs[0].layout, v_t_1 = input_descs[1].layout, | |||||
lamb_param = input_descs[2].layout, grad = input_descs[3].layout; | |||||
TensorLayout new_param = lamb_param, m_t = m_t_1, v_t = v_t_1; | |||||
auto&& m_t_1 = input_descs[0].layout; | |||||
auto&& v_t_1 = input_descs[1].layout; | |||||
auto&& lamb_param = input_descs[2].layout; | |||||
auto&& grad = input_descs[3].layout; | |||||
MGB_MARK_USED_VAR(grad); | |||||
auto&& new_param = lamb_param; | |||||
auto&& m_t = m_t_1; | |||||
auto&& v_t = v_t_1; | |||||
return {{{m_t, comp_node}, {v_t, comp_node1}, {new_param, comp_node2}}, true}; | return {{{m_t, comp_node}, {v_t, comp_node1}, {new_param, comp_node2}}, true}; | ||||
} | } | ||||
@@ -46,23 +49,11 @@ SmallVector<TensorPtr> apply_on_physical_tensor( | |||||
TensorLayout lamb_param_layout{lamb_param->layout()}; | TensorLayout lamb_param_layout{lamb_param->layout()}; | ||||
auto m_t = Tensor::make(m_t_1_layout, m_t_1->comp_node()); | auto m_t = Tensor::make(m_t_1_layout, m_t_1->comp_node()); | ||||
auto v_t = Tensor::make(v_t_1_layout, v_t_1->comp_node()); | auto v_t = Tensor::make(v_t_1_layout, v_t_1->comp_node()); | ||||
auto new_param = Tensor::make(lamb_param_layout, lamb_param->comp_node()); | auto new_param = Tensor::make(lamb_param_layout, lamb_param->comp_node()); | ||||
DnnOprCaller<megdnn::LAMBUpdate> caller{lamb_param->comp_node()}; | |||||
size_t sz = caller.op->get_workspace_in_bytes( | |||||
m_t_1->layout(), v_t_1->layout(), lamb_param->layout(), grad->layout(), | |||||
m_t->layout(), v_t->layout(), new_param->layout()); | |||||
auto dnn_workspace = caller.create_workspace(sz); | |||||
caller.op->param() = op.param(); | |||||
caller.op->exec( | |||||
m_t_1->dev_tensor().as_megdnn(), v_t_1->dev_tensor().as_megdnn(), | |||||
lamb_param->dev_tensor().as_megdnn(), grad->dev_tensor().as_megdnn(), | |||||
m_t->dnn_tensor(), v_t->dnn_tensor(), new_param->dnn_tensor(), | |||||
dnn_workspace); | |||||
DnnOprCaller<megdnn::LAMBUpdate> dnn_opr{lamb_param->comp_node(), op.param()}; | |||||
dnn_opr.exec_with_ws(m_t_1, v_t_1, lamb_param, grad, m_t, v_t, new_param); | |||||
return {m_t, v_t, new_param}; | return {m_t, v_t, new_param}; | ||||
} | } | ||||
@@ -29,11 +29,11 @@ cg::OperatorNodeBase* apply_on_var_node(const OpDef& def, const VarNodeArray& in | |||||
std::tuple<SmallVector<LogicalTensorDesc>, bool> infer_output_attrs_fallible( | std::tuple<SmallVector<LogicalTensorDesc>, bool> infer_output_attrs_fallible( | ||||
const OpDef& def, const SmallVector<LogicalTensorDesc>& inputs) { | const OpDef& def, const SmallVector<LogicalTensorDesc>& inputs) { | ||||
auto&& op_def = def.cast_final_safe<LayerNorm>(); | |||||
auto&& layer_norm = def.cast_final_safe<LayerNorm>(); | |||||
size_t nr_inp = inputs.size(); | size_t nr_inp = inputs.size(); | ||||
auto p = op_def.param(); | |||||
auto affine = layer_norm.affine; | |||||
mgb_assert( | mgb_assert( | ||||
(nr_inp == 3 && p.affine) || (nr_inp == 1 && !p.affine), | |||||
(nr_inp == 3 && affine) || (nr_inp == 1 && !affine), | |||||
"num of inputs of pooling should be 1 or 3 but you give %zu", | "num of inputs of pooling should be 1 or 3 but you give %zu", | ||||
inputs.size()); | inputs.size()); | ||||
@@ -47,9 +47,9 @@ std::tuple<SmallVector<LogicalTensorDesc>, bool> infer_output_attrs_fallible( | |||||
false}; | false}; | ||||
} | } | ||||
TensorLayout oup_layout, mean_layout, rstd_layout; | |||||
megdnn::LayerNorm::deduce_layout_fwd_impl( | |||||
inp.layout, p, oup_layout, mean_layout, rstd_layout); | |||||
DnnOprHelper<megdnn::LayerNorm> dnn_opr(layer_norm.param()); | |||||
auto&& [oup_layout, mean_layout, rstd_layout] = | |||||
dnn_opr.deduce_layouts<3>(inp.layout, TensorLayout{}, TensorLayout{}); | |||||
return {{{oup_layout, inp_cn, {}}, | return {{{oup_layout, inp_cn, {}}, | ||||
{mean_layout, inp_cn, {}}, | {mean_layout, inp_cn, {}}, | ||||
{rstd_layout, inp_cn, {}}}, | {rstd_layout, inp_cn, {}}}, | ||||
@@ -69,32 +69,21 @@ SmallVector<TensorPtr> apply_on_physical_tensor( | |||||
inputs.size()); | inputs.size()); | ||||
auto cn = inputs[0]->comp_node(); | auto cn = inputs[0]->comp_node(); | ||||
DnnOprCaller<megdnn::LayerNorm> caller(cn); | |||||
auto&& dnn_opr = caller.op; | |||||
dnn_opr->param() = p; | |||||
DnnOprCaller<megdnn::LayerNorm> caller(cn, op_def.param()); | |||||
TensorLayout oup_layout, mean_layout, rstd_layout; | |||||
megdnn::LayerNorm::deduce_layout_fwd_impl( | |||||
inputs[0]->dnn_tensor().layout, p, oup_layout, mean_layout, rstd_layout); | |||||
auto&& [oup_layout, mean_layout, rstd_layout] = caller.deduce_layouts<3>( | |||||
inputs[0]->layout(), TensorLayout{}, TensorLayout{}); | |||||
auto out = Tensor::make(oup_layout, cn); | auto out = Tensor::make(oup_layout, cn); | ||||
auto mean = Tensor::make(mean_layout, cn); | auto mean = Tensor::make(mean_layout, cn); | ||||
auto rstd = Tensor::make(rstd_layout, cn); | auto rstd = Tensor::make(rstd_layout, cn); | ||||
auto wk_size = caller.op->get_workspace_in_bytes( | |||||
inputs[0]->dnn_tensor().layout, | |||||
p.affine ? inputs[1]->dnn_tensor().layout : TensorLayout(), | |||||
p.affine ? inputs[2]->dnn_tensor().layout : TensorLayout(), oup_layout, | |||||
mean_layout, rstd_layout); | |||||
auto dnn_wk = caller.create_workspace(wk_size); | |||||
caller.op->exec( | |||||
inputs[0]->dnn_tensor(), | |||||
p.affine ? inputs[1]->dnn_tensor() : megdnn::TensorND(), | |||||
p.affine ? inputs[2]->dnn_tensor() : megdnn::TensorND(), out->dnn_tensor(), | |||||
mean->dnn_tensor(), rstd->dnn_tensor(), dnn_wk); | |||||
if (p.affine) { | |||||
caller.exec_with_ws(inputs[0], inputs[1], inputs[2], out, mean, rstd); | |||||
} else { | |||||
megdnn::TensorND empty_dnn; | |||||
caller.exec_with_ws(inputs[0], empty_dnn, empty_dnn, out, mean, rstd); | |||||
} | |||||
return {out, mean, rstd}; | return {out, mean, rstd}; | ||||
} | } | ||||
@@ -105,4 +94,4 @@ OP_TRAIT_REG(LayerNorm, LayerNorm) | |||||
.fallback(); | .fallback(); | ||||
} // namespace layer_norm | } // namespace layer_norm | ||||
} // namespace mgb::imperative | |||||
} // namespace mgb::imperative |
@@ -24,7 +24,6 @@ auto apply_on_var_node(const OpDef& def, const VarNodeArray& inputs) { | |||||
auto dim1 = matmul.dimA, dim2 = matmul.dimB; | auto dim1 = matmul.dimA, dim2 = matmul.dimB; | ||||
auto cn = inputs[0]->comp_node(); | auto cn = inputs[0]->comp_node(); | ||||
using Desc = opr::AxisAddRemove::AxisDesc; | |||||
using IndexDesc = opr::Subtensor::IndexDesc; | using IndexDesc = opr::Subtensor::IndexDesc; | ||||
OperatorNodeConfig config{matmul.make_name(), cn}; | OperatorNodeConfig config{matmul.make_name(), cn}; | ||||
@@ -104,9 +103,8 @@ std::tuple<SmallVector<LogicalTensorDesc>, bool> infer_output_attrs_fallible( | |||||
dim1 = dim2 = 2; | dim1 = dim2 = 2; | ||||
} | } | ||||
DnnOprCaller<megdnn::MatrixMul> dnn_opr(inputs[0].comp_node); | |||||
dnn_opr.op->param() = matmul.param(); | |||||
dnn_opr.op->deduce_dtype(layout1.dtype, layout1.dtype, dst_dtype); | |||||
DnnOprHelper<megdnn::MatrixMul> dnn_opr(matmul.param()); | |||||
dnn_opr.opr().deduce_dtype(layout1.dtype, layout1.dtype, dst_dtype); | |||||
if (dim1 == 0 || dim2 == 0) { | if (dim1 == 0 || dim2 == 0) { | ||||
return {{{TensorLayout(dst_dtype), inputs[0].comp_node}}, false}; | return {{{TensorLayout(dst_dtype), inputs[0].comp_node}}, false}; | ||||
@@ -143,8 +141,7 @@ SmallVector<TensorPtr> apply_on_physical_tensor( | |||||
SmallVector<TensorND> inp_tensornds(inputs.size()); | SmallVector<TensorND> inp_tensornds(inputs.size()); | ||||
TensorLayout layout1 = inputs[0]->layout(), layout2 = inputs[1]->layout(); | TensorLayout layout1 = inputs[0]->layout(), layout2 = inputs[1]->layout(); | ||||
DnnOprCaller<megdnn::MatrixMul> dnn_opr(cn); | |||||
dnn_opr.op->param() = matmul.param(); | |||||
DnnOprCaller<megdnn::MatrixMul> dnn_opr(cn, matmul.param(), matmul.policy()); | |||||
if (matmul.dimA == matmul.dimB && matmul.dimB >= 3) { // only happens in backward | if (matmul.dimA == matmul.dimB && matmul.dimB >= 3) { // only happens in backward | ||||
for (size_t i = 1; i + 1 < layout1.ndim; ++i) { | for (size_t i = 1; i + 1 < layout1.ndim; ++i) { | ||||
@@ -160,7 +157,7 @@ SmallVector<TensorPtr> apply_on_physical_tensor( | |||||
} | } | ||||
DType dst_dtype; | DType dst_dtype; | ||||
dnn_opr.op->deduce_dtype(layout1.dtype, layout1.dtype, dst_dtype); | |||||
dnn_opr.op()->deduce_dtype(layout1.dtype, layout1.dtype, dst_dtype); | |||||
// only matters when layout1 has dim 2 | // only matters when layout1 has dim 2 | ||||
if (matmul.transposeA) | if (matmul.transposeA) | ||||
@@ -229,13 +226,8 @@ SmallVector<TensorPtr> apply_on_physical_tensor( | |||||
inp_tensornds[0].layout = layout_a; | inp_tensornds[0].layout = layout_a; | ||||
inp_tensornds[1].layout = layout_b; | inp_tensornds[1].layout = layout_b; | ||||
} | } | ||||
size_t sz = setup_algo<megdnn::MatrixMul>( | |||||
{layout_a, layout_b, dst_layout}, dnn_opr.op.get(), 0, false, false, cn, | |||||
matmul.policy(), false, &inp_tensornds); | |||||
auto out = Tensor::make(dst_layout, cn); | auto out = Tensor::make(dst_layout, cn); | ||||
auto dnn_wk = dnn_opr.create_workspace(sz); | |||||
dnn_opr.op->exec(inp_tensornds[0], inp_tensornds[1], out->dnn_tensor(), dnn_wk); | |||||
dnn_opr.exec_fastrun(inp_tensornds[0], inp_tensornds[1], out); | |||||
return {out->sub(0, real_dst_layout)}; | return {out->sub(0, real_dst_layout)}; | ||||
} | } | ||||
@@ -266,7 +258,6 @@ auto apply_on_var_node(const OpDef& def, const VarNodeArray& inputs) { | |||||
auto dim1 = matmul.dimA, dim2 = matmul.dimB; | auto dim1 = matmul.dimA, dim2 = matmul.dimB; | ||||
auto cn = inputs[0]->comp_node(); | auto cn = inputs[0]->comp_node(); | ||||
using Desc = opr::AxisAddRemove::AxisDesc; | |||||
using IndexDesc = opr::Subtensor::IndexDesc; | using IndexDesc = opr::Subtensor::IndexDesc; | ||||
OperatorNodeConfig config{matmul.make_name(), cn}; | OperatorNodeConfig config{matmul.make_name(), cn}; | ||||
@@ -343,9 +334,8 @@ std::tuple<SmallVector<LogicalTensorDesc>, bool> infer_output_attrs_fallible( | |||||
DType dst_dtype; | DType dst_dtype; | ||||
DnnOprCaller<megdnn::MatrixMul> dnn_opr(inputs[0].comp_node); | |||||
dnn_opr.op->param() = matmul.param(); | |||||
dnn_opr.op->deduce_dtype(layout1.dtype, layout1.dtype, dst_dtype); | |||||
DnnOprHelper<megdnn::MatrixMul> dnn_opr(matmul.param()); | |||||
dnn_opr.opr().deduce_dtype(layout1.dtype, layout1.dtype, dst_dtype); | |||||
if (dim1 == 0 || dim2 == 0) { | if (dim1 == 0 || dim2 == 0) { | ||||
return {{{TensorLayout(dst_dtype), inputs[0].comp_node}}, false}; | return {{{TensorLayout(dst_dtype), inputs[0].comp_node}}, false}; | ||||
@@ -386,10 +376,9 @@ SmallVector<TensorPtr> apply_on_physical_tensor( | |||||
TensorLayout layout1 = inputs[0]->layout(), layout2 = inputs[1]->layout(); | TensorLayout layout1 = inputs[0]->layout(), layout2 = inputs[1]->layout(); | ||||
size_t dim1 = layout1.ndim, dim2 = layout2.ndim; | size_t dim1 = layout1.ndim, dim2 = layout2.ndim; | ||||
DnnOprCaller<megdnn::BatchedMatrixMul> dnn_opr(cn); | |||||
dnn_opr.op->param() = matmul.param(); | |||||
DnnOprCaller<megdnn::BatchedMatrixMul> dnn_opr(cn, matmul.param(), matmul.policy()); | |||||
DType dst_dtype; | DType dst_dtype; | ||||
dnn_opr.op->deduce_dtype(layout1.dtype, layout1.dtype, dst_dtype); | |||||
dnn_opr.op()->deduce_dtype(layout1.dtype, layout1.dtype, dst_dtype); | |||||
TensorShape tshp, batch_shp; | TensorShape tshp, batch_shp; | ||||
size_t j = 0; | size_t j = 0; | ||||
@@ -473,14 +462,9 @@ SmallVector<TensorPtr> apply_on_physical_tensor( | |||||
inp_tensornds[1] = inp2->dnn_tensor(); | inp_tensornds[1] = inp2->dnn_tensor(); | ||||
inp_tensornds[1].layout = layout2; | inp_tensornds[1].layout = layout2; | ||||
size_t sz = setup_algo<megdnn::BatchedMatrixMul>( | |||||
{layout1, layout2, dst_layout}, dnn_opr.op.get(), 0, false, false, cn, | |||||
matmul.policy(), false, &inp_tensornds); | |||||
auto out = Tensor::make(dst_layout, cn); | auto out = Tensor::make(dst_layout, cn); | ||||
auto dnn_wk = dnn_opr.create_workspace(sz); | |||||
dnn_opr.op->exec(inp_tensornds[0], inp_tensornds[1], out->dnn_tensor(), dnn_wk); | |||||
dnn_opr.exec_fastrun(inp_tensornds[0], inp_tensornds[1], out); | |||||
shp1[shp1.ndim - 2] = dst_layout[dst_layout.ndim - 2]; | shp1[shp1.ndim - 2] = dst_layout[dst_layout.ndim - 2]; | ||||
shp1[shp1.ndim - 1] = dst_layout[dst_layout.ndim - 1]; | shp1[shp1.ndim - 1] = dst_layout[dst_layout.ndim - 1]; | ||||
@@ -533,7 +517,7 @@ SmallVector<TensorPtr> apply_on_physical_tensor( | |||||
TensorLayout oup_layout{inputs[0]->dtype()}; | TensorLayout oup_layout{inputs[0]->dtype()}; | ||||
auto inp1_tensor = inputs[0]->dnn_tensor(); | auto inp1_tensor = inputs[0]->dnn_tensor(); | ||||
auto inp2_tensor = inputs[1]->dnn_tensor(); | auto inp2_tensor = inputs[1]->dnn_tensor(); | ||||
dnn_opr.op->deduce_layout(inp1_tensor.layout, inp2_tensor.layout, oup_layout); | |||||
oup_layout = dnn_opr.deduce_layout(inp1_tensor.layout, inp2_tensor.layout); | |||||
if (inputs[0]->layout().is_empty() || inputs[1]->layout().is_empty()) { | if (inputs[0]->layout().is_empty() || inputs[1]->layout().is_empty()) { | ||||
auto out = Tensor::make(oup_layout, comp_node); | auto out = Tensor::make(oup_layout, comp_node); | ||||
@@ -543,14 +527,8 @@ SmallVector<TensorPtr> apply_on_physical_tensor( | |||||
return {out}; | return {out}; | ||||
} | } | ||||
auto sz = dnn_opr.op->get_workspace_in_bytes( | |||||
inp_tensornds[0].layout, inp_tensornds[1].layout, output_descs[0].layout); | |||||
auto out = Tensor::make(oup_layout, comp_node); | auto out = Tensor::make(oup_layout, comp_node); | ||||
auto dnn_wk = dnn_opr.create_workspace(sz); | |||||
dnn_opr.op->exec(inp_tensornds[0], inp_tensornds[1], out->dnn_tensor(), dnn_wk); | |||||
dnn_opr.exec_with_ws(inp_tensornds[0], inp_tensornds[1], out); | |||||
return {out}; | return {out}; | ||||
} | } | ||||
@@ -17,27 +17,18 @@ SymbolVarArray apply_on_var_node(const OpDef& def, const VarNodeArray& inputs) { | |||||
SmallVector<TensorPtr> apply_on_physical_tensor( | SmallVector<TensorPtr> apply_on_physical_tensor( | ||||
const OpDef& def, const SmallVector<TensorPtr>& inputs, | const OpDef& def, const SmallVector<TensorPtr>& inputs, | ||||
SmallVector<LogicalTensorDesc>& output_descs, const bool& validated) { | SmallVector<LogicalTensorDesc>& output_descs, const bool& validated) { | ||||
size_t size = inputs.size(); | |||||
auto&& op = def.cast_final_safe<CheckNonFinite>(); | auto&& op = def.cast_final_safe<CheckNonFinite>(); | ||||
SmallVector<TensorPtr> outputs(size + 1); | |||||
outputs[size] = Tensor::make( | |||||
TensorLayout(TensorShape({1}), dtype::Int32()), inputs[0]->comp_node()); | |||||
auto dest = outputs[size]; | |||||
auto cn = dest->comp_node(); | |||||
DnnOprCaller<megdnn::CheckNonFinite> dnn_opr(cn); | |||||
SmallVector<megdnn::TensorND> srcs(size); | |||||
// copy an outputs to the dnn for inplace | |||||
for (size_t i = 0; i < size; ++i) { | |||||
outputs[i] = Tensor::make(inputs[i]->layout(), inputs[0]->comp_node()); | |||||
outputs[i]->dev_tensor().copy_from_fixlayout(inputs[i]->dev_tensor()); | |||||
srcs[i] = outputs[i]->dev_tensor().as_megdnn(); | |||||
auto comp_node = inputs[0]->comp_node(); | |||||
auto dest = Tensor::make(TensorLayout({1}, dtype::Int32()), comp_node); | |||||
SmallVector<TensorPtr> outputs; | |||||
outputs.reserve(inputs.size() + 1); | |||||
for (auto&& input : inputs) { | |||||
outputs.push_back(Tensor::make(input->layout(), comp_node)); | |||||
outputs.back()->dev_tensor().copy_from_fixlayout(input->dev_tensor()); | |||||
} | } | ||||
megdnn::CheckNonFinite::Param param({op.scale}); | |||||
dnn_opr.op->param() = param; | |||||
size_t sz = dnn_opr.op->get_workspace_in_bytes(srcs, dest->layout()); | |||||
auto dnn_wk = dnn_opr.create_workspace(sz); | |||||
dnn_opr.op->exec(srcs, dest->dnn_tensor(), dnn_wk); | |||||
DnnOprCaller<megdnn::CheckNonFinite> dnn_opr(comp_node, {op.scale}); | |||||
dnn_opr.exec_with_ws(outputs, dest); | |||||
outputs.push_back(dest); | |||||
return outputs; | return outputs; | ||||
} | } | ||||
@@ -45,13 +36,15 @@ std::tuple<SmallVector<LogicalTensorDesc>, bool> infer_output_attrs_fallible( | |||||
const OpDef& def, const SmallVector<LogicalTensorDesc>& inputs) { | const OpDef& def, const SmallVector<LogicalTensorDesc>& inputs) { | ||||
size_t size = inputs.size(); | size_t size = inputs.size(); | ||||
SmallVector<LogicalTensorDesc> dests(size + 1); | SmallVector<LogicalTensorDesc> dests(size + 1); | ||||
bool validated = true; | |||||
for (size_t i = 0; i < size; ++i) { | for (size_t i = 0; i < size; ++i) { | ||||
dests[i].comp_node = inputs[i].comp_node; | dests[i].comp_node = inputs[i].comp_node; | ||||
dests[i].layout = inputs[i].layout; | dests[i].layout = inputs[i].layout; | ||||
validated &= bool(dests[i].layout.ndim); | |||||
} | } | ||||
dests[size].comp_node = inputs[0].comp_node; | dests[size].comp_node = inputs[0].comp_node; | ||||
dests[size].layout = TensorLayout(TensorShape({1}), dtype::Int32()); | |||||
return {dests, true}; | |||||
dests[size].layout = TensorLayout({1}, dtype::Int32()); | |||||
return {dests, validated}; | |||||
} | } | ||||
OP_TRAIT_REG(CheckNonFinite, CheckNonFinite) | OP_TRAIT_REG(CheckNonFinite, CheckNonFinite) | ||||
@@ -27,40 +27,31 @@ SmallVector<TensorPtr> apply_on_physical_tensor( | |||||
SmallVector<LogicalTensorDesc>& output_descs, const bool& validated) { | SmallVector<LogicalTensorDesc>& output_descs, const bool& validated) { | ||||
auto comp_node = inputs[0]->comp_node(); | auto comp_node = inputs[0]->comp_node(); | ||||
auto&& op_def = def.cast_final_safe<Padding>(); | auto&& op_def = def.cast_final_safe<Padding>(); | ||||
DnnOprCaller<megdnn::Padding> dnn_op(comp_node); | |||||
dnn_op.op->param() = op_def.param(); | |||||
TensorLayout dst = output_descs[0].layout; | |||||
if (!validated) { | |||||
megdnn::Padding::deduce_layout_impl( | |||||
inputs[0]->dnn_tensor().layout, dst, op_def.param()); | |||||
} | |||||
DeviceTensorND out = | |||||
BlobManager::inst()->alloc_workspace_with_defrag(comp_node, dst); | |||||
dnn_op.op->exec(inputs[0]->dnn_tensor(), out.as_megdnn()); | |||||
return {Tensor::make(out)}; | |||||
DnnOprCaller<megdnn::Padding> dnn_op(comp_node, op_def.param()); | |||||
auto dst = [&] { | |||||
if (validated) { | |||||
return output_descs[0].layout; | |||||
} else { | |||||
return dnn_op.deduce_layout(inputs[0]->layout()); | |||||
} | |||||
}(); | |||||
auto out = Tensor::make(dst, comp_node); | |||||
dnn_op.exec(inputs[0], out); | |||||
return {out}; | |||||
} | } | ||||
std::tuple<SmallVector<LogicalTensorDesc>, bool> infer_output_attrs_fallible( | std::tuple<SmallVector<LogicalTensorDesc>, bool> infer_output_attrs_fallible( | ||||
const OpDef& def, const SmallVector<LogicalTensorDesc>& inputs) { | const OpDef& def, const SmallVector<LogicalTensorDesc>& inputs) { | ||||
auto&& op_def = def.cast_final_safe<Padding>(); | auto&& op_def = def.cast_final_safe<Padding>(); | ||||
size_t nr_inp = inputs.size(); | |||||
auto p = op_def.param(); | |||||
auto&& inp = inputs[0]; | auto&& inp = inputs[0]; | ||||
auto& inp_cn = inp.comp_node; | |||||
if (inp.layout.ndim == 0) { | if (inp.layout.ndim == 0) { | ||||
return {{{TensorLayout{inp.layout.dtype}, inp_cn, {}}}, false}; | |||||
return {{{TensorLayout{inp.layout.dtype}, inp.comp_node, {}}}, false}; | |||||
} | } | ||||
TensorLayout oup_layout; | |||||
megdnn::Padding::deduce_layout_impl(inp.layout, oup_layout, p); | |||||
return {{{oup_layout, inp_cn, {}}}, true}; | |||||
DnnOprHelper<megdnn::Padding> dnn_op(op_def.param()); | |||||
auto oup_layout = dnn_op.deduce_layout(inp.layout); | |||||
return {{{oup_layout, inp.comp_node}}, true}; | |||||
} | } | ||||
OP_TRAIT_REG(Padding, Padding, opr::Padding) | OP_TRAIT_REG(Padding, Padding, opr::Padding) | ||||
@@ -74,4 +65,4 @@ OP_TRAIT_REG(Padding, Padding, opr::Padding) | |||||
} // namespace imperative | } // namespace imperative | ||||
} // namespace mgb | } // namespace mgb | ||||
// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} | |||||
// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} |
@@ -25,19 +25,13 @@ std::tuple<SmallVector<LogicalTensorDesc>, bool> infer_output_attrs_fallible( | |||||
mgb_assert( | mgb_assert( | ||||
inputs.size() == 1, "num of inputs of pooling should be 1 but you give %zu", | inputs.size() == 1, "num of inputs of pooling should be 1 but you give %zu", | ||||
inputs.size()); | inputs.size()); | ||||
auto&& op_def = def.cast_final_safe<Pooling>(); | auto&& op_def = def.cast_final_safe<Pooling>(); | ||||
auto&& inp = inputs[0]; | |||||
auto& inp_cn = inp.comp_node; | |||||
if (inp.layout.ndim == 0) { | |||||
return {{{TensorLayout{inp.layout.dtype}, inp_cn, {}}}, false}; | |||||
if (!inputs[0].layout.ndim) { | |||||
return {{{inputs[0].layout, inputs[0].comp_node}}, false}; | |||||
} | } | ||||
TensorLayout oup_layout; | |||||
megdnn::Pooling::deduce_layout_impl(inp.layout, op_def.param(), oup_layout); | |||||
return {{{oup_layout, inp_cn, {}}}, true}; | |||||
DnnOprHelper<megdnn::Pooling> dnn_opr(op_def.param()); | |||||
auto oup_layout = dnn_opr.deduce_layout(inputs[0].layout); | |||||
return {{{oup_layout, inputs[0].comp_node}}, true}; | |||||
} | } | ||||
SmallVector<TensorPtr> apply_on_physical_tensor( | SmallVector<TensorPtr> apply_on_physical_tensor( | ||||
@@ -47,30 +41,18 @@ SmallVector<TensorPtr> apply_on_physical_tensor( | |||||
inputs.size() == 1, "num of inputs of pooling should be 1 but you give %zu", | inputs.size() == 1, "num of inputs of pooling should be 1 but you give %zu", | ||||
inputs.size()); | inputs.size()); | ||||
auto&& op_def = def.cast_final_safe<Pooling>(); | |||||
auto&& pooling = def.cast_final_safe<Pooling>(); | |||||
auto cn = inputs[0]->comp_node(); | auto cn = inputs[0]->comp_node(); | ||||
DnnOprCaller<megdnn::Pooling> caller(cn); | |||||
auto&& dnn_opr = caller.op; | |||||
dnn_opr->param() = op_def.param(); | |||||
SmallVector<megdnn::TensorND> inp_tensornds(inputs.size()); | |||||
inp_tensornds[0] = inputs[0]->dnn_tensor(); | |||||
TensorLayout& oup_layout = output_descs[0].layout; | |||||
if (!validated) { | |||||
megdnn::Pooling::deduce_layout_impl( | |||||
inp_tensornds[0].layout, op_def.param(), oup_layout); | |||||
} | |||||
size_t wk_size = setup_algo<megdnn::Pooling>( | |||||
{inp_tensornds[0].layout, oup_layout}, dnn_opr.get(), 0, false, false, cn, | |||||
op_def.policy(), false, &inp_tensornds); | |||||
DnnOprCaller<megdnn::Pooling> dnn_opr(cn, pooling.param(), pooling.policy()); | |||||
auto oup_layout = [&] { | |||||
if (validated) { | |||||
return output_descs[0].layout; | |||||
} else { | |||||
return dnn_opr.deduce_layout(inputs[0]->layout()); | |||||
} | |||||
}(); | |||||
auto out = Tensor::make(oup_layout, cn); | auto out = Tensor::make(oup_layout, cn); | ||||
auto dnn_wk = caller.create_workspace(wk_size); | |||||
caller.op->exec(inp_tensornds[0], out->dnn_tensor(), dnn_wk); | |||||
dnn_opr.exec_fastrun(inputs[0], out); | |||||
return {out}; | return {out}; | ||||
} | } | ||||
@@ -18,33 +18,31 @@ namespace reduce { | |||||
auto apply_on_var_node(const OpDef& def, const VarNodeArray& inputs) { | auto apply_on_var_node(const OpDef& def, const VarNodeArray& inputs) { | ||||
auto&& reduce = static_cast<const Reduce&>(def); | auto&& reduce = static_cast<const Reduce&>(def); | ||||
auto comp_node = inputs[0]->comp_node(); | auto comp_node = inputs[0]->comp_node(); | ||||
OperatorNodeConfig config{reduce.make_name(), comp_node, inputs[0]->dtype()}; | |||||
auto name = reduce.make_name(); | |||||
if (inputs.size() > 1) { | |||||
return opr::Reduce::make(inputs[0], reduce.param(), inputs[1], config); | |||||
} | |||||
using Param = megdnn::param::Reduce; | |||||
auto param = reduce.param(); | auto param = reduce.param(); | ||||
if (param.axis < 0) { | |||||
param.axis = inputs[0]->shape().ndim + param.axis; | |||||
auto axis = param.axis; | |||||
auto keepdim = reduce.keepdim; | |||||
if (inputs.size() == 2) { | |||||
return opr::Reduce::make(inputs[0], param, inputs[1], {name}); | |||||
} | } | ||||
mgb_assert(inputs.size() == 1); | |||||
SymbolVar target_shape = (cg::VarNode*)nullptr; | |||||
if (param.axis == INT_MAX) { | |||||
DTypeScalar vi{1}; | |||||
// auto graph = ComputingGraph::make(); | |||||
if (axis == INT_MAX) { | |||||
// keepdim could be ignored when ndim == 1 | |||||
auto graph = inputs[0]->owner_graph(); | auto graph = inputs[0]->owner_graph(); | ||||
target_shape = opr::ImmutableTensor::make(*graph, vi, config); | |||||
auto scalar_shape = | |||||
opr::ImmutableTensor::make(*graph, DTypeScalar(1), {name, comp_node}); | |||||
return opr::Reduce::make(inputs[0], param, scalar_shape, {name}); | |||||
} | } | ||||
auto res = opr::Reduce::make(inputs[0], param, target_shape, config); | |||||
if (!reduce.keepdim && param.axis != INT_MAX) { | |||||
// mgb::opr::Reduce supports negative axis | |||||
auto res = opr::Reduce::make(inputs[0], param, {}, {name}); | |||||
if (!keepdim) { | |||||
using Desc = opr::AxisAddRemove::AxisDesc; | using Desc = opr::AxisAddRemove::AxisDesc; | ||||
std::vector<Desc> remove_param; | |||||
remove_param.push_back(Desc::make_remove(param.axis)); | |||||
OperatorNodeConfig remove_config{ | |||||
def.make_name(), comp_node, inputs[0]->dtype()}; | |||||
return opr::AxisAddRemove::make(res, remove_param, remove_config); | |||||
std::vector<Desc> remove_axis_param; | |||||
remove_axis_param.push_back(Desc::make_remove(axis)); | |||||
res = opr::AxisAddRemove::make(res, remove_axis_param, {name}); | |||||
} | } | ||||
return res; | return res; | ||||
} | } | ||||
@@ -71,111 +69,104 @@ bool memory_forward_success(const OpDef& def, SmallVector<TensorPtr> inputs) { | |||||
SmallVector<TensorPtr> apply_on_physical_tensor( | SmallVector<TensorPtr> apply_on_physical_tensor( | ||||
const OpDef& def, const SmallVector<TensorPtr>& inputs, | const OpDef& def, const SmallVector<TensorPtr>& inputs, | ||||
SmallVector<LogicalTensorDesc>& output_descs, const bool& validated) { | SmallVector<LogicalTensorDesc>& output_descs, const bool& validated) { | ||||
// memory forward | |||||
if (memory_forward_success(def, inputs)) { | if (memory_forward_success(def, inputs)) { | ||||
// maybe returns inputs[0] directly | |||||
return {Tensor::make( | return {Tensor::make( | ||||
inputs[0]->blob(), inputs[0]->offset(), inputs[0]->layout())}; | inputs[0]->blob(), inputs[0]->offset(), inputs[0]->layout())}; | ||||
} | } | ||||
auto size = inputs.size(); | |||||
if (size > 1) { | |||||
if (inputs.size() == 2) { | |||||
// reduce to target shape, fallback to proxy_graph | |||||
return proxy_graph_detail::apply_on_physical_tensor( | return proxy_graph_detail::apply_on_physical_tensor( | ||||
def, inputs, output_descs, validated); | def, inputs, output_descs, validated); | ||||
} | } | ||||
mgb_assert(inputs.size() == 1); | |||||
auto comp_node = inputs[0]->comp_node(); | auto comp_node = inputs[0]->comp_node(); | ||||
using TensorND = megdnn::TensorND; | |||||
auto&& op_def = def.cast_final_safe<Reduce>(); | auto&& op_def = def.cast_final_safe<Reduce>(); | ||||
SmallVector<TensorND> inp_tensornds; | |||||
inp_tensornds.reserve(inputs.size()); | |||||
auto src = inputs[0]->layout(); | |||||
DnnOprCaller<megdnn::Reduce> dnn_op(comp_node); | |||||
dnn_op.op->param() = op_def.param(); | |||||
auto axis = op_def.param().axis; | |||||
DnnOprCaller<megdnn::Reduce> dnn_op(comp_node, op_def.param()); | |||||
auto&& mode = dnn_op.param().mode; | |||||
auto& axis = dnn_op.param().axis; | |||||
auto keepdim = op_def.keepdim; | auto keepdim = op_def.keepdim; | ||||
if (axis < 0) { | |||||
axis = inputs[0]->layout().ndim + axis; | |||||
} | |||||
dnn_op.op->param().axis = axis == INT_MAX ? 0 : axis; | |||||
if (axis == INT_MAX) { | |||||
src.shape[0] = src.total_nr_elems(); | |||||
src.ndim = 1; | |||||
src.init_contiguous_stride(); | |||||
} | |||||
TensorLayout layout{src.dtype}; | |||||
dnn_op.op->deduce_layout(src, layout); | |||||
if (inputs[0]->layout().is_empty()) { | |||||
inputs[0]->dev_tensor().reset(inputs[0]->dev_tensor().storage(), src); | |||||
auto mode = op_def.param().mode; | |||||
if (!keepdim && src.ndim > 1) { | |||||
layout.remove_axis_inplace(axis); | |||||
layout.init_contiguous_stride(); | |||||
DnnTensorND dnn_input = [&] { | |||||
if (axis == INT_MAX) { // reduce to scalar | |||||
axis = 0; | |||||
// flatten input | |||||
return inputs[0]->dnn_tensor({inputs[0]->shape().total_nr_elems()}); | |||||
} else { | |||||
if (axis < 0) { | |||||
axis = inputs[0]->layout().ndim + axis; | |||||
} | |||||
mgb_assert(axis >= 0 && axis < inputs[0]->layout().ndim); | |||||
return inputs[0]->dnn_tensor(); | |||||
} | } | ||||
auto out = Tensor::make(layout, comp_node); | |||||
}(); | |||||
auto output_layout = dnn_op.deduce_layout(dnn_input.layout); | |||||
auto resolve_keepdim = [&] { | |||||
if (!keepdim) { | |||||
if (output_layout.ndim > 1) { | |||||
mgb_assert(output_layout.shape[axis] == 1); | |||||
output_layout.remove_axis_inplace(axis); | |||||
} | |||||
} | |||||
}; | |||||
std::string err_msg; | |||||
TensorPtr output; | |||||
if (output_layout.is_empty()) { | |||||
// output empty, no computation | |||||
resolve_keepdim(); | |||||
output = Tensor::make(output_layout, comp_node); | |||||
} else if (dnn_input.layout.is_empty()) { | |||||
// input empty but output not, do fill | |||||
resolve_keepdim(); | |||||
output = Tensor::make(output_layout, comp_node); | |||||
auto on_bad_empty_reduce = [](const char* name) { | |||||
mgb_throw( | |||||
MegBrainError, "empty input is not allowed for reduce mode: %s", | |||||
name); | |||||
}; | |||||
switch (mode) { | switch (mode) { | ||||
case Reduce::Mode::SUM: | case Reduce::Mode::SUM: | ||||
if (!out->empty()) { | |||||
dev_tensor_memset(out->dev_tensor(), 0); | |||||
} | |||||
// fill 0 | |||||
dev_tensor_memset(output->dev_tensor(), 0); | |||||
break; | break; | ||||
case Reduce::Mode::PRODUCT: | |||||
if (!out->empty()) { | |||||
DnnOprCaller<megdnn::Fill> fill_op(comp_node); | |||||
fill_op.op->param() = 1; | |||||
fill_op.op->exec(out->dnn_tensor(), {}); | |||||
} | |||||
case Reduce::Mode::PRODUCT: { | |||||
// fill 1 | |||||
DnnOprCaller<megdnn::Fill> fill_op(comp_node, {1}); | |||||
fill_op.exec_with_ws(output); | |||||
break; | break; | ||||
} | |||||
case Reduce::Mode::MEAN: | case Reduce::Mode::MEAN: | ||||
err_msg = "mean"; | |||||
on_bad_empty_reduce("mean"); | |||||
break; | break; | ||||
case Reduce::Mode::MIN: | case Reduce::Mode::MIN: | ||||
err_msg = "min"; | |||||
on_bad_empty_reduce("min"); | |||||
break; | break; | ||||
case Reduce::Mode::MAX: | case Reduce::Mode::MAX: | ||||
err_msg = "max"; | |||||
on_bad_empty_reduce("max"); | |||||
break; | break; | ||||
case Reduce::Mode::SUM_SQR: | case Reduce::Mode::SUM_SQR: | ||||
err_msg = "sum_sqr"; | |||||
on_bad_empty_reduce("sum_sqr"); | |||||
break; | break; | ||||
default: | default: | ||||
mgb_throw(MegBrainError, "bad reduce mode"); | mgb_throw(MegBrainError, "bad reduce mode"); | ||||
} | } | ||||
if (!err_msg.empty()) { | |||||
mgb_throw( | |||||
MegBrainError, "empty input is not allowed for reduce mode: %s", | |||||
err_msg.c_str()); | |||||
} else { | |||||
// common reduction | |||||
if (keepdim) { | |||||
output = Tensor::make(output_layout, comp_node); | |||||
dnn_op.exec_with_ws(dnn_input, output); | |||||
} else { | |||||
// used by megdnn::exec | |||||
auto output_layout_keepdim = output_layout; | |||||
resolve_keepdim(); | |||||
output = Tensor::make(output_layout, comp_node); | |||||
dnn_op.exec_with_ws(dnn_input, output->dnn_tensor(output_layout_keepdim)); | |||||
} | } | ||||
return {out}; | |||||
} | } | ||||
auto dnn_ten = inputs[0]->dnn_tensor(); | |||||
dnn_ten.layout = src; | |||||
inp_tensornds.push_back(dnn_ten); | |||||
auto wk_size = dnn_op.op->get_workspace_in_bytes(src, layout); | |||||
auto dnn_wk = dnn_op.create_workspace(wk_size); | |||||
TensorLayout ori_layout = layout; | |||||
if (!keepdim && src.ndim > 1) { | |||||
layout.remove_axis_inplace(axis); | |||||
layout.init_contiguous_stride(); | |||||
} | |||||
auto out = Tensor::make(layout, comp_node); | |||||
auto dnn_out = out->dnn_tensor(); | |||||
dnn_out.layout = ori_layout; | |||||
dnn_op.op->exec(inp_tensornds[0], dnn_out, dnn_wk); | |||||
return {out}; | |||||
return {output}; | |||||
} | } | ||||
std::tuple<SmallVector<LogicalTensorDesc>, bool> infer_output_attrs_fallible( | std::tuple<SmallVector<LogicalTensorDesc>, bool> infer_output_attrs_fallible( | ||||
@@ -184,16 +175,12 @@ std::tuple<SmallVector<LogicalTensorDesc>, bool> infer_output_attrs_fallible( | |||||
auto axis = op_def.param().axis; | auto axis = op_def.param().axis; | ||||
auto keepdim = op_def.keepdim; | auto keepdim = op_def.keepdim; | ||||
size_t size = inputs.size(); | |||||
SmallVector<LogicalTensorDesc> dests(size); | |||||
mgb_assert(inputs.size() > 0); | |||||
auto&& comp_node = inputs[0].comp_node; | |||||
auto&& input_layout = inputs[0].layout; | |||||
for (size_t i = 0; i < size; i++) { | |||||
if (inputs[i].layout.ndim == 0) { | |||||
return {{{TensorLayout(inputs[0].layout.dtype), inputs[0].comp_node}}, | |||||
false}; | |||||
} | |||||
} | |||||
if (size > 1) { | |||||
if (inputs.size() == 2) { | |||||
// fallback to proxy_graph, matters on backward | |||||
auto [output_descs, validated] = | auto [output_descs, validated] = | ||||
proxy_graph_detail::infer_output_attrs_fallible(def, inputs); | proxy_graph_detail::infer_output_attrs_fallible(def, inputs); | ||||
if (!inputs[1].value.empty()) { | if (!inputs[1].value.empty()) { | ||||
@@ -203,30 +190,37 @@ std::tuple<SmallVector<LogicalTensorDesc>, bool> infer_output_attrs_fallible( | |||||
return {output_descs, validated}; | return {output_descs, validated}; | ||||
} | } | ||||
mgb_assert(inputs.size() == 1); | |||||
if (axis == INT_MAX) { | |||||
// reduce to scalar | |||||
// ignore keepdim because ndim is 1 | |||||
auto&& dtype = input_layout.dtype; | |||||
auto&& format = input_layout.format; | |||||
auto output_layout = TensorLayout{{1}, dtype, format}; | |||||
return {{{output_layout, comp_node}}, true}; | |||||
} | |||||
if (input_layout.ndim == 0) { | |||||
// shape incomplete | |||||
return {{{TensorLayout(input_layout.dtype, input_layout.format), comp_node}}, | |||||
false}; | |||||
} | |||||
if (axis < 0) { | if (axis < 0) { | ||||
axis = inputs[0].layout.ndim + axis; | |||||
axis = input_layout.ndim + axis; | |||||
} | } | ||||
mgb_assert(axis >= 0 && axis < input_layout.ndim); | |||||
if (axis == INT_MAX || inputs[0].layout.ndim == 1) { | |||||
TensorLayout layout{inputs[0].layout.dtype}; | |||||
layout.shape[0] = 1; | |||||
layout.ndim = 1; | |||||
dests[0].layout = layout; | |||||
dests[0].comp_node = inputs[0].comp_node; | |||||
TensorLayout output_layout = input_layout; | |||||
bool remove_axis = (!keepdim) && input_layout.ndim > 1; | |||||
if (remove_axis) { | |||||
output_layout.remove_axis_inplace(axis); | |||||
} else { | } else { | ||||
for (size_t i = 0; i < size; ++i) { | |||||
dests[i].comp_node = inputs[i].comp_node; | |||||
dests[i].layout = inputs[i].layout; | |||||
if (!keepdim && dests[i].layout.ndim > 1) { | |||||
dests[i].layout.remove_axis_inplace(axis); | |||||
} else { | |||||
dests[i].layout.shape[axis] = 1; | |||||
} | |||||
dests[i].layout.init_contiguous_stride(); | |||||
} | |||||
output_layout.shape[axis] = 1; | |||||
} | } | ||||
return {dests, true}; | |||||
output_layout.init_contiguous_stride(); | |||||
return {{{output_layout, comp_node}}, true}; | |||||
} | } | ||||
SmallVector<VarNode::LayoutConstraintCallback> get_input_layout_constraint( | SmallVector<VarNode::LayoutConstraintCallback> get_input_layout_constraint( | ||||
@@ -230,31 +230,19 @@ SmallVector<TensorPtr> param_pack_concat_apply_on_physical_tensor( | |||||
} | } | ||||
auto dest_layout = TensorLayout({nr_elems}, dtype); | auto dest_layout = TensorLayout({nr_elems}, dtype); | ||||
auto output = Tensor::make(dest_layout, comp_node); | auto output = Tensor::make(dest_layout, comp_node); | ||||
auto caller = DnnOprCaller<megdnn::ParamPackConcat>(comp_node); | |||||
size_t srcs_size = sizeof(void*) * nr_inputs; | |||||
void** srcs_raw_ptr = (void**)comp_node.alloc_host(srcs_size); | |||||
std::shared_ptr<dt_byte> srcs_ptr = { | |||||
(dt_byte*)srcs_raw_ptr, | |||||
[comp_node](dt_byte* ptr) { comp_node.free_host(ptr); }}; | |||||
// FIXME: add param to ParamPackConcat | |||||
DnnOprCaller<megdnn::ParamPackConcat> caller{comp_node}; | |||||
HostTensorStorage srcs_storage{comp_node}; | |||||
srcs_storage.ensure_size(sizeof(void*) * nr_inputs); | |||||
TensorLayout srcs_layout = TensorLayout{{nr_inputs}, dtype::Int32()}; | TensorLayout srcs_layout = TensorLayout{{nr_inputs}, dtype::Int32()}; | ||||
size_t ws_size; | |||||
{ | |||||
TensorShapeArray src_shapes; | |||||
for (size_t i = 0; i < nr_inputs; ++i) { | |||||
src_shapes.push_back(inputs[i]->shape()); | |||||
} | |||||
ws_size = caller.op->get_workspace_in_bytes( | |||||
src_shapes, inputs.back()->shape(), TensorShape{}); | |||||
} | |||||
HostTensorND srcs_tensornd; | |||||
srcs_tensornd.reset(srcs_storage, srcs_layout); | |||||
auto* srcs_raw_ptr = reinterpret_cast<void**>(srcs_storage.ptr()); | |||||
for (size_t i = 0; i < nr_inputs; ++i) { | for (size_t i = 0; i < nr_inputs; ++i) { | ||||
srcs_raw_ptr[i] = inputs[i]->dev_tensor().as_megdnn().raw_ptr(); | |||||
srcs_raw_ptr[i] = inputs[i]->dnn_tensor().raw_ptr(); | |||||
} | } | ||||
HostTensorStorage srcs_storage; | |||||
srcs_storage.reset(comp_node, srcs_size, srcs_ptr); | |||||
caller.op->exec( | |||||
{srcs_raw_ptr, srcs_layout}, inputs.back()->dnn_tensor(), | |||||
output->dnn_tensor(), caller.create_workspace(ws_size)); | |||||
async_release(HostTensorND{comp_node, srcs_layout}.storage(srcs_storage)); | |||||
caller.exec_with_ws(srcs_tensornd.as_megdnn(), inputs.back(), output); | |||||
async_release(srcs_tensornd); | |||||
return {output}; | return {output}; | ||||
} | } | ||||
@@ -33,69 +33,39 @@ VarNodeArray apply_on_var_node(const OpDef& def, const VarNodeArray& inputs) { | |||||
std::tuple<SmallVector<LogicalTensorDesc>, bool> infer_output_attrs_fallible( | std::tuple<SmallVector<LogicalTensorDesc>, bool> infer_output_attrs_fallible( | ||||
const OpDef& def, const SmallVector<LogicalTensorDesc>& inputs) { | const OpDef& def, const SmallVector<LogicalTensorDesc>& inputs) { | ||||
auto&& op = static_cast<const ROIAlign&>(def); | |||||
if (inputs[0].layout.is_empty() || inputs[1].layout.is_empty()) { | |||||
return {{{TensorLayout(inputs[0].layout.dtype), inputs[0].comp_node}, | |||||
{TensorLayout(dtype::Int32()), inputs[1].comp_node}}, | |||||
false}; | |||||
} | |||||
SmallVector<LogicalTensorDesc> descs(2u); | |||||
size_t n = inputs[1].layout[0]; | |||||
size_t c = inputs[0].layout[1]; | |||||
descs[0].layout = TensorLayout( | |||||
{n, c, op.pooled_height, op.pooled_width}, inputs[0].layout.dtype); | |||||
descs[0].layout.init_contiguous_stride(); | |||||
descs[0].comp_node = inputs[0].comp_node; | |||||
descs[1].layout = | |||||
TensorLayout({n, c, op.pooled_height, op.pooled_width}, dtype::Int32()); | |||||
descs[1].layout.init_contiguous_stride(); | |||||
descs[1].comp_node = descs[0].comp_node; | |||||
return {descs, true}; | |||||
auto&& op = def.cast_final_safe<ROIAlign>(); | |||||
DnnOprHelper<megdnn::ROIAlign> dnn_opr(op.param()); | |||||
auto cn = inputs[0].comp_node; | |||||
auto&& [out_layout, ind_layout] = | |||||
dnn_opr.deduce_layouts<2>(inputs[0].layout, inputs[1].layout); | |||||
bool validated = out_layout.ndim == 0 && ind_layout.ndim == 0; | |||||
return {{{out_layout, cn}, {ind_layout, cn}}, validated}; | |||||
} | } | ||||
SmallVector<TensorPtr> apply_on_physical_tensor( | SmallVector<TensorPtr> apply_on_physical_tensor( | ||||
const OpDef& def, const SmallVector<TensorPtr>& inputs, | const OpDef& def, const SmallVector<TensorPtr>& inputs, | ||||
SmallVector<LogicalTensorDesc>& output_descs, const bool& validated) { | SmallVector<LogicalTensorDesc>& output_descs, const bool& validated) { | ||||
auto&& op = static_cast<const ROIAlign&>(def); | |||||
CompNode cn = inputs[0]->comp_node(); | |||||
auto&& op = def.cast_final_safe<ROIAlign>(); | |||||
auto cn = inputs[0]->comp_node(); | |||||
TensorLayout out_layout = output_descs[0].layout; | |||||
TensorLayout ind_layout = output_descs[1].layout; | |||||
if (!validated) { | |||||
size_t n = inputs[1]->layout()[0]; | |||||
size_t c = inputs[0]->layout()[1]; | |||||
out_layout = TensorLayout( | |||||
{n, c, op.pooled_height, op.pooled_width}, inputs[0]->layout().dtype); | |||||
out_layout.init_contiguous_stride(); | |||||
ind_layout = | |||||
TensorLayout({n, c, op.pooled_height, op.pooled_width}, dtype::Int32()); | |||||
ind_layout.init_contiguous_stride(); | |||||
} | |||||
DnnOprCaller<megdnn::ROIAlign> dnn_opr(cn, op.param()); | |||||
auto&& [out_layout, ind_layout] = [&]() -> std::array<TensorLayout, 2> { | |||||
if (validated) { | |||||
return {output_descs[0].layout, output_descs[1].layout}; | |||||
} else { | |||||
return dnn_opr.deduce_layouts<2>(inputs[0]->layout(), inputs[1]->layout()); | |||||
} | |||||
}(); | |||||
DeviceTensorND out = | |||||
BlobManager::inst()->alloc_workspace_with_defrag(cn, out_layout); | |||||
DeviceTensorND inds = | |||||
BlobManager::inst()->alloc_workspace_with_defrag(cn, ind_layout); | |||||
auto out = Tensor::make(out_layout, cn); | |||||
auto ind = Tensor::make(ind_layout, cn); | |||||
if (out_layout.is_empty() || ind_layout.is_empty()) { | if (out_layout.is_empty() || ind_layout.is_empty()) { | ||||
return {Tensor::make(out), Tensor::make(inds)}; | |||||
return {out, ind}; | |||||
} | } | ||||
DnnOprCaller<megdnn::ROIAlign> dnn_opr(cn); | |||||
dnn_opr.op->param() = op.param(); | |||||
size_t sz = dnn_opr.op->get_workspace_in_bytes( | |||||
inputs[0]->layout(), inputs[1]->layout(), out_layout, ind_layout); | |||||
auto dnn_wk = dnn_opr.create_workspace(sz); | |||||
dnn_opr.op->exec( | |||||
inputs[0]->dnn_tensor(), inputs[1]->dnn_tensor(), out.as_megdnn(), | |||||
inds.as_megdnn(), dnn_wk); | |||||
return {Tensor::make(out), Tensor::make(inds)}; | |||||
dnn_opr.exec_with_ws(inputs[0], inputs[1], out, ind); | |||||
return {out, ind}; | |||||
} | } | ||||
SmallVector<VarNode::LayoutConstraintCallback> get_input_layout_constraint( | SmallVector<VarNode::LayoutConstraintCallback> get_input_layout_constraint( | ||||
@@ -570,11 +570,17 @@ bool Tensor::empty() { | |||||
return !m_blob->size(); | return !m_blob->size(); | ||||
} | } | ||||
megdnn::TensorND Tensor::dnn_tensor() { | |||||
DnnTensorND Tensor::dnn_tensor() { | |||||
mgb_assert(m_blob, "uninitialized tensor."); | mgb_assert(m_blob, "uninitialized tensor."); | ||||
mgb_assert(m_layout.ndim, "dnn don't support scalar"); | |||||
return DnnTensorND{m_layout, m_blob->storage(), m_offset}; | return DnnTensorND{m_layout, m_blob->storage(), m_offset}; | ||||
} | } | ||||
DnnTensorND Tensor::dnn_tensor(TensorShape new_shape) { | |||||
mgb_assert(m_blob, "uninitialized tensor."); | |||||
return DnnTensorND{m_layout.reshape(new_shape), m_blob->storage(), m_offset}; | |||||
} | |||||
void Tensor::fetch_value() { | void Tensor::fetch_value() { | ||||
MGB_LOCK_GUARD(m_value_mtx); | MGB_LOCK_GUARD(m_value_mtx); | ||||
if (m_value.empty()) { | if (m_value.empty()) { | ||||
@@ -334,9 +334,16 @@ public: | |||||
size_t j = 0; | size_t j = 0; | ||||
for (auto&& var : m_opr->output()) { | for (auto&& var : m_opr->output()) { | ||||
if (var->contain_flag(VarNode::Flag::VOLATILE_CONTENT)) { | if (var->contain_flag(VarNode::Flag::VOLATILE_CONTENT)) { | ||||
TensorLayout layout{var->shape(), var->dtype(), var->format()}; | |||||
var->m_dev_tensor = BlobManager::inst()->alloc_workspace_with_defrag( | |||||
var->comp_node(), layout); | |||||
auto comp_node = var->comp_node(); | |||||
auto dtype = var->dtype(); | |||||
auto&& shape = var->shape(); | |||||
size_t size = dtype.size(shape.total_nr_elems()); | |||||
mgb_assert( | |||||
var->format().is_default(), "non default format for workspace"); | |||||
auto raw_storage = Blob::make(comp_node, size)->storage(); | |||||
DeviceTensorStorage storage; | |||||
storage.reset(comp_node, size, raw_storage); | |||||
var->m_dev_tensor.reset(storage, {shape, dtype}); | |||||
} else { | } else { | ||||
mgb_assert(j < outputs.size()); | mgb_assert(j < outputs.size()); | ||||
auto&& tensor = outputs[j]; | auto&& tensor = outputs[j]; | ||||
@@ -1,6 +1,7 @@ | |||||
#pragma once | #pragma once | ||||
#include "megbrain/imperative/physical_tensor.h" | #include "megbrain/imperative/physical_tensor.h" | ||||
#include "megbrain/imperative/utils/helper.h" | |||||
namespace mgb { | namespace mgb { | ||||
namespace imperative { | namespace imperative { | ||||
@@ -15,13 +16,19 @@ public: | |||||
virtual void alloc_direct(OwnedBlob* blob, size_t size) = 0; | virtual void alloc_direct(OwnedBlob* blob, size_t size) = 0; | ||||
virtual bool try_alloc_direct(OwnedBlob* blob, size_t size) { | |||||
try { | |||||
alloc_direct(blob, size); | |||||
return true; | |||||
} catch (MemAllocError&) { | |||||
return false; | |||||
} | |||||
} | |||||
virtual void alloc_with_defrag(OwnedBlob* blob, size_t size) = 0; | virtual void alloc_with_defrag(OwnedBlob* blob, size_t size) = 0; | ||||
virtual void set_allocator(allocator_t allocator) = 0; | virtual void set_allocator(allocator_t allocator) = 0; | ||||
virtual DeviceTensorND alloc_workspace_with_defrag( | |||||
CompNode cn, TensorLayout& layout) = 0; | |||||
virtual void register_blob(OwnedBlob* blob) = 0; | virtual void register_blob(OwnedBlob* blob) = 0; | ||||
virtual void unregister_blob(OwnedBlob* blob) = 0; | virtual void unregister_blob(OwnedBlob* blob) = 0; | ||||
@@ -89,24 +89,19 @@ using EventPtr = std::unique_ptr<CompNode::Event, EventDeleter>; | |||||
class Tensor; | class Tensor; | ||||
using TensorPtr = std::shared_ptr<Tensor>; | using TensorPtr = std::shared_ptr<Tensor>; | ||||
/* | |||||
using DnnTensorND to save the reference count of workspace | |||||
allocted by blobmanager to prevent invalidation | |||||
*/ | |||||
struct DnnTensorND : megdnn::TensorND { | struct DnnTensorND : megdnn::TensorND { | ||||
private: | |||||
std::shared_ptr<dt_byte> m_reference; | |||||
// hold extra reference to repvent defrag-in-use | |||||
std::shared_ptr<dt_byte> reference; | |||||
public: | |||||
DnnTensorND(TensorLayout& layout_, std::shared_ptr<dt_byte> ref_ptr, size_t offset) | |||||
: megdnn::TensorND(layout_, {ref_ptr.get(), offset}) { | |||||
m_reference = ref_ptr; | |||||
DnnTensorND( | |||||
const TensorLayout& layout_, std::shared_ptr<dt_byte> ptr, size_t offset) | |||||
: megdnn::TensorND(layout_, {ptr.get(), offset}) { | |||||
reference = std::move(ptr); | |||||
} | } | ||||
}; | }; | ||||
class Tensor : public NonCopyableObj { | class Tensor : public NonCopyableObj { | ||||
public: | public: | ||||
Tensor() = default; | |||||
Tensor(BlobPtr blob, const TensorLayout& layout, size_t offset = 0, | Tensor(BlobPtr blob, const TensorLayout& layout, size_t offset = 0, | ||||
const HostTensorND& hv = {}); | const HostTensorND& hv = {}); | ||||
Tensor(BlobPtr blob, const TensorLayout& layout, const HostTensorND& hv = {}) | Tensor(BlobPtr blob, const TensorLayout& layout, const HostTensorND& hv = {}) | ||||
@@ -154,7 +149,9 @@ public: | |||||
void assign_from_dev_tensor(DeviceTensorND); | void assign_from_dev_tensor(DeviceTensorND); | ||||
megdnn::TensorND dnn_tensor(); | |||||
DnnTensorND dnn_tensor(); | |||||
DnnTensorND dnn_tensor(TensorShape new_shape); | |||||
static TensorPtr make_scalar(DTypeScalar value, CompNode cn); | static TensorPtr make_scalar(DTypeScalar value, CompNode cn); | ||||
@@ -3,6 +3,7 @@ | |||||
#include <iomanip> | #include <iomanip> | ||||
#include <memory> | #include <memory> | ||||
#include <mutex> | #include <mutex> | ||||
#include <optional> | |||||
#include <sstream> | #include <sstream> | ||||
#include "megbrain/utils/metahelper.h" | #include "megbrain/utils/metahelper.h" | ||||
@@ -14,11 +15,28 @@ namespace imperative { | |||||
template <typename T = std::function<void()>> | template <typename T = std::function<void()>> | ||||
class CleanupGuard : public NonCopyableObj { | class CleanupGuard : public NonCopyableObj { | ||||
private: | private: | ||||
T m_callback; | |||||
std::optional<T> m_callback; | |||||
public: | public: | ||||
CleanupGuard() = default; | |||||
explicit CleanupGuard(T cb) : m_callback{std::move(cb)} {} | explicit CleanupGuard(T cb) : m_callback{std::move(cb)} {} | ||||
~CleanupGuard() { m_callback(); } | |||||
~CleanupGuard() { reset(); } | |||||
CleanupGuard(CleanupGuard&& rhs) : m_callback(std::move(rhs.m_callback)) { | |||||
rhs.m_callback.reset(); | |||||
} | |||||
CleanupGuard& operator=(CleanupGuard&& rhs) { | |||||
swap(m_callback, rhs.m_callback); | |||||
rhs.reset(); | |||||
return *this; | |||||
} | |||||
public: | |||||
void reset() { | |||||
if (m_callback) { | |||||
(*m_callback)(); | |||||
m_callback.reset(); | |||||
} | |||||
} | |||||
}; | }; | ||||
inline std::string quoted(std::string str) { | inline std::string quoted(std::string str) { | ||||
@@ -33,6 +51,19 @@ inline std::string quoted(std::string str) { | |||||
std::call_once(_once_flag, [&] { __VA_ARGS__; }); \ | std::call_once(_once_flag, [&] { __VA_ARGS__; }); \ | ||||
} while (false) | } while (false) | ||||
template <typename T> | |||||
struct is_small_vector { | |||||
static constexpr bool value = false; | |||||
}; | |||||
template <typename T> | |||||
struct is_small_vector<SmallVector<T>> { | |||||
static constexpr bool value = true; | |||||
}; | |||||
template <typename T> | |||||
static constexpr bool is_small_vector_v = is_small_vector<T>::value; | |||||
} // namespace imperative | } // namespace imperative | ||||
} // namespace mgb | } // namespace mgb |
@@ -6,4 +6,10 @@ namespace mgb::imperative { | |||||
std::string demangle(std::string mangled); | std::string demangle(std::string mangled); | ||||
template <typename T> | |||||
const char* demangled_typename() { | |||||
static auto name = demangle(typeid(T).name()); | |||||
return name.c_str(); | |||||
} | } | ||||
} // namespace mgb::imperative |
@@ -314,7 +314,8 @@ void CondTake::init_output_static_infer_desc() { | |||||
auto dtype = input(0)->dtype(); | auto dtype = input(0)->dtype(); | ||||
TensorLayout ily(iv.val[0].shape(), dtype); | TensorLayout ily(iv.val[0].shape(), dtype); | ||||
dest.ndim = 1; | dest.ndim = 1; | ||||
dest.shape[0] = megdnn_opr()->get_workspace_in_bytes(ily); | |||||
TensorLayout mly(iv.val[0].shape(), dtype::Int32()); | |||||
dest.shape[0] = megdnn_opr()->get_workspace_in_bytes(ily, mly); | |||||
return true; | return true; | ||||
}; | }; | ||||
owner_graph()->static_infer_manager().register_shape_infer( | owner_graph()->static_infer_manager().register_shape_infer( | ||||
@@ -548,9 +549,9 @@ void CheckNonFinite::init_output_static_infer_desc() { | |||||
auto infer_wk = [this](TensorShape& dest, const InpVal& inp) { | auto infer_wk = [this](TensorShape& dest, const InpVal& inp) { | ||||
dest.ndim = 1; | dest.ndim = 1; | ||||
megdnn::TensorNDArray inp_arr(input().size()); | |||||
SmallVector<megdnn::TensorLayout> inp_arr(input().size()); | |||||
for (size_t i = 0; i < input().size(); ++i) { | for (size_t i = 0; i < input().size(); ++i) { | ||||
inp_arr[i] = {NULL, {inp.val.at(i).shape(), input(0)->dtype()}}; | |||||
inp_arr[i] = {inp.val.at(i).shape(), input(0)->dtype()}; | |||||
} | } | ||||
dest.shape[0] = megdnn_opr()->get_workspace_in_bytes( | dest.shape[0] = megdnn_opr()->get_workspace_in_bytes( | ||||
inp_arr, {output(input().size() + 1)->shape(), | inp_arr, {output(input().size() + 1)->shape(), | ||||
@@ -1447,11 +1447,8 @@ void ParamPackConcat::init_output_static_infer_desc() { | |||||
auto infer_wk = [this](TensorShape& dest, const InpVal& inp) { | auto infer_wk = [this](TensorShape& dest, const InpVal& inp) { | ||||
TensorShapeArray shapes; | TensorShapeArray shapes; | ||||
auto vals = inp.val; | auto vals = inp.val; | ||||
shapes.reserve(vals.size() - 1); | |||||
for (size_t i = 0; i < vals.size() - 1; i++) { | |||||
shapes.push_back(vals[i].shape()); | |||||
} | |||||
dest = {m_opr->get_workspace_in_bytes(shapes, vals.back().shape(), dest)}; | |||||
size_t nr_params = vals.size() - 1; | |||||
dest = {m_opr->get_workspace_in_bytes({nr_params}, vals.back().shape(), dest)}; | |||||
return true; | return true; | ||||
}; | }; | ||||
mgr.register_shape_infer(output(0), {SourceType::DEP, shp_deps, infer_out}); | mgr.register_shape_infer(output(0), {SourceType::DEP, shp_deps, infer_out}); | ||||
@@ -970,8 +970,9 @@ void AlgoChooser<Opr>::AlgoChooserHelper::profile( | |||||
if (!policy.algo.valid()) | if (!policy.algo.valid()) | ||||
continue; | continue; | ||||
size_t workspace_needed = get_workspace_size_bytes(policy); | size_t workspace_needed = get_workspace_size_bytes(policy); | ||||
if (m_inputs != nullptr) | |||||
if (m_inputs == nullptr) { | |||||
workspace_needed += data_size; | workspace_needed += data_size; | ||||
} | |||||
if (workspace_needed > | if (workspace_needed > | ||||
m_desc.get_workspace_limit(m_cn, m_execution_policy.workspace_limit)) { | m_desc.get_workspace_limit(m_cn, m_execution_policy.workspace_limit)) { | ||||
continue; | continue; | ||||
@@ -18,7 +18,8 @@ failed_files = Manager().list() | |||||
def process_file(file, clang_format, write): | def process_file(file, clang_format, write): | ||||
source = open(file, "r").read() | |||||
original_source = open(file, "r").read() | |||||
source = original_source | |||||
source = re.sub(r"MGB_DEFINE(?P<r>([^\\]|\n)*?)// *{", r"class MGB_DEFINE\g<r>{", source) | source = re.sub(r"MGB_DEFINE(?P<r>([^\\]|\n)*?)// *{", r"class MGB_DEFINE\g<r>{", source) | ||||
source, count = re.subn(r"(?<!#define )MGB_DEFINE(.*) +\\", r"class MGB_DEFINE\1{\\", source) | source, count = re.subn(r"(?<!#define )MGB_DEFINE(.*) +\\", r"class MGB_DEFINE\1{\\", source) | ||||
@@ -38,7 +39,7 @@ def process_file(file, clang_format, write): | |||||
result = re.sub(r"class MGB_DEFINE(.*){( *)\\", r"MGB_DEFINE\1\2 \\", result) | result = re.sub(r"class MGB_DEFINE(.*){( *)\\", r"MGB_DEFINE\1\2 \\", result) | ||||
result = re.sub(r"class MGB_DEFINE((.|\n)*?){", r"MGB_DEFINE\1// {", result) | result = re.sub(r"class MGB_DEFINE((.|\n)*?){", r"MGB_DEFINE\1// {", result) | ||||
if write: | |||||
if write and original_source != result: | |||||
with tempfile.NamedTemporaryFile( | with tempfile.NamedTemporaryFile( | ||||
dir=os.path.dirname(file), delete=False | dir=os.path.dirname(file), delete=False | ||||
) as tmp_file: | ) as tmp_file: | ||||