GitOrigin-RevId: 402cba209a
HuaHua404-patch-4
@@ -397,7 +397,8 @@ public: | |||
OutputDType infer_dtype(DType data, DType mask); | |||
virtual size_t get_workspace_in_bytes(const TensorLayout& data) = 0; | |||
virtual size_t get_workspace_in_bytes( | |||
const TensorLayout& data, const TensorLayout& mask) = 0; | |||
virtual Output exec( | |||
_megdnn_tensor_in data, _megdnn_tensor_in mask, _megdnn_workspace workspace, | |||
@@ -512,7 +513,8 @@ public: | |||
virtual void exec( | |||
_megdnn_in const TensorNDArray& srcs, _megdnn_tensor_out dst, | |||
_megdnn_workspace workspace) = 0; | |||
void deduce_layout(const TensorLayoutArray& srcs, TensorLayout& dst); | |||
MGE_WIN_DECLSPEC_FUC void deduce_layout( | |||
const TensorLayoutArray& srcs, TensorLayout& dst); | |||
virtual size_t get_workspace_in_bytes( | |||
const TensorLayoutArray& srcs, const TensorLayout& dst) = 0; | |||
@@ -596,7 +598,7 @@ public: | |||
_megdnn_workspace workspace) = 0; | |||
virtual size_t get_workspace_in_bytes( | |||
const TensorShapeArray& srcs, const TensorShape& offsets, | |||
const TensorShape& srcs, const TensorShape& offsets, | |||
const TensorShape& dst) = 0; | |||
}; | |||
@@ -1145,7 +1147,7 @@ protected: | |||
/*! | |||
* \return axis on dst used by indexer (i.e. ExecInfo::idx_axis) | |||
*/ | |||
static size_t deduce_layout_fwd( | |||
MGE_WIN_DECLSPEC_FUC static size_t deduce_layout_fwd( | |||
const TensorLayout& data, const IndexDescLayoutOnly& index, | |||
TensorLayout& dst); | |||
@@ -1362,9 +1364,10 @@ class CheckNonFinite : public OperatorBase { | |||
public: | |||
virtual size_t get_workspace_in_bytes( | |||
const TensorNDArray& srcs, const TensorLayout& dst) = 0; | |||
const TensorLayoutArray& srcs, const TensorLayout& dst) = 0; | |||
void deduce_layout(const TensorLayoutArray& srcs, TensorLayout& dst); | |||
MGE_WIN_DECLSPEC_FUC void deduce_layout( | |||
const TensorLayoutArray& srcs, TensorLayout& dst); | |||
virtual void exec( | |||
_megdnn_in const TensorNDArray& srcs, _megdnn_tensor_out dst, | |||
@@ -1420,7 +1423,7 @@ public: | |||
} | |||
virtual size_t get_workspace_in_bytes( | |||
const TensorLayout& src, const TensorLayout& dst) = 0; | |||
void deduce_layout(const TensorLayout& src, TensorLayout& dst); | |||
MGE_WIN_DECLSPEC_FUC void deduce_layout(const TensorLayout& src, TensorLayout& dst); | |||
MGE_WIN_DECLSPEC_FUC static void deduce_layout_impl( | |||
const TensorLayout& src, TensorLayout& dst, const Param& p); | |||
@@ -1464,7 +1467,7 @@ public: | |||
const TensorLayout& m_t, const TensorLayout& v_t, | |||
const TensorLayout& new_param) = 0; | |||
void deduce_layout( | |||
MGE_WIN_DECLSPEC_FUC void deduce_layout( | |||
const TensorLayout& m_t_1, const TensorLayout& v_t_1, | |||
const TensorLayout& lamb_param, const TensorLayout& grad, TensorLayout& m_t, | |||
TensorLayout& v_t, TensorLayout& new_param); | |||
@@ -27,7 +27,8 @@ public: | |||
_megdnn_tensor_in A, _megdnn_tensor_in B, _megdnn_tensor_out C, | |||
_megdnn_workspace workspace) = 0; | |||
MGE_WIN_DECLSPEC_FUC void deduce_dtype(DType A, DType B, DType& C); | |||
void deduce_layout(const TensorLayout& A, const TensorLayout& B, TensorLayout& C); | |||
MGE_WIN_DECLSPEC_FUC void deduce_layout( | |||
const TensorLayout& A, const TensorLayout& B, TensorLayout& C); | |||
virtual size_t get_workspace_in_bytes( | |||
const TensorLayout& A, const TensorLayout& B, const TensorLayout& C) = 0; | |||
@@ -64,7 +65,8 @@ public: | |||
_megdnn_tensor_in A, _megdnn_tensor_in B, _megdnn_tensor_out C, | |||
_megdnn_workspace workspace) = 0; | |||
MGE_WIN_DECLSPEC_FUC void deduce_dtype(DType A, DType B, DType& C); | |||
void deduce_layout(const TensorLayout& A, const TensorLayout& B, TensorLayout& C); | |||
MGE_WIN_DECLSPEC_FUC void deduce_layout( | |||
const TensorLayout& A, const TensorLayout& B, TensorLayout& C); | |||
virtual size_t get_workspace_in_bytes( | |||
const TensorLayout& A, const TensorLayout& B, const TensorLayout& C) = 0; | |||
@@ -224,9 +224,9 @@ public: | |||
const TensorLayout& src_layout, _megdnn_tensor_in filter, | |||
const TensorLayout& dst_layout, PreprocessedFilter* preprocessed_filter, | |||
_megdnn_workspace workspace) = 0; | |||
void deduce_dtype(DType src, DType filter, DType& dst); | |||
MGE_WIN_DECLSPEC_FUC void deduce_dtype(DType src, DType filter, DType& dst); | |||
void deduce_layout( | |||
MGE_WIN_DECLSPEC_FUC void deduce_layout( | |||
const TensorLayout& src, const TensorLayout& filter, TensorLayout& dst); | |||
/** | |||
@@ -300,7 +300,7 @@ public: | |||
const TensorLayout& grad) = 0; | |||
MGE_WIN_DECLSPEC_FUC void deduce_dtype(DType filter, DType diff, DType& grad); | |||
void deduce_layout( | |||
MGE_WIN_DECLSPEC_FUC void deduce_layout( | |||
const TensorLayout& filter, const TensorLayout& diff, TensorLayout& grad); | |||
static Algorithm::OprType get_opr_type() { | |||
@@ -378,6 +378,12 @@ public: | |||
const PreprocessedFilter* preprocessed_filter, | |||
_megdnn_workspace workspace) = 0; | |||
MGE_WIN_DECLSPEC_FUC void exec( | |||
_megdnn_tensor_in src, _megdnn_tensor_in filter, _megdnn_tensor_in bias, | |||
_megdnn_tensor_in z, _megdnn_tensor_out dst, _megdnn_workspace workspace) { | |||
exec(src, filter, bias, z, dst, nullptr, workspace); | |||
} | |||
/** | |||
* \brief execute weight preprocessing, read weights form filter and bias, | |||
* write to preprocessed_filter after preprocessed. | |||
@@ -390,8 +396,9 @@ public: | |||
_megdnn_tensor_in bias, const TensorLayout& z_layout, | |||
const TensorLayout& dst_layout, PreprocessedFilter* preprocessed_filter, | |||
_megdnn_workspace workspace) = 0; | |||
void deduce_dtype(DType src, DType filter, DType bias, DType z, DType& dst); | |||
void deduce_layout( | |||
MGE_WIN_DECLSPEC_FUC void deduce_dtype( | |||
DType src, DType filter, DType bias, DType z, DType& dst); | |||
MGE_WIN_DECLSPEC_FUC void deduce_layout( | |||
const TensorLayout& src, const TensorLayout& filter, | |||
const TensorLayout& bias, const TensorLayout& z, TensorLayout& dst); | |||
@@ -775,7 +782,7 @@ protected: | |||
void check_layout_fwd(const TensorLayout& src, const TensorLayout& dst); | |||
public: | |||
MGE_WIN_DECLSPEC_FUC static void deduce_layout_impl( | |||
static void deduce_layout_impl( | |||
const TensorLayout& src, const Param& param, TensorLayout& dst); | |||
}; | |||
@@ -791,7 +798,7 @@ public: | |||
virtual void exec( | |||
_megdnn_tensor_in src, _megdnn_tensor_out dst, | |||
_megdnn_workspace workspace) = 0; | |||
void deduce_layout(const TensorLayout& src, TensorLayout& dst); | |||
MGE_WIN_DECLSPEC_FUC void deduce_layout(const TensorLayout& src, TensorLayout& dst); | |||
virtual size_t get_workspace_in_bytes( | |||
const TensorLayout& src, const TensorLayout& dst) = 0; | |||
@@ -1253,7 +1260,7 @@ public: | |||
virtual void exec( | |||
_megdnn_tensor_in src, _megdnn_tensor_in filter, _megdnn_tensor_out dst, | |||
_megdnn_workspace workspace) = 0; | |||
void deduce_layout( | |||
MGE_WIN_DECLSPEC_FUC void deduce_layout( | |||
const TensorLayout& src, const TensorLayout& filter, TensorLayout& dst); | |||
virtual size_t get_workspace_in_bytes( | |||
const TensorLayout& src, const TensorLayout& filter, | |||
@@ -1281,18 +1288,16 @@ public: | |||
* \param[in] diff (n, oc, od, oh, ow) | |||
* \param[out] grad (n, ic, id, ih, iw) | |||
*/ | |||
MGE_WIN_DECLSPEC_FUC static void deduce_layout_impl( | |||
static void deduce_layout_impl( | |||
const TensorLayout& filter, const TensorLayout& diff, const Param& param, | |||
TensorLayout& grad); | |||
virtual void exec( | |||
_megdnn_tensor_in filter, _megdnn_tensor_in diff, _megdnn_tensor_out grad, | |||
_megdnn_workspace workspace) = 0; | |||
virtual size_t get_workspace_in_bytes( | |||
const TensorLayout& filter, const TensorLayout& diff, | |||
const TensorLayout& grad) = 0; | |||
void deduce_layout( | |||
MGE_WIN_DECLSPEC_FUC void deduce_layout( | |||
const TensorLayout& filter, const TensorLayout& diff, TensorLayout& grad); | |||
static Algorithm::OprType get_opr_type() { | |||
@@ -1472,7 +1477,7 @@ public: | |||
virtual void exec( | |||
_megdnn_tensor_in src, _megdnn_tensor_in rois, _megdnn_tensor_out dst, | |||
_megdnn_tensor_out index, _megdnn_workspace workspace) = 0; | |||
void deduce_layout( | |||
MGE_WIN_DECLSPEC_FUC void deduce_layout( | |||
const TensorLayout& src, const TensorLayout& rois, TensorLayout& dst, | |||
TensorLayout& index); | |||
virtual size_t get_workspace_in_bytes( | |||
@@ -1963,7 +1968,7 @@ public: | |||
_megdnn_tensor_in data, _megdnn_tensor_in weight, _megdnn_tensor_in bias, | |||
_megdnn_tensor_out dst, _megdnn_tensor_out mean, _megdnn_tensor_out rstd, | |||
_megdnn_workspace workspace) = 0; | |||
void deduce_layout( | |||
MGE_WIN_DECLSPEC_FUC void deduce_layout( | |||
const TensorLayout& data, const TensorLayout& weight, | |||
const TensorLayout& bias, TensorLayout& dst, TensorLayout& mean, | |||
TensorLayout& rstd); | |||
@@ -7,7 +7,11 @@ void CheckNonFinite::check_exec( | |||
const TensorNDArray& srcs, const TensorND& dst, size_t workspace_in_bytes) { | |||
megdnn_assert_contiguous(dst.layout); | |||
megdnn_assert(srcs.size() > 0); | |||
auto required_workspace_in_bytes = get_workspace_in_bytes(srcs, dst.layout); | |||
TensorLayoutArray src_layouts; | |||
for (auto&& src : srcs) { | |||
src_layouts.push_back(src.layout); | |||
} | |||
auto required_workspace_in_bytes = get_workspace_in_bytes(src_layouts, dst.layout); | |||
megdnn_assert(workspace_in_bytes >= required_workspace_in_bytes); | |||
} | |||
@@ -11,7 +11,7 @@ size_t CondTake::check_exec_get_size( | |||
mask.TensorShape::to_string().c_str()); | |||
megdnn_assert(data.is_physical_contiguous() && mask.is_physical_contiguous()); | |||
megdnn_assert(m_param.eps > 0, "eps must be non-negative; got: %g", m_param.eps); | |||
megdnn_assert(workspace_in_bytes >= get_workspace_in_bytes(data)); | |||
megdnn_assert(workspace_in_bytes >= get_workspace_in_bytes(data, mask)); | |||
return data.total_nr_elems(); | |||
} | |||
@@ -7,9 +7,9 @@ void LAMBUpdate::deduce_layout( | |||
const TensorLayout& m_t_1, const TensorLayout& v_t_1, | |||
const TensorLayout& lamb_param, const TensorLayout& grad, TensorLayout& m_t, | |||
TensorLayout& v_t, TensorLayout& new_param) { | |||
m_t = TensorLayout(m_t_1); | |||
v_t = TensorLayout(v_t_1); | |||
new_param = TensorLayout(lamb_param); | |||
m_t = m_t_1; | |||
v_t = v_t_1; | |||
new_param = lamb_param; | |||
MEGDNN_MARK_USED_VAR(grad); | |||
} | |||
@@ -26,14 +26,14 @@ size_t CheckNonFiniteImpl::_get_workspace_in_bytes() { | |||
} | |||
size_t CheckNonFiniteImpl::get_workspace_in_bytes( | |||
const TensorNDArray& srcs, const TensorLayout&) { | |||
const TensorLayoutArray& srcs, const TensorLayout&) { | |||
m_size = 0; | |||
for (const auto& src : srcs) { | |||
m_size += DIVUP(src.layout.total_nr_elems(), total_nr_elems_max); | |||
m_size += DIVUP(src.total_nr_elems(), total_nr_elems_max); | |||
} | |||
if (srcs.begin()->layout.dtype == dtype::Float32()) { | |||
if (srcs.begin()->dtype == dtype::Float32()) { | |||
return _get_workspace_in_bytes<dt_float32>(); | |||
} else if (srcs.begin()->layout.dtype == dtype::Float16()) { | |||
} else if (srcs.begin()->dtype == dtype::Float16()) { | |||
return _get_workspace_in_bytes<dt_float16>(); | |||
} else { | |||
megdnn_log_warn("only support fp16 and fp32, fallback to fp32"); | |||
@@ -19,7 +19,7 @@ public: | |||
using CheckNonFinite::CheckNonFinite; | |||
size_t get_workspace_in_bytes( | |||
const TensorNDArray& srcs, const TensorLayout& dst) override; | |||
const TensorLayoutArray& srcs, const TensorLayout& dst) override; | |||
bool is_thread_safe() const override { return true; } | |||
@@ -20,7 +20,8 @@ WorkspaceBundle CondTakeImpl::make_bundle(size_t nr_item) { | |||
handle()->alignment_requirement()}; | |||
} | |||
size_t CondTakeImpl::get_workspace_in_bytes(const TensorLayout& data) { | |||
size_t CondTakeImpl::get_workspace_in_bytes( | |||
const TensorLayout& data, const TensorLayout&) { | |||
return make_bundle(data.total_nr_elems()).total_size_in_bytes(); | |||
} | |||
@@ -15,7 +15,8 @@ public: | |||
_megdnn_tensor_in data, _megdnn_tensor_in mask, _megdnn_workspace workspace, | |||
DynOutMallocPolicyCall malloc_policy) override; | |||
size_t get_workspace_in_bytes(const TensorLayout& data) override; | |||
size_t get_workspace_in_bytes( | |||
const TensorLayout& data, const TensorLayout& mask) override; | |||
}; | |||
} // namespace cuda | |||
@@ -6,8 +6,8 @@ namespace megdnn { | |||
namespace cuda { | |||
size_t ParamPackConcatImpl::get_workspace_in_bytes( | |||
const TensorShapeArray& srcs, const TensorShape&, const TensorShape&) { | |||
return sizeof(size_t) * srcs.size(); | |||
const TensorShape&, const TensorShape& offsets, const TensorShape&) { | |||
return sizeof(size_t) * (offsets.shape[0] / 2); | |||
} | |||
template <typename T> | |||
@@ -12,7 +12,7 @@ public: | |||
_megdnn_workspace workspace) override; | |||
size_t get_workspace_in_bytes( | |||
const TensorShapeArray& srcs, const TensorShape& table, | |||
const TensorShape& srcs, const TensorShape& table, | |||
const TensorShape& dst) override; | |||
private: | |||
@@ -13,7 +13,8 @@ public: | |||
bool is_thread_safe() const override { return true; } | |||
size_t get_workspace_in_bytes(const TensorNDArray&, const TensorLayout&) override { | |||
size_t get_workspace_in_bytes( | |||
const TensorLayoutArray&, const TensorLayout&) override { | |||
m_size = 0; | |||
return _get_workspace_in_bytes(); | |||
} | |||
@@ -38,7 +38,8 @@ void copy_data( | |||
} // anonymous namespace | |||
size_t CondTakeImpl::get_workspace_in_bytes(const TensorLayout& data) { | |||
size_t CondTakeImpl::get_workspace_in_bytes( | |||
const TensorLayout& data, const TensorLayout&) { | |||
return (data.total_nr_elems() + 1) * sizeof(dt_int32); | |||
} | |||
@@ -11,7 +11,8 @@ class CondTakeImpl : public CondTake { | |||
public: | |||
using CondTake::CondTake; | |||
size_t get_workspace_in_bytes(const TensorLayout& data) override; | |||
size_t get_workspace_in_bytes( | |||
const TensorLayout& data, const TensorLayout& mask) override; | |||
Output exec( | |||
_megdnn_tensor_in data, _megdnn_tensor_in mask, _megdnn_workspace workspace, | |||
@@ -11,7 +11,7 @@ public: | |||
_megdnn_workspace workspace) override; | |||
size_t get_workspace_in_bytes( | |||
const TensorShapeArray&, const TensorShape&, const TensorShape&) override { | |||
const TensorShape&, const TensorShape&, const TensorShape&) override { | |||
return 0; | |||
} | |||
}; | |||
@@ -7,8 +7,8 @@ namespace megdnn { | |||
namespace rocm { | |||
size_t ParamPackConcatImpl::get_workspace_in_bytes( | |||
const TensorShapeArray& srcs, const TensorShape&, const TensorShape&) { | |||
return sizeof(size_t) * srcs.size(); | |||
const TensorShape&, const TensorShape& offsets, const TensorShape&) { | |||
return sizeof(size_t) * (offsets.shape[0] / 2); | |||
} | |||
template <typename T> | |||
@@ -12,7 +12,7 @@ public: | |||
_megdnn_workspace workspace) override; | |||
size_t get_workspace_in_bytes( | |||
const TensorShapeArray& srcs, const TensorShape& table, | |||
const TensorShape& srcs, const TensorShape& table, | |||
const TensorShape& dst) override; | |||
private: | |||
@@ -71,7 +71,7 @@ CondTakeTestcase::Result CondTakeTestcase::run(CondTake* opr) { | |||
opr->param() = m_param; | |||
DynOutMallocPolicyImpl malloc_policy(handle); | |||
auto workspace_size = opr->get_workspace_in_bytes(data->layout); | |||
auto workspace_size = opr->get_workspace_in_bytes(data->layout, mask->layout); | |||
auto workspace_ptr = malloc_policy.alloc_workspace(workspace_size, nullptr); | |||
auto result = opr->exec( | |||
*data, *mask, {(dt_byte*)workspace_ptr, workspace_size}, &malloc_policy); | |||
@@ -205,9 +205,14 @@ struct OprProxy<CheckNonFinite> { | |||
auto inps = tensors; | |||
inps.pop_back(); | |||
TensorLayoutArray inp_layouts(inps.size()); | |||
std::transform( | |||
inps.begin(), inps.end(), inp_layouts.begin(), | |||
[](const TensorND& tensor) { return tensor.layout; }); | |||
WorkspaceWrapper W( | |||
opr->handle(), | |||
opr->get_workspace_in_bytes(inps, tensors.back().layout)); | |||
opr->get_workspace_in_bytes(inp_layouts, tensors.back().layout)); | |||
opr->exec(inps, tensors.back(), W.workspace()); | |||
} | |||
}; | |||
@@ -95,7 +95,7 @@ void test_param_pack_concat( | |||
test::WorkspaceWrapper workspace( | |||
handle, | |||
concat->get_workspace_in_bytes(shapes, offsets_layout, {pack_size})); | |||
concat->get_workspace_in_bytes({nr_params}, offsets_layout, {pack_size})); | |||
TensorND src_tensor(param_ptrs.data(), TensorLayout({nr_params}, dtype::Int32())); | |||
concat->exec(src_tensor, offsets_tensor, dst_tensor, workspace.workspace()); | |||
@@ -97,7 +97,7 @@ void test_param_pack_concat( | |||
test::WorkspaceWrapper workspace( | |||
handle, | |||
concat->get_workspace_in_bytes(shapes, offsets_layout, {pack_size})); | |||
concat->get_workspace_in_bytes({nr_params}, offsets_layout, {pack_size})); | |||
TensorND src_tensor(param_ptrs.data(), TensorLayout({nr_params}, dtype::Int32())); | |||
concat->exec(src_tensor, offsets_tensor, dst_tensor, workspace.workspace()); | |||
@@ -9,11 +9,8 @@ BlobManagerImpl::BlobData::BlobData(OwnedBlob* in_blob) { | |||
blob = in_blob; | |||
DeviceTensorStorage d_storage; | |||
d_storage.reset(blob->m_comp_node, blob->m_size, blob->m_storage); | |||
h_storage = HostTensorStorage(blob->m_comp_node); | |||
h_storage.ensure_size(blob->m_size); | |||
h_storage.copy_from(const_cast<DeviceTensorStorage&>(d_storage), blob->m_size); | |||
} | |||
@@ -30,65 +27,36 @@ void BlobManagerImpl::unregister_blob(OwnedBlob* blob) { | |||
} | |||
void BlobManagerImpl::alloc_with_defrag(OwnedBlob* blob, size_t size) { | |||
if (custom_allocator) { | |||
blob->m_storage = custom_allocator(blob->m_comp_node, size); | |||
if (m_custom_allocator) { | |||
blob->m_storage = m_custom_allocator(blob->m_comp_node, size); | |||
return; | |||
} | |||
// try alloc | |||
MGB_TRY { alloc_direct(blob, size); } | |||
// if fail, try defrag, alloc again | |||
MGB_CATCH(MemAllocError&, { | |||
if (!try_alloc_direct(blob, size)) { | |||
mgb_log_warn("memory allocation failed for blob; try defragmenting"); | |||
defrag(blob->m_comp_node); | |||
alloc_direct(blob, size); | |||
}); | |||
} | |||
} | |||
void BlobManagerImpl::alloc_direct(OwnedBlob* blob, size_t size) { | |||
DeviceTensorStorage storage(blob->m_comp_node); | |||
mgb_assert(blob->m_comp_node.valid()); | |||
DeviceTensorStorage storage(blob->m_comp_node); | |||
storage.ensure_size(size); | |||
blob->m_storage = storage.raw_storage(); | |||
} | |||
DeviceTensorND BlobManagerImpl::alloc_workspace_with_defrag( | |||
CompNode cn, TensorLayout& layout) { | |||
DeviceTensorND dev_tensor; | |||
if (custom_allocator) { | |||
DeviceTensorStorage storage(cn); | |||
size_t sz = layout.dtype.size(layout.total_nr_elems()); | |||
storage.reset(cn, sz, custom_allocator(cn, sz)); | |||
dev_tensor.reset(storage, layout); | |||
return dev_tensor; | |||
} | |||
MGB_TRY { dev_tensor = alloc_workspace(cn, layout); } | |||
MGB_CATCH(MemAllocError&, { | |||
mgb_log_warn("memory allocation failed for workspace; try defragmenting"); | |||
defrag(cn); | |||
dev_tensor = alloc_workspace(cn, layout); | |||
}); | |||
return dev_tensor; | |||
}; | |||
DeviceTensorND BlobManagerImpl::alloc_workspace(CompNode cn, TensorLayout layout) { | |||
DeviceTensorStorage storage(cn); | |||
storage.ensure_size(layout.dtype.size(layout.total_nr_elems())); | |||
DeviceTensorND dev_tensor; | |||
dev_tensor.reset(storage, layout); | |||
return dev_tensor; | |||
} | |||
void BlobManagerImpl::set_allocator(allocator_t allocator) { | |||
custom_allocator = allocator; | |||
m_custom_allocator = allocator; | |||
} | |||
void BlobManagerImpl::defrag(const CompNode& cn) { | |||
BlobSetWithMux* blobs_set_ptr; | |||
{ | |||
auto& blobs_set_ptr = ([&]() -> auto& { | |||
MGB_LOCK_GUARD(m_mtx); | |||
blobs_set_ptr = &m_comp2blobs_map[cn]; | |||
} | |||
MGB_LOCK_GUARD(blobs_set_ptr->mtx); | |||
return m_comp2blobs_map[cn]; | |||
})(); | |||
MGB_LOCK_GUARD(blobs_set_ptr.mtx); | |||
std::vector<BlobData> blob_data_arrary; | |||
std::set<Blob::RawStorage> storage_set; | |||
@@ -96,7 +64,7 @@ void BlobManagerImpl::defrag(const CompNode& cn) { | |||
size_t tot_sz = 0; | |||
// copy to HostTensorStorage, and release | |||
for (auto i : blobs_set_ptr->blobs_set) { | |||
for (auto i : blobs_set_ptr.blobs_set) { | |||
// skip if blob do not have m_storage | |||
if (!i->m_storage) | |||
continue; | |||
@@ -153,9 +121,6 @@ struct BlobManagerStub : BlobManager { | |||
void alloc_with_defrag(OwnedBlob* blob, size_t size) { | |||
mgb_assert(0, "prohibited after global variable destruction"); | |||
}; | |||
DeviceTensorND alloc_workspace_with_defrag(CompNode cn, TensorLayout& layout) { | |||
mgb_assert(0, "prohibited after global variable destruction"); | |||
}; | |||
void register_blob(OwnedBlob* blob) { | |||
mgb_assert(0, "prohibited after global variable destruction"); | |||
}; | |||
@@ -163,7 +128,7 @@ struct BlobManagerStub : BlobManager { | |||
void defrag(const CompNode& cn) { | |||
mgb_assert(0, "prohibited after global variable destruction"); | |||
}; | |||
virtual void set_allocator(allocator_t allocator) { | |||
void set_allocator(allocator_t allocator) { | |||
mgb_assert(0, "prohibited after global variable destruction"); | |||
}; | |||
}; | |||
@@ -27,27 +27,21 @@ class BlobManagerImpl final : public BlobManager { | |||
std::mutex m_mtx; | |||
CompNode::UnorderedMap<BlobSetWithMux> m_comp2blobs_map; | |||
void defrag(const CompNode& cn) override; | |||
BlobManager::allocator_t m_custom_allocator; | |||
void alloc_direct(OwnedBlob* blob, size_t size) override; | |||
DeviceTensorND alloc_workspace(CompNode cn, TensorLayout layout); | |||
BlobManager::allocator_t custom_allocator; | |||
public: | |||
static BlobManager* inst(); | |||
void alloc_with_defrag(OwnedBlob* blob, size_t size) override; | |||
DeviceTensorND alloc_workspace_with_defrag( | |||
CompNode cn, TensorLayout& layout) override; | |||
void register_blob(OwnedBlob* blob) override; | |||
void unregister_blob(OwnedBlob* blob) override; | |||
void defrag(const CompNode& cn) override; | |||
void set_allocator(allocator_t allocator) override; | |||
}; | |||
@@ -1,79 +1,331 @@ | |||
#pragma once | |||
#include <optional> | |||
#include <type_traits> | |||
#include "algo_chooser.h" | |||
#include "megbrain/comp_node.h" | |||
#include "megbrain/comp_node_env.h" | |||
#include "megbrain/imperative/blob_manager.h" | |||
#include "megbrain/imperative/physical_tensor.h" | |||
#include "megbrain/imperative/utils/helper.h" | |||
#include "megbrain/imperative/utils/platform.h" | |||
#include "megbrain/rdnn/management.h" | |||
using namespace megdnn; | |||
#include "megdnn/basic_types.h" | |||
namespace mgb { | |||
namespace imperative { | |||
/*! | |||
* \brief A struct for safely calling DNN oprs | |||
* In some cases, op may be released before the complete of the execution | |||
* This destructor will prevent this | |||
* /brief Helps deduce layout and dtype | |||
*/ | |||
template <typename Opr> | |||
struct DnnOprCaller { | |||
CompNode cn; | |||
DeviceTensorND dev_tensor; | |||
Workspace workspace; | |||
mgb::opr::intl::UniqPtrWithCN<Opr> op; | |||
class DnnOprDeducer { | |||
private: | |||
Opr* m_opr; | |||
DnnOprCaller(CompNode cn) : cn(cn), op(std::move(create_operator(cn))) {} | |||
public: | |||
DnnOprDeducer(Opr* opr) : m_opr(opr) { mgb_assert(opr); } | |||
static mgb::opr::intl::UniqPtrWithCN<Opr> create_operator(CompNode cn) { | |||
return mgb::opr::intl::create_megdnn_opr<Opr>(cn); | |||
// FIXME: maybe in-place style deduction works better | |||
template <typename... TArgs> | |||
TensorLayout deduce_layout(TArgs&&... args) { | |||
static_assert((std::is_convertible_v<TArgs, TensorLayout> && ...)); | |||
TensorLayout output_layout; | |||
m_opr->deduce_layout(args..., output_layout); | |||
return output_layout; | |||
} | |||
Workspace create_workspace(size_t sz) { | |||
if (workspace.raw_ptr) { | |||
mgb_throw(MegBrainError, "workspace should not be applicated many times"); | |||
} | |||
if (sz) { | |||
TensorLayout layout({sz}, dtype::Byte()); | |||
dev_tensor = Tensor::make(layout, cn)->dev_tensor(); | |||
workspace = megdnn::Workspace( | |||
dev_tensor.raw_ptr(), dev_tensor.storage().size()); | |||
template <typename... TArgs> | |||
TensorLayout deduce_layout_fallible(TArgs&&... args) { | |||
static_assert((std::is_convertible_v<TArgs, TensorLayout> && ...)); | |||
TensorLayout output_layout; | |||
bool success = (args.ndim * ...) > 0; | |||
if (success) { | |||
m_opr->deduce_layout(args..., output_layout); | |||
} else { | |||
m_opr->deduce_dtype(args.dtype..., output_layout.dtype); | |||
} | |||
return workspace; | |||
return output_layout; | |||
} | |||
~DnnOprCaller() { | |||
template <size_t nr_outputs, typename... TArgs> | |||
std::array<TensorLayout, nr_outputs> deduce_layouts(TArgs&&... args) { | |||
static_assert((std::is_convertible_v<TArgs, TensorLayout> && ...)); | |||
std::array<TensorLayout, nr_outputs> layouts; | |||
std::apply( | |||
[&](auto&&... outputs) { m_opr->deduce_layout(args..., outputs...); }, | |||
layouts); | |||
return layouts; | |||
} | |||
}; | |||
/*! | |||
* /brief Declare an abstract operator and initialize it's param | |||
*/ | |||
template <typename Opr> | |||
class DnnOprStub { | |||
private: | |||
// TODO: make opr concrete | |||
std::aligned_storage_t<sizeof(Opr), alignof(Opr)> m_storage; | |||
using Param = typename Opr::Param; | |||
private: | |||
DnnOprStub() { new (¶m()) Param(); } | |||
public: | |||
DnnOprStub(const Param& param) { this->param() = param; } | |||
// undefined behavior | |||
Opr& opr() { return *reinterpret_cast<Opr*>(&m_storage); } | |||
auto& param() { return opr().param(); } | |||
auto& param() const { return opr().param(); } | |||
~DnnOprStub() { param().~Param(); } | |||
}; | |||
/*! | |||
* /brief Deduce layout without create concrete opr | |||
*/ | |||
template <typename Opr> | |||
class DnnOprHelper : public DnnOprStub<Opr>, public DnnOprDeducer<Opr> { | |||
private: | |||
using Stub = DnnOprStub<Opr>; | |||
using Deducer = DnnOprDeducer<Opr>; | |||
public: | |||
DnnOprHelper(const typename Opr::Param& param) | |||
: Stub(param), Deducer(&Stub::opr()) {} | |||
}; | |||
// hold a concrete operator in given comp_node | |||
template <typename Opr> | |||
class DnnOprHolder { | |||
private: | |||
CompNode m_comp_node; | |||
opr::intl::UniqPtrWithCN<Opr> m_opr = | |||
opr::intl::create_megdnn_opr<Opr>(m_comp_node); | |||
public: | |||
DnnOprHolder(CompNode comp_node) : m_comp_node(comp_node) {} | |||
auto& op() { return m_opr; } | |||
auto comp_node() { return m_comp_node; } | |||
auto& param() { return m_opr->param(); } | |||
auto& param() const { return m_opr->param(); } | |||
~DnnOprHolder() { | |||
using DT = CompNode::DeviceType; | |||
if (cn.device_type() == DT::CPU && cn != CompNode::default_cpu()) { | |||
CompNodeEnv::from_comp_node(cn).cpu_env().dispatch( | |||
[p = op.release()] { delete p; }); | |||
if (m_comp_node.device_type() == DT::CPU && | |||
m_comp_node != CompNode::default_cpu()) { | |||
CompNodeEnv::from_comp_node(m_comp_node) | |||
.cpu_env() | |||
.dispatch([p = m_opr.release()] { delete p; }); | |||
} | |||
} | |||
}; | |||
/*! | |||
* /brief Prevent binary float | |||
*/ | |||
class DnnOprCallerBase { | |||
protected: | |||
static auto&& get_layout(const megdnn::TensorND& tensor) { return tensor.layout; } | |||
static auto get_layout(const megdnn::TensorNDArray& tensors) { | |||
SmallVector<TensorLayout> layouts; | |||
for (auto&& tensor : tensors) { | |||
layouts.push_back(tensor.layout); | |||
} | |||
return layouts; | |||
} | |||
}; | |||
template <size_t OSize> | |||
class MegDNNDynOutMallocImpl final : public megdnn::DynOutMallocPolicy { | |||
using Output = std::array<TensorPtr, OSize>; | |||
/*! | |||
* \brief A struct for safely calling DNN oprs | |||
* | |||
* In some cases, op may be released before the complete of the execution | |||
* This destructor will prevent this | |||
*/ | |||
template <typename Opr> | |||
class DnnOprCaller final : public DnnOprHolder<Opr>, | |||
public DnnOprDeducer<Opr>, | |||
public DnnOprCallerBase { | |||
private: | |||
using Holder = DnnOprHolder<Opr>; | |||
using Deducer = DnnOprDeducer<Opr>; | |||
using Base = DnnOprCallerBase; | |||
std::optional<DnnTensorND> m_workspace; | |||
std::optional<megdnn::param::ExecutionPolicy> m_policy; | |||
CompNode m_cn; | |||
Output m_out; | |||
megdnn::Workspace create_workspace(size_t sz) { | |||
mgb_assert( | |||
!m_workspace, "workspace asked more than once by op: %s", | |||
demangled_typename<Opr>()); | |||
dt_byte* ptr = nullptr; | |||
if (sz) { | |||
TensorLayout layout({sz}, dtype::Byte()); | |||
m_workspace.emplace( | |||
Tensor::make(layout, Holder::comp_node())->dnn_tensor()); | |||
ptr = reinterpret_cast<dt_byte*>(m_workspace->raw_ptr()); | |||
} | |||
return {ptr, sz}; | |||
} | |||
public: | |||
MegDNNDynOutMallocImpl(CompNode cn) : m_cn{cn} {} | |||
megdnn::TensorND alloc_output( | |||
size_t id, DType dtype, const TensorShape& shape, | |||
void* user_data) override { | |||
TensorLayout m_layout(shape, dtype); | |||
m_out[id] = Tensor::make(m_layout, m_cn); | |||
return m_out[id]->dev_tensor().as_megdnn(); | |||
using Param = typename Opr::Param; | |||
DnnOprCaller(CompNode cn) : Holder(cn), Deducer(Holder::op().get()) {} | |||
DnnOprCaller(CompNode cn, const Param& param) : DnnOprCaller(cn) { | |||
Holder::param() = param; | |||
} | |||
DnnOprCaller(CompNode cn, const Param& param, megdnn::param::ExecutionPolicy policy) | |||
: DnnOprCaller(cn, param) { | |||
m_policy.emplace(policy); | |||
} | |||
void* alloc_workspace(size_t sz, void* user_data) override { | |||
return m_cn.alloc_device(sz); | |||
/** | |||
* /brief Convert TensorPtr args to megdnn::TensorND and call f | |||
* | |||
*/ | |||
template <typename TFunctor, typename... TArgs> | |||
auto call_dnn(TFunctor&& f, TArgs&&... args) { | |||
std::optional<SmallVector<std::shared_ptr<dt_byte>>> input_ptrs; | |||
// recursive convert: | |||
// 1. TensorPtr to DnnTensorND (subclass of megdnn::TensorND) ; | |||
// 2. DeviceTensorND, HostTensorND to megdnn::TensorND ; | |||
// 3. SmallVector of above to SmallVector<megdnn::TensorND> . | |||
auto to_dnn = [&](auto&& arg, auto&& to_dnn) { | |||
using T = decltype(arg); | |||
if constexpr (std::is_convertible_v<T, TensorPtr>) { | |||
return arg->dnn_tensor(); | |||
} else if constexpr ( | |||
std::is_convertible_v<T, DeviceTensorND> || | |||
std::is_convertible_v<T, HostTensorND>) { | |||
return arg.as_megdnn(); | |||
} else if constexpr ( | |||
std::is_convertible_v<T, megdnn::TensorND> || | |||
std::is_convertible_v<T, SmallVector<megdnn::TensorND>>) { | |||
return std::forward<T>(arg); | |||
} else if constexpr (is_small_vector_v<std::decay_t<T>>) { | |||
using TItem = std::decay_t<decltype(to_dnn(arg[0], to_dnn))>; | |||
SmallVector<megdnn::TensorND> dnn_tensors; | |||
for (auto&& tensor : arg) { | |||
if constexpr (std::is_same_v<TItem, DnnTensorND>) { | |||
if (!input_ptrs) { | |||
input_ptrs.emplace(); | |||
} | |||
auto dnn_tensor = to_dnn(tensor, to_dnn); | |||
input_ptrs->push_back(std::move(dnn_tensor.reference)); | |||
dnn_tensors.push_back(std::move(dnn_tensor)); | |||
} else if constexpr (std::is_same_v<TItem, megdnn::TensorND>) { | |||
dnn_tensors.push_back(to_dnn(tensor, to_dnn)); | |||
} else { | |||
static_assert(!std::is_same_v<TItem, TItem>); | |||
} | |||
} | |||
return dnn_tensors; | |||
} else { | |||
static_assert(!std::is_same_v<T, T>); | |||
} | |||
}; | |||
return f(to_dnn(std::forward<TArgs>(args), to_dnn)...); | |||
} | |||
void free_workspace(void* ptr, void* user_data) override { m_cn.free_device(ptr); } | |||
// common execution (opr->exec(inputs..., outputs...)) | |||
template <typename... TArgs> | |||
void exec(TArgs&&... args) { | |||
call_dnn( | |||
[this](auto&&... args) { | |||
Holder::op()->exec(std::forward<decltype(args)>(args)...); | |||
}, | |||
std::forward<TArgs>(args)...); | |||
} | |||
// execution fastrun opr | |||
// (opr->exec(inputs..., outputs..., create_ws(setup_algo(...)))) | |||
template <typename... TArgs> | |||
void exec_fastrun(TArgs&&... args) { | |||
call_dnn( | |||
[&](auto&&... args) { | |||
using FixedTensorLayouts = | |||
typename rdnn::AlgoChooser<Opr>::FixedTensorLayouts; | |||
SmallVector<megdnn::TensorND> dnn_inputs = {args...}; | |||
mgb_assert(m_policy, "policy not set"); | |||
size_t workspace_size = setup_algo<Opr>( | |||
FixedTensorLayouts{args.layout...}, Holder::op().get(), 0, | |||
false, false, Holder::comp_node(), *m_policy, false, | |||
&dnn_inputs); | |||
Holder::op()->exec( | |||
std::forward<decltype(args)>(args)..., | |||
create_workspace(workspace_size)); | |||
}, | |||
std::forward<TArgs>(args)...); | |||
} | |||
// execute with fixed workspace | |||
// (opr->exec(input..., outputs..., create_ws(get_workspace_in_bytes(...)))) | |||
template <typename... TArgs> | |||
void exec_with_ws(TArgs&&... args) { | |||
call_dnn( | |||
[&](auto&&... args) { | |||
size_t workspace_size = | |||
Holder::op()->get_workspace_in_bytes(get_layout(args)...); | |||
Holder::op()->exec( | |||
std::forward<decltype(args)>(args)..., | |||
create_workspace(workspace_size)); | |||
}, | |||
std::forward<TArgs>(args)...); | |||
} | |||
TensorPtr at(size_t id) { return m_out[id]; } | |||
// execute dynamic out opr | |||
// (opr->exec(inputs..., outputs... create_ws(get_workspace_in_bytes(...)), alloc)) | |||
template <size_t nr_out, typename... TArgs> | |||
auto exec_dynout(TArgs&&... args) { | |||
struct Alloc final : public megdnn::DynOutMallocPolicy { | |||
CompNode comp_node; | |||
std::array<TensorPtr, nr_out> output_tensors; | |||
std::array<std::optional<DnnTensorND>, nr_out> output_dnn_tensors; | |||
public: | |||
Alloc(CompNode comp_node) : comp_node(comp_node) {} | |||
megdnn::TensorND alloc_output( | |||
size_t id, DType dtype, const TensorShape& shape, | |||
void* user_data) override { | |||
TensorLayout layout(shape, dtype); | |||
output_tensors[id] = Tensor::make(layout, comp_node); | |||
output_dnn_tensors[id].emplace( | |||
output_tensors[id]->dnn_tensor()); // pin output | |||
return *output_dnn_tensors[id]; | |||
} | |||
void* alloc_workspace(size_t sz, void* user_data) override { | |||
mgb_assert(false); | |||
} | |||
void free_workspace(void* ptr, void* user_data) override { | |||
mgb_assert(false); | |||
} | |||
} alloc{Holder::comp_node()}; | |||
call_dnn( | |||
[&](auto&&... args) { | |||
size_t workspace_size = | |||
Holder::op()->get_workspace_in_bytes(get_layout(args)...); | |||
Holder::op()->exec( | |||
std::forward<decltype(args)>(args)..., | |||
create_workspace(workspace_size), &alloc); | |||
}, | |||
std::forward<TArgs>(args)...); | |||
return alloc.output_tensors; | |||
} | |||
}; | |||
} // namespace imperative | |||
@@ -605,6 +605,7 @@ TensorInfo* ChannelImpl::alloc() { | |||
void ChannelImpl::init(TensorInfo* info, LogicalTensorDesc&& desc) { | |||
m_valid_handle.insert(reinterpret_cast<Handle>(info)); | |||
MGB_RECORD_EVENT(TensorDeclareEvent, info->id, info->name); | |||
mgb_assert(desc.comp_node.valid(), "comp_node invalid"); | |||
info->status = TensorInfo::Allocated; | |||
info->desc = std::move(desc); | |||
} | |||
@@ -831,6 +832,7 @@ void ChannelImpl::do_apply_op(const ApplyOp& cmd, std::string reason) { | |||
output_descs.push_back(i->desc); | |||
} | |||
} else { | |||
// i may be null | |||
validated = false; | |||
} | |||
// Here std::move is REQUIRED for removing duplicated references. | |||
@@ -1064,17 +1066,16 @@ void ChannelImpl::alloc_tensor_with_evict(OwnedBlob* x) { | |||
if (in_worker) { | |||
reserve_size(x->size()); | |||
} | |||
MGB_TRY { BlobManager::inst()->alloc_direct(x, x->size()); } | |||
MGB_CATCH(MemAllocError&, { | |||
if (!BlobManager::inst()->try_alloc_direct(x, x->size())) { | |||
bool suc = false; | |||
if (in_worker) { | |||
while (!suc) { | |||
if (!auto_evict(1)) { | |||
break; | |||
} | |||
MGB_TRY { BlobManager::inst()->alloc_direct(x, x->size()); } | |||
MGB_CATCH(MemAllocError&, { continue; }); | |||
suc = true; | |||
if (BlobManager::inst()->try_alloc_direct(x, x->size())) { | |||
suc = true; | |||
} | |||
} | |||
} | |||
if (!suc) { | |||
@@ -1086,9 +1087,11 @@ void ChannelImpl::alloc_tensor_with_evict(OwnedBlob* x) { | |||
imperative_log_profile_begin("defrag"); | |||
BlobManager::inst()->defrag(x->comp_node()); | |||
imperative_log_profile_end("defrag"); | |||
BlobManager::inst()->alloc_direct(x, x->size()); | |||
mgb_assert( | |||
BlobManager::inst()->try_alloc_direct(x, x->size()), | |||
"allocation failed after defrag"); | |||
} | |||
}); | |||
} | |||
set_log_level(pre_level); | |||
} | |||
@@ -75,13 +75,12 @@ std::tuple<SmallVector<LogicalTensorDesc>, bool> infer_output_attrs_fallible( | |||
SmallVector<TensorPtr> apply_on_physical_tensor( | |||
const OpDef& def, const SmallVector<TensorPtr>& inputs, | |||
SmallVector<LogicalTensorDesc>& output_descs, const bool& validated) { | |||
auto&& pool = static_cast<const AdaptivePooling&>(def); | |||
auto&& pooling = def.cast_final_safe<AdaptivePooling>(); | |||
auto&& cn = inputs[0]->comp_node(); | |||
using TensorND = megdnn::TensorND; | |||
auto&& src_layout = inputs[0]->layout(); | |||
TensorLayout dst_layout = output_descs[0].layout; | |||
auto param_format = pool.format; | |||
TensorLayout dst_layout{inputs[0]->dtype()}; | |||
auto param_format = pooling.format; | |||
if (!validated) { | |||
dst_layout.ndim = src_layout.ndim; | |||
const dt_int32* oshp2d = nullptr; | |||
@@ -91,7 +90,7 @@ SmallVector<TensorPtr> apply_on_physical_tensor( | |||
tshp1n = inputs[1]->layout().total_nr_elems() == 1; | |||
oshp2d = tshp_nd->get_value().proxy_to_default_cpu().ptr<dt_int32>(); | |||
} else { | |||
oshp2d = pool.shape.data(); | |||
oshp2d = pooling.shape.data(); | |||
} | |||
if (param_format == opr::AdaptivePooling::Param::Format::NCHW) { | |||
dst_layout[0] = src_layout[0]; | |||
@@ -108,15 +107,17 @@ SmallVector<TensorPtr> apply_on_physical_tensor( | |||
MegBrainError, "AdaptivePooling only support NCHW or NHWC format"); | |||
} | |||
dst_layout.init_contiguous_stride(); | |||
} else { | |||
dst_layout = output_descs[0].layout; | |||
} | |||
size_t IH, IW, OH, OW; | |||
if (param_format == param::AdaptivePooling::Format::NCHW) { | |||
if (param_format == megdnn::param::AdaptivePooling::Format::NCHW) { | |||
IH = src_layout[2]; | |||
IW = src_layout[3]; | |||
OH = dst_layout[2]; | |||
OW = dst_layout[3]; | |||
} else if (param_format == param::AdaptivePooling::Format::NHWC) { | |||
} else if (param_format == megdnn::param::AdaptivePooling::Format::NHWC) { | |||
IH = src_layout[1]; | |||
IW = src_layout[2]; | |||
OH = dst_layout[1]; | |||
@@ -124,26 +125,21 @@ SmallVector<TensorPtr> apply_on_physical_tensor( | |||
} else { | |||
mgb_throw(MegBrainError, "AdaptivePooling only support NCHW or NHWC format"); | |||
} | |||
DnnOprCaller<megdnn::Pooling> dnn_opr(cn); | |||
auto&& param = dnn_opr.op->param(); | |||
param.mode = pool.mode; | |||
param.format = pool.format; | |||
// adaptive_pooling param to pooling | |||
auto&& param = megdnn::Pooling::Param(); | |||
param.mode = pooling.mode; | |||
param.format = pooling.format; | |||
param.pad_h = param.pad_w = 0; | |||
param.stride_h = floor(IH / OH); | |||
param.stride_w = floor(IW / OW); | |||
param.stride_h = IH / OH; | |||
param.stride_w = IW / OW; | |||
param.window_h = IH - (OH - 1) * param.stride_h; | |||
param.window_w = IW - (OW - 1) * param.stride_w; | |||
TensorND src = inputs[0]->dnn_tensor(); | |||
DnnOprCaller<megdnn::Pooling> dnn_opr(cn, param, megdnn::param::ExecutionPolicy{}); | |||
auto src = inputs[0]; | |||
auto dst = Tensor::make(dst_layout, cn); | |||
size_t sz = setup_algo<megdnn::Pooling>( | |||
{src_layout, dst_layout}, dnn_opr.op.get(), 0, false, false, cn, | |||
::megdnn::param::ExecutionPolicy{}, false); | |||
auto dnn_wk = dnn_opr.create_workspace(sz); | |||
dnn_opr.op->exec(src, dst->dnn_tensor(), dnn_wk); | |||
dnn_opr.exec_fastrun(inputs[0], dst); | |||
return {dst}; | |||
} | |||
@@ -145,79 +145,44 @@ SmallVector<TensorPtr> apply_on_physical_tensor( | |||
auto&& op_def = def.cast_final_safe<BatchNorm>(); | |||
auto&& comp_node = inputs[0]->comp_node(); | |||
using TensorND = megdnn::TensorND; | |||
DnnOprCaller<megdnn::BN> dnn_opr(comp_node, op_def.param()); | |||
SmallVector<TensorND> inp_tensornds(inputs.size()); | |||
for (size_t i = 0; i < inputs.size(); ++i) { | |||
inp_tensornds[i] = inputs[i]->dnn_tensor(); | |||
} | |||
DnnOprCaller<megdnn::BN> dnn_opr(comp_node); | |||
dnn_opr.op->param() = op_def.param(); | |||
TensorLayout src_layout = inputs[0]->layout(); | |||
TensorLayout scale_layout = inputs[1]->layout(); | |||
auto src_layout = inputs[0]->layout(); | |||
auto scale_layout = inputs[1]->layout(); | |||
bool empty_input = src_layout.is_empty(); | |||
size_t nr_inp = inputs.size(); | |||
size_t sz = 0, rsz = 0; | |||
TensorLayout r_layout({rsz}, dtype::Byte()); | |||
if (!empty_input) { | |||
sz = dnn_opr.op->get_workspace_in_bytes( | |||
src_layout, src_layout, src_layout, src_layout, src_layout, src_layout, | |||
src_layout, src_layout, src_layout); | |||
rsz = dnn_opr.op->get_reserve_in_bytes(src_layout); | |||
r_layout = TensorLayout({rsz}, dtype::Byte()); | |||
} | |||
auto dnn_wk = dnn_opr.create_workspace(sz); | |||
auto reserve = Tensor::make(r_layout, comp_node); | |||
// size_t ws_size = 0, reserve_size = 0; | |||
size_t reserve_size = | |||
empty_input ? (size_t)0 : dnn_opr.op()->get_reserve_in_bytes(src_layout); | |||
// alloc memory | |||
// alloc outputs | |||
auto y = Tensor::make(src_layout, comp_node); | |||
auto save_mean = Tensor::make(scale_layout, comp_node); | |||
auto save_variance = Tensor::make(scale_layout, comp_node); | |||
auto reserve = Tensor::make(TensorLayout{{reserve_size}, dtype::Byte()}, comp_node); | |||
if (op_def.fwd_mode == ::megdnn::param::BN::FwdMode::INFERENCE) { | |||
if (!empty_input) | |||
dnn_opr.op->exec( | |||
inp_tensornds[0], inp_tensornds[1], inp_tensornds[2], | |||
inp_tensornds[3], inp_tensornds[4], save_mean->dnn_tensor(), | |||
save_variance->dnn_tensor(), reserve->dnn_tensor(), y->dnn_tensor(), | |||
dnn_wk); | |||
if (!empty_input) { | |||
dnn_opr.exec_with_ws( | |||
inputs[0], inputs[1], inputs[2], inputs[3], inputs[4], save_mean, | |||
save_variance, reserve, y); | |||
} | |||
return {inputs[3], inputs[4], reserve, y}; | |||
} else { | |||
if (nr_inp == 5) { | |||
auto mean = Tensor::make(scale_layout, comp_node); | |||
auto variance = Tensor::make(scale_layout, comp_node); | |||
megdnn::RefPtr src_ptr1( | |||
inp_tensornds[3].get_ref_ptr().get_ptr(), inputs[3]->offset()); | |||
megdnn::RefPtr dst_ptr1( | |||
mean->dev_tensor().storage().get_ref_ptr(), | |||
mean->dev_tensor().storage().offset(), false); | |||
comp_node.peer_copy_to_ref( | |||
comp_node, dst_ptr1, src_ptr1, scale_layout.span().high_byte); | |||
megdnn::RefPtr src_ptr2( | |||
inp_tensornds[4].get_ref_ptr().get_ptr(), inputs[4]->offset()); | |||
megdnn::RefPtr dst_ptr2( | |||
variance->dev_tensor().storage().get_ref_ptr(), | |||
variance->dev_tensor().storage().offset(), false); | |||
comp_node.peer_copy_to_ref( | |||
comp_node, dst_ptr2, src_ptr2, scale_layout.span().high_byte); | |||
if (!empty_input) | |||
dnn_opr.op->exec( | |||
inp_tensornds[0], inp_tensornds[1], inp_tensornds[2], | |||
mean->dnn_tensor(), variance->dnn_tensor(), | |||
save_mean->dnn_tensor(), save_variance->dnn_tensor(), | |||
reserve->dnn_tensor(), y->dnn_tensor(), dnn_wk); | |||
// FIXME | |||
mean->dev_tensor().copy_from(inputs[3]->dev_tensor()); | |||
variance->dev_tensor().copy_from(inputs[4]->dev_tensor()); | |||
if (!empty_input) { | |||
dnn_opr.exec_with_ws( | |||
inputs[0], inputs[1], inputs[2], mean, variance, save_mean, | |||
save_variance, reserve, y); | |||
} | |||
return {mean, variance, save_mean, save_variance, reserve, y}; | |||
} | |||
@@ -227,11 +192,9 @@ SmallVector<TensorPtr> apply_on_physical_tensor( | |||
auto variance = Tensor::make(m_layout, comp_node); | |||
if (!empty_input) { | |||
dnn_opr.op->exec( | |||
inp_tensornds[0], inp_tensornds[1], inp_tensornds[2], | |||
mean->dnn_tensor(), variance->dnn_tensor(), save_mean->dnn_tensor(), | |||
save_variance->dnn_tensor(), reserve->dnn_tensor(), y->dnn_tensor(), | |||
dnn_wk); | |||
dnn_opr.exec_with_ws( | |||
inputs[0], inputs[1], inputs[2], mean, variance, save_mean, | |||
save_variance, reserve, y); | |||
} | |||
return {save_mean, save_variance, reserve, y}; | |||
@@ -28,33 +28,26 @@ SmallVector<TensorPtr> apply_on_physical_tensor( | |||
auto&& inp = inputs[0]; | |||
auto&& msk = inputs[1]; | |||
SmallVector<TensorPtr> out; | |||
mgb_assert( | |||
inp->layout().eq_shape(msk->layout()), | |||
"input shape does not match mask shape"); | |||
mgb_assert( | |||
msk->get_value().dtype().enumv() == DTypeEnum::Bool, | |||
"mask dtype must be bool"); | |||
MegDNNDynOutMallocImpl<2> policy{inp->comp_node()}; | |||
if (inp->layout().is_empty()) { | |||
// empty tensor | |||
policy.alloc_output(0, inp->layout().dtype, {0}, nullptr); | |||
policy.alloc_output(1, dtype::Int32(), {0}, nullptr); | |||
return { | |||
Tensor::make(TensorLayout{{0}, inp->dtype()}, inp->comp_node()), | |||
Tensor::make(TensorLayout{{0}, dtype::Int32()}, inp->comp_node()), | |||
}; | |||
} else { | |||
DnnOprCaller<megdnn::CondTake> dnn_op(inp->comp_node()); | |||
dnn_op.op->param().val = 1; | |||
size_t sz = dnn_op.op->get_workspace_in_bytes(inp->layout()); | |||
auto dnn_workspace = dnn_op.create_workspace(sz); | |||
dnn_op.op->exec( | |||
inp->dev_tensor().as_megdnn(), msk->dev_tensor().as_megdnn(), | |||
dnn_workspace, &policy); | |||
// maybe we need to split CondTake | |||
megdnn::CondTake::Param param; | |||
param.val = 1; | |||
DnnOprCaller<megdnn::CondTake> dnn_op(inp->comp_node(), param); | |||
auto&& [out0, out1] = dnn_op.exec_dynout<2>(inp, msk); | |||
return {out0, out1}; | |||
} | |||
out.push_back(policy.at(0)); | |||
out.push_back(policy.at(1)); | |||
return out; | |||
} | |||
std::tuple<SmallVector<LogicalTensorDesc>, bool> infer_output_attrs_fallible( | |||
@@ -8,14 +8,7 @@ | |||
namespace mgb { | |||
namespace imperative { | |||
namespace { | |||
size_t infer_conv_shape(size_t inp, size_t flt, size_t stride, size_t pad) { | |||
mgb_assert(inp + 2 * pad >= flt, "input=%zu padding=%zu filter=%zu", inp, pad, flt); | |||
return (inp + 2 * pad - flt) / stride + 1; | |||
} | |||
namespace convolution { | |||
std::shared_ptr<OpDef> make_from_op_node(cg::OperatorNodeBase* node_) { | |||
auto* node = &node_->cast_final_safe<opr::Convolution>(); | |||
@@ -29,131 +22,23 @@ auto apply_on_var_node(const OpDef& def, const VarNodeArray& inputs) { | |||
inputs[0], inputs[1], conv.param(), conv.policy(), config); | |||
} | |||
TensorLayout do_shape_infer( | |||
const OpDef& def, size_t src_ndim, TensorLayout src, TensorLayout filter) { | |||
auto&& conv = static_cast<const Convolution&>(def); | |||
using Param = ::megdnn::param::Convolution; | |||
auto img_ndim = src_ndim - 2; | |||
mgb_assert( | |||
img_ndim == 2, | |||
"only 2D convolution is supported, and input should be 4-dim; " | |||
"got input dim = %zu", | |||
src_ndim); | |||
size_t group = 1; | |||
size_t flt_start, flt_spatial_start, ocpg_pos, icpg_pos; | |||
if (conv.sparse == Param::Sparse::DENSE) { | |||
mgb_assert( | |||
filter.ndim == img_ndim + 2 || filter.ndim == img_ndim + 4, | |||
"bad filter ndim for dense convolution: " | |||
"spatial_ndim=%zu filter_ndim=%zu", | |||
img_ndim, filter.ndim); | |||
group = 1; | |||
flt_start = 0; | |||
} else { // Param::Sparse::GROUP | |||
mgb_assert( | |||
filter.ndim == img_ndim + 3 || filter.ndim == img_ndim + 5, | |||
"bad filter ndim for group convolution: " | |||
"spatial_ndim=%zu filter_ndim=%zu", | |||
img_ndim, filter.ndim); | |||
// grp, oc, ic, dims[] | |||
group = filter[0]; | |||
flt_start = 1; | |||
} | |||
uint32_t ic_block_size = 1, oc_block_size = 1; | |||
size_t src_or_dst_c_pos = 0; | |||
size_t src_or_dst_spatial_start = 0; | |||
if (conv.format == Param::Format::NCHW) { | |||
// filter should be (oc, ic, fh, fw) | |||
flt_spatial_start = 2; | |||
ocpg_pos = 0; | |||
icpg_pos = 1; | |||
src_or_dst_c_pos = 1; | |||
src_or_dst_spatial_start = 2; | |||
} else { // Param::Format::NHWC | |||
// filter should be (oc, fh, fw, ic) | |||
flt_spatial_start = 1; | |||
ocpg_pos = 0; | |||
icpg_pos = 3; | |||
src_or_dst_c_pos = 3; | |||
src_or_dst_spatial_start = 1; | |||
} | |||
size_t ocpg = filter[flt_start + ocpg_pos] * oc_block_size; | |||
size_t icpg = filter[flt_start + icpg_pos] * ic_block_size; | |||
uint32_t dilation[2], dilated_spatial[2], stride[2], padding[2]; | |||
dilation[0] = conv.dilate_h; | |||
dilation[1] = conv.dilate_w; | |||
stride[0] = conv.stride_h; | |||
stride[1] = conv.stride_w; | |||
padding[0] = conv.pad_h; | |||
padding[1] = conv.pad_w; | |||
for (size_t i = 0; i < img_ndim; ++i) { | |||
mgb_assert( | |||
dilation[i] > 0, "invalid dilation on spatial dim %zu: %u", i, | |||
dilation[i]); | |||
dilated_spatial[i] = | |||
(filter[i + flt_start + flt_spatial_start] - 1) * dilation[i] + 1; | |||
} | |||
mgb_assert( | |||
icpg * group == src[src_or_dst_c_pos], | |||
"group conv invalid: input channel of Conv expect %zu, but got %zu\n" | |||
"hint: weight may be changed by mistake\n", | |||
icpg * group, src[src_or_dst_c_pos]); | |||
TensorLayout dst{src.dtype}; | |||
dst.ndim = src_ndim; | |||
dst[0] = src[0]; | |||
dst[src_or_dst_c_pos] = ocpg * group; | |||
for (size_t i = 0; i < img_ndim; ++i) { | |||
dst[i + src_or_dst_spatial_start] = infer_conv_shape( | |||
src[i + src_or_dst_spatial_start], dilated_spatial[i], stride[i], | |||
padding[i]); | |||
} | |||
dst.init_contiguous_stride(); | |||
return dst; | |||
} | |||
std::tuple<SmallVector<LogicalTensorDesc>, bool> infer_output_attrs_fallible( | |||
const OpDef& def, const SmallVector<LogicalTensorDesc>& inputs) { | |||
SmallVector<LogicalTensorDesc> dests(1); | |||
auto&& desc = dests[0]; | |||
desc.comp_node = inputs[0].comp_node; | |||
TensorLayout src = inputs[0].layout; | |||
TensorLayout filter = inputs[1].layout; | |||
size_t src_ndim = src.ndim; | |||
if (src_ndim == 0 || filter.ndim == 0) { | |||
desc.layout = TensorLayout{{}, src.dtype}; | |||
return {dests, false}; | |||
auto&& conv = def.cast_final_safe<Convolution>(); | |||
DnnOprHelper<megdnn::ConvolutionForward> dnn_opr(conv.param()); | |||
auto&& data = inputs[0].layout; | |||
auto&& filter = inputs[1].layout; | |||
TensorLayout output_layout{data.dtype}; | |||
if (data.ndim && filter.ndim) { | |||
// deduce_layout won't override existing dtype | |||
dnn_opr.opr().deduce_layout(data, filter, output_layout); | |||
} | |||
desc.layout = do_shape_infer(def, src_ndim, src, filter); | |||
return {dests, true}; | |||
return {{{output_layout, inputs[0].comp_node}}, output_layout.ndim != 0}; | |||
} | |||
SmallVector<TensorPtr> apply_on_physical_tensor( | |||
const OpDef& def, const SmallVector<TensorPtr>& inputs, | |||
SmallVector<LogicalTensorDesc>& output_descs, const bool& validated) { | |||
// create megdnn opr | |||
auto&& conv = static_cast<const Convolution&>(def); | |||
CompNode cn = inputs[0]->comp_node(); | |||
TensorLayout out_layout = output_descs[0].layout; | |||
if (!validated) | |||
out_layout = do_shape_infer( | |||
def, inputs[0]->layout().ndim, inputs[0]->layout(), | |||
inputs[1]->layout()); | |||
using TensorND = megdnn::TensorND; | |||
SmallVector<TensorND> inp_tensornds(inputs.size() + 2); | |||
TensorLayoutArray inp_shapes(inputs.size()), oup_shapes(output_descs.size()); | |||
for (unsigned i = 0; i < inputs.size(); ++i) { | |||
inp_tensornds[i] = inputs[i]->dnn_tensor(); | |||
inp_shapes[i] = inputs[i]->layout(); | |||
} | |||
oup_shapes[0] = out_layout; | |||
DnnOprCaller<megdnn::ConvBiasForward> dnn_opr(cn); | |||
auto&& param = dnn_opr.op->param(); | |||
// Convolution::Param -> ConvBias::Param | |||
auto conv_bias_param_from_convolution(const Convolution& conv) { | |||
megdnn::ConvBias::Param param; | |||
param.pad_h = conv.pad_h; | |||
param.pad_w = conv.pad_w; | |||
param.stride_h = conv.stride_h; | |||
@@ -163,30 +48,37 @@ SmallVector<TensorPtr> apply_on_physical_tensor( | |||
param.sparse = conv.sparse; | |||
param.compute_mode = conv.compute_mode; | |||
param.format = conv.format; | |||
return param; | |||
} | |||
// shape infer | |||
TensorLayout empty_shp({0}, inputs[0]->dtype()); | |||
empty_shp.ndim = 0; | |||
auto empty_bias = Tensor::make(empty_shp, cn); | |||
inp_tensornds[2] = empty_bias->dnn_tensor(); | |||
inp_tensornds[3] = empty_bias->dnn_tensor(); | |||
size_t sz = setup_algo<megdnn::ConvBiasForward>( | |||
{inp_shapes[0], inp_shapes[1], empty_shp, empty_shp, oup_shapes[0]}, | |||
dnn_opr.op.get(), 0, false, false, cn, conv.policy(), false, | |||
&inp_tensornds); | |||
SmallVector<TensorPtr> apply_on_physical_tensor( | |||
const OpDef& def, const SmallVector<TensorPtr>& inputs, | |||
SmallVector<LogicalTensorDesc>& output_descs, const bool& validated) { | |||
// create megdnn opr | |||
auto&& conv = def.cast_final_safe<Convolution>(); | |||
CompNode cn = inputs[0]->comp_node(); | |||
auto&& param = conv_bias_param_from_convolution(conv); | |||
DnnOprCaller<megdnn::ConvBiasForward> dnn_opr(cn, param, conv.policy()); | |||
megdnn::TensorND empty_bias; | |||
empty_bias.layout.dtype = inputs[0]->dtype(); | |||
empty_bias.layout.ndim = 0; | |||
auto out_layout = [&] { | |||
if (validated) { | |||
return output_descs[0].layout; | |||
} else { | |||
TensorLayout out_layout{inputs[0]->dtype()}; | |||
dnn_opr.op()->deduce_layout( | |||
inputs[0]->layout(), inputs[1]->layout(), empty_bias.layout, | |||
empty_bias.layout, out_layout); | |||
return out_layout; | |||
} | |||
}(); | |||
// alloc memory | |||
auto out = Tensor::make(out_layout, cn); | |||
auto dnn_wk = dnn_opr.create_workspace(sz); | |||
// exeucte | |||
dnn_opr.op->exec( | |||
inp_tensornds[0], inp_tensornds[1], inp_tensornds[2], inp_tensornds[3], | |||
out->dnn_tensor(), nullptr, dnn_wk); | |||
dnn_opr.exec_fastrun(inputs[0], inputs[1], empty_bias, empty_bias, out); | |||
return {out}; | |||
} | |||
@@ -243,155 +135,41 @@ auto apply_on_var_node(const OpDef& def, const VarNodeArray& inputs) { | |||
} | |||
} | |||
TensorLayout convbwd_do_shape_infer( | |||
const OpDef& def, size_t diff_ndim, TensorLayout filter, TensorLayout diff, | |||
CompNode cn) { | |||
auto&& bwd_conv = static_cast<const ConvolutionBackwardData&>(def); | |||
DnnOprCaller<megdnn::ConvolutionBackwardData> caller(cn); | |||
auto&& dnn_opr = caller.op; | |||
using Param = ::megdnn::param::Convolution; | |||
// using Param1 = ::megdnn::param::ConvolutionBackwardData; | |||
auto img_ndim = diff_ndim - 2; | |||
mgb_assert( | |||
img_ndim == 2, | |||
"only 2D convolution is supported, and input should be 4-dim; " | |||
"got input dim = %zu", | |||
diff_ndim); | |||
size_t group = 1; | |||
size_t flt_start, flt_spatial_start, ocpg_pos, icpg_pos; | |||
if (bwd_conv.sparse == Param::Sparse::DENSE) { | |||
mgb_assert( | |||
filter.ndim == img_ndim + 2 || filter.ndim == img_ndim + 4, | |||
"bad filter ndim for dense convolution: " | |||
"spatial_ndim=%zu filter_ndim=%zu", | |||
img_ndim, filter.ndim); | |||
group = 1; | |||
flt_start = 0; | |||
} else { // Param::Sparse::GROUP | |||
mgb_assert( | |||
filter.ndim == img_ndim + 3 || filter.ndim == img_ndim + 5, | |||
"bad filter ndim for group convolution: " | |||
"spatial_ndim=%zu filter_ndim=%zu", | |||
img_ndim, filter.ndim); | |||
// grp, oc, ic, dims[] | |||
group = filter[0]; | |||
flt_start = 1; | |||
} | |||
uint32_t ic_block_size = 1, oc_block_size = 1; | |||
size_t src_or_dst_c_pos = 0; | |||
size_t src_or_dst_spatial_start = 0; | |||
if (bwd_conv.format == Param::Format::NCHW) { | |||
// filter should be (oc, ic, fh, fw) | |||
flt_spatial_start = 2; | |||
ocpg_pos = 0; | |||
icpg_pos = 1; | |||
src_or_dst_c_pos = 1; | |||
src_or_dst_spatial_start = 2; | |||
} else { // Param::Format::NHWC | |||
// filter should be (oc, fh, fw, ic) | |||
flt_spatial_start = 1; | |||
ocpg_pos = 0; | |||
icpg_pos = 3; | |||
src_or_dst_c_pos = 3; | |||
src_or_dst_spatial_start = 1; | |||
} | |||
size_t ocpg = filter[flt_start + ocpg_pos] * oc_block_size; | |||
size_t icpg = filter[flt_start + icpg_pos] * ic_block_size; | |||
uint32_t dilation[2], dilated_spatial[2], stride[2], padding[2]; | |||
dilation[0] = bwd_conv.dilate_h; | |||
dilation[1] = bwd_conv.dilate_w; | |||
stride[0] = bwd_conv.stride_h; | |||
stride[1] = bwd_conv.stride_w; | |||
padding[0] = bwd_conv.pad_h; | |||
padding[1] = bwd_conv.pad_w; | |||
for (size_t i = 0; i < img_ndim; ++i) { | |||
mgb_assert( | |||
dilation[i] > 0, "invalid dilation on spatial dim %zu: %u", i, | |||
dilation[i]); | |||
dilated_spatial[i] = | |||
(filter[i + flt_start + flt_spatial_start] - 1) * dilation[i] + 1; | |||
} | |||
mgb_assert( | |||
ocpg * group == diff[src_or_dst_c_pos], | |||
"group conv invalid: input channel of Conv expect %zu, but got %zu\n" | |||
"hint: weight may be changed by mistake\n", | |||
ocpg * group, diff[src_or_dst_c_pos]); | |||
auto deduce = [](size_t out, size_t filter, size_t stride, size_t pad) { | |||
auto i = (out - 1) * stride + filter; | |||
mgb_assert(i > pad * 2); | |||
return i - pad * 2; | |||
}; | |||
DType dst_dtype = bwd_conv.dtype; | |||
dnn_opr->deduce_dtype(filter.dtype, diff.dtype, dst_dtype); | |||
TensorLayout dst{dst_dtype}; | |||
dst.ndim = diff_ndim; | |||
dst[0] = diff[0]; | |||
dst[src_or_dst_c_pos] = icpg * group; | |||
for (size_t i = 0; i < img_ndim; ++i) { | |||
dst[i + src_or_dst_spatial_start] = | |||
deduce(diff[i + src_or_dst_spatial_start], dilated_spatial[i], | |||
stride[i], padding[i]); | |||
} | |||
dst.init_contiguous_stride(); | |||
return dst; | |||
} | |||
std::tuple<SmallVector<LogicalTensorDesc>, bool> infer_output_attrs_fallible( | |||
const OpDef& def, const SmallVector<LogicalTensorDesc>& inputs) { | |||
SmallVector<LogicalTensorDesc> dests(1); | |||
auto&& desc = dests[0]; | |||
desc.comp_node = inputs[0].comp_node; | |||
TensorLayout filter = inputs[0].layout; | |||
TensorLayout diff = inputs[1].layout; | |||
size_t diff_ndim = diff.ndim; | |||
if (diff_ndim == 0 || filter.ndim == 0) { | |||
desc.layout = TensorLayout{{}, diff.dtype}; | |||
return {dests, false}; | |||
auto&& convbwd = def.cast_final_safe<ConvolutionBackwardData>(); | |||
DnnOprHelper<megdnn::ConvolutionBackwardData> dnn_opr(convbwd.param()); | |||
// force set dtype | |||
auto&& filter = inputs[0].layout; | |||
auto&& diff = inputs[1].layout; | |||
TensorLayout output_layout{convbwd.dtype}; | |||
if (filter.ndim && diff.ndim) { | |||
// deduce_layout won't override existing dtype | |||
dnn_opr.opr().deduce_layout(filter, diff, output_layout); | |||
} | |||
desc.layout = | |||
convbwd_do_shape_infer(def, diff_ndim, filter, diff, inputs[0].comp_node); | |||
return {dests, true}; | |||
return {{{output_layout, inputs[0].comp_node}}, output_layout.ndim != 0}; | |||
} | |||
SmallVector<TensorPtr> apply_on_physical_tensor( | |||
const OpDef& def, const SmallVector<TensorPtr>& inputs, | |||
SmallVector<LogicalTensorDesc>& output_descs, const bool& validated) { | |||
// create megdnn opr | |||
auto&& convbwd = static_cast<const ConvolutionBackwardData&>(def); | |||
auto&& convbwd = def.cast_final_safe<ConvolutionBackwardData>(); | |||
CompNode cn = inputs[0]->comp_node(); | |||
TensorLayout out_layout = output_descs[0].layout; | |||
if (!validated) | |||
out_layout = convbwd_do_shape_infer( | |||
def, inputs[1]->layout().ndim, inputs[0]->layout(), inputs[1]->layout(), | |||
cn); | |||
DnnOprCaller<megdnn::ConvolutionBackwardData> dnn_opr( | |||
cn, convbwd.param(), convbwd.policy()); | |||
auto out_layout = [&] { | |||
if (validated) { | |||
return output_descs[0].layout; | |||
} else { | |||
TensorLayout out_layout{inputs[0]->dtype()}; | |||
dnn_opr.op()->deduce_layout( | |||
inputs[0]->layout(), inputs[1]->layout(), out_layout); | |||
return out_layout; | |||
} | |||
}(); | |||
auto out = Tensor::make(out_layout, cn); | |||
using TensorND = megdnn::TensorND; | |||
SmallVector<TensorND> inp_tensornds(inputs.size()); | |||
TensorLayoutArray inp_shapes(inputs.size()), oup_shapes(output_descs.size()); | |||
for (unsigned i = 0; i < inputs.size(); ++i) { | |||
inp_tensornds[i] = inputs[i]->dnn_tensor(); | |||
inp_shapes[i] = inputs[i]->layout(); | |||
} | |||
oup_shapes[0] = out_layout; | |||
DnnOprCaller<megdnn::ConvolutionBackwardData> dnn_opr(cn); | |||
dnn_opr.op->param() = convbwd.param(); | |||
size_t sz = setup_algo<megdnn::ConvolutionBackwardData>( | |||
{inp_shapes[0], inp_shapes[1], oup_shapes[0]}, dnn_opr.op.get(), 0, false, | |||
false, cn, convbwd.policy(), false, &inp_tensornds); | |||
auto dnn_wk = dnn_opr.create_workspace(sz); | |||
// exeucte | |||
dnn_opr.op->exec(inp_tensornds[0], inp_tensornds[1], out->dnn_tensor(), dnn_wk); | |||
dnn_opr.exec_fastrun(inputs[0], inputs[1], out); | |||
return {out}; | |||
} | |||
@@ -415,149 +193,36 @@ auto apply_on_var_node(const OpDef& def, const VarNodeArray& inputs) { | |||
return opr::Convolution3D::make(inputs[0], inputs[1], conv.param(), conv.policy()); | |||
} | |||
TensorLayout do_shape_infer( | |||
const OpDef& def, size_t src_ndim, TensorLayout src, TensorLayout filter) { | |||
auto&& conv = static_cast<const Convolution3D&>(def); | |||
using Param = ::megdnn::param::Convolution3D; | |||
auto img_ndim = src_ndim - 2; | |||
mgb_assert( | |||
img_ndim == 3, | |||
"only 3D convolution is supported, and input should be 5-dim; " | |||
"got input dim = %zu", | |||
src_ndim); | |||
size_t group = 1; | |||
size_t flt_start, flt_spatial_start, ocpg_pos, icpg_pos; | |||
if (conv.sparse == Param::Sparse::DENSE) { | |||
mgb_assert( | |||
filter.ndim == img_ndim + 2 || filter.ndim == img_ndim + 4, | |||
"bad filter ndim for dense convolution: " | |||
"spatial_ndim=%zu filter_ndim=%zu", | |||
img_ndim, filter.ndim); | |||
group = 1; | |||
flt_start = 0; | |||
} else { // Param::Sparse::GROUP | |||
mgb_assert( | |||
filter.ndim == img_ndim + 3 || filter.ndim == img_ndim + 5, | |||
"bad filter ndim for group convolution: " | |||
"spatial_ndim=%zu filter_ndim=%zu", | |||
img_ndim, filter.ndim); | |||
// grp, oc, ic, dims[] | |||
group = filter[0]; | |||
flt_start = 1; | |||
} | |||
uint32_t ic_block_size = 1, oc_block_size = 1; | |||
size_t src_or_dst_c_pos = 0; | |||
size_t src_or_dst_spatial_start = 0; | |||
if (conv.format == Param::Format::NCDHW) { | |||
// filter should be (oc, ic, fd, fh, fw) | |||
flt_spatial_start = 2; | |||
ocpg_pos = 0; | |||
icpg_pos = 1; | |||
src_or_dst_c_pos = 1; | |||
src_or_dst_spatial_start = 2; | |||
} else { // Param::Format::NDHWC | |||
// filter should be (oc, fd, fh, fw, ic) | |||
flt_spatial_start = 1; | |||
ocpg_pos = 0; | |||
icpg_pos = 4; | |||
src_or_dst_c_pos = 4; | |||
src_or_dst_spatial_start = 1; | |||
} | |||
size_t ocpg = filter[flt_start + ocpg_pos] * oc_block_size; | |||
size_t icpg = filter[flt_start + icpg_pos] * ic_block_size; | |||
uint32_t dilation[3], dilated_spatial[3], stride[3], padding[3]; | |||
dilation[0] = conv.dilate_d; | |||
dilation[1] = conv.dilate_h; | |||
dilation[2] = conv.dilate_w; | |||
stride[0] = conv.stride_d; | |||
stride[1] = conv.stride_h; | |||
stride[2] = conv.stride_w; | |||
padding[0] = conv.pad_d; | |||
padding[1] = conv.pad_h; | |||
padding[2] = conv.pad_w; | |||
for (size_t i = 0; i < img_ndim; ++i) { | |||
mgb_assert( | |||
dilation[i] > 0, "invalid dilation on spatial dim %zu: %u", i, | |||
dilation[i]); | |||
dilated_spatial[i] = | |||
(filter[i + flt_start + flt_spatial_start] - 1) * dilation[i] + 1; | |||
} | |||
mgb_assert( | |||
icpg * group == src[src_or_dst_c_pos], | |||
"group conv invalid: input channel of Conv expect %zu, but got %zu\n" | |||
"hint: weight may be changed by mistake\n", | |||
icpg * group, src[src_or_dst_c_pos]); | |||
TensorLayout dst{src.dtype}; | |||
dst.ndim = src_ndim; | |||
dst[0] = src[0]; | |||
dst[src_or_dst_c_pos] = ocpg * group; | |||
for (size_t i = 0; i < img_ndim; ++i) { | |||
dst[i + src_or_dst_spatial_start] = infer_conv_shape( | |||
src[i + src_or_dst_spatial_start], dilated_spatial[i], stride[i], | |||
padding[i]); | |||
} | |||
dst.init_contiguous_stride(); | |||
return dst; | |||
} | |||
std::tuple<SmallVector<LogicalTensorDesc>, bool> infer_output_attrs_fallible( | |||
const OpDef& def, const SmallVector<LogicalTensorDesc>& inputs) { | |||
SmallVector<LogicalTensorDesc> dests(1); | |||
auto&& desc = dests[0]; | |||
desc.comp_node = inputs[0].comp_node; | |||
auto&& conv = def.cast_final_safe<Convolution3D>(); | |||
TensorLayout src = inputs[0].layout; | |||
TensorLayout filter = inputs[1].layout; | |||
size_t src_ndim = src.ndim; | |||
if (src_ndim == 0 || filter.ndim == 0) { | |||
desc.layout = TensorLayout{{}, src.dtype}; | |||
return {dests, false}; | |||
if (src.ndim == 0 || filter.ndim == 0) { | |||
return {{{TensorLayout{src.dtype}, inputs[0].comp_node}}, false}; | |||
} | |||
desc.layout = do_shape_infer(def, src_ndim, src, filter); | |||
return {dests, true}; | |||
DnnOprHelper<megdnn::Convolution3DForward> dnn_opr(conv.param()); | |||
auto output = dnn_opr.deduce_layout(src, filter); | |||
return {{{output, inputs[0].comp_node}}, false}; | |||
} | |||
SmallVector<TensorPtr> apply_on_physical_tensor( | |||
const OpDef& def, const SmallVector<TensorPtr>& inputs, | |||
SmallVector<LogicalTensorDesc>& output_descs, const bool& validated) { | |||
// create megdnn opr | |||
auto&& conv = static_cast<const Convolution3D&>(def); | |||
TensorLayout out_layout = output_descs[0].layout; | |||
if (!validated) | |||
out_layout = do_shape_infer( | |||
def, inputs[0]->layout().ndim, inputs[0]->layout(), | |||
inputs[1]->layout()); | |||
using TensorND = megdnn::TensorND; | |||
auto&& conv = def.cast_final_safe<Convolution3D>(); | |||
CompNode cn = inputs[0]->comp_node(); | |||
SmallVector<TensorND> inp_tensornds(inputs.size()); | |||
TensorLayoutArray inp_shapes(inputs.size()), oup_shapes(output_descs.size()); | |||
for (unsigned i = 0; i < inputs.size(); ++i) { | |||
inp_tensornds[i] = inputs[i]->dnn_tensor(); | |||
inp_shapes[i] = inputs[i]->layout(); | |||
} | |||
oup_shapes[0] = out_layout; | |||
DnnOprCaller<megdnn::Convolution3D> dnn_opr(cn); | |||
dnn_opr.op->param() = conv.param(); | |||
// shape infer | |||
size_t sz = setup_algo<megdnn::Convolution3D>( | |||
{inp_shapes[0], inp_shapes[1], oup_shapes[0]}, dnn_opr.op.get(), 0, false, | |||
false, cn, conv.policy(), false, &inp_tensornds); | |||
DnnOprCaller<megdnn::Convolution3D> dnn_opr(cn, conv.param(), conv.policy()); | |||
auto out_layout = [&] { | |||
if (validated) { | |||
return output_descs[0].layout; | |||
} else { | |||
return dnn_opr.deduce_layout(inputs[0]->layout(), inputs[1]->layout()); | |||
} | |||
}(); | |||
// alloc memory | |||
auto out = Tensor::make(out_layout, cn); | |||
auto dnn_wk = dnn_opr.create_workspace(sz); | |||
// exeucte | |||
dnn_opr.op->exec(inp_tensornds[0], inp_tensornds[1], out->dnn_tensor(), dnn_wk); | |||
dnn_opr.exec_fastrun(inputs[0], inputs[1], out); | |||
return {out}; | |||
} | |||
@@ -579,51 +244,38 @@ std::tuple<SmallVector<LogicalTensorDesc>, bool> infer_output_attrs_fallible( | |||
inputs.size() == 2, | |||
"inputs num of conv_transpose3d should be 2 but you give %zu", | |||
inputs.size()); | |||
auto&& op_def = def.cast_final_safe<Convolution3DBackwardData>(); | |||
auto&& weight = inputs[0]; | |||
auto&& diff = inputs[1]; | |||
auto& cn = weight.comp_node; | |||
if (weight.layout.ndim == 0 || diff.layout.ndim == 0) { | |||
return {{{TensorLayout{weight.layout.dtype}, cn, {}}}, false}; | |||
if (!(weight.layout.ndim && diff.layout.ndim)) { | |||
return {{{TensorLayout{weight.layout.dtype}, weight.comp_node}}, false}; | |||
} | |||
TensorLayout oup_layout; | |||
megdnn::Convolution3DBackwardData::deduce_layout_impl( | |||
weight.layout, diff.layout, op_def.param(), oup_layout); | |||
return {{{oup_layout, cn, {}}}, true}; | |||
DnnOprHelper<megdnn::Convolution3DBackwardData> dnn_opr(op_def.param()); | |||
auto oup_layout = dnn_opr.deduce_layout(weight.layout, diff.layout); | |||
return {{{oup_layout, weight.comp_node}}, true}; | |||
} | |||
SmallVector<TensorPtr> apply_on_physical_tensor( | |||
const OpDef& def, const SmallVector<TensorPtr>& inputs, | |||
SmallVector<LogicalTensorDesc>& output_descs, const bool& validated) { | |||
auto&& op_def = def.cast_final_safe<Convolution3DBackwardData>(); | |||
auto&& conv = def.cast_final_safe<Convolution3DBackwardData>(); | |||
auto cn = inputs[0]->comp_node(); | |||
auto&& wlayout = inputs[0]->layout(); | |||
auto&& dlayout = inputs[1]->layout(); | |||
DnnOprCaller<megdnn::Convolution3DBackwardData> caller(cn); | |||
auto&& dnn_opr = caller.op; | |||
dnn_opr->param() = op_def.param(); | |||
DnnOprCaller<megdnn::Convolution3DBackwardData> dnn_op( | |||
cn, conv.param(), conv.policy()); | |||
TensorLayout& oup_layout = output_descs[0].layout; | |||
if (!validated) { | |||
megdnn::Convolution3DBackwardData::deduce_layout_impl( | |||
wlayout, dlayout, op_def.param(), oup_layout); | |||
} | |||
auto oup_layout = [&] { | |||
if (validated) { | |||
return output_descs[0].layout; | |||
} else { | |||
return dnn_op.deduce_layout(wlayout, dlayout); | |||
} | |||
}(); | |||
auto oup = Tensor::make(oup_layout, cn); | |||
SmallVector<megdnn::TensorND> inp_tensornds(inputs.size()); | |||
inp_tensornds[0] = inputs[0]->dnn_tensor(); | |||
inp_tensornds[1] = inputs[1]->dnn_tensor(); | |||
size_t wk_size = setup_algo<megdnn::Convolution3DBackwardData>( | |||
{wlayout, dlayout, oup_layout}, dnn_opr.get(), 0, false, false, cn, | |||
op_def.policy(), false, &inp_tensornds); | |||
auto dnn_wk = caller.create_workspace(wk_size); | |||
dnn_opr->exec(inp_tensornds[0], inp_tensornds[1], oup->dnn_tensor(), dnn_wk); | |||
dnn_op.exec_fastrun(inputs[0], inputs[1], oup); | |||
return {oup}; | |||
} | |||
@@ -94,52 +94,44 @@ void apply_on_device_tensornd( | |||
mgb_assert( | |||
inputs.size() == trait.arity, "%s expects %u inputs; got %zu actually", | |||
trait.name, trait.arity, inputs.size()); | |||
DnnOprCaller<megdnn::Elemwise> dnn_opr(inputs[0].comp_node()); | |||
opr::Elemwise::perform(op_def.mode, (*outputs)[0], inputs, dnn_opr.op); | |||
DnnOprCaller<megdnn::Elemwise> dnn_opr(inputs[0].comp_node(), {op_def.mode}); | |||
opr::Elemwise::perform(op_def.mode, (*outputs)[0], inputs, dnn_opr.op()); | |||
} | |||
SmallVector<TensorPtr> apply_on_physical_tensor( | |||
const OpDef& def, const SmallVector<TensorPtr>& inputs, | |||
SmallVector<LogicalTensorDesc>& output_descs, const bool& validated) { | |||
auto comp_node = inputs[0]->comp_node(); | |||
auto dtype = inputs[0]->dtype(); | |||
using Mode = Elemwise::Mode; | |||
using TensorND = megdnn::TensorND; | |||
auto&& op_def = def.cast_final_safe<Elemwise>(); | |||
SmallVector<TensorND> inp_tensornds; | |||
TensorShapeArray inp_shapes(inputs.size()); | |||
inp_tensornds.reserve(inputs.size()); | |||
TensorLayout layout{inputs[0]->layout().dtype}; | |||
bool is_empty = false; | |||
for (unsigned i = 0; i < inputs.size(); ++i) { | |||
if (inputs[i]->layout().is_empty()) { | |||
is_empty = true; | |||
} | |||
inp_tensornds.push_back(inputs[i]->dnn_tensor()); | |||
inp_shapes[i] = inputs[i]->layout(); | |||
auto mode = op_def.mode; | |||
TensorShapeArray input_shapes; | |||
input_shapes.reserve(inputs.size()); | |||
for (auto&& input : inputs) { | |||
input_shapes.push_back(input->shape()); | |||
} | |||
megdnn::Elemwise::deduce_shape(inp_shapes, layout); | |||
layout.init_contiguous_stride(); | |||
auto out = Tensor::make(layout, comp_node); | |||
if (is_empty) { | |||
return {out}; | |||
// deduce_shape is static and fast | |||
TensorLayout output_layout{dtype}; | |||
// TODO: deduce_layout directly | |||
megdnn::Elemwise::deduce_shape(input_shapes, output_layout); | |||
output_layout.init_contiguous_stride(); | |||
auto output = Tensor::make(output_layout, comp_node); | |||
if (output_layout.is_empty()) { | |||
return {output}; | |||
} | |||
DnnOprCaller<megdnn::Elemwise> dnn_opr(comp_node); | |||
dnn_opr.op->param() = op_def.param(); | |||
if (dnn_opr.op->param().mode == Mode::FUSE_MUL_ADD3 || | |||
dnn_opr.op->param().mode == Mode::FUSE_MUL_ADD4 || | |||
(inp_tensornds.size() && | |||
inp_tensornds[0].layout.dtype.category() == DTypeCategory::QUANTIZED)) { | |||
opr::Elemwise::perform_dnn( | |||
comp_node, out->dnn_tensor(), inp_tensornds, dnn_opr.op); | |||
DnnOprCaller<megdnn::Elemwise> dnn_opr(comp_node, op_def.param()); | |||
if (mode == Mode::FUSE_MUL_ADD3 || mode == Mode::FUSE_MUL_ADD4 || | |||
dtype.category() == DTypeCategory::QUANTIZED) { | |||
dnn_opr.call_dnn( | |||
[&](auto&& inputs, auto&& output) { | |||
opr::Elemwise::perform_dnn(comp_node, output, inputs, dnn_opr.op()); | |||
}, | |||
inputs, output); | |||
} else { | |||
dnn_opr.op->exec(inp_tensornds, out->dnn_tensor()); | |||
dnn_opr.exec(inputs, output); | |||
} | |||
return {out}; | |||
return {output}; | |||
} | |||
MGB_DEFINE_OPR_CLASS( | |||
@@ -179,7 +171,7 @@ protected: | |||
return ret; | |||
} | |||
void create_megdnn_opr() override { | |||
auto opr = DnnOprCaller<megdnn::Elemwise>::create_operator(comp_node()); | |||
auto opr = mgb::opr::intl::create_megdnn_opr<megdnn::Elemwise>(comp_node()); | |||
opr->param().mode = m_param.mode; | |||
set_megdnn_opr(std::move(opr)); | |||
} | |||
@@ -243,22 +235,19 @@ SmallVector<TensorPtr> apply_inplace_add_on_physical_tensor( | |||
"This inplace modification may change the elements of other tensors. " | |||
"Fallback to non-inplace update."); | |||
DeviceTensorStorage storage; | |||
storage.reset(dest->comp_node(), dest->blob()->size(), dest->blob()->storage()); | |||
storage = storage.sub(dest->offset()); | |||
DeviceTensorND dv; | |||
dv.reset(storage, dest->layout()); | |||
DeviceTensorND dv_new; | |||
dv_new.copy_from(dv); | |||
dest = Tensor::make(dv_new); | |||
auto dest_layout = inputs[0]->layout(); | |||
dest_layout.init_contiguous_stride(); | |||
auto new_dest = Tensor::make(dest_layout, inputs[0]->comp_node()); | |||
new_dest->dev_tensor().copy_from(dest->dev_tensor()); | |||
dest = new_dest; | |||
} | |||
auto tensor_to_scalar = [](const TensorPtr& tensor) -> float { | |||
return *tensor->get_value().ptr<float>(); | |||
}; | |||
DnnOprCaller<megdnn::AddUpdate> caller{dest->comp_node()}; | |||
caller.op->param() = {tensor_to_scalar(alpha), tensor_to_scalar(beta)}; | |||
caller.op->exec(dest->dev_tensor().as_megdnn(), delta->dev_tensor().as_megdnn()); | |||
DnnOprCaller<megdnn::AddUpdate> caller{ | |||
dest->comp_node(), {tensor_to_scalar(alpha), tensor_to_scalar(beta)}}; | |||
caller.exec(dest, delta); | |||
// FIXME: inplace update host value | |||
return {std::make_shared<Tensor>(dest->blob(), dest->offset(), dest->layout())}; | |||
} | |||
@@ -67,10 +67,8 @@ SmallVector<TensorPtr> apply_on_physical_tensor( | |||
auto&& op = def.cast_final_safe<IndexingOneHot>(); | |||
auto&& inp = inputs[0]; | |||
auto&& index = inputs[1]; | |||
TensorLayout layout = inp->layout(); | |||
TensorLayout index_layout = index->layout(); | |||
DnnOprCaller<megdnn::IndexingOneHot> dnn_op(inp->comp_node()); | |||
auto&& indexing_one_hot_param = dnn_op.op->param(); | |||
auto&& layout = inp->layout(); | |||
auto&& index_layout = index->layout(); | |||
int real_axis = static_cast<int>(op.axis); | |||
if (real_axis < 0) { | |||
real_axis += static_cast<int>(layout.ndim); | |||
@@ -79,16 +77,10 @@ SmallVector<TensorPtr> apply_on_physical_tensor( | |||
0 <= real_axis && real_axis < static_cast<int>(layout.ndim), | |||
"Dimension out of range (expected to be in range of [%d, %d], but got %d)", | |||
0, static_cast<int>(layout.ndim) - 1, op.axis); | |||
indexing_one_hot_param = real_axis; | |||
TensorLayout tlayout; | |||
dnn_op.op->deduce_layout(layout, index_layout, tlayout); | |||
TensorPtr out = Tensor::make(tlayout, inp->comp_node()); | |||
megdnn::TensorND in = inp->dnn_tensor(); | |||
megdnn::TensorND ind = index->dnn_tensor(); | |||
size_t sz = dnn_op.op->get_workspace_in_bytes(layout, index_layout, tlayout); | |||
auto dnn_workspace = dnn_op.create_workspace(sz); | |||
dnn_op.op->exec(in, ind, out->dnn_tensor(), dnn_workspace); | |||
DnnOprCaller<megdnn::IndexingOneHot> dnn_op(inp->comp_node(), real_axis); | |||
auto tlayout = dnn_op.deduce_layout(layout, index_layout); | |||
auto out = Tensor::make(tlayout, inp->comp_node()); | |||
dnn_op.exec_with_ws(inp, index, out); | |||
return {out}; | |||
} | |||
@@ -105,15 +97,14 @@ std::tuple<SmallVector<LogicalTensorDesc>, bool> infer_output_attrs_fallible( | |||
const OpDef& def, const SmallVector<LogicalTensorDesc>& input_descs) { | |||
mgb_assert(input_descs.size() == 3, "IndexingSetOneHot expects three inputs"); | |||
auto comp_node = input_descs[0].comp_node; | |||
TensorLayout src = input_descs[0].layout, index = input_descs[1].layout; | |||
auto&& src = input_descs[0].layout; | |||
auto&& index = input_descs[1].layout; | |||
mgb_assert(index.dtype == dtype::Int32(), "index dtype must be int32"); | |||
if (!src.ndim) { | |||
return {{{{{}, src.dtype}, comp_node}}, false}; | |||
} | |||
mgb_assert(src.is_contiguous(), "src should be contiguous"); | |||
return {{input_descs[0]}, true}; | |||
return {{{src, comp_node}}, true}; | |||
} | |||
auto apply_on_var_node(const OpDef& def, const VarNodeArray& inputs) { | |||
@@ -136,25 +127,15 @@ SmallVector<TensorPtr> apply_on_physical_tensor( | |||
auto&& index = inputs[1]; | |||
auto&& sub = inputs[2]; | |||
TensorLayout layout = inp->layout(); | |||
TensorLayout index_layout = index->layout(); | |||
TensorLayout tlayout = sub->layout(); | |||
mgb_assert(layout.is_contiguous()); | |||
DnnOprCaller<megdnn::IndexingSetOneHot> dnn_op(inp->comp_node()); | |||
auto&& indexing_one_hot_param = dnn_op.op->param(); | |||
int real_axis = static_cast<int>(op.axis); | |||
if (real_axis < 0) { | |||
real_axis += static_cast<int>(layout.ndim); | |||
} | |||
indexing_one_hot_param = real_axis; | |||
DnnOprCaller<megdnn::IndexingSetOneHot> dnn_op(inp->comp_node(), real_axis); | |||
TensorPtr out = Tensor::make(layout, inp->comp_node()); | |||
out->dev_tensor().copy_from_fixlayout(inp->dev_tensor()); | |||
megdnn::TensorND in = inp->dnn_tensor(); | |||
megdnn::TensorND ind = index->dnn_tensor(); | |||
megdnn::TensorND su = sub->dnn_tensor(); | |||
size_t sz = dnn_op.op->get_workspace_in_bytes(layout, index_layout, tlayout); | |||
auto dnn_workspace = dnn_op.create_workspace(sz); | |||
dnn_op.op->exec(out->dnn_tensor(), ind, su, dnn_workspace); | |||
dnn_op.exec_with_ws(out, index, sub); | |||
return {out}; | |||
} | |||
@@ -54,14 +54,15 @@ cg::OperatorNodeBase* apply_on_var_node_remote_recv( | |||
TensorPtr megray_recv_tensor( | |||
std::shared_ptr<MegRay::Communicator> megray_comm, TensorLayout& layout, | |||
CompNode cn, uint32_t rank_from) { | |||
DeviceTensorND out = BlobManager::inst()->alloc_workspace_with_defrag(cn, layout); | |||
auto out = Tensor::make(layout, cn); | |||
auto dnn_out = out->dnn_tensor(); | |||
auto megray_ctx = mgb::opr::get_megray_context(cn); | |||
size_t data_size = layout.total_nr_elems(); | |||
auto status = megray_comm->recv( | |||
out.raw_ptr(), data_size, mgb::opr::get_megray_dtype(layout.dtype), | |||
dnn_out.raw_ptr(), data_size, mgb::opr::get_megray_dtype(layout.dtype), | |||
rank_from, megray_ctx); | |||
mgb_assert(status == MegRay::MEGRAY_OK, "MegRay recv failed"); | |||
return Tensor::make(out); | |||
return out; | |||
} | |||
void megray_send_tensor( | |||
@@ -105,9 +106,7 @@ SmallVector<TensorPtr> apply_on_physical_tensor_remote_send( | |||
mgb_assert(megray_comm != nullptr); | |||
megray_send_tensor(megray_comm, inputs[0], op.rank_to); | |||
TensorLayout layout({0}, inputs[0]->dtype()); | |||
DeviceTensorND out = BlobManager::inst()->alloc_workspace_with_defrag( | |||
inputs[0]->comp_node(), layout); | |||
return {Tensor::make(out)}; | |||
return {Tensor::make(layout, inputs[0]->comp_node())}; | |||
} | |||
std::tuple<SmallVector<LogicalTensorDesc>, bool> infer_output_attrs_fallible_remote_recv( | |||
@@ -21,14 +21,17 @@ SmallVector<VarNode::LayoutConstraintCallback> get_input_layout_constraint( | |||
std::tuple<SmallVector<LogicalTensorDesc>, bool> infer_output_attrs_fallible( | |||
const OpDef& def, const SmallVector<LogicalTensorDesc>& input_descs) { | |||
mgb_assert(input_descs.size() == 4, "IndexingOneHot expects 4inputs"); | |||
auto comp_node = input_descs[0].comp_node; | |||
auto comp_node1 = input_descs[1].comp_node; | |||
auto comp_node2 = input_descs[2].comp_node; | |||
TensorLayout m_t_1 = input_descs[0].layout, v_t_1 = input_descs[1].layout, | |||
lamb_param = input_descs[2].layout, grad = input_descs[3].layout; | |||
TensorLayout new_param = lamb_param, m_t = m_t_1, v_t = v_t_1; | |||
auto&& m_t_1 = input_descs[0].layout; | |||
auto&& v_t_1 = input_descs[1].layout; | |||
auto&& lamb_param = input_descs[2].layout; | |||
auto&& grad = input_descs[3].layout; | |||
MGB_MARK_USED_VAR(grad); | |||
auto&& new_param = lamb_param; | |||
auto&& m_t = m_t_1; | |||
auto&& v_t = v_t_1; | |||
return {{{m_t, comp_node}, {v_t, comp_node1}, {new_param, comp_node2}}, true}; | |||
} | |||
@@ -46,23 +49,11 @@ SmallVector<TensorPtr> apply_on_physical_tensor( | |||
TensorLayout lamb_param_layout{lamb_param->layout()}; | |||
auto m_t = Tensor::make(m_t_1_layout, m_t_1->comp_node()); | |||
auto v_t = Tensor::make(v_t_1_layout, v_t_1->comp_node()); | |||
auto new_param = Tensor::make(lamb_param_layout, lamb_param->comp_node()); | |||
DnnOprCaller<megdnn::LAMBUpdate> caller{lamb_param->comp_node()}; | |||
size_t sz = caller.op->get_workspace_in_bytes( | |||
m_t_1->layout(), v_t_1->layout(), lamb_param->layout(), grad->layout(), | |||
m_t->layout(), v_t->layout(), new_param->layout()); | |||
auto dnn_workspace = caller.create_workspace(sz); | |||
caller.op->param() = op.param(); | |||
caller.op->exec( | |||
m_t_1->dev_tensor().as_megdnn(), v_t_1->dev_tensor().as_megdnn(), | |||
lamb_param->dev_tensor().as_megdnn(), grad->dev_tensor().as_megdnn(), | |||
m_t->dnn_tensor(), v_t->dnn_tensor(), new_param->dnn_tensor(), | |||
dnn_workspace); | |||
DnnOprCaller<megdnn::LAMBUpdate> dnn_opr{lamb_param->comp_node(), op.param()}; | |||
dnn_opr.exec_with_ws(m_t_1, v_t_1, lamb_param, grad, m_t, v_t, new_param); | |||
return {m_t, v_t, new_param}; | |||
} | |||
@@ -29,11 +29,11 @@ cg::OperatorNodeBase* apply_on_var_node(const OpDef& def, const VarNodeArray& in | |||
std::tuple<SmallVector<LogicalTensorDesc>, bool> infer_output_attrs_fallible( | |||
const OpDef& def, const SmallVector<LogicalTensorDesc>& inputs) { | |||
auto&& op_def = def.cast_final_safe<LayerNorm>(); | |||
auto&& layer_norm = def.cast_final_safe<LayerNorm>(); | |||
size_t nr_inp = inputs.size(); | |||
auto p = op_def.param(); | |||
auto affine = layer_norm.affine; | |||
mgb_assert( | |||
(nr_inp == 3 && p.affine) || (nr_inp == 1 && !p.affine), | |||
(nr_inp == 3 && affine) || (nr_inp == 1 && !affine), | |||
"num of inputs of pooling should be 1 or 3 but you give %zu", | |||
inputs.size()); | |||
@@ -47,9 +47,9 @@ std::tuple<SmallVector<LogicalTensorDesc>, bool> infer_output_attrs_fallible( | |||
false}; | |||
} | |||
TensorLayout oup_layout, mean_layout, rstd_layout; | |||
megdnn::LayerNorm::deduce_layout_fwd_impl( | |||
inp.layout, p, oup_layout, mean_layout, rstd_layout); | |||
DnnOprHelper<megdnn::LayerNorm> dnn_opr(layer_norm.param()); | |||
auto&& [oup_layout, mean_layout, rstd_layout] = | |||
dnn_opr.deduce_layouts<3>(inp.layout, TensorLayout{}, TensorLayout{}); | |||
return {{{oup_layout, inp_cn, {}}, | |||
{mean_layout, inp_cn, {}}, | |||
{rstd_layout, inp_cn, {}}}, | |||
@@ -69,32 +69,21 @@ SmallVector<TensorPtr> apply_on_physical_tensor( | |||
inputs.size()); | |||
auto cn = inputs[0]->comp_node(); | |||
DnnOprCaller<megdnn::LayerNorm> caller(cn); | |||
auto&& dnn_opr = caller.op; | |||
dnn_opr->param() = p; | |||
DnnOprCaller<megdnn::LayerNorm> caller(cn, op_def.param()); | |||
TensorLayout oup_layout, mean_layout, rstd_layout; | |||
megdnn::LayerNorm::deduce_layout_fwd_impl( | |||
inputs[0]->dnn_tensor().layout, p, oup_layout, mean_layout, rstd_layout); | |||
auto&& [oup_layout, mean_layout, rstd_layout] = caller.deduce_layouts<3>( | |||
inputs[0]->layout(), TensorLayout{}, TensorLayout{}); | |||
auto out = Tensor::make(oup_layout, cn); | |||
auto mean = Tensor::make(mean_layout, cn); | |||
auto rstd = Tensor::make(rstd_layout, cn); | |||
auto wk_size = caller.op->get_workspace_in_bytes( | |||
inputs[0]->dnn_tensor().layout, | |||
p.affine ? inputs[1]->dnn_tensor().layout : TensorLayout(), | |||
p.affine ? inputs[2]->dnn_tensor().layout : TensorLayout(), oup_layout, | |||
mean_layout, rstd_layout); | |||
auto dnn_wk = caller.create_workspace(wk_size); | |||
caller.op->exec( | |||
inputs[0]->dnn_tensor(), | |||
p.affine ? inputs[1]->dnn_tensor() : megdnn::TensorND(), | |||
p.affine ? inputs[2]->dnn_tensor() : megdnn::TensorND(), out->dnn_tensor(), | |||
mean->dnn_tensor(), rstd->dnn_tensor(), dnn_wk); | |||
if (p.affine) { | |||
caller.exec_with_ws(inputs[0], inputs[1], inputs[2], out, mean, rstd); | |||
} else { | |||
megdnn::TensorND empty_dnn; | |||
caller.exec_with_ws(inputs[0], empty_dnn, empty_dnn, out, mean, rstd); | |||
} | |||
return {out, mean, rstd}; | |||
} | |||
@@ -105,4 +94,4 @@ OP_TRAIT_REG(LayerNorm, LayerNorm) | |||
.fallback(); | |||
} // namespace layer_norm | |||
} // namespace mgb::imperative | |||
} // namespace mgb::imperative |
@@ -24,7 +24,6 @@ auto apply_on_var_node(const OpDef& def, const VarNodeArray& inputs) { | |||
auto dim1 = matmul.dimA, dim2 = matmul.dimB; | |||
auto cn = inputs[0]->comp_node(); | |||
using Desc = opr::AxisAddRemove::AxisDesc; | |||
using IndexDesc = opr::Subtensor::IndexDesc; | |||
OperatorNodeConfig config{matmul.make_name(), cn}; | |||
@@ -104,9 +103,8 @@ std::tuple<SmallVector<LogicalTensorDesc>, bool> infer_output_attrs_fallible( | |||
dim1 = dim2 = 2; | |||
} | |||
DnnOprCaller<megdnn::MatrixMul> dnn_opr(inputs[0].comp_node); | |||
dnn_opr.op->param() = matmul.param(); | |||
dnn_opr.op->deduce_dtype(layout1.dtype, layout1.dtype, dst_dtype); | |||
DnnOprHelper<megdnn::MatrixMul> dnn_opr(matmul.param()); | |||
dnn_opr.opr().deduce_dtype(layout1.dtype, layout1.dtype, dst_dtype); | |||
if (dim1 == 0 || dim2 == 0) { | |||
return {{{TensorLayout(dst_dtype), inputs[0].comp_node}}, false}; | |||
@@ -143,8 +141,7 @@ SmallVector<TensorPtr> apply_on_physical_tensor( | |||
SmallVector<TensorND> inp_tensornds(inputs.size()); | |||
TensorLayout layout1 = inputs[0]->layout(), layout2 = inputs[1]->layout(); | |||
DnnOprCaller<megdnn::MatrixMul> dnn_opr(cn); | |||
dnn_opr.op->param() = matmul.param(); | |||
DnnOprCaller<megdnn::MatrixMul> dnn_opr(cn, matmul.param(), matmul.policy()); | |||
if (matmul.dimA == matmul.dimB && matmul.dimB >= 3) { // only happens in backward | |||
for (size_t i = 1; i + 1 < layout1.ndim; ++i) { | |||
@@ -160,7 +157,7 @@ SmallVector<TensorPtr> apply_on_physical_tensor( | |||
} | |||
DType dst_dtype; | |||
dnn_opr.op->deduce_dtype(layout1.dtype, layout1.dtype, dst_dtype); | |||
dnn_opr.op()->deduce_dtype(layout1.dtype, layout1.dtype, dst_dtype); | |||
// only matters when layout1 has dim 2 | |||
if (matmul.transposeA) | |||
@@ -229,13 +226,8 @@ SmallVector<TensorPtr> apply_on_physical_tensor( | |||
inp_tensornds[0].layout = layout_a; | |||
inp_tensornds[1].layout = layout_b; | |||
} | |||
size_t sz = setup_algo<megdnn::MatrixMul>( | |||
{layout_a, layout_b, dst_layout}, dnn_opr.op.get(), 0, false, false, cn, | |||
matmul.policy(), false, &inp_tensornds); | |||
auto out = Tensor::make(dst_layout, cn); | |||
auto dnn_wk = dnn_opr.create_workspace(sz); | |||
dnn_opr.op->exec(inp_tensornds[0], inp_tensornds[1], out->dnn_tensor(), dnn_wk); | |||
dnn_opr.exec_fastrun(inp_tensornds[0], inp_tensornds[1], out); | |||
return {out->sub(0, real_dst_layout)}; | |||
} | |||
@@ -266,7 +258,6 @@ auto apply_on_var_node(const OpDef& def, const VarNodeArray& inputs) { | |||
auto dim1 = matmul.dimA, dim2 = matmul.dimB; | |||
auto cn = inputs[0]->comp_node(); | |||
using Desc = opr::AxisAddRemove::AxisDesc; | |||
using IndexDesc = opr::Subtensor::IndexDesc; | |||
OperatorNodeConfig config{matmul.make_name(), cn}; | |||
@@ -343,9 +334,8 @@ std::tuple<SmallVector<LogicalTensorDesc>, bool> infer_output_attrs_fallible( | |||
DType dst_dtype; | |||
DnnOprCaller<megdnn::MatrixMul> dnn_opr(inputs[0].comp_node); | |||
dnn_opr.op->param() = matmul.param(); | |||
dnn_opr.op->deduce_dtype(layout1.dtype, layout1.dtype, dst_dtype); | |||
DnnOprHelper<megdnn::MatrixMul> dnn_opr(matmul.param()); | |||
dnn_opr.opr().deduce_dtype(layout1.dtype, layout1.dtype, dst_dtype); | |||
if (dim1 == 0 || dim2 == 0) { | |||
return {{{TensorLayout(dst_dtype), inputs[0].comp_node}}, false}; | |||
@@ -386,10 +376,9 @@ SmallVector<TensorPtr> apply_on_physical_tensor( | |||
TensorLayout layout1 = inputs[0]->layout(), layout2 = inputs[1]->layout(); | |||
size_t dim1 = layout1.ndim, dim2 = layout2.ndim; | |||
DnnOprCaller<megdnn::BatchedMatrixMul> dnn_opr(cn); | |||
dnn_opr.op->param() = matmul.param(); | |||
DnnOprCaller<megdnn::BatchedMatrixMul> dnn_opr(cn, matmul.param(), matmul.policy()); | |||
DType dst_dtype; | |||
dnn_opr.op->deduce_dtype(layout1.dtype, layout1.dtype, dst_dtype); | |||
dnn_opr.op()->deduce_dtype(layout1.dtype, layout1.dtype, dst_dtype); | |||
TensorShape tshp, batch_shp; | |||
size_t j = 0; | |||
@@ -473,14 +462,9 @@ SmallVector<TensorPtr> apply_on_physical_tensor( | |||
inp_tensornds[1] = inp2->dnn_tensor(); | |||
inp_tensornds[1].layout = layout2; | |||
size_t sz = setup_algo<megdnn::BatchedMatrixMul>( | |||
{layout1, layout2, dst_layout}, dnn_opr.op.get(), 0, false, false, cn, | |||
matmul.policy(), false, &inp_tensornds); | |||
auto out = Tensor::make(dst_layout, cn); | |||
auto dnn_wk = dnn_opr.create_workspace(sz); | |||
dnn_opr.op->exec(inp_tensornds[0], inp_tensornds[1], out->dnn_tensor(), dnn_wk); | |||
dnn_opr.exec_fastrun(inp_tensornds[0], inp_tensornds[1], out); | |||
shp1[shp1.ndim - 2] = dst_layout[dst_layout.ndim - 2]; | |||
shp1[shp1.ndim - 1] = dst_layout[dst_layout.ndim - 1]; | |||
@@ -533,7 +517,7 @@ SmallVector<TensorPtr> apply_on_physical_tensor( | |||
TensorLayout oup_layout{inputs[0]->dtype()}; | |||
auto inp1_tensor = inputs[0]->dnn_tensor(); | |||
auto inp2_tensor = inputs[1]->dnn_tensor(); | |||
dnn_opr.op->deduce_layout(inp1_tensor.layout, inp2_tensor.layout, oup_layout); | |||
oup_layout = dnn_opr.deduce_layout(inp1_tensor.layout, inp2_tensor.layout); | |||
if (inputs[0]->layout().is_empty() || inputs[1]->layout().is_empty()) { | |||
auto out = Tensor::make(oup_layout, comp_node); | |||
@@ -543,14 +527,8 @@ SmallVector<TensorPtr> apply_on_physical_tensor( | |||
return {out}; | |||
} | |||
auto sz = dnn_opr.op->get_workspace_in_bytes( | |||
inp_tensornds[0].layout, inp_tensornds[1].layout, output_descs[0].layout); | |||
auto out = Tensor::make(oup_layout, comp_node); | |||
auto dnn_wk = dnn_opr.create_workspace(sz); | |||
dnn_opr.op->exec(inp_tensornds[0], inp_tensornds[1], out->dnn_tensor(), dnn_wk); | |||
dnn_opr.exec_with_ws(inp_tensornds[0], inp_tensornds[1], out); | |||
return {out}; | |||
} | |||
@@ -17,27 +17,18 @@ SymbolVarArray apply_on_var_node(const OpDef& def, const VarNodeArray& inputs) { | |||
SmallVector<TensorPtr> apply_on_physical_tensor( | |||
const OpDef& def, const SmallVector<TensorPtr>& inputs, | |||
SmallVector<LogicalTensorDesc>& output_descs, const bool& validated) { | |||
size_t size = inputs.size(); | |||
auto&& op = def.cast_final_safe<CheckNonFinite>(); | |||
SmallVector<TensorPtr> outputs(size + 1); | |||
outputs[size] = Tensor::make( | |||
TensorLayout(TensorShape({1}), dtype::Int32()), inputs[0]->comp_node()); | |||
auto dest = outputs[size]; | |||
auto cn = dest->comp_node(); | |||
DnnOprCaller<megdnn::CheckNonFinite> dnn_opr(cn); | |||
SmallVector<megdnn::TensorND> srcs(size); | |||
// copy an outputs to the dnn for inplace | |||
for (size_t i = 0; i < size; ++i) { | |||
outputs[i] = Tensor::make(inputs[i]->layout(), inputs[0]->comp_node()); | |||
outputs[i]->dev_tensor().copy_from_fixlayout(inputs[i]->dev_tensor()); | |||
srcs[i] = outputs[i]->dev_tensor().as_megdnn(); | |||
auto comp_node = inputs[0]->comp_node(); | |||
auto dest = Tensor::make(TensorLayout({1}, dtype::Int32()), comp_node); | |||
SmallVector<TensorPtr> outputs; | |||
outputs.reserve(inputs.size() + 1); | |||
for (auto&& input : inputs) { | |||
outputs.push_back(Tensor::make(input->layout(), comp_node)); | |||
outputs.back()->dev_tensor().copy_from_fixlayout(input->dev_tensor()); | |||
} | |||
megdnn::CheckNonFinite::Param param({op.scale}); | |||
dnn_opr.op->param() = param; | |||
size_t sz = dnn_opr.op->get_workspace_in_bytes(srcs, dest->layout()); | |||
auto dnn_wk = dnn_opr.create_workspace(sz); | |||
dnn_opr.op->exec(srcs, dest->dnn_tensor(), dnn_wk); | |||
DnnOprCaller<megdnn::CheckNonFinite> dnn_opr(comp_node, {op.scale}); | |||
dnn_opr.exec_with_ws(outputs, dest); | |||
outputs.push_back(dest); | |||
return outputs; | |||
} | |||
@@ -45,13 +36,15 @@ std::tuple<SmallVector<LogicalTensorDesc>, bool> infer_output_attrs_fallible( | |||
const OpDef& def, const SmallVector<LogicalTensorDesc>& inputs) { | |||
size_t size = inputs.size(); | |||
SmallVector<LogicalTensorDesc> dests(size + 1); | |||
bool validated = true; | |||
for (size_t i = 0; i < size; ++i) { | |||
dests[i].comp_node = inputs[i].comp_node; | |||
dests[i].layout = inputs[i].layout; | |||
validated &= bool(dests[i].layout.ndim); | |||
} | |||
dests[size].comp_node = inputs[0].comp_node; | |||
dests[size].layout = TensorLayout(TensorShape({1}), dtype::Int32()); | |||
return {dests, true}; | |||
dests[size].layout = TensorLayout({1}, dtype::Int32()); | |||
return {dests, validated}; | |||
} | |||
OP_TRAIT_REG(CheckNonFinite, CheckNonFinite) | |||
@@ -27,40 +27,31 @@ SmallVector<TensorPtr> apply_on_physical_tensor( | |||
SmallVector<LogicalTensorDesc>& output_descs, const bool& validated) { | |||
auto comp_node = inputs[0]->comp_node(); | |||
auto&& op_def = def.cast_final_safe<Padding>(); | |||
DnnOprCaller<megdnn::Padding> dnn_op(comp_node); | |||
dnn_op.op->param() = op_def.param(); | |||
TensorLayout dst = output_descs[0].layout; | |||
if (!validated) { | |||
megdnn::Padding::deduce_layout_impl( | |||
inputs[0]->dnn_tensor().layout, dst, op_def.param()); | |||
} | |||
DeviceTensorND out = | |||
BlobManager::inst()->alloc_workspace_with_defrag(comp_node, dst); | |||
dnn_op.op->exec(inputs[0]->dnn_tensor(), out.as_megdnn()); | |||
return {Tensor::make(out)}; | |||
DnnOprCaller<megdnn::Padding> dnn_op(comp_node, op_def.param()); | |||
auto dst = [&] { | |||
if (validated) { | |||
return output_descs[0].layout; | |||
} else { | |||
return dnn_op.deduce_layout(inputs[0]->layout()); | |||
} | |||
}(); | |||
auto out = Tensor::make(dst, comp_node); | |||
dnn_op.exec(inputs[0], out); | |||
return {out}; | |||
} | |||
std::tuple<SmallVector<LogicalTensorDesc>, bool> infer_output_attrs_fallible( | |||
const OpDef& def, const SmallVector<LogicalTensorDesc>& inputs) { | |||
auto&& op_def = def.cast_final_safe<Padding>(); | |||
size_t nr_inp = inputs.size(); | |||
auto p = op_def.param(); | |||
auto&& inp = inputs[0]; | |||
auto& inp_cn = inp.comp_node; | |||
if (inp.layout.ndim == 0) { | |||
return {{{TensorLayout{inp.layout.dtype}, inp_cn, {}}}, false}; | |||
return {{{TensorLayout{inp.layout.dtype}, inp.comp_node, {}}}, false}; | |||
} | |||
TensorLayout oup_layout; | |||
megdnn::Padding::deduce_layout_impl(inp.layout, oup_layout, p); | |||
return {{{oup_layout, inp_cn, {}}}, true}; | |||
DnnOprHelper<megdnn::Padding> dnn_op(op_def.param()); | |||
auto oup_layout = dnn_op.deduce_layout(inp.layout); | |||
return {{{oup_layout, inp.comp_node}}, true}; | |||
} | |||
OP_TRAIT_REG(Padding, Padding, opr::Padding) | |||
@@ -74,4 +65,4 @@ OP_TRAIT_REG(Padding, Padding, opr::Padding) | |||
} // namespace imperative | |||
} // namespace mgb | |||
// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} | |||
// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} |
@@ -25,19 +25,13 @@ std::tuple<SmallVector<LogicalTensorDesc>, bool> infer_output_attrs_fallible( | |||
mgb_assert( | |||
inputs.size() == 1, "num of inputs of pooling should be 1 but you give %zu", | |||
inputs.size()); | |||
auto&& op_def = def.cast_final_safe<Pooling>(); | |||
auto&& inp = inputs[0]; | |||
auto& inp_cn = inp.comp_node; | |||
if (inp.layout.ndim == 0) { | |||
return {{{TensorLayout{inp.layout.dtype}, inp_cn, {}}}, false}; | |||
if (!inputs[0].layout.ndim) { | |||
return {{{inputs[0].layout, inputs[0].comp_node}}, false}; | |||
} | |||
TensorLayout oup_layout; | |||
megdnn::Pooling::deduce_layout_impl(inp.layout, op_def.param(), oup_layout); | |||
return {{{oup_layout, inp_cn, {}}}, true}; | |||
DnnOprHelper<megdnn::Pooling> dnn_opr(op_def.param()); | |||
auto oup_layout = dnn_opr.deduce_layout(inputs[0].layout); | |||
return {{{oup_layout, inputs[0].comp_node}}, true}; | |||
} | |||
SmallVector<TensorPtr> apply_on_physical_tensor( | |||
@@ -47,30 +41,18 @@ SmallVector<TensorPtr> apply_on_physical_tensor( | |||
inputs.size() == 1, "num of inputs of pooling should be 1 but you give %zu", | |||
inputs.size()); | |||
auto&& op_def = def.cast_final_safe<Pooling>(); | |||
auto&& pooling = def.cast_final_safe<Pooling>(); | |||
auto cn = inputs[0]->comp_node(); | |||
DnnOprCaller<megdnn::Pooling> caller(cn); | |||
auto&& dnn_opr = caller.op; | |||
dnn_opr->param() = op_def.param(); | |||
SmallVector<megdnn::TensorND> inp_tensornds(inputs.size()); | |||
inp_tensornds[0] = inputs[0]->dnn_tensor(); | |||
TensorLayout& oup_layout = output_descs[0].layout; | |||
if (!validated) { | |||
megdnn::Pooling::deduce_layout_impl( | |||
inp_tensornds[0].layout, op_def.param(), oup_layout); | |||
} | |||
size_t wk_size = setup_algo<megdnn::Pooling>( | |||
{inp_tensornds[0].layout, oup_layout}, dnn_opr.get(), 0, false, false, cn, | |||
op_def.policy(), false, &inp_tensornds); | |||
DnnOprCaller<megdnn::Pooling> dnn_opr(cn, pooling.param(), pooling.policy()); | |||
auto oup_layout = [&] { | |||
if (validated) { | |||
return output_descs[0].layout; | |||
} else { | |||
return dnn_opr.deduce_layout(inputs[0]->layout()); | |||
} | |||
}(); | |||
auto out = Tensor::make(oup_layout, cn); | |||
auto dnn_wk = caller.create_workspace(wk_size); | |||
caller.op->exec(inp_tensornds[0], out->dnn_tensor(), dnn_wk); | |||
dnn_opr.exec_fastrun(inputs[0], out); | |||
return {out}; | |||
} | |||
@@ -18,33 +18,31 @@ namespace reduce { | |||
auto apply_on_var_node(const OpDef& def, const VarNodeArray& inputs) { | |||
auto&& reduce = static_cast<const Reduce&>(def); | |||
auto comp_node = inputs[0]->comp_node(); | |||
OperatorNodeConfig config{reduce.make_name(), comp_node, inputs[0]->dtype()}; | |||
auto name = reduce.make_name(); | |||
if (inputs.size() > 1) { | |||
return opr::Reduce::make(inputs[0], reduce.param(), inputs[1], config); | |||
} | |||
using Param = megdnn::param::Reduce; | |||
auto param = reduce.param(); | |||
if (param.axis < 0) { | |||
param.axis = inputs[0]->shape().ndim + param.axis; | |||
auto axis = param.axis; | |||
auto keepdim = reduce.keepdim; | |||
if (inputs.size() == 2) { | |||
return opr::Reduce::make(inputs[0], param, inputs[1], {name}); | |||
} | |||
mgb_assert(inputs.size() == 1); | |||
SymbolVar target_shape = (cg::VarNode*)nullptr; | |||
if (param.axis == INT_MAX) { | |||
DTypeScalar vi{1}; | |||
// auto graph = ComputingGraph::make(); | |||
if (axis == INT_MAX) { | |||
// keepdim could be ignored when ndim == 1 | |||
auto graph = inputs[0]->owner_graph(); | |||
target_shape = opr::ImmutableTensor::make(*graph, vi, config); | |||
auto scalar_shape = | |||
opr::ImmutableTensor::make(*graph, DTypeScalar(1), {name, comp_node}); | |||
return opr::Reduce::make(inputs[0], param, scalar_shape, {name}); | |||
} | |||
auto res = opr::Reduce::make(inputs[0], param, target_shape, config); | |||
if (!reduce.keepdim && param.axis != INT_MAX) { | |||
// mgb::opr::Reduce supports negative axis | |||
auto res = opr::Reduce::make(inputs[0], param, {}, {name}); | |||
if (!keepdim) { | |||
using Desc = opr::AxisAddRemove::AxisDesc; | |||
std::vector<Desc> remove_param; | |||
remove_param.push_back(Desc::make_remove(param.axis)); | |||
OperatorNodeConfig remove_config{ | |||
def.make_name(), comp_node, inputs[0]->dtype()}; | |||
return opr::AxisAddRemove::make(res, remove_param, remove_config); | |||
std::vector<Desc> remove_axis_param; | |||
remove_axis_param.push_back(Desc::make_remove(axis)); | |||
res = opr::AxisAddRemove::make(res, remove_axis_param, {name}); | |||
} | |||
return res; | |||
} | |||
@@ -71,111 +69,104 @@ bool memory_forward_success(const OpDef& def, SmallVector<TensorPtr> inputs) { | |||
SmallVector<TensorPtr> apply_on_physical_tensor( | |||
const OpDef& def, const SmallVector<TensorPtr>& inputs, | |||
SmallVector<LogicalTensorDesc>& output_descs, const bool& validated) { | |||
// memory forward | |||
if (memory_forward_success(def, inputs)) { | |||
// maybe returns inputs[0] directly | |||
return {Tensor::make( | |||
inputs[0]->blob(), inputs[0]->offset(), inputs[0]->layout())}; | |||
} | |||
auto size = inputs.size(); | |||
if (size > 1) { | |||
if (inputs.size() == 2) { | |||
// reduce to target shape, fallback to proxy_graph | |||
return proxy_graph_detail::apply_on_physical_tensor( | |||
def, inputs, output_descs, validated); | |||
} | |||
mgb_assert(inputs.size() == 1); | |||
auto comp_node = inputs[0]->comp_node(); | |||
using TensorND = megdnn::TensorND; | |||
auto&& op_def = def.cast_final_safe<Reduce>(); | |||
SmallVector<TensorND> inp_tensornds; | |||
inp_tensornds.reserve(inputs.size()); | |||
auto src = inputs[0]->layout(); | |||
DnnOprCaller<megdnn::Reduce> dnn_op(comp_node); | |||
dnn_op.op->param() = op_def.param(); | |||
auto axis = op_def.param().axis; | |||
DnnOprCaller<megdnn::Reduce> dnn_op(comp_node, op_def.param()); | |||
auto&& mode = dnn_op.param().mode; | |||
auto& axis = dnn_op.param().axis; | |||
auto keepdim = op_def.keepdim; | |||
if (axis < 0) { | |||
axis = inputs[0]->layout().ndim + axis; | |||
} | |||
dnn_op.op->param().axis = axis == INT_MAX ? 0 : axis; | |||
if (axis == INT_MAX) { | |||
src.shape[0] = src.total_nr_elems(); | |||
src.ndim = 1; | |||
src.init_contiguous_stride(); | |||
} | |||
TensorLayout layout{src.dtype}; | |||
dnn_op.op->deduce_layout(src, layout); | |||
if (inputs[0]->layout().is_empty()) { | |||
inputs[0]->dev_tensor().reset(inputs[0]->dev_tensor().storage(), src); | |||
auto mode = op_def.param().mode; | |||
if (!keepdim && src.ndim > 1) { | |||
layout.remove_axis_inplace(axis); | |||
layout.init_contiguous_stride(); | |||
DnnTensorND dnn_input = [&] { | |||
if (axis == INT_MAX) { // reduce to scalar | |||
axis = 0; | |||
// flatten input | |||
return inputs[0]->dnn_tensor({inputs[0]->shape().total_nr_elems()}); | |||
} else { | |||
if (axis < 0) { | |||
axis = inputs[0]->layout().ndim + axis; | |||
} | |||
mgb_assert(axis >= 0 && axis < inputs[0]->layout().ndim); | |||
return inputs[0]->dnn_tensor(); | |||
} | |||
auto out = Tensor::make(layout, comp_node); | |||
}(); | |||
auto output_layout = dnn_op.deduce_layout(dnn_input.layout); | |||
auto resolve_keepdim = [&] { | |||
if (!keepdim) { | |||
if (output_layout.ndim > 1) { | |||
mgb_assert(output_layout.shape[axis] == 1); | |||
output_layout.remove_axis_inplace(axis); | |||
} | |||
} | |||
}; | |||
std::string err_msg; | |||
TensorPtr output; | |||
if (output_layout.is_empty()) { | |||
// output empty, no computation | |||
resolve_keepdim(); | |||
output = Tensor::make(output_layout, comp_node); | |||
} else if (dnn_input.layout.is_empty()) { | |||
// input empty but output not, do fill | |||
resolve_keepdim(); | |||
output = Tensor::make(output_layout, comp_node); | |||
auto on_bad_empty_reduce = [](const char* name) { | |||
mgb_throw( | |||
MegBrainError, "empty input is not allowed for reduce mode: %s", | |||
name); | |||
}; | |||
switch (mode) { | |||
case Reduce::Mode::SUM: | |||
if (!out->empty()) { | |||
dev_tensor_memset(out->dev_tensor(), 0); | |||
} | |||
// fill 0 | |||
dev_tensor_memset(output->dev_tensor(), 0); | |||
break; | |||
case Reduce::Mode::PRODUCT: | |||
if (!out->empty()) { | |||
DnnOprCaller<megdnn::Fill> fill_op(comp_node); | |||
fill_op.op->param() = 1; | |||
fill_op.op->exec(out->dnn_tensor(), {}); | |||
} | |||
case Reduce::Mode::PRODUCT: { | |||
// fill 1 | |||
DnnOprCaller<megdnn::Fill> fill_op(comp_node, {1}); | |||
fill_op.exec_with_ws(output); | |||
break; | |||
} | |||
case Reduce::Mode::MEAN: | |||
err_msg = "mean"; | |||
on_bad_empty_reduce("mean"); | |||
break; | |||
case Reduce::Mode::MIN: | |||
err_msg = "min"; | |||
on_bad_empty_reduce("min"); | |||
break; | |||
case Reduce::Mode::MAX: | |||
err_msg = "max"; | |||
on_bad_empty_reduce("max"); | |||
break; | |||
case Reduce::Mode::SUM_SQR: | |||
err_msg = "sum_sqr"; | |||
on_bad_empty_reduce("sum_sqr"); | |||
break; | |||
default: | |||
mgb_throw(MegBrainError, "bad reduce mode"); | |||
} | |||
if (!err_msg.empty()) { | |||
mgb_throw( | |||
MegBrainError, "empty input is not allowed for reduce mode: %s", | |||
err_msg.c_str()); | |||
} else { | |||
// common reduction | |||
if (keepdim) { | |||
output = Tensor::make(output_layout, comp_node); | |||
dnn_op.exec_with_ws(dnn_input, output); | |||
} else { | |||
// used by megdnn::exec | |||
auto output_layout_keepdim = output_layout; | |||
resolve_keepdim(); | |||
output = Tensor::make(output_layout, comp_node); | |||
dnn_op.exec_with_ws(dnn_input, output->dnn_tensor(output_layout_keepdim)); | |||
} | |||
return {out}; | |||
} | |||
auto dnn_ten = inputs[0]->dnn_tensor(); | |||
dnn_ten.layout = src; | |||
inp_tensornds.push_back(dnn_ten); | |||
auto wk_size = dnn_op.op->get_workspace_in_bytes(src, layout); | |||
auto dnn_wk = dnn_op.create_workspace(wk_size); | |||
TensorLayout ori_layout = layout; | |||
if (!keepdim && src.ndim > 1) { | |||
layout.remove_axis_inplace(axis); | |||
layout.init_contiguous_stride(); | |||
} | |||
auto out = Tensor::make(layout, comp_node); | |||
auto dnn_out = out->dnn_tensor(); | |||
dnn_out.layout = ori_layout; | |||
dnn_op.op->exec(inp_tensornds[0], dnn_out, dnn_wk); | |||
return {out}; | |||
return {output}; | |||
} | |||
std::tuple<SmallVector<LogicalTensorDesc>, bool> infer_output_attrs_fallible( | |||
@@ -184,16 +175,12 @@ std::tuple<SmallVector<LogicalTensorDesc>, bool> infer_output_attrs_fallible( | |||
auto axis = op_def.param().axis; | |||
auto keepdim = op_def.keepdim; | |||
size_t size = inputs.size(); | |||
SmallVector<LogicalTensorDesc> dests(size); | |||
mgb_assert(inputs.size() > 0); | |||
auto&& comp_node = inputs[0].comp_node; | |||
auto&& input_layout = inputs[0].layout; | |||
for (size_t i = 0; i < size; i++) { | |||
if (inputs[i].layout.ndim == 0) { | |||
return {{{TensorLayout(inputs[0].layout.dtype), inputs[0].comp_node}}, | |||
false}; | |||
} | |||
} | |||
if (size > 1) { | |||
if (inputs.size() == 2) { | |||
// fallback to proxy_graph, matters on backward | |||
auto [output_descs, validated] = | |||
proxy_graph_detail::infer_output_attrs_fallible(def, inputs); | |||
if (!inputs[1].value.empty()) { | |||
@@ -203,30 +190,37 @@ std::tuple<SmallVector<LogicalTensorDesc>, bool> infer_output_attrs_fallible( | |||
return {output_descs, validated}; | |||
} | |||
mgb_assert(inputs.size() == 1); | |||
if (axis == INT_MAX) { | |||
// reduce to scalar | |||
// ignore keepdim because ndim is 1 | |||
auto&& dtype = input_layout.dtype; | |||
auto&& format = input_layout.format; | |||
auto output_layout = TensorLayout{{1}, dtype, format}; | |||
return {{{output_layout, comp_node}}, true}; | |||
} | |||
if (input_layout.ndim == 0) { | |||
// shape incomplete | |||
return {{{TensorLayout(input_layout.dtype, input_layout.format), comp_node}}, | |||
false}; | |||
} | |||
if (axis < 0) { | |||
axis = inputs[0].layout.ndim + axis; | |||
axis = input_layout.ndim + axis; | |||
} | |||
mgb_assert(axis >= 0 && axis < input_layout.ndim); | |||
if (axis == INT_MAX || inputs[0].layout.ndim == 1) { | |||
TensorLayout layout{inputs[0].layout.dtype}; | |||
layout.shape[0] = 1; | |||
layout.ndim = 1; | |||
dests[0].layout = layout; | |||
dests[0].comp_node = inputs[0].comp_node; | |||
TensorLayout output_layout = input_layout; | |||
bool remove_axis = (!keepdim) && input_layout.ndim > 1; | |||
if (remove_axis) { | |||
output_layout.remove_axis_inplace(axis); | |||
} else { | |||
for (size_t i = 0; i < size; ++i) { | |||
dests[i].comp_node = inputs[i].comp_node; | |||
dests[i].layout = inputs[i].layout; | |||
if (!keepdim && dests[i].layout.ndim > 1) { | |||
dests[i].layout.remove_axis_inplace(axis); | |||
} else { | |||
dests[i].layout.shape[axis] = 1; | |||
} | |||
dests[i].layout.init_contiguous_stride(); | |||
} | |||
output_layout.shape[axis] = 1; | |||
} | |||
return {dests, true}; | |||
output_layout.init_contiguous_stride(); | |||
return {{{output_layout, comp_node}}, true}; | |||
} | |||
SmallVector<VarNode::LayoutConstraintCallback> get_input_layout_constraint( | |||
@@ -230,31 +230,19 @@ SmallVector<TensorPtr> param_pack_concat_apply_on_physical_tensor( | |||
} | |||
auto dest_layout = TensorLayout({nr_elems}, dtype); | |||
auto output = Tensor::make(dest_layout, comp_node); | |||
auto caller = DnnOprCaller<megdnn::ParamPackConcat>(comp_node); | |||
size_t srcs_size = sizeof(void*) * nr_inputs; | |||
void** srcs_raw_ptr = (void**)comp_node.alloc_host(srcs_size); | |||
std::shared_ptr<dt_byte> srcs_ptr = { | |||
(dt_byte*)srcs_raw_ptr, | |||
[comp_node](dt_byte* ptr) { comp_node.free_host(ptr); }}; | |||
// FIXME: add param to ParamPackConcat | |||
DnnOprCaller<megdnn::ParamPackConcat> caller{comp_node}; | |||
HostTensorStorage srcs_storage{comp_node}; | |||
srcs_storage.ensure_size(sizeof(void*) * nr_inputs); | |||
TensorLayout srcs_layout = TensorLayout{{nr_inputs}, dtype::Int32()}; | |||
size_t ws_size; | |||
{ | |||
TensorShapeArray src_shapes; | |||
for (size_t i = 0; i < nr_inputs; ++i) { | |||
src_shapes.push_back(inputs[i]->shape()); | |||
} | |||
ws_size = caller.op->get_workspace_in_bytes( | |||
src_shapes, inputs.back()->shape(), TensorShape{}); | |||
} | |||
HostTensorND srcs_tensornd; | |||
srcs_tensornd.reset(srcs_storage, srcs_layout); | |||
auto* srcs_raw_ptr = reinterpret_cast<void**>(srcs_storage.ptr()); | |||
for (size_t i = 0; i < nr_inputs; ++i) { | |||
srcs_raw_ptr[i] = inputs[i]->dev_tensor().as_megdnn().raw_ptr(); | |||
srcs_raw_ptr[i] = inputs[i]->dnn_tensor().raw_ptr(); | |||
} | |||
HostTensorStorage srcs_storage; | |||
srcs_storage.reset(comp_node, srcs_size, srcs_ptr); | |||
caller.op->exec( | |||
{srcs_raw_ptr, srcs_layout}, inputs.back()->dnn_tensor(), | |||
output->dnn_tensor(), caller.create_workspace(ws_size)); | |||
async_release(HostTensorND{comp_node, srcs_layout}.storage(srcs_storage)); | |||
caller.exec_with_ws(srcs_tensornd.as_megdnn(), inputs.back(), output); | |||
async_release(srcs_tensornd); | |||
return {output}; | |||
} | |||
@@ -33,69 +33,39 @@ VarNodeArray apply_on_var_node(const OpDef& def, const VarNodeArray& inputs) { | |||
std::tuple<SmallVector<LogicalTensorDesc>, bool> infer_output_attrs_fallible( | |||
const OpDef& def, const SmallVector<LogicalTensorDesc>& inputs) { | |||
auto&& op = static_cast<const ROIAlign&>(def); | |||
if (inputs[0].layout.is_empty() || inputs[1].layout.is_empty()) { | |||
return {{{TensorLayout(inputs[0].layout.dtype), inputs[0].comp_node}, | |||
{TensorLayout(dtype::Int32()), inputs[1].comp_node}}, | |||
false}; | |||
} | |||
SmallVector<LogicalTensorDesc> descs(2u); | |||
size_t n = inputs[1].layout[0]; | |||
size_t c = inputs[0].layout[1]; | |||
descs[0].layout = TensorLayout( | |||
{n, c, op.pooled_height, op.pooled_width}, inputs[0].layout.dtype); | |||
descs[0].layout.init_contiguous_stride(); | |||
descs[0].comp_node = inputs[0].comp_node; | |||
descs[1].layout = | |||
TensorLayout({n, c, op.pooled_height, op.pooled_width}, dtype::Int32()); | |||
descs[1].layout.init_contiguous_stride(); | |||
descs[1].comp_node = descs[0].comp_node; | |||
return {descs, true}; | |||
auto&& op = def.cast_final_safe<ROIAlign>(); | |||
DnnOprHelper<megdnn::ROIAlign> dnn_opr(op.param()); | |||
auto cn = inputs[0].comp_node; | |||
auto&& [out_layout, ind_layout] = | |||
dnn_opr.deduce_layouts<2>(inputs[0].layout, inputs[1].layout); | |||
bool validated = out_layout.ndim == 0 && ind_layout.ndim == 0; | |||
return {{{out_layout, cn}, {ind_layout, cn}}, validated}; | |||
} | |||
SmallVector<TensorPtr> apply_on_physical_tensor( | |||
const OpDef& def, const SmallVector<TensorPtr>& inputs, | |||
SmallVector<LogicalTensorDesc>& output_descs, const bool& validated) { | |||
auto&& op = static_cast<const ROIAlign&>(def); | |||
CompNode cn = inputs[0]->comp_node(); | |||
auto&& op = def.cast_final_safe<ROIAlign>(); | |||
auto cn = inputs[0]->comp_node(); | |||
TensorLayout out_layout = output_descs[0].layout; | |||
TensorLayout ind_layout = output_descs[1].layout; | |||
if (!validated) { | |||
size_t n = inputs[1]->layout()[0]; | |||
size_t c = inputs[0]->layout()[1]; | |||
out_layout = TensorLayout( | |||
{n, c, op.pooled_height, op.pooled_width}, inputs[0]->layout().dtype); | |||
out_layout.init_contiguous_stride(); | |||
ind_layout = | |||
TensorLayout({n, c, op.pooled_height, op.pooled_width}, dtype::Int32()); | |||
ind_layout.init_contiguous_stride(); | |||
} | |||
DnnOprCaller<megdnn::ROIAlign> dnn_opr(cn, op.param()); | |||
auto&& [out_layout, ind_layout] = [&]() -> std::array<TensorLayout, 2> { | |||
if (validated) { | |||
return {output_descs[0].layout, output_descs[1].layout}; | |||
} else { | |||
return dnn_opr.deduce_layouts<2>(inputs[0]->layout(), inputs[1]->layout()); | |||
} | |||
}(); | |||
DeviceTensorND out = | |||
BlobManager::inst()->alloc_workspace_with_defrag(cn, out_layout); | |||
DeviceTensorND inds = | |||
BlobManager::inst()->alloc_workspace_with_defrag(cn, ind_layout); | |||
auto out = Tensor::make(out_layout, cn); | |||
auto ind = Tensor::make(ind_layout, cn); | |||
if (out_layout.is_empty() || ind_layout.is_empty()) { | |||
return {Tensor::make(out), Tensor::make(inds)}; | |||
return {out, ind}; | |||
} | |||
DnnOprCaller<megdnn::ROIAlign> dnn_opr(cn); | |||
dnn_opr.op->param() = op.param(); | |||
size_t sz = dnn_opr.op->get_workspace_in_bytes( | |||
inputs[0]->layout(), inputs[1]->layout(), out_layout, ind_layout); | |||
auto dnn_wk = dnn_opr.create_workspace(sz); | |||
dnn_opr.op->exec( | |||
inputs[0]->dnn_tensor(), inputs[1]->dnn_tensor(), out.as_megdnn(), | |||
inds.as_megdnn(), dnn_wk); | |||
return {Tensor::make(out), Tensor::make(inds)}; | |||
dnn_opr.exec_with_ws(inputs[0], inputs[1], out, ind); | |||
return {out, ind}; | |||
} | |||
SmallVector<VarNode::LayoutConstraintCallback> get_input_layout_constraint( | |||
@@ -570,11 +570,17 @@ bool Tensor::empty() { | |||
return !m_blob->size(); | |||
} | |||
megdnn::TensorND Tensor::dnn_tensor() { | |||
DnnTensorND Tensor::dnn_tensor() { | |||
mgb_assert(m_blob, "uninitialized tensor."); | |||
mgb_assert(m_layout.ndim, "dnn don't support scalar"); | |||
return DnnTensorND{m_layout, m_blob->storage(), m_offset}; | |||
} | |||
DnnTensorND Tensor::dnn_tensor(TensorShape new_shape) { | |||
mgb_assert(m_blob, "uninitialized tensor."); | |||
return DnnTensorND{m_layout.reshape(new_shape), m_blob->storage(), m_offset}; | |||
} | |||
void Tensor::fetch_value() { | |||
MGB_LOCK_GUARD(m_value_mtx); | |||
if (m_value.empty()) { | |||
@@ -334,9 +334,16 @@ public: | |||
size_t j = 0; | |||
for (auto&& var : m_opr->output()) { | |||
if (var->contain_flag(VarNode::Flag::VOLATILE_CONTENT)) { | |||
TensorLayout layout{var->shape(), var->dtype(), var->format()}; | |||
var->m_dev_tensor = BlobManager::inst()->alloc_workspace_with_defrag( | |||
var->comp_node(), layout); | |||
auto comp_node = var->comp_node(); | |||
auto dtype = var->dtype(); | |||
auto&& shape = var->shape(); | |||
size_t size = dtype.size(shape.total_nr_elems()); | |||
mgb_assert( | |||
var->format().is_default(), "non default format for workspace"); | |||
auto raw_storage = Blob::make(comp_node, size)->storage(); | |||
DeviceTensorStorage storage; | |||
storage.reset(comp_node, size, raw_storage); | |||
var->m_dev_tensor.reset(storage, {shape, dtype}); | |||
} else { | |||
mgb_assert(j < outputs.size()); | |||
auto&& tensor = outputs[j]; | |||
@@ -1,6 +1,7 @@ | |||
#pragma once | |||
#include "megbrain/imperative/physical_tensor.h" | |||
#include "megbrain/imperative/utils/helper.h" | |||
namespace mgb { | |||
namespace imperative { | |||
@@ -15,13 +16,19 @@ public: | |||
virtual void alloc_direct(OwnedBlob* blob, size_t size) = 0; | |||
virtual bool try_alloc_direct(OwnedBlob* blob, size_t size) { | |||
try { | |||
alloc_direct(blob, size); | |||
return true; | |||
} catch (MemAllocError&) { | |||
return false; | |||
} | |||
} | |||
virtual void alloc_with_defrag(OwnedBlob* blob, size_t size) = 0; | |||
virtual void set_allocator(allocator_t allocator) = 0; | |||
virtual DeviceTensorND alloc_workspace_with_defrag( | |||
CompNode cn, TensorLayout& layout) = 0; | |||
virtual void register_blob(OwnedBlob* blob) = 0; | |||
virtual void unregister_blob(OwnedBlob* blob) = 0; | |||
@@ -89,24 +89,19 @@ using EventPtr = std::unique_ptr<CompNode::Event, EventDeleter>; | |||
class Tensor; | |||
using TensorPtr = std::shared_ptr<Tensor>; | |||
/* | |||
using DnnTensorND to save the reference count of workspace | |||
allocted by blobmanager to prevent invalidation | |||
*/ | |||
struct DnnTensorND : megdnn::TensorND { | |||
private: | |||
std::shared_ptr<dt_byte> m_reference; | |||
// hold extra reference to repvent defrag-in-use | |||
std::shared_ptr<dt_byte> reference; | |||
public: | |||
DnnTensorND(TensorLayout& layout_, std::shared_ptr<dt_byte> ref_ptr, size_t offset) | |||
: megdnn::TensorND(layout_, {ref_ptr.get(), offset}) { | |||
m_reference = ref_ptr; | |||
DnnTensorND( | |||
const TensorLayout& layout_, std::shared_ptr<dt_byte> ptr, size_t offset) | |||
: megdnn::TensorND(layout_, {ptr.get(), offset}) { | |||
reference = std::move(ptr); | |||
} | |||
}; | |||
class Tensor : public NonCopyableObj { | |||
public: | |||
Tensor() = default; | |||
Tensor(BlobPtr blob, const TensorLayout& layout, size_t offset = 0, | |||
const HostTensorND& hv = {}); | |||
Tensor(BlobPtr blob, const TensorLayout& layout, const HostTensorND& hv = {}) | |||
@@ -154,7 +149,9 @@ public: | |||
void assign_from_dev_tensor(DeviceTensorND); | |||
megdnn::TensorND dnn_tensor(); | |||
DnnTensorND dnn_tensor(); | |||
DnnTensorND dnn_tensor(TensorShape new_shape); | |||
static TensorPtr make_scalar(DTypeScalar value, CompNode cn); | |||
@@ -3,6 +3,7 @@ | |||
#include <iomanip> | |||
#include <memory> | |||
#include <mutex> | |||
#include <optional> | |||
#include <sstream> | |||
#include "megbrain/utils/metahelper.h" | |||
@@ -14,11 +15,28 @@ namespace imperative { | |||
template <typename T = std::function<void()>> | |||
class CleanupGuard : public NonCopyableObj { | |||
private: | |||
T m_callback; | |||
std::optional<T> m_callback; | |||
public: | |||
CleanupGuard() = default; | |||
explicit CleanupGuard(T cb) : m_callback{std::move(cb)} {} | |||
~CleanupGuard() { m_callback(); } | |||
~CleanupGuard() { reset(); } | |||
CleanupGuard(CleanupGuard&& rhs) : m_callback(std::move(rhs.m_callback)) { | |||
rhs.m_callback.reset(); | |||
} | |||
CleanupGuard& operator=(CleanupGuard&& rhs) { | |||
swap(m_callback, rhs.m_callback); | |||
rhs.reset(); | |||
return *this; | |||
} | |||
public: | |||
void reset() { | |||
if (m_callback) { | |||
(*m_callback)(); | |||
m_callback.reset(); | |||
} | |||
} | |||
}; | |||
inline std::string quoted(std::string str) { | |||
@@ -33,6 +51,19 @@ inline std::string quoted(std::string str) { | |||
std::call_once(_once_flag, [&] { __VA_ARGS__; }); \ | |||
} while (false) | |||
template <typename T> | |||
struct is_small_vector { | |||
static constexpr bool value = false; | |||
}; | |||
template <typename T> | |||
struct is_small_vector<SmallVector<T>> { | |||
static constexpr bool value = true; | |||
}; | |||
template <typename T> | |||
static constexpr bool is_small_vector_v = is_small_vector<T>::value; | |||
} // namespace imperative | |||
} // namespace mgb |
@@ -6,4 +6,10 @@ namespace mgb::imperative { | |||
std::string demangle(std::string mangled); | |||
template <typename T> | |||
const char* demangled_typename() { | |||
static auto name = demangle(typeid(T).name()); | |||
return name.c_str(); | |||
} | |||
} // namespace mgb::imperative |
@@ -314,7 +314,8 @@ void CondTake::init_output_static_infer_desc() { | |||
auto dtype = input(0)->dtype(); | |||
TensorLayout ily(iv.val[0].shape(), dtype); | |||
dest.ndim = 1; | |||
dest.shape[0] = megdnn_opr()->get_workspace_in_bytes(ily); | |||
TensorLayout mly(iv.val[0].shape(), dtype::Int32()); | |||
dest.shape[0] = megdnn_opr()->get_workspace_in_bytes(ily, mly); | |||
return true; | |||
}; | |||
owner_graph()->static_infer_manager().register_shape_infer( | |||
@@ -548,9 +549,9 @@ void CheckNonFinite::init_output_static_infer_desc() { | |||
auto infer_wk = [this](TensorShape& dest, const InpVal& inp) { | |||
dest.ndim = 1; | |||
megdnn::TensorNDArray inp_arr(input().size()); | |||
SmallVector<megdnn::TensorLayout> inp_arr(input().size()); | |||
for (size_t i = 0; i < input().size(); ++i) { | |||
inp_arr[i] = {NULL, {inp.val.at(i).shape(), input(0)->dtype()}}; | |||
inp_arr[i] = {inp.val.at(i).shape(), input(0)->dtype()}; | |||
} | |||
dest.shape[0] = megdnn_opr()->get_workspace_in_bytes( | |||
inp_arr, {output(input().size() + 1)->shape(), | |||
@@ -1447,11 +1447,8 @@ void ParamPackConcat::init_output_static_infer_desc() { | |||
auto infer_wk = [this](TensorShape& dest, const InpVal& inp) { | |||
TensorShapeArray shapes; | |||
auto vals = inp.val; | |||
shapes.reserve(vals.size() - 1); | |||
for (size_t i = 0; i < vals.size() - 1; i++) { | |||
shapes.push_back(vals[i].shape()); | |||
} | |||
dest = {m_opr->get_workspace_in_bytes(shapes, vals.back().shape(), dest)}; | |||
size_t nr_params = vals.size() - 1; | |||
dest = {m_opr->get_workspace_in_bytes({nr_params}, vals.back().shape(), dest)}; | |||
return true; | |||
}; | |||
mgr.register_shape_infer(output(0), {SourceType::DEP, shp_deps, infer_out}); | |||
@@ -970,8 +970,9 @@ void AlgoChooser<Opr>::AlgoChooserHelper::profile( | |||
if (!policy.algo.valid()) | |||
continue; | |||
size_t workspace_needed = get_workspace_size_bytes(policy); | |||
if (m_inputs != nullptr) | |||
if (m_inputs == nullptr) { | |||
workspace_needed += data_size; | |||
} | |||
if (workspace_needed > | |||
m_desc.get_workspace_limit(m_cn, m_execution_policy.workspace_limit)) { | |||
continue; | |||
@@ -18,7 +18,8 @@ failed_files = Manager().list() | |||
def process_file(file, clang_format, write): | |||
source = open(file, "r").read() | |||
original_source = open(file, "r").read() | |||
source = original_source | |||
source = re.sub(r"MGB_DEFINE(?P<r>([^\\]|\n)*?)// *{", r"class MGB_DEFINE\g<r>{", source) | |||
source, count = re.subn(r"(?<!#define )MGB_DEFINE(.*) +\\", r"class MGB_DEFINE\1{\\", source) | |||
@@ -38,7 +39,7 @@ def process_file(file, clang_format, write): | |||
result = re.sub(r"class MGB_DEFINE(.*){( *)\\", r"MGB_DEFINE\1\2 \\", result) | |||
result = re.sub(r"class MGB_DEFINE((.|\n)*?){", r"MGB_DEFINE\1// {", result) | |||
if write: | |||
if write and original_source != result: | |||
with tempfile.NamedTemporaryFile( | |||
dir=os.path.dirname(file), delete=False | |||
) as tmp_file: | |||