Browse Source

refactor(imperative/ops): extends DnnOprCaller with template

GitOrigin-RevId: 402cba209a
HuaHua404-patch-4
Megvii Engine Team 2 years ago
parent
commit
c49d3070ba
52 changed files with 912 additions and 1171 deletions
  1. +11
    -8
      dnn/include/megdnn/oprs/general.h
  2. +4
    -2
      dnn/include/megdnn/oprs/linalg.h
  3. +19
    -14
      dnn/include/megdnn/oprs/nn.h
  4. +5
    -1
      dnn/src/common/check_non_finite.cpp
  5. +1
    -1
      dnn/src/common/cond_take/opr_impl.cpp
  6. +3
    -3
      dnn/src/common/lamb.cpp
  7. +4
    -4
      dnn/src/cuda/check_non_finite/opr_impl.cpp
  8. +1
    -1
      dnn/src/cuda/check_non_finite/opr_impl.h
  9. +2
    -1
      dnn/src/cuda/cond_take/opr_impl.cpp
  10. +2
    -1
      dnn/src/cuda/cond_take/opr_impl.h
  11. +2
    -2
      dnn/src/cuda/param_pack/opr_impl.cpp
  12. +1
    -1
      dnn/src/cuda/param_pack/opr_impl.h
  13. +2
    -1
      dnn/src/naive/check_non_finite/opr_impl.h
  14. +2
    -1
      dnn/src/naive/cond_take/opr_impl.cpp
  15. +2
    -1
      dnn/src/naive/cond_take/opr_impl.h
  16. +1
    -1
      dnn/src/naive/param_pack/opr_impl.h
  17. +2
    -2
      dnn/src/rocm/param_pack/opr_impl.cpp
  18. +1
    -1
      dnn/src/rocm/param_pack/opr_impl.h
  19. +1
    -1
      dnn/test/common/cond_take.cpp
  20. +6
    -1
      dnn/test/common/opr_proxy.h
  21. +1
    -1
      dnn/test/cuda/param_pack.cpp
  22. +1
    -1
      dnn/test/rocm/param_pack.cpp
  23. +12
    -47
      imperative/src/impl/blob_manager_impl.cpp
  24. +3
    -9
      imperative/src/impl/blob_manager_impl.h
  25. +297
    -45
      imperative/src/impl/dnn_op_helper.h
  26. +10
    -7
      imperative/src/impl/interpreter/interpreter_impl.cpp
  27. +18
    -22
      imperative/src/impl/ops/adaptive_pooling.cpp
  28. +25
    -62
      imperative/src/impl/ops/batch_norm.cpp
  29. +10
    -17
      imperative/src/impl/ops/cond_take.cpp
  30. +95
    -443
      imperative/src/impl/ops/convolution.cpp
  31. +36
    -47
      imperative/src/impl/ops/elemwise.cpp
  32. +11
    -30
      imperative/src/impl/ops/indexing.cpp
  33. +5
    -6
      imperative/src/impl/ops/io_remote.cpp
  34. +10
    -19
      imperative/src/impl/ops/lamb.cpp
  35. +16
    -27
      imperative/src/impl/ops/layer_norm.cpp
  36. +12
    -34
      imperative/src/impl/ops/matmul.cpp
  37. +14
    -21
      imperative/src/impl/ops/misc.cpp
  38. +16
    -25
      imperative/src/impl/ops/padding.cpp
  39. +15
    -33
      imperative/src/impl/ops/pooling.cpp
  40. +115
    -121
      imperative/src/impl/ops/reduce.cpp
  41. +10
    -22
      imperative/src/impl/ops/tensor_manip.cpp
  42. +22
    -52
      imperative/src/impl/ops/vision.cpp
  43. +7
    -1
      imperative/src/impl/physical_tensor.cpp
  44. +10
    -3
      imperative/src/impl/proxy_graph/mini_graph.h
  45. +10
    -3
      imperative/src/include/megbrain/imperative/blob_manager.h
  46. +9
    -12
      imperative/src/include/megbrain/imperative/physical_tensor.h
  47. +33
    -2
      imperative/src/include/megbrain/imperative/utils/helper.h
  48. +6
    -0
      imperative/src/include/megbrain/imperative/utils/platform.h
  49. +4
    -3
      src/opr/impl/misc.cpp
  50. +2
    -5
      src/opr/impl/tensor_manip.cpp
  51. +2
    -1
      src/rdnn/impl/algo_chooser.cpp
  52. +3
    -2
      tools/format.py

+ 11
- 8
dnn/include/megdnn/oprs/general.h View File

@@ -397,7 +397,8 @@ public:

OutputDType infer_dtype(DType data, DType mask);

virtual size_t get_workspace_in_bytes(const TensorLayout& data) = 0;
virtual size_t get_workspace_in_bytes(
const TensorLayout& data, const TensorLayout& mask) = 0;

virtual Output exec(
_megdnn_tensor_in data, _megdnn_tensor_in mask, _megdnn_workspace workspace,
@@ -512,7 +513,8 @@ public:
virtual void exec(
_megdnn_in const TensorNDArray& srcs, _megdnn_tensor_out dst,
_megdnn_workspace workspace) = 0;
void deduce_layout(const TensorLayoutArray& srcs, TensorLayout& dst);
MGE_WIN_DECLSPEC_FUC void deduce_layout(
const TensorLayoutArray& srcs, TensorLayout& dst);
virtual size_t get_workspace_in_bytes(
const TensorLayoutArray& srcs, const TensorLayout& dst) = 0;

@@ -596,7 +598,7 @@ public:
_megdnn_workspace workspace) = 0;

virtual size_t get_workspace_in_bytes(
const TensorShapeArray& srcs, const TensorShape& offsets,
const TensorShape& srcs, const TensorShape& offsets,
const TensorShape& dst) = 0;
};

@@ -1145,7 +1147,7 @@ protected:
/*!
* \return axis on dst used by indexer (i.e. ExecInfo::idx_axis)
*/
static size_t deduce_layout_fwd(
MGE_WIN_DECLSPEC_FUC static size_t deduce_layout_fwd(
const TensorLayout& data, const IndexDescLayoutOnly& index,
TensorLayout& dst);

@@ -1362,9 +1364,10 @@ class CheckNonFinite : public OperatorBase {

public:
virtual size_t get_workspace_in_bytes(
const TensorNDArray& srcs, const TensorLayout& dst) = 0;
const TensorLayoutArray& srcs, const TensorLayout& dst) = 0;

void deduce_layout(const TensorLayoutArray& srcs, TensorLayout& dst);
MGE_WIN_DECLSPEC_FUC void deduce_layout(
const TensorLayoutArray& srcs, TensorLayout& dst);

virtual void exec(
_megdnn_in const TensorNDArray& srcs, _megdnn_tensor_out dst,
@@ -1420,7 +1423,7 @@ public:
}
virtual size_t get_workspace_in_bytes(
const TensorLayout& src, const TensorLayout& dst) = 0;
void deduce_layout(const TensorLayout& src, TensorLayout& dst);
MGE_WIN_DECLSPEC_FUC void deduce_layout(const TensorLayout& src, TensorLayout& dst);

MGE_WIN_DECLSPEC_FUC static void deduce_layout_impl(
const TensorLayout& src, TensorLayout& dst, const Param& p);
@@ -1464,7 +1467,7 @@ public:
const TensorLayout& m_t, const TensorLayout& v_t,
const TensorLayout& new_param) = 0;

void deduce_layout(
MGE_WIN_DECLSPEC_FUC void deduce_layout(
const TensorLayout& m_t_1, const TensorLayout& v_t_1,
const TensorLayout& lamb_param, const TensorLayout& grad, TensorLayout& m_t,
TensorLayout& v_t, TensorLayout& new_param);


+ 4
- 2
dnn/include/megdnn/oprs/linalg.h View File

@@ -27,7 +27,8 @@ public:
_megdnn_tensor_in A, _megdnn_tensor_in B, _megdnn_tensor_out C,
_megdnn_workspace workspace) = 0;
MGE_WIN_DECLSPEC_FUC void deduce_dtype(DType A, DType B, DType& C);
void deduce_layout(const TensorLayout& A, const TensorLayout& B, TensorLayout& C);
MGE_WIN_DECLSPEC_FUC void deduce_layout(
const TensorLayout& A, const TensorLayout& B, TensorLayout& C);
virtual size_t get_workspace_in_bytes(
const TensorLayout& A, const TensorLayout& B, const TensorLayout& C) = 0;

@@ -64,7 +65,8 @@ public:
_megdnn_tensor_in A, _megdnn_tensor_in B, _megdnn_tensor_out C,
_megdnn_workspace workspace) = 0;
MGE_WIN_DECLSPEC_FUC void deduce_dtype(DType A, DType B, DType& C);
void deduce_layout(const TensorLayout& A, const TensorLayout& B, TensorLayout& C);
MGE_WIN_DECLSPEC_FUC void deduce_layout(
const TensorLayout& A, const TensorLayout& B, TensorLayout& C);
virtual size_t get_workspace_in_bytes(
const TensorLayout& A, const TensorLayout& B, const TensorLayout& C) = 0;



+ 19
- 14
dnn/include/megdnn/oprs/nn.h View File

@@ -224,9 +224,9 @@ public:
const TensorLayout& src_layout, _megdnn_tensor_in filter,
const TensorLayout& dst_layout, PreprocessedFilter* preprocessed_filter,
_megdnn_workspace workspace) = 0;
void deduce_dtype(DType src, DType filter, DType& dst);
MGE_WIN_DECLSPEC_FUC void deduce_dtype(DType src, DType filter, DType& dst);

void deduce_layout(
MGE_WIN_DECLSPEC_FUC void deduce_layout(
const TensorLayout& src, const TensorLayout& filter, TensorLayout& dst);

/**
@@ -300,7 +300,7 @@ public:
const TensorLayout& grad) = 0;

MGE_WIN_DECLSPEC_FUC void deduce_dtype(DType filter, DType diff, DType& grad);
void deduce_layout(
MGE_WIN_DECLSPEC_FUC void deduce_layout(
const TensorLayout& filter, const TensorLayout& diff, TensorLayout& grad);

static Algorithm::OprType get_opr_type() {
@@ -378,6 +378,12 @@ public:
const PreprocessedFilter* preprocessed_filter,
_megdnn_workspace workspace) = 0;

MGE_WIN_DECLSPEC_FUC void exec(
_megdnn_tensor_in src, _megdnn_tensor_in filter, _megdnn_tensor_in bias,
_megdnn_tensor_in z, _megdnn_tensor_out dst, _megdnn_workspace workspace) {
exec(src, filter, bias, z, dst, nullptr, workspace);
}

/**
* \brief execute weight preprocessing, read weights form filter and bias,
* write to preprocessed_filter after preprocessed.
@@ -390,8 +396,9 @@ public:
_megdnn_tensor_in bias, const TensorLayout& z_layout,
const TensorLayout& dst_layout, PreprocessedFilter* preprocessed_filter,
_megdnn_workspace workspace) = 0;
void deduce_dtype(DType src, DType filter, DType bias, DType z, DType& dst);
void deduce_layout(
MGE_WIN_DECLSPEC_FUC void deduce_dtype(
DType src, DType filter, DType bias, DType z, DType& dst);
MGE_WIN_DECLSPEC_FUC void deduce_layout(
const TensorLayout& src, const TensorLayout& filter,
const TensorLayout& bias, const TensorLayout& z, TensorLayout& dst);

@@ -775,7 +782,7 @@ protected:
void check_layout_fwd(const TensorLayout& src, const TensorLayout& dst);

public:
MGE_WIN_DECLSPEC_FUC static void deduce_layout_impl(
static void deduce_layout_impl(
const TensorLayout& src, const Param& param, TensorLayout& dst);
};

@@ -791,7 +798,7 @@ public:
virtual void exec(
_megdnn_tensor_in src, _megdnn_tensor_out dst,
_megdnn_workspace workspace) = 0;
void deduce_layout(const TensorLayout& src, TensorLayout& dst);
MGE_WIN_DECLSPEC_FUC void deduce_layout(const TensorLayout& src, TensorLayout& dst);
virtual size_t get_workspace_in_bytes(
const TensorLayout& src, const TensorLayout& dst) = 0;

@@ -1253,7 +1260,7 @@ public:
virtual void exec(
_megdnn_tensor_in src, _megdnn_tensor_in filter, _megdnn_tensor_out dst,
_megdnn_workspace workspace) = 0;
void deduce_layout(
MGE_WIN_DECLSPEC_FUC void deduce_layout(
const TensorLayout& src, const TensorLayout& filter, TensorLayout& dst);
virtual size_t get_workspace_in_bytes(
const TensorLayout& src, const TensorLayout& filter,
@@ -1281,18 +1288,16 @@ public:
* \param[in] diff (n, oc, od, oh, ow)
* \param[out] grad (n, ic, id, ih, iw)
*/
MGE_WIN_DECLSPEC_FUC static void deduce_layout_impl(
static void deduce_layout_impl(
const TensorLayout& filter, const TensorLayout& diff, const Param& param,
TensorLayout& grad);

virtual void exec(
_megdnn_tensor_in filter, _megdnn_tensor_in diff, _megdnn_tensor_out grad,
_megdnn_workspace workspace) = 0;
virtual size_t get_workspace_in_bytes(
const TensorLayout& filter, const TensorLayout& diff,
const TensorLayout& grad) = 0;

void deduce_layout(
MGE_WIN_DECLSPEC_FUC void deduce_layout(
const TensorLayout& filter, const TensorLayout& diff, TensorLayout& grad);

static Algorithm::OprType get_opr_type() {
@@ -1472,7 +1477,7 @@ public:
virtual void exec(
_megdnn_tensor_in src, _megdnn_tensor_in rois, _megdnn_tensor_out dst,
_megdnn_tensor_out index, _megdnn_workspace workspace) = 0;
void deduce_layout(
MGE_WIN_DECLSPEC_FUC void deduce_layout(
const TensorLayout& src, const TensorLayout& rois, TensorLayout& dst,
TensorLayout& index);
virtual size_t get_workspace_in_bytes(
@@ -1963,7 +1968,7 @@ public:
_megdnn_tensor_in data, _megdnn_tensor_in weight, _megdnn_tensor_in bias,
_megdnn_tensor_out dst, _megdnn_tensor_out mean, _megdnn_tensor_out rstd,
_megdnn_workspace workspace) = 0;
void deduce_layout(
MGE_WIN_DECLSPEC_FUC void deduce_layout(
const TensorLayout& data, const TensorLayout& weight,
const TensorLayout& bias, TensorLayout& dst, TensorLayout& mean,
TensorLayout& rstd);


+ 5
- 1
dnn/src/common/check_non_finite.cpp View File

@@ -7,7 +7,11 @@ void CheckNonFinite::check_exec(
const TensorNDArray& srcs, const TensorND& dst, size_t workspace_in_bytes) {
megdnn_assert_contiguous(dst.layout);
megdnn_assert(srcs.size() > 0);
auto required_workspace_in_bytes = get_workspace_in_bytes(srcs, dst.layout);
TensorLayoutArray src_layouts;
for (auto&& src : srcs) {
src_layouts.push_back(src.layout);
}
auto required_workspace_in_bytes = get_workspace_in_bytes(src_layouts, dst.layout);
megdnn_assert(workspace_in_bytes >= required_workspace_in_bytes);
}



+ 1
- 1
dnn/src/common/cond_take/opr_impl.cpp View File

@@ -11,7 +11,7 @@ size_t CondTake::check_exec_get_size(
mask.TensorShape::to_string().c_str());
megdnn_assert(data.is_physical_contiguous() && mask.is_physical_contiguous());
megdnn_assert(m_param.eps > 0, "eps must be non-negative; got: %g", m_param.eps);
megdnn_assert(workspace_in_bytes >= get_workspace_in_bytes(data));
megdnn_assert(workspace_in_bytes >= get_workspace_in_bytes(data, mask));
return data.total_nr_elems();
}



+ 3
- 3
dnn/src/common/lamb.cpp View File

@@ -7,9 +7,9 @@ void LAMBUpdate::deduce_layout(
const TensorLayout& m_t_1, const TensorLayout& v_t_1,
const TensorLayout& lamb_param, const TensorLayout& grad, TensorLayout& m_t,
TensorLayout& v_t, TensorLayout& new_param) {
m_t = TensorLayout(m_t_1);
v_t = TensorLayout(v_t_1);
new_param = TensorLayout(lamb_param);
m_t = m_t_1;
v_t = v_t_1;
new_param = lamb_param;
MEGDNN_MARK_USED_VAR(grad);
}



+ 4
- 4
dnn/src/cuda/check_non_finite/opr_impl.cpp View File

@@ -26,14 +26,14 @@ size_t CheckNonFiniteImpl::_get_workspace_in_bytes() {
}

size_t CheckNonFiniteImpl::get_workspace_in_bytes(
const TensorNDArray& srcs, const TensorLayout&) {
const TensorLayoutArray& srcs, const TensorLayout&) {
m_size = 0;
for (const auto& src : srcs) {
m_size += DIVUP(src.layout.total_nr_elems(), total_nr_elems_max);
m_size += DIVUP(src.total_nr_elems(), total_nr_elems_max);
}
if (srcs.begin()->layout.dtype == dtype::Float32()) {
if (srcs.begin()->dtype == dtype::Float32()) {
return _get_workspace_in_bytes<dt_float32>();
} else if (srcs.begin()->layout.dtype == dtype::Float16()) {
} else if (srcs.begin()->dtype == dtype::Float16()) {
return _get_workspace_in_bytes<dt_float16>();
} else {
megdnn_log_warn("only support fp16 and fp32, fallback to fp32");


+ 1
- 1
dnn/src/cuda/check_non_finite/opr_impl.h View File

@@ -19,7 +19,7 @@ public:
using CheckNonFinite::CheckNonFinite;

size_t get_workspace_in_bytes(
const TensorNDArray& srcs, const TensorLayout& dst) override;
const TensorLayoutArray& srcs, const TensorLayout& dst) override;

bool is_thread_safe() const override { return true; }



+ 2
- 1
dnn/src/cuda/cond_take/opr_impl.cpp View File

@@ -20,7 +20,8 @@ WorkspaceBundle CondTakeImpl::make_bundle(size_t nr_item) {
handle()->alignment_requirement()};
}

size_t CondTakeImpl::get_workspace_in_bytes(const TensorLayout& data) {
size_t CondTakeImpl::get_workspace_in_bytes(
const TensorLayout& data, const TensorLayout&) {
return make_bundle(data.total_nr_elems()).total_size_in_bytes();
}



+ 2
- 1
dnn/src/cuda/cond_take/opr_impl.h View File

@@ -15,7 +15,8 @@ public:
_megdnn_tensor_in data, _megdnn_tensor_in mask, _megdnn_workspace workspace,
DynOutMallocPolicyCall malloc_policy) override;

size_t get_workspace_in_bytes(const TensorLayout& data) override;
size_t get_workspace_in_bytes(
const TensorLayout& data, const TensorLayout& mask) override;
};

} // namespace cuda


+ 2
- 2
dnn/src/cuda/param_pack/opr_impl.cpp View File

@@ -6,8 +6,8 @@ namespace megdnn {
namespace cuda {

size_t ParamPackConcatImpl::get_workspace_in_bytes(
const TensorShapeArray& srcs, const TensorShape&, const TensorShape&) {
return sizeof(size_t) * srcs.size();
const TensorShape&, const TensorShape& offsets, const TensorShape&) {
return sizeof(size_t) * (offsets.shape[0] / 2);
}

template <typename T>


+ 1
- 1
dnn/src/cuda/param_pack/opr_impl.h View File

@@ -12,7 +12,7 @@ public:
_megdnn_workspace workspace) override;

size_t get_workspace_in_bytes(
const TensorShapeArray& srcs, const TensorShape& table,
const TensorShape& srcs, const TensorShape& table,
const TensorShape& dst) override;

private:


+ 2
- 1
dnn/src/naive/check_non_finite/opr_impl.h View File

@@ -13,7 +13,8 @@ public:

bool is_thread_safe() const override { return true; }

size_t get_workspace_in_bytes(const TensorNDArray&, const TensorLayout&) override {
size_t get_workspace_in_bytes(
const TensorLayoutArray&, const TensorLayout&) override {
m_size = 0;
return _get_workspace_in_bytes();
}


+ 2
- 1
dnn/src/naive/cond_take/opr_impl.cpp View File

@@ -38,7 +38,8 @@ void copy_data(

} // anonymous namespace

size_t CondTakeImpl::get_workspace_in_bytes(const TensorLayout& data) {
size_t CondTakeImpl::get_workspace_in_bytes(
const TensorLayout& data, const TensorLayout&) {
return (data.total_nr_elems() + 1) * sizeof(dt_int32);
}



+ 2
- 1
dnn/src/naive/cond_take/opr_impl.h View File

@@ -11,7 +11,8 @@ class CondTakeImpl : public CondTake {
public:
using CondTake::CondTake;

size_t get_workspace_in_bytes(const TensorLayout& data) override;
size_t get_workspace_in_bytes(
const TensorLayout& data, const TensorLayout& mask) override;

Output exec(
_megdnn_tensor_in data, _megdnn_tensor_in mask, _megdnn_workspace workspace,


+ 1
- 1
dnn/src/naive/param_pack/opr_impl.h View File

@@ -11,7 +11,7 @@ public:
_megdnn_workspace workspace) override;

size_t get_workspace_in_bytes(
const TensorShapeArray&, const TensorShape&, const TensorShape&) override {
const TensorShape&, const TensorShape&, const TensorShape&) override {
return 0;
}
};


+ 2
- 2
dnn/src/rocm/param_pack/opr_impl.cpp View File

@@ -7,8 +7,8 @@ namespace megdnn {
namespace rocm {

size_t ParamPackConcatImpl::get_workspace_in_bytes(
const TensorShapeArray& srcs, const TensorShape&, const TensorShape&) {
return sizeof(size_t) * srcs.size();
const TensorShape&, const TensorShape& offsets, const TensorShape&) {
return sizeof(size_t) * (offsets.shape[0] / 2);
}

template <typename T>


+ 1
- 1
dnn/src/rocm/param_pack/opr_impl.h View File

@@ -12,7 +12,7 @@ public:
_megdnn_workspace workspace) override;

size_t get_workspace_in_bytes(
const TensorShapeArray& srcs, const TensorShape& table,
const TensorShape& srcs, const TensorShape& table,
const TensorShape& dst) override;

private:


+ 1
- 1
dnn/test/common/cond_take.cpp View File

@@ -71,7 +71,7 @@ CondTakeTestcase::Result CondTakeTestcase::run(CondTake* opr) {
opr->param() = m_param;

DynOutMallocPolicyImpl malloc_policy(handle);
auto workspace_size = opr->get_workspace_in_bytes(data->layout);
auto workspace_size = opr->get_workspace_in_bytes(data->layout, mask->layout);
auto workspace_ptr = malloc_policy.alloc_workspace(workspace_size, nullptr);
auto result = opr->exec(
*data, *mask, {(dt_byte*)workspace_ptr, workspace_size}, &malloc_policy);


+ 6
- 1
dnn/test/common/opr_proxy.h View File

@@ -205,9 +205,14 @@ struct OprProxy<CheckNonFinite> {
auto inps = tensors;
inps.pop_back();

TensorLayoutArray inp_layouts(inps.size());
std::transform(
inps.begin(), inps.end(), inp_layouts.begin(),
[](const TensorND& tensor) { return tensor.layout; });

WorkspaceWrapper W(
opr->handle(),
opr->get_workspace_in_bytes(inps, tensors.back().layout));
opr->get_workspace_in_bytes(inp_layouts, tensors.back().layout));
opr->exec(inps, tensors.back(), W.workspace());
}
};


+ 1
- 1
dnn/test/cuda/param_pack.cpp View File

@@ -95,7 +95,7 @@ void test_param_pack_concat(

test::WorkspaceWrapper workspace(
handle,
concat->get_workspace_in_bytes(shapes, offsets_layout, {pack_size}));
concat->get_workspace_in_bytes({nr_params}, offsets_layout, {pack_size}));
TensorND src_tensor(param_ptrs.data(), TensorLayout({nr_params}, dtype::Int32()));

concat->exec(src_tensor, offsets_tensor, dst_tensor, workspace.workspace());


+ 1
- 1
dnn/test/rocm/param_pack.cpp View File

@@ -97,7 +97,7 @@ void test_param_pack_concat(

test::WorkspaceWrapper workspace(
handle,
concat->get_workspace_in_bytes(shapes, offsets_layout, {pack_size}));
concat->get_workspace_in_bytes({nr_params}, offsets_layout, {pack_size}));
TensorND src_tensor(param_ptrs.data(), TensorLayout({nr_params}, dtype::Int32()));

concat->exec(src_tensor, offsets_tensor, dst_tensor, workspace.workspace());


+ 12
- 47
imperative/src/impl/blob_manager_impl.cpp View File

@@ -9,11 +9,8 @@ BlobManagerImpl::BlobData::BlobData(OwnedBlob* in_blob) {
blob = in_blob;
DeviceTensorStorage d_storage;
d_storage.reset(blob->m_comp_node, blob->m_size, blob->m_storage);

h_storage = HostTensorStorage(blob->m_comp_node);

h_storage.ensure_size(blob->m_size);

h_storage.copy_from(const_cast<DeviceTensorStorage&>(d_storage), blob->m_size);
}

@@ -30,65 +27,36 @@ void BlobManagerImpl::unregister_blob(OwnedBlob* blob) {
}

void BlobManagerImpl::alloc_with_defrag(OwnedBlob* blob, size_t size) {
if (custom_allocator) {
blob->m_storage = custom_allocator(blob->m_comp_node, size);
if (m_custom_allocator) {
blob->m_storage = m_custom_allocator(blob->m_comp_node, size);
return;
}
// try alloc
MGB_TRY { alloc_direct(blob, size); }
// if fail, try defrag, alloc again
MGB_CATCH(MemAllocError&, {
if (!try_alloc_direct(blob, size)) {
mgb_log_warn("memory allocation failed for blob; try defragmenting");
defrag(blob->m_comp_node);
alloc_direct(blob, size);
});
}
}

void BlobManagerImpl::alloc_direct(OwnedBlob* blob, size_t size) {
DeviceTensorStorage storage(blob->m_comp_node);
mgb_assert(blob->m_comp_node.valid());
DeviceTensorStorage storage(blob->m_comp_node);
storage.ensure_size(size);
blob->m_storage = storage.raw_storage();
}

DeviceTensorND BlobManagerImpl::alloc_workspace_with_defrag(
CompNode cn, TensorLayout& layout) {
DeviceTensorND dev_tensor;
if (custom_allocator) {
DeviceTensorStorage storage(cn);
size_t sz = layout.dtype.size(layout.total_nr_elems());
storage.reset(cn, sz, custom_allocator(cn, sz));
dev_tensor.reset(storage, layout);
return dev_tensor;
}
MGB_TRY { dev_tensor = alloc_workspace(cn, layout); }
MGB_CATCH(MemAllocError&, {
mgb_log_warn("memory allocation failed for workspace; try defragmenting");
defrag(cn);
dev_tensor = alloc_workspace(cn, layout);
});
return dev_tensor;
};

DeviceTensorND BlobManagerImpl::alloc_workspace(CompNode cn, TensorLayout layout) {
DeviceTensorStorage storage(cn);
storage.ensure_size(layout.dtype.size(layout.total_nr_elems()));
DeviceTensorND dev_tensor;
dev_tensor.reset(storage, layout);
return dev_tensor;
}

void BlobManagerImpl::set_allocator(allocator_t allocator) {
custom_allocator = allocator;
m_custom_allocator = allocator;
}

void BlobManagerImpl::defrag(const CompNode& cn) {
BlobSetWithMux* blobs_set_ptr;
{
auto& blobs_set_ptr = ([&]() -> auto& {
MGB_LOCK_GUARD(m_mtx);
blobs_set_ptr = &m_comp2blobs_map[cn];
}
MGB_LOCK_GUARD(blobs_set_ptr->mtx);
return m_comp2blobs_map[cn];
})();
MGB_LOCK_GUARD(blobs_set_ptr.mtx);
std::vector<BlobData> blob_data_arrary;
std::set<Blob::RawStorage> storage_set;

@@ -96,7 +64,7 @@ void BlobManagerImpl::defrag(const CompNode& cn) {
size_t tot_sz = 0;

// copy to HostTensorStorage, and release
for (auto i : blobs_set_ptr->blobs_set) {
for (auto i : blobs_set_ptr.blobs_set) {
// skip if blob do not have m_storage
if (!i->m_storage)
continue;
@@ -153,9 +121,6 @@ struct BlobManagerStub : BlobManager {
void alloc_with_defrag(OwnedBlob* blob, size_t size) {
mgb_assert(0, "prohibited after global variable destruction");
};
DeviceTensorND alloc_workspace_with_defrag(CompNode cn, TensorLayout& layout) {
mgb_assert(0, "prohibited after global variable destruction");
};
void register_blob(OwnedBlob* blob) {
mgb_assert(0, "prohibited after global variable destruction");
};
@@ -163,7 +128,7 @@ struct BlobManagerStub : BlobManager {
void defrag(const CompNode& cn) {
mgb_assert(0, "prohibited after global variable destruction");
};
virtual void set_allocator(allocator_t allocator) {
void set_allocator(allocator_t allocator) {
mgb_assert(0, "prohibited after global variable destruction");
};
};


+ 3
- 9
imperative/src/impl/blob_manager_impl.h View File

@@ -27,27 +27,21 @@ class BlobManagerImpl final : public BlobManager {

std::mutex m_mtx;
CompNode::UnorderedMap<BlobSetWithMux> m_comp2blobs_map;

void defrag(const CompNode& cn) override;
BlobManager::allocator_t m_custom_allocator;

void alloc_direct(OwnedBlob* blob, size_t size) override;

DeviceTensorND alloc_workspace(CompNode cn, TensorLayout layout);

BlobManager::allocator_t custom_allocator;

public:
static BlobManager* inst();

void alloc_with_defrag(OwnedBlob* blob, size_t size) override;

DeviceTensorND alloc_workspace_with_defrag(
CompNode cn, TensorLayout& layout) override;

void register_blob(OwnedBlob* blob) override;

void unregister_blob(OwnedBlob* blob) override;

void defrag(const CompNode& cn) override;

void set_allocator(allocator_t allocator) override;
};



+ 297
- 45
imperative/src/impl/dnn_op_helper.h View File

@@ -1,79 +1,331 @@
#pragma once
#include <optional>
#include <type_traits>

#include "algo_chooser.h"
#include "megbrain/comp_node.h"
#include "megbrain/comp_node_env.h"
#include "megbrain/imperative/blob_manager.h"
#include "megbrain/imperative/physical_tensor.h"
#include "megbrain/imperative/utils/helper.h"
#include "megbrain/imperative/utils/platform.h"
#include "megbrain/rdnn/management.h"

using namespace megdnn;
#include "megdnn/basic_types.h"

namespace mgb {
namespace imperative {

/*!
* \brief A struct for safely calling DNN oprs
* In some cases, op may be released before the complete of the execution
* This destructor will prevent this
* /brief Helps deduce layout and dtype
*/
template <typename Opr>
struct DnnOprCaller {
CompNode cn;
DeviceTensorND dev_tensor;
Workspace workspace;
mgb::opr::intl::UniqPtrWithCN<Opr> op;
class DnnOprDeducer {
private:
Opr* m_opr;

DnnOprCaller(CompNode cn) : cn(cn), op(std::move(create_operator(cn))) {}
public:
DnnOprDeducer(Opr* opr) : m_opr(opr) { mgb_assert(opr); }

static mgb::opr::intl::UniqPtrWithCN<Opr> create_operator(CompNode cn) {
return mgb::opr::intl::create_megdnn_opr<Opr>(cn);
// FIXME: maybe in-place style deduction works better
template <typename... TArgs>
TensorLayout deduce_layout(TArgs&&... args) {
static_assert((std::is_convertible_v<TArgs, TensorLayout> && ...));
TensorLayout output_layout;
m_opr->deduce_layout(args..., output_layout);
return output_layout;
}

Workspace create_workspace(size_t sz) {
if (workspace.raw_ptr) {
mgb_throw(MegBrainError, "workspace should not be applicated many times");
}
if (sz) {
TensorLayout layout({sz}, dtype::Byte());
dev_tensor = Tensor::make(layout, cn)->dev_tensor();
workspace = megdnn::Workspace(
dev_tensor.raw_ptr(), dev_tensor.storage().size());
template <typename... TArgs>
TensorLayout deduce_layout_fallible(TArgs&&... args) {
static_assert((std::is_convertible_v<TArgs, TensorLayout> && ...));
TensorLayout output_layout;
bool success = (args.ndim * ...) > 0;
if (success) {
m_opr->deduce_layout(args..., output_layout);
} else {
m_opr->deduce_dtype(args.dtype..., output_layout.dtype);
}
return workspace;
return output_layout;
}

~DnnOprCaller() {
template <size_t nr_outputs, typename... TArgs>
std::array<TensorLayout, nr_outputs> deduce_layouts(TArgs&&... args) {
static_assert((std::is_convertible_v<TArgs, TensorLayout> && ...));
std::array<TensorLayout, nr_outputs> layouts;
std::apply(
[&](auto&&... outputs) { m_opr->deduce_layout(args..., outputs...); },
layouts);
return layouts;
}
};

/*!
* /brief Declare an abstract operator and initialize it's param
*/
template <typename Opr>
class DnnOprStub {
private:
// TODO: make opr concrete
std::aligned_storage_t<sizeof(Opr), alignof(Opr)> m_storage;

using Param = typename Opr::Param;

private:
DnnOprStub() { new (&param()) Param(); }

public:
DnnOprStub(const Param& param) { this->param() = param; }

// undefined behavior
Opr& opr() { return *reinterpret_cast<Opr*>(&m_storage); }

auto& param() { return opr().param(); }

auto& param() const { return opr().param(); }

~DnnOprStub() { param().~Param(); }
};

/*!
* /brief Deduce layout without create concrete opr
*/
template <typename Opr>
class DnnOprHelper : public DnnOprStub<Opr>, public DnnOprDeducer<Opr> {
private:
using Stub = DnnOprStub<Opr>;
using Deducer = DnnOprDeducer<Opr>;

public:
DnnOprHelper(const typename Opr::Param& param)
: Stub(param), Deducer(&Stub::opr()) {}
};

// hold a concrete operator in given comp_node
template <typename Opr>
class DnnOprHolder {
private:
CompNode m_comp_node;
opr::intl::UniqPtrWithCN<Opr> m_opr =
opr::intl::create_megdnn_opr<Opr>(m_comp_node);

public:
DnnOprHolder(CompNode comp_node) : m_comp_node(comp_node) {}

auto& op() { return m_opr; }

auto comp_node() { return m_comp_node; }

auto& param() { return m_opr->param(); }

auto& param() const { return m_opr->param(); }

~DnnOprHolder() {
using DT = CompNode::DeviceType;
if (cn.device_type() == DT::CPU && cn != CompNode::default_cpu()) {
CompNodeEnv::from_comp_node(cn).cpu_env().dispatch(
[p = op.release()] { delete p; });

if (m_comp_node.device_type() == DT::CPU &&
m_comp_node != CompNode::default_cpu()) {
CompNodeEnv::from_comp_node(m_comp_node)
.cpu_env()
.dispatch([p = m_opr.release()] { delete p; });
}
}
};

/*!
* /brief Prevent binary float
*/
class DnnOprCallerBase {
protected:
static auto&& get_layout(const megdnn::TensorND& tensor) { return tensor.layout; }

static auto get_layout(const megdnn::TensorNDArray& tensors) {
SmallVector<TensorLayout> layouts;
for (auto&& tensor : tensors) {
layouts.push_back(tensor.layout);
}
return layouts;
}
};

template <size_t OSize>
class MegDNNDynOutMallocImpl final : public megdnn::DynOutMallocPolicy {
using Output = std::array<TensorPtr, OSize>;
/*!
* \brief A struct for safely calling DNN oprs
*
* In some cases, op may be released before the complete of the execution
* This destructor will prevent this
*/
template <typename Opr>
class DnnOprCaller final : public DnnOprHolder<Opr>,
public DnnOprDeducer<Opr>,
public DnnOprCallerBase {
private:
using Holder = DnnOprHolder<Opr>;
using Deducer = DnnOprDeducer<Opr>;
using Base = DnnOprCallerBase;

std::optional<DnnTensorND> m_workspace;
std::optional<megdnn::param::ExecutionPolicy> m_policy;

CompNode m_cn;
Output m_out;
megdnn::Workspace create_workspace(size_t sz) {
mgb_assert(
!m_workspace, "workspace asked more than once by op: %s",
demangled_typename<Opr>());
dt_byte* ptr = nullptr;
if (sz) {
TensorLayout layout({sz}, dtype::Byte());
m_workspace.emplace(
Tensor::make(layout, Holder::comp_node())->dnn_tensor());
ptr = reinterpret_cast<dt_byte*>(m_workspace->raw_ptr());
}
return {ptr, sz};
}

public:
MegDNNDynOutMallocImpl(CompNode cn) : m_cn{cn} {}

megdnn::TensorND alloc_output(
size_t id, DType dtype, const TensorShape& shape,
void* user_data) override {
TensorLayout m_layout(shape, dtype);
m_out[id] = Tensor::make(m_layout, m_cn);
return m_out[id]->dev_tensor().as_megdnn();
using Param = typename Opr::Param;

DnnOprCaller(CompNode cn) : Holder(cn), Deducer(Holder::op().get()) {}

DnnOprCaller(CompNode cn, const Param& param) : DnnOprCaller(cn) {
Holder::param() = param;
}

DnnOprCaller(CompNode cn, const Param& param, megdnn::param::ExecutionPolicy policy)
: DnnOprCaller(cn, param) {
m_policy.emplace(policy);
}

void* alloc_workspace(size_t sz, void* user_data) override {
return m_cn.alloc_device(sz);
/**
* /brief Convert TensorPtr args to megdnn::TensorND and call f
*
*/
template <typename TFunctor, typename... TArgs>
auto call_dnn(TFunctor&& f, TArgs&&... args) {
std::optional<SmallVector<std::shared_ptr<dt_byte>>> input_ptrs;
// recursive convert:
// 1. TensorPtr to DnnTensorND (subclass of megdnn::TensorND) ;
// 2. DeviceTensorND, HostTensorND to megdnn::TensorND ;
// 3. SmallVector of above to SmallVector<megdnn::TensorND> .
auto to_dnn = [&](auto&& arg, auto&& to_dnn) {
using T = decltype(arg);
if constexpr (std::is_convertible_v<T, TensorPtr>) {
return arg->dnn_tensor();
} else if constexpr (
std::is_convertible_v<T, DeviceTensorND> ||
std::is_convertible_v<T, HostTensorND>) {
return arg.as_megdnn();
} else if constexpr (
std::is_convertible_v<T, megdnn::TensorND> ||
std::is_convertible_v<T, SmallVector<megdnn::TensorND>>) {
return std::forward<T>(arg);
} else if constexpr (is_small_vector_v<std::decay_t<T>>) {
using TItem = std::decay_t<decltype(to_dnn(arg[0], to_dnn))>;
SmallVector<megdnn::TensorND> dnn_tensors;
for (auto&& tensor : arg) {
if constexpr (std::is_same_v<TItem, DnnTensorND>) {
if (!input_ptrs) {
input_ptrs.emplace();
}
auto dnn_tensor = to_dnn(tensor, to_dnn);
input_ptrs->push_back(std::move(dnn_tensor.reference));
dnn_tensors.push_back(std::move(dnn_tensor));
} else if constexpr (std::is_same_v<TItem, megdnn::TensorND>) {
dnn_tensors.push_back(to_dnn(tensor, to_dnn));
} else {
static_assert(!std::is_same_v<TItem, TItem>);
}
}
return dnn_tensors;
} else {
static_assert(!std::is_same_v<T, T>);
}
};
return f(to_dnn(std::forward<TArgs>(args), to_dnn)...);
}

void free_workspace(void* ptr, void* user_data) override { m_cn.free_device(ptr); }
// common execution (opr->exec(inputs..., outputs...))
template <typename... TArgs>
void exec(TArgs&&... args) {
call_dnn(
[this](auto&&... args) {
Holder::op()->exec(std::forward<decltype(args)>(args)...);
},
std::forward<TArgs>(args)...);
}

// execution fastrun opr
// (opr->exec(inputs..., outputs..., create_ws(setup_algo(...))))
template <typename... TArgs>
void exec_fastrun(TArgs&&... args) {
call_dnn(
[&](auto&&... args) {
using FixedTensorLayouts =
typename rdnn::AlgoChooser<Opr>::FixedTensorLayouts;
SmallVector<megdnn::TensorND> dnn_inputs = {args...};
mgb_assert(m_policy, "policy not set");
size_t workspace_size = setup_algo<Opr>(
FixedTensorLayouts{args.layout...}, Holder::op().get(), 0,
false, false, Holder::comp_node(), *m_policy, false,
&dnn_inputs);
Holder::op()->exec(
std::forward<decltype(args)>(args)...,
create_workspace(workspace_size));
},
std::forward<TArgs>(args)...);
}

// execute with fixed workspace
// (opr->exec(input..., outputs..., create_ws(get_workspace_in_bytes(...))))
template <typename... TArgs>
void exec_with_ws(TArgs&&... args) {
call_dnn(
[&](auto&&... args) {
size_t workspace_size =
Holder::op()->get_workspace_in_bytes(get_layout(args)...);
Holder::op()->exec(
std::forward<decltype(args)>(args)...,
create_workspace(workspace_size));
},
std::forward<TArgs>(args)...);
}

TensorPtr at(size_t id) { return m_out[id]; }
// execute dynamic out opr
// (opr->exec(inputs..., outputs... create_ws(get_workspace_in_bytes(...)), alloc))
template <size_t nr_out, typename... TArgs>
auto exec_dynout(TArgs&&... args) {
struct Alloc final : public megdnn::DynOutMallocPolicy {
CompNode comp_node;
std::array<TensorPtr, nr_out> output_tensors;
std::array<std::optional<DnnTensorND>, nr_out> output_dnn_tensors;

public:
Alloc(CompNode comp_node) : comp_node(comp_node) {}
megdnn::TensorND alloc_output(
size_t id, DType dtype, const TensorShape& shape,
void* user_data) override {
TensorLayout layout(shape, dtype);
output_tensors[id] = Tensor::make(layout, comp_node);
output_dnn_tensors[id].emplace(
output_tensors[id]->dnn_tensor()); // pin output
return *output_dnn_tensors[id];
}

void* alloc_workspace(size_t sz, void* user_data) override {
mgb_assert(false);
}

void free_workspace(void* ptr, void* user_data) override {
mgb_assert(false);
}
} alloc{Holder::comp_node()};
call_dnn(
[&](auto&&... args) {
size_t workspace_size =
Holder::op()->get_workspace_in_bytes(get_layout(args)...);
Holder::op()->exec(
std::forward<decltype(args)>(args)...,
create_workspace(workspace_size), &alloc);
},
std::forward<TArgs>(args)...);
return alloc.output_tensors;
}
};

} // namespace imperative


+ 10
- 7
imperative/src/impl/interpreter/interpreter_impl.cpp View File

@@ -605,6 +605,7 @@ TensorInfo* ChannelImpl::alloc() {
void ChannelImpl::init(TensorInfo* info, LogicalTensorDesc&& desc) {
m_valid_handle.insert(reinterpret_cast<Handle>(info));
MGB_RECORD_EVENT(TensorDeclareEvent, info->id, info->name);
mgb_assert(desc.comp_node.valid(), "comp_node invalid");
info->status = TensorInfo::Allocated;
info->desc = std::move(desc);
}
@@ -831,6 +832,7 @@ void ChannelImpl::do_apply_op(const ApplyOp& cmd, std::string reason) {
output_descs.push_back(i->desc);
}
} else {
// i may be null
validated = false;
}
// Here std::move is REQUIRED for removing duplicated references.
@@ -1064,17 +1066,16 @@ void ChannelImpl::alloc_tensor_with_evict(OwnedBlob* x) {
if (in_worker) {
reserve_size(x->size());
}
MGB_TRY { BlobManager::inst()->alloc_direct(x, x->size()); }
MGB_CATCH(MemAllocError&, {
if (!BlobManager::inst()->try_alloc_direct(x, x->size())) {
bool suc = false;
if (in_worker) {
while (!suc) {
if (!auto_evict(1)) {
break;
}
MGB_TRY { BlobManager::inst()->alloc_direct(x, x->size()); }
MGB_CATCH(MemAllocError&, { continue; });
suc = true;
if (BlobManager::inst()->try_alloc_direct(x, x->size())) {
suc = true;
}
}
}
if (!suc) {
@@ -1086,9 +1087,11 @@ void ChannelImpl::alloc_tensor_with_evict(OwnedBlob* x) {
imperative_log_profile_begin("defrag");
BlobManager::inst()->defrag(x->comp_node());
imperative_log_profile_end("defrag");
BlobManager::inst()->alloc_direct(x, x->size());
mgb_assert(
BlobManager::inst()->try_alloc_direct(x, x->size()),
"allocation failed after defrag");
}
});
}
set_log_level(pre_level);
}



+ 18
- 22
imperative/src/impl/ops/adaptive_pooling.cpp View File

@@ -75,13 +75,12 @@ std::tuple<SmallVector<LogicalTensorDesc>, bool> infer_output_attrs_fallible(
SmallVector<TensorPtr> apply_on_physical_tensor(
const OpDef& def, const SmallVector<TensorPtr>& inputs,
SmallVector<LogicalTensorDesc>& output_descs, const bool& validated) {
auto&& pool = static_cast<const AdaptivePooling&>(def);
auto&& pooling = def.cast_final_safe<AdaptivePooling>();
auto&& cn = inputs[0]->comp_node();

using TensorND = megdnn::TensorND;
auto&& src_layout = inputs[0]->layout();
TensorLayout dst_layout = output_descs[0].layout;
auto param_format = pool.format;
TensorLayout dst_layout{inputs[0]->dtype()};
auto param_format = pooling.format;
if (!validated) {
dst_layout.ndim = src_layout.ndim;
const dt_int32* oshp2d = nullptr;
@@ -91,7 +90,7 @@ SmallVector<TensorPtr> apply_on_physical_tensor(
tshp1n = inputs[1]->layout().total_nr_elems() == 1;
oshp2d = tshp_nd->get_value().proxy_to_default_cpu().ptr<dt_int32>();
} else {
oshp2d = pool.shape.data();
oshp2d = pooling.shape.data();
}
if (param_format == opr::AdaptivePooling::Param::Format::NCHW) {
dst_layout[0] = src_layout[0];
@@ -108,15 +107,17 @@ SmallVector<TensorPtr> apply_on_physical_tensor(
MegBrainError, "AdaptivePooling only support NCHW or NHWC format");
}
dst_layout.init_contiguous_stride();
} else {
dst_layout = output_descs[0].layout;
}

size_t IH, IW, OH, OW;
if (param_format == param::AdaptivePooling::Format::NCHW) {
if (param_format == megdnn::param::AdaptivePooling::Format::NCHW) {
IH = src_layout[2];
IW = src_layout[3];
OH = dst_layout[2];
OW = dst_layout[3];
} else if (param_format == param::AdaptivePooling::Format::NHWC) {
} else if (param_format == megdnn::param::AdaptivePooling::Format::NHWC) {
IH = src_layout[1];
IW = src_layout[2];
OH = dst_layout[1];
@@ -124,26 +125,21 @@ SmallVector<TensorPtr> apply_on_physical_tensor(
} else {
mgb_throw(MegBrainError, "AdaptivePooling only support NCHW or NHWC format");
}
DnnOprCaller<megdnn::Pooling> dnn_opr(cn);
auto&& param = dnn_opr.op->param();
param.mode = pool.mode;
param.format = pool.format;

// adaptive_pooling param to pooling
auto&& param = megdnn::Pooling::Param();
param.mode = pooling.mode;
param.format = pooling.format;
param.pad_h = param.pad_w = 0;
param.stride_h = floor(IH / OH);
param.stride_w = floor(IW / OW);
param.stride_h = IH / OH;
param.stride_w = IW / OW;
param.window_h = IH - (OH - 1) * param.stride_h;
param.window_w = IW - (OW - 1) * param.stride_w;

TensorND src = inputs[0]->dnn_tensor();
DnnOprCaller<megdnn::Pooling> dnn_opr(cn, param, megdnn::param::ExecutionPolicy{});
auto src = inputs[0];
auto dst = Tensor::make(dst_layout, cn);

size_t sz = setup_algo<megdnn::Pooling>(
{src_layout, dst_layout}, dnn_opr.op.get(), 0, false, false, cn,
::megdnn::param::ExecutionPolicy{}, false);

auto dnn_wk = dnn_opr.create_workspace(sz);
dnn_opr.op->exec(src, dst->dnn_tensor(), dnn_wk);

dnn_opr.exec_fastrun(inputs[0], dst);
return {dst};
}



+ 25
- 62
imperative/src/impl/ops/batch_norm.cpp View File

@@ -145,79 +145,44 @@ SmallVector<TensorPtr> apply_on_physical_tensor(
auto&& op_def = def.cast_final_safe<BatchNorm>();
auto&& comp_node = inputs[0]->comp_node();

using TensorND = megdnn::TensorND;
DnnOprCaller<megdnn::BN> dnn_opr(comp_node, op_def.param());

SmallVector<TensorND> inp_tensornds(inputs.size());
for (size_t i = 0; i < inputs.size(); ++i) {
inp_tensornds[i] = inputs[i]->dnn_tensor();
}

DnnOprCaller<megdnn::BN> dnn_opr(comp_node);
dnn_opr.op->param() = op_def.param();

TensorLayout src_layout = inputs[0]->layout();
TensorLayout scale_layout = inputs[1]->layout();
auto src_layout = inputs[0]->layout();
auto scale_layout = inputs[1]->layout();
bool empty_input = src_layout.is_empty();
size_t nr_inp = inputs.size();

size_t sz = 0, rsz = 0;

TensorLayout r_layout({rsz}, dtype::Byte());

if (!empty_input) {
sz = dnn_opr.op->get_workspace_in_bytes(
src_layout, src_layout, src_layout, src_layout, src_layout, src_layout,
src_layout, src_layout, src_layout);
rsz = dnn_opr.op->get_reserve_in_bytes(src_layout);

r_layout = TensorLayout({rsz}, dtype::Byte());
}
auto dnn_wk = dnn_opr.create_workspace(sz);
auto reserve = Tensor::make(r_layout, comp_node);
// size_t ws_size = 0, reserve_size = 0;
size_t reserve_size =
empty_input ? (size_t)0 : dnn_opr.op()->get_reserve_in_bytes(src_layout);

// alloc memory
// alloc outputs
auto y = Tensor::make(src_layout, comp_node);

auto save_mean = Tensor::make(scale_layout, comp_node);

auto save_variance = Tensor::make(scale_layout, comp_node);
auto reserve = Tensor::make(TensorLayout{{reserve_size}, dtype::Byte()}, comp_node);

if (op_def.fwd_mode == ::megdnn::param::BN::FwdMode::INFERENCE) {
if (!empty_input)
dnn_opr.op->exec(
inp_tensornds[0], inp_tensornds[1], inp_tensornds[2],
inp_tensornds[3], inp_tensornds[4], save_mean->dnn_tensor(),
save_variance->dnn_tensor(), reserve->dnn_tensor(), y->dnn_tensor(),
dnn_wk);
if (!empty_input) {
dnn_opr.exec_with_ws(
inputs[0], inputs[1], inputs[2], inputs[3], inputs[4], save_mean,
save_variance, reserve, y);
}
return {inputs[3], inputs[4], reserve, y};
} else {
if (nr_inp == 5) {
auto mean = Tensor::make(scale_layout, comp_node);

auto variance = Tensor::make(scale_layout, comp_node);

megdnn::RefPtr src_ptr1(
inp_tensornds[3].get_ref_ptr().get_ptr(), inputs[3]->offset());
megdnn::RefPtr dst_ptr1(
mean->dev_tensor().storage().get_ref_ptr(),
mean->dev_tensor().storage().offset(), false);
comp_node.peer_copy_to_ref(
comp_node, dst_ptr1, src_ptr1, scale_layout.span().high_byte);

megdnn::RefPtr src_ptr2(
inp_tensornds[4].get_ref_ptr().get_ptr(), inputs[4]->offset());
megdnn::RefPtr dst_ptr2(
variance->dev_tensor().storage().get_ref_ptr(),
variance->dev_tensor().storage().offset(), false);
comp_node.peer_copy_to_ref(
comp_node, dst_ptr2, src_ptr2, scale_layout.span().high_byte);

if (!empty_input)
dnn_opr.op->exec(
inp_tensornds[0], inp_tensornds[1], inp_tensornds[2],
mean->dnn_tensor(), variance->dnn_tensor(),
save_mean->dnn_tensor(), save_variance->dnn_tensor(),
reserve->dnn_tensor(), y->dnn_tensor(), dnn_wk);
// FIXME
mean->dev_tensor().copy_from(inputs[3]->dev_tensor());
variance->dev_tensor().copy_from(inputs[4]->dev_tensor());

if (!empty_input) {
dnn_opr.exec_with_ws(
inputs[0], inputs[1], inputs[2], mean, variance, save_mean,
save_variance, reserve, y);
}

return {mean, variance, save_mean, save_variance, reserve, y};
}
@@ -227,11 +192,9 @@ SmallVector<TensorPtr> apply_on_physical_tensor(
auto variance = Tensor::make(m_layout, comp_node);

if (!empty_input) {
dnn_opr.op->exec(
inp_tensornds[0], inp_tensornds[1], inp_tensornds[2],
mean->dnn_tensor(), variance->dnn_tensor(), save_mean->dnn_tensor(),
save_variance->dnn_tensor(), reserve->dnn_tensor(), y->dnn_tensor(),
dnn_wk);
dnn_opr.exec_with_ws(
inputs[0], inputs[1], inputs[2], mean, variance, save_mean,
save_variance, reserve, y);
}

return {save_mean, save_variance, reserve, y};


+ 10
- 17
imperative/src/impl/ops/cond_take.cpp View File

@@ -28,33 +28,26 @@ SmallVector<TensorPtr> apply_on_physical_tensor(

auto&& inp = inputs[0];
auto&& msk = inputs[1];
SmallVector<TensorPtr> out;
mgb_assert(
inp->layout().eq_shape(msk->layout()),
"input shape does not match mask shape");
mgb_assert(
msk->get_value().dtype().enumv() == DTypeEnum::Bool,
"mask dtype must be bool");
MegDNNDynOutMallocImpl<2> policy{inp->comp_node()};
if (inp->layout().is_empty()) {
// empty tensor
policy.alloc_output(0, inp->layout().dtype, {0}, nullptr);
policy.alloc_output(1, dtype::Int32(), {0}, nullptr);
return {
Tensor::make(TensorLayout{{0}, inp->dtype()}, inp->comp_node()),
Tensor::make(TensorLayout{{0}, dtype::Int32()}, inp->comp_node()),
};
} else {
DnnOprCaller<megdnn::CondTake> dnn_op(inp->comp_node());
dnn_op.op->param().val = 1;

size_t sz = dnn_op.op->get_workspace_in_bytes(inp->layout());

auto dnn_workspace = dnn_op.create_workspace(sz);

dnn_op.op->exec(
inp->dev_tensor().as_megdnn(), msk->dev_tensor().as_megdnn(),
dnn_workspace, &policy);
// maybe we need to split CondTake
megdnn::CondTake::Param param;
param.val = 1;
DnnOprCaller<megdnn::CondTake> dnn_op(inp->comp_node(), param);
auto&& [out0, out1] = dnn_op.exec_dynout<2>(inp, msk);
return {out0, out1};
}
out.push_back(policy.at(0));
out.push_back(policy.at(1));
return out;
}

std::tuple<SmallVector<LogicalTensorDesc>, bool> infer_output_attrs_fallible(


+ 95
- 443
imperative/src/impl/ops/convolution.cpp View File

@@ -8,14 +8,7 @@

namespace mgb {
namespace imperative {

namespace {

size_t infer_conv_shape(size_t inp, size_t flt, size_t stride, size_t pad) {
mgb_assert(inp + 2 * pad >= flt, "input=%zu padding=%zu filter=%zu", inp, pad, flt);
return (inp + 2 * pad - flt) / stride + 1;
}

namespace convolution {
std::shared_ptr<OpDef> make_from_op_node(cg::OperatorNodeBase* node_) {
auto* node = &node_->cast_final_safe<opr::Convolution>();
@@ -29,131 +22,23 @@ auto apply_on_var_node(const OpDef& def, const VarNodeArray& inputs) {
inputs[0], inputs[1], conv.param(), conv.policy(), config);
}

TensorLayout do_shape_infer(
const OpDef& def, size_t src_ndim, TensorLayout src, TensorLayout filter) {
auto&& conv = static_cast<const Convolution&>(def);
using Param = ::megdnn::param::Convolution;

auto img_ndim = src_ndim - 2;
mgb_assert(
img_ndim == 2,
"only 2D convolution is supported, and input should be 4-dim; "
"got input dim = %zu",
src_ndim);
size_t group = 1;
size_t flt_start, flt_spatial_start, ocpg_pos, icpg_pos;
if (conv.sparse == Param::Sparse::DENSE) {
mgb_assert(
filter.ndim == img_ndim + 2 || filter.ndim == img_ndim + 4,
"bad filter ndim for dense convolution: "
"spatial_ndim=%zu filter_ndim=%zu",
img_ndim, filter.ndim);
group = 1;
flt_start = 0;
} else { // Param::Sparse::GROUP
mgb_assert(
filter.ndim == img_ndim + 3 || filter.ndim == img_ndim + 5,
"bad filter ndim for group convolution: "
"spatial_ndim=%zu filter_ndim=%zu",
img_ndim, filter.ndim);
// grp, oc, ic, dims[]
group = filter[0];
flt_start = 1;
}

uint32_t ic_block_size = 1, oc_block_size = 1;
size_t src_or_dst_c_pos = 0;
size_t src_or_dst_spatial_start = 0;
if (conv.format == Param::Format::NCHW) {
// filter should be (oc, ic, fh, fw)
flt_spatial_start = 2;
ocpg_pos = 0;
icpg_pos = 1;
src_or_dst_c_pos = 1;
src_or_dst_spatial_start = 2;
} else { // Param::Format::NHWC
// filter should be (oc, fh, fw, ic)
flt_spatial_start = 1;
ocpg_pos = 0;
icpg_pos = 3;
src_or_dst_c_pos = 3;
src_or_dst_spatial_start = 1;
}
size_t ocpg = filter[flt_start + ocpg_pos] * oc_block_size;
size_t icpg = filter[flt_start + icpg_pos] * ic_block_size;
uint32_t dilation[2], dilated_spatial[2], stride[2], padding[2];
dilation[0] = conv.dilate_h;
dilation[1] = conv.dilate_w;
stride[0] = conv.stride_h;
stride[1] = conv.stride_w;
padding[0] = conv.pad_h;
padding[1] = conv.pad_w;
for (size_t i = 0; i < img_ndim; ++i) {
mgb_assert(
dilation[i] > 0, "invalid dilation on spatial dim %zu: %u", i,
dilation[i]);
dilated_spatial[i] =
(filter[i + flt_start + flt_spatial_start] - 1) * dilation[i] + 1;
}
mgb_assert(
icpg * group == src[src_or_dst_c_pos],
"group conv invalid: input channel of Conv expect %zu, but got %zu\n"
"hint: weight may be changed by mistake\n",
icpg * group, src[src_or_dst_c_pos]);
TensorLayout dst{src.dtype};
dst.ndim = src_ndim;
dst[0] = src[0];
dst[src_or_dst_c_pos] = ocpg * group;
for (size_t i = 0; i < img_ndim; ++i) {
dst[i + src_or_dst_spatial_start] = infer_conv_shape(
src[i + src_or_dst_spatial_start], dilated_spatial[i], stride[i],
padding[i]);
}
dst.init_contiguous_stride();
return dst;
}

std::tuple<SmallVector<LogicalTensorDesc>, bool> infer_output_attrs_fallible(
const OpDef& def, const SmallVector<LogicalTensorDesc>& inputs) {
SmallVector<LogicalTensorDesc> dests(1);
auto&& desc = dests[0];
desc.comp_node = inputs[0].comp_node;

TensorLayout src = inputs[0].layout;
TensorLayout filter = inputs[1].layout;
size_t src_ndim = src.ndim;
if (src_ndim == 0 || filter.ndim == 0) {
desc.layout = TensorLayout{{}, src.dtype};
return {dests, false};
auto&& conv = def.cast_final_safe<Convolution>();
DnnOprHelper<megdnn::ConvolutionForward> dnn_opr(conv.param());
auto&& data = inputs[0].layout;
auto&& filter = inputs[1].layout;
TensorLayout output_layout{data.dtype};
if (data.ndim && filter.ndim) {
// deduce_layout won't override existing dtype
dnn_opr.opr().deduce_layout(data, filter, output_layout);
}

desc.layout = do_shape_infer(def, src_ndim, src, filter);
return {dests, true};
return {{{output_layout, inputs[0].comp_node}}, output_layout.ndim != 0};
}

SmallVector<TensorPtr> apply_on_physical_tensor(
const OpDef& def, const SmallVector<TensorPtr>& inputs,
SmallVector<LogicalTensorDesc>& output_descs, const bool& validated) {
// create megdnn opr
auto&& conv = static_cast<const Convolution&>(def);
CompNode cn = inputs[0]->comp_node();

TensorLayout out_layout = output_descs[0].layout;
if (!validated)
out_layout = do_shape_infer(
def, inputs[0]->layout().ndim, inputs[0]->layout(),
inputs[1]->layout());

using TensorND = megdnn::TensorND;
SmallVector<TensorND> inp_tensornds(inputs.size() + 2);
TensorLayoutArray inp_shapes(inputs.size()), oup_shapes(output_descs.size());
for (unsigned i = 0; i < inputs.size(); ++i) {
inp_tensornds[i] = inputs[i]->dnn_tensor();
inp_shapes[i] = inputs[i]->layout();
}
oup_shapes[0] = out_layout;
DnnOprCaller<megdnn::ConvBiasForward> dnn_opr(cn);
auto&& param = dnn_opr.op->param();
// Convolution::Param -> ConvBias::Param
auto conv_bias_param_from_convolution(const Convolution& conv) {
megdnn::ConvBias::Param param;
param.pad_h = conv.pad_h;
param.pad_w = conv.pad_w;
param.stride_h = conv.stride_h;
@@ -163,30 +48,37 @@ SmallVector<TensorPtr> apply_on_physical_tensor(
param.sparse = conv.sparse;
param.compute_mode = conv.compute_mode;
param.format = conv.format;
return param;
}

// shape infer
TensorLayout empty_shp({0}, inputs[0]->dtype());
empty_shp.ndim = 0;

auto empty_bias = Tensor::make(empty_shp, cn);

inp_tensornds[2] = empty_bias->dnn_tensor();
inp_tensornds[3] = empty_bias->dnn_tensor();

size_t sz = setup_algo<megdnn::ConvBiasForward>(
{inp_shapes[0], inp_shapes[1], empty_shp, empty_shp, oup_shapes[0]},
dnn_opr.op.get(), 0, false, false, cn, conv.policy(), false,
&inp_tensornds);
SmallVector<TensorPtr> apply_on_physical_tensor(
const OpDef& def, const SmallVector<TensorPtr>& inputs,
SmallVector<LogicalTensorDesc>& output_descs, const bool& validated) {
// create megdnn opr
auto&& conv = def.cast_final_safe<Convolution>();
CompNode cn = inputs[0]->comp_node();
auto&& param = conv_bias_param_from_convolution(conv);
DnnOprCaller<megdnn::ConvBiasForward> dnn_opr(cn, param, conv.policy());

megdnn::TensorND empty_bias;
empty_bias.layout.dtype = inputs[0]->dtype();
empty_bias.layout.ndim = 0;

auto out_layout = [&] {
if (validated) {
return output_descs[0].layout;
} else {
TensorLayout out_layout{inputs[0]->dtype()};
dnn_opr.op()->deduce_layout(
inputs[0]->layout(), inputs[1]->layout(), empty_bias.layout,
empty_bias.layout, out_layout);
return out_layout;
}
}();

// alloc memory
auto out = Tensor::make(out_layout, cn);

auto dnn_wk = dnn_opr.create_workspace(sz);

// exeucte
dnn_opr.op->exec(
inp_tensornds[0], inp_tensornds[1], inp_tensornds[2], inp_tensornds[3],
out->dnn_tensor(), nullptr, dnn_wk);
dnn_opr.exec_fastrun(inputs[0], inputs[1], empty_bias, empty_bias, out);
return {out};
}

@@ -243,155 +135,41 @@ auto apply_on_var_node(const OpDef& def, const VarNodeArray& inputs) {
}
}

TensorLayout convbwd_do_shape_infer(
const OpDef& def, size_t diff_ndim, TensorLayout filter, TensorLayout diff,
CompNode cn) {
auto&& bwd_conv = static_cast<const ConvolutionBackwardData&>(def);
DnnOprCaller<megdnn::ConvolutionBackwardData> caller(cn);
auto&& dnn_opr = caller.op;
using Param = ::megdnn::param::Convolution;
// using Param1 = ::megdnn::param::ConvolutionBackwardData;

auto img_ndim = diff_ndim - 2;
mgb_assert(
img_ndim == 2,
"only 2D convolution is supported, and input should be 4-dim; "
"got input dim = %zu",
diff_ndim);
size_t group = 1;
size_t flt_start, flt_spatial_start, ocpg_pos, icpg_pos;
if (bwd_conv.sparse == Param::Sparse::DENSE) {
mgb_assert(
filter.ndim == img_ndim + 2 || filter.ndim == img_ndim + 4,
"bad filter ndim for dense convolution: "
"spatial_ndim=%zu filter_ndim=%zu",
img_ndim, filter.ndim);
group = 1;
flt_start = 0;
} else { // Param::Sparse::GROUP
mgb_assert(
filter.ndim == img_ndim + 3 || filter.ndim == img_ndim + 5,
"bad filter ndim for group convolution: "
"spatial_ndim=%zu filter_ndim=%zu",
img_ndim, filter.ndim);
// grp, oc, ic, dims[]
group = filter[0];
flt_start = 1;
}

uint32_t ic_block_size = 1, oc_block_size = 1;
size_t src_or_dst_c_pos = 0;
size_t src_or_dst_spatial_start = 0;
if (bwd_conv.format == Param::Format::NCHW) {
// filter should be (oc, ic, fh, fw)
flt_spatial_start = 2;
ocpg_pos = 0;
icpg_pos = 1;
src_or_dst_c_pos = 1;
src_or_dst_spatial_start = 2;
} else { // Param::Format::NHWC
// filter should be (oc, fh, fw, ic)
flt_spatial_start = 1;
ocpg_pos = 0;
icpg_pos = 3;
src_or_dst_c_pos = 3;
src_or_dst_spatial_start = 1;
}
size_t ocpg = filter[flt_start + ocpg_pos] * oc_block_size;
size_t icpg = filter[flt_start + icpg_pos] * ic_block_size;
uint32_t dilation[2], dilated_spatial[2], stride[2], padding[2];
dilation[0] = bwd_conv.dilate_h;
dilation[1] = bwd_conv.dilate_w;
stride[0] = bwd_conv.stride_h;
stride[1] = bwd_conv.stride_w;
padding[0] = bwd_conv.pad_h;
padding[1] = bwd_conv.pad_w;
for (size_t i = 0; i < img_ndim; ++i) {
mgb_assert(
dilation[i] > 0, "invalid dilation on spatial dim %zu: %u", i,
dilation[i]);
dilated_spatial[i] =
(filter[i + flt_start + flt_spatial_start] - 1) * dilation[i] + 1;
}
mgb_assert(
ocpg * group == diff[src_or_dst_c_pos],
"group conv invalid: input channel of Conv expect %zu, but got %zu\n"
"hint: weight may be changed by mistake\n",
ocpg * group, diff[src_or_dst_c_pos]);
auto deduce = [](size_t out, size_t filter, size_t stride, size_t pad) {
auto i = (out - 1) * stride + filter;
mgb_assert(i > pad * 2);
return i - pad * 2;
};

DType dst_dtype = bwd_conv.dtype;
dnn_opr->deduce_dtype(filter.dtype, diff.dtype, dst_dtype);
TensorLayout dst{dst_dtype};
dst.ndim = diff_ndim;
dst[0] = diff[0];
dst[src_or_dst_c_pos] = icpg * group;
for (size_t i = 0; i < img_ndim; ++i) {
dst[i + src_or_dst_spatial_start] =
deduce(diff[i + src_or_dst_spatial_start], dilated_spatial[i],
stride[i], padding[i]);
}
dst.init_contiguous_stride();
return dst;
}

std::tuple<SmallVector<LogicalTensorDesc>, bool> infer_output_attrs_fallible(
const OpDef& def, const SmallVector<LogicalTensorDesc>& inputs) {
SmallVector<LogicalTensorDesc> dests(1);
auto&& desc = dests[0];
desc.comp_node = inputs[0].comp_node;

TensorLayout filter = inputs[0].layout;
TensorLayout diff = inputs[1].layout;
size_t diff_ndim = diff.ndim;
if (diff_ndim == 0 || filter.ndim == 0) {
desc.layout = TensorLayout{{}, diff.dtype};
return {dests, false};
auto&& convbwd = def.cast_final_safe<ConvolutionBackwardData>();
DnnOprHelper<megdnn::ConvolutionBackwardData> dnn_opr(convbwd.param());
// force set dtype
auto&& filter = inputs[0].layout;
auto&& diff = inputs[1].layout;
TensorLayout output_layout{convbwd.dtype};
if (filter.ndim && diff.ndim) {
// deduce_layout won't override existing dtype
dnn_opr.opr().deduce_layout(filter, diff, output_layout);
}

desc.layout =
convbwd_do_shape_infer(def, diff_ndim, filter, diff, inputs[0].comp_node);
return {dests, true};
return {{{output_layout, inputs[0].comp_node}}, output_layout.ndim != 0};
}

SmallVector<TensorPtr> apply_on_physical_tensor(
const OpDef& def, const SmallVector<TensorPtr>& inputs,
SmallVector<LogicalTensorDesc>& output_descs, const bool& validated) {
// create megdnn opr
auto&& convbwd = static_cast<const ConvolutionBackwardData&>(def);
auto&& convbwd = def.cast_final_safe<ConvolutionBackwardData>();
CompNode cn = inputs[0]->comp_node();

TensorLayout out_layout = output_descs[0].layout;
if (!validated)
out_layout = convbwd_do_shape_infer(
def, inputs[1]->layout().ndim, inputs[0]->layout(), inputs[1]->layout(),
cn);

DnnOprCaller<megdnn::ConvolutionBackwardData> dnn_opr(
cn, convbwd.param(), convbwd.policy());
auto out_layout = [&] {
if (validated) {
return output_descs[0].layout;
} else {
TensorLayout out_layout{inputs[0]->dtype()};
dnn_opr.op()->deduce_layout(
inputs[0]->layout(), inputs[1]->layout(), out_layout);
return out_layout;
}
}();
auto out = Tensor::make(out_layout, cn);

using TensorND = megdnn::TensorND;
SmallVector<TensorND> inp_tensornds(inputs.size());
TensorLayoutArray inp_shapes(inputs.size()), oup_shapes(output_descs.size());
for (unsigned i = 0; i < inputs.size(); ++i) {
inp_tensornds[i] = inputs[i]->dnn_tensor();
inp_shapes[i] = inputs[i]->layout();
}
oup_shapes[0] = out_layout;
DnnOprCaller<megdnn::ConvolutionBackwardData> dnn_opr(cn);
dnn_opr.op->param() = convbwd.param();

size_t sz = setup_algo<megdnn::ConvolutionBackwardData>(
{inp_shapes[0], inp_shapes[1], oup_shapes[0]}, dnn_opr.op.get(), 0, false,
false, cn, convbwd.policy(), false, &inp_tensornds);

auto dnn_wk = dnn_opr.create_workspace(sz);

// exeucte
dnn_opr.op->exec(inp_tensornds[0], inp_tensornds[1], out->dnn_tensor(), dnn_wk);
dnn_opr.exec_fastrun(inputs[0], inputs[1], out);
return {out};
}

@@ -415,149 +193,36 @@ auto apply_on_var_node(const OpDef& def, const VarNodeArray& inputs) {
return opr::Convolution3D::make(inputs[0], inputs[1], conv.param(), conv.policy());
}

TensorLayout do_shape_infer(
const OpDef& def, size_t src_ndim, TensorLayout src, TensorLayout filter) {
auto&& conv = static_cast<const Convolution3D&>(def);
using Param = ::megdnn::param::Convolution3D;
auto img_ndim = src_ndim - 2;
mgb_assert(
img_ndim == 3,
"only 3D convolution is supported, and input should be 5-dim; "
"got input dim = %zu",
src_ndim);

size_t group = 1;
size_t flt_start, flt_spatial_start, ocpg_pos, icpg_pos;
if (conv.sparse == Param::Sparse::DENSE) {
mgb_assert(
filter.ndim == img_ndim + 2 || filter.ndim == img_ndim + 4,
"bad filter ndim for dense convolution: "
"spatial_ndim=%zu filter_ndim=%zu",
img_ndim, filter.ndim);
group = 1;
flt_start = 0;
} else { // Param::Sparse::GROUP
mgb_assert(
filter.ndim == img_ndim + 3 || filter.ndim == img_ndim + 5,
"bad filter ndim for group convolution: "
"spatial_ndim=%zu filter_ndim=%zu",
img_ndim, filter.ndim);

// grp, oc, ic, dims[]
group = filter[0];
flt_start = 1;
}

uint32_t ic_block_size = 1, oc_block_size = 1;
size_t src_or_dst_c_pos = 0;
size_t src_or_dst_spatial_start = 0;
if (conv.format == Param::Format::NCDHW) {
// filter should be (oc, ic, fd, fh, fw)
flt_spatial_start = 2;
ocpg_pos = 0;
icpg_pos = 1;
src_or_dst_c_pos = 1;
src_or_dst_spatial_start = 2;
} else { // Param::Format::NDHWC
// filter should be (oc, fd, fh, fw, ic)
flt_spatial_start = 1;
ocpg_pos = 0;
icpg_pos = 4;
src_or_dst_c_pos = 4;
src_or_dst_spatial_start = 1;
}
size_t ocpg = filter[flt_start + ocpg_pos] * oc_block_size;
size_t icpg = filter[flt_start + icpg_pos] * ic_block_size;
uint32_t dilation[3], dilated_spatial[3], stride[3], padding[3];
dilation[0] = conv.dilate_d;
dilation[1] = conv.dilate_h;
dilation[2] = conv.dilate_w;
stride[0] = conv.stride_d;
stride[1] = conv.stride_h;
stride[2] = conv.stride_w;
padding[0] = conv.pad_d;
padding[1] = conv.pad_h;
padding[2] = conv.pad_w;
for (size_t i = 0; i < img_ndim; ++i) {
mgb_assert(
dilation[i] > 0, "invalid dilation on spatial dim %zu: %u", i,
dilation[i]);
dilated_spatial[i] =
(filter[i + flt_start + flt_spatial_start] - 1) * dilation[i] + 1;
}
mgb_assert(
icpg * group == src[src_or_dst_c_pos],
"group conv invalid: input channel of Conv expect %zu, but got %zu\n"
"hint: weight may be changed by mistake\n",
icpg * group, src[src_or_dst_c_pos]);
TensorLayout dst{src.dtype};
dst.ndim = src_ndim;
dst[0] = src[0];
dst[src_or_dst_c_pos] = ocpg * group;
for (size_t i = 0; i < img_ndim; ++i) {
dst[i + src_or_dst_spatial_start] = infer_conv_shape(
src[i + src_or_dst_spatial_start], dilated_spatial[i], stride[i],
padding[i]);
}
dst.init_contiguous_stride();

return dst;
}

std::tuple<SmallVector<LogicalTensorDesc>, bool> infer_output_attrs_fallible(
const OpDef& def, const SmallVector<LogicalTensorDesc>& inputs) {
SmallVector<LogicalTensorDesc> dests(1);
auto&& desc = dests[0];
desc.comp_node = inputs[0].comp_node;

auto&& conv = def.cast_final_safe<Convolution3D>();
TensorLayout src = inputs[0].layout;
TensorLayout filter = inputs[1].layout;
size_t src_ndim = src.ndim;
if (src_ndim == 0 || filter.ndim == 0) {
desc.layout = TensorLayout{{}, src.dtype};
return {dests, false};
if (src.ndim == 0 || filter.ndim == 0) {
return {{{TensorLayout{src.dtype}, inputs[0].comp_node}}, false};
}
desc.layout = do_shape_infer(def, src_ndim, src, filter);
return {dests, true};
DnnOprHelper<megdnn::Convolution3DForward> dnn_opr(conv.param());
auto output = dnn_opr.deduce_layout(src, filter);
return {{{output, inputs[0].comp_node}}, false};
}

SmallVector<TensorPtr> apply_on_physical_tensor(
const OpDef& def, const SmallVector<TensorPtr>& inputs,
SmallVector<LogicalTensorDesc>& output_descs, const bool& validated) {
// create megdnn opr
auto&& conv = static_cast<const Convolution3D&>(def);

TensorLayout out_layout = output_descs[0].layout;
if (!validated)
out_layout = do_shape_infer(
def, inputs[0]->layout().ndim, inputs[0]->layout(),
inputs[1]->layout());

using TensorND = megdnn::TensorND;
auto&& conv = def.cast_final_safe<Convolution3D>();
CompNode cn = inputs[0]->comp_node();
SmallVector<TensorND> inp_tensornds(inputs.size());
TensorLayoutArray inp_shapes(inputs.size()), oup_shapes(output_descs.size());
for (unsigned i = 0; i < inputs.size(); ++i) {
inp_tensornds[i] = inputs[i]->dnn_tensor();
inp_shapes[i] = inputs[i]->layout();
}
oup_shapes[0] = out_layout;
DnnOprCaller<megdnn::Convolution3D> dnn_opr(cn);
dnn_opr.op->param() = conv.param();

// shape infer
size_t sz = setup_algo<megdnn::Convolution3D>(
{inp_shapes[0], inp_shapes[1], oup_shapes[0]}, dnn_opr.op.get(), 0, false,
false, cn, conv.policy(), false, &inp_tensornds);

DnnOprCaller<megdnn::Convolution3D> dnn_opr(cn, conv.param(), conv.policy());
auto out_layout = [&] {
if (validated) {
return output_descs[0].layout;
} else {
return dnn_opr.deduce_layout(inputs[0]->layout(), inputs[1]->layout());
}
}();
// alloc memory
auto out = Tensor::make(out_layout, cn);

auto dnn_wk = dnn_opr.create_workspace(sz);

// exeucte
dnn_opr.op->exec(inp_tensornds[0], inp_tensornds[1], out->dnn_tensor(), dnn_wk);
dnn_opr.exec_fastrun(inputs[0], inputs[1], out);
return {out};
}

@@ -579,51 +244,38 @@ std::tuple<SmallVector<LogicalTensorDesc>, bool> infer_output_attrs_fallible(
inputs.size() == 2,
"inputs num of conv_transpose3d should be 2 but you give %zu",
inputs.size());

auto&& op_def = def.cast_final_safe<Convolution3DBackwardData>();
auto&& weight = inputs[0];
auto&& diff = inputs[1];
auto& cn = weight.comp_node;

if (weight.layout.ndim == 0 || diff.layout.ndim == 0) {
return {{{TensorLayout{weight.layout.dtype}, cn, {}}}, false};
if (!(weight.layout.ndim && diff.layout.ndim)) {
return {{{TensorLayout{weight.layout.dtype}, weight.comp_node}}, false};
}

TensorLayout oup_layout;
megdnn::Convolution3DBackwardData::deduce_layout_impl(
weight.layout, diff.layout, op_def.param(), oup_layout);
return {{{oup_layout, cn, {}}}, true};
DnnOprHelper<megdnn::Convolution3DBackwardData> dnn_opr(op_def.param());
auto oup_layout = dnn_opr.deduce_layout(weight.layout, diff.layout);
return {{{oup_layout, weight.comp_node}}, true};
}

SmallVector<TensorPtr> apply_on_physical_tensor(
const OpDef& def, const SmallVector<TensorPtr>& inputs,
SmallVector<LogicalTensorDesc>& output_descs, const bool& validated) {
auto&& op_def = def.cast_final_safe<Convolution3DBackwardData>();
auto&& conv = def.cast_final_safe<Convolution3DBackwardData>();
auto cn = inputs[0]->comp_node();

auto&& wlayout = inputs[0]->layout();
auto&& dlayout = inputs[1]->layout();

DnnOprCaller<megdnn::Convolution3DBackwardData> caller(cn);
auto&& dnn_opr = caller.op;
dnn_opr->param() = op_def.param();
DnnOprCaller<megdnn::Convolution3DBackwardData> dnn_op(
cn, conv.param(), conv.policy());

TensorLayout& oup_layout = output_descs[0].layout;
if (!validated) {
megdnn::Convolution3DBackwardData::deduce_layout_impl(
wlayout, dlayout, op_def.param(), oup_layout);
}
auto oup_layout = [&] {
if (validated) {
return output_descs[0].layout;
} else {
return dnn_op.deduce_layout(wlayout, dlayout);
}
}();
auto oup = Tensor::make(oup_layout, cn);

SmallVector<megdnn::TensorND> inp_tensornds(inputs.size());
inp_tensornds[0] = inputs[0]->dnn_tensor();
inp_tensornds[1] = inputs[1]->dnn_tensor();
size_t wk_size = setup_algo<megdnn::Convolution3DBackwardData>(
{wlayout, dlayout, oup_layout}, dnn_opr.get(), 0, false, false, cn,
op_def.policy(), false, &inp_tensornds);
auto dnn_wk = caller.create_workspace(wk_size);

dnn_opr->exec(inp_tensornds[0], inp_tensornds[1], oup->dnn_tensor(), dnn_wk);
dnn_op.exec_fastrun(inputs[0], inputs[1], oup);
return {oup};
}



+ 36
- 47
imperative/src/impl/ops/elemwise.cpp View File

@@ -94,52 +94,44 @@ void apply_on_device_tensornd(
mgb_assert(
inputs.size() == trait.arity, "%s expects %u inputs; got %zu actually",
trait.name, trait.arity, inputs.size());
DnnOprCaller<megdnn::Elemwise> dnn_opr(inputs[0].comp_node());
opr::Elemwise::perform(op_def.mode, (*outputs)[0], inputs, dnn_opr.op);
DnnOprCaller<megdnn::Elemwise> dnn_opr(inputs[0].comp_node(), {op_def.mode});
opr::Elemwise::perform(op_def.mode, (*outputs)[0], inputs, dnn_opr.op());
}

SmallVector<TensorPtr> apply_on_physical_tensor(
const OpDef& def, const SmallVector<TensorPtr>& inputs,
SmallVector<LogicalTensorDesc>& output_descs, const bool& validated) {
auto comp_node = inputs[0]->comp_node();
auto dtype = inputs[0]->dtype();
using Mode = Elemwise::Mode;
using TensorND = megdnn::TensorND;
auto&& op_def = def.cast_final_safe<Elemwise>();
SmallVector<TensorND> inp_tensornds;
TensorShapeArray inp_shapes(inputs.size());
inp_tensornds.reserve(inputs.size());

TensorLayout layout{inputs[0]->layout().dtype};
bool is_empty = false;
for (unsigned i = 0; i < inputs.size(); ++i) {
if (inputs[i]->layout().is_empty()) {
is_empty = true;
}
inp_tensornds.push_back(inputs[i]->dnn_tensor());
inp_shapes[i] = inputs[i]->layout();
auto mode = op_def.mode;
TensorShapeArray input_shapes;
input_shapes.reserve(inputs.size());
for (auto&& input : inputs) {
input_shapes.push_back(input->shape());
}
megdnn::Elemwise::deduce_shape(inp_shapes, layout);
layout.init_contiguous_stride();

auto out = Tensor::make(layout, comp_node);

if (is_empty) {
return {out};
// deduce_shape is static and fast
TensorLayout output_layout{dtype};
// TODO: deduce_layout directly
megdnn::Elemwise::deduce_shape(input_shapes, output_layout);
output_layout.init_contiguous_stride();
auto output = Tensor::make(output_layout, comp_node);
if (output_layout.is_empty()) {
return {output};
}
DnnOprCaller<megdnn::Elemwise> dnn_opr(comp_node);

dnn_opr.op->param() = op_def.param();
if (dnn_opr.op->param().mode == Mode::FUSE_MUL_ADD3 ||
dnn_opr.op->param().mode == Mode::FUSE_MUL_ADD4 ||
(inp_tensornds.size() &&
inp_tensornds[0].layout.dtype.category() == DTypeCategory::QUANTIZED)) {
opr::Elemwise::perform_dnn(
comp_node, out->dnn_tensor(), inp_tensornds, dnn_opr.op);
DnnOprCaller<megdnn::Elemwise> dnn_opr(comp_node, op_def.param());
if (mode == Mode::FUSE_MUL_ADD3 || mode == Mode::FUSE_MUL_ADD4 ||
dtype.category() == DTypeCategory::QUANTIZED) {
dnn_opr.call_dnn(
[&](auto&& inputs, auto&& output) {
opr::Elemwise::perform_dnn(comp_node, output, inputs, dnn_opr.op());
},
inputs, output);
} else {
dnn_opr.op->exec(inp_tensornds, out->dnn_tensor());
dnn_opr.exec(inputs, output);
}

return {out};
return {output};
}

MGB_DEFINE_OPR_CLASS(
@@ -179,7 +171,7 @@ protected:
return ret;
}
void create_megdnn_opr() override {
auto opr = DnnOprCaller<megdnn::Elemwise>::create_operator(comp_node());
auto opr = mgb::opr::intl::create_megdnn_opr<megdnn::Elemwise>(comp_node());
opr->param().mode = m_param.mode;
set_megdnn_opr(std::move(opr));
}
@@ -243,22 +235,19 @@ SmallVector<TensorPtr> apply_inplace_add_on_physical_tensor(
"This inplace modification may change the elements of other tensors. "
"Fallback to non-inplace update.");

DeviceTensorStorage storage;
storage.reset(dest->comp_node(), dest->blob()->size(), dest->blob()->storage());
storage = storage.sub(dest->offset());
DeviceTensorND dv;
dv.reset(storage, dest->layout());

DeviceTensorND dv_new;
dv_new.copy_from(dv);
dest = Tensor::make(dv_new);
auto dest_layout = inputs[0]->layout();
dest_layout.init_contiguous_stride();
auto new_dest = Tensor::make(dest_layout, inputs[0]->comp_node());
new_dest->dev_tensor().copy_from(dest->dev_tensor());
dest = new_dest;
}
auto tensor_to_scalar = [](const TensorPtr& tensor) -> float {
return *tensor->get_value().ptr<float>();
};
DnnOprCaller<megdnn::AddUpdate> caller{dest->comp_node()};
caller.op->param() = {tensor_to_scalar(alpha), tensor_to_scalar(beta)};
caller.op->exec(dest->dev_tensor().as_megdnn(), delta->dev_tensor().as_megdnn());
DnnOprCaller<megdnn::AddUpdate> caller{
dest->comp_node(), {tensor_to_scalar(alpha), tensor_to_scalar(beta)}};
caller.exec(dest, delta);
// FIXME: inplace update host value
return {std::make_shared<Tensor>(dest->blob(), dest->offset(), dest->layout())};
}



+ 11
- 30
imperative/src/impl/ops/indexing.cpp View File

@@ -67,10 +67,8 @@ SmallVector<TensorPtr> apply_on_physical_tensor(
auto&& op = def.cast_final_safe<IndexingOneHot>();
auto&& inp = inputs[0];
auto&& index = inputs[1];
TensorLayout layout = inp->layout();
TensorLayout index_layout = index->layout();
DnnOprCaller<megdnn::IndexingOneHot> dnn_op(inp->comp_node());
auto&& indexing_one_hot_param = dnn_op.op->param();
auto&& layout = inp->layout();
auto&& index_layout = index->layout();
int real_axis = static_cast<int>(op.axis);
if (real_axis < 0) {
real_axis += static_cast<int>(layout.ndim);
@@ -79,16 +77,10 @@ SmallVector<TensorPtr> apply_on_physical_tensor(
0 <= real_axis && real_axis < static_cast<int>(layout.ndim),
"Dimension out of range (expected to be in range of [%d, %d], but got %d)",
0, static_cast<int>(layout.ndim) - 1, op.axis);
indexing_one_hot_param = real_axis;
TensorLayout tlayout;
dnn_op.op->deduce_layout(layout, index_layout, tlayout);
TensorPtr out = Tensor::make(tlayout, inp->comp_node());
megdnn::TensorND in = inp->dnn_tensor();
megdnn::TensorND ind = index->dnn_tensor();
size_t sz = dnn_op.op->get_workspace_in_bytes(layout, index_layout, tlayout);

auto dnn_workspace = dnn_op.create_workspace(sz);
dnn_op.op->exec(in, ind, out->dnn_tensor(), dnn_workspace);
DnnOprCaller<megdnn::IndexingOneHot> dnn_op(inp->comp_node(), real_axis);
auto tlayout = dnn_op.deduce_layout(layout, index_layout);
auto out = Tensor::make(tlayout, inp->comp_node());
dnn_op.exec_with_ws(inp, index, out);
return {out};
}

@@ -105,15 +97,14 @@ std::tuple<SmallVector<LogicalTensorDesc>, bool> infer_output_attrs_fallible(
const OpDef& def, const SmallVector<LogicalTensorDesc>& input_descs) {
mgb_assert(input_descs.size() == 3, "IndexingSetOneHot expects three inputs");
auto comp_node = input_descs[0].comp_node;
TensorLayout src = input_descs[0].layout, index = input_descs[1].layout;
auto&& src = input_descs[0].layout;
auto&& index = input_descs[1].layout;
mgb_assert(index.dtype == dtype::Int32(), "index dtype must be int32");

if (!src.ndim) {
return {{{{{}, src.dtype}, comp_node}}, false};
}
mgb_assert(src.is_contiguous(), "src should be contiguous");
return {{input_descs[0]}, true};
return {{{src, comp_node}}, true};
}

auto apply_on_var_node(const OpDef& def, const VarNodeArray& inputs) {
@@ -136,25 +127,15 @@ SmallVector<TensorPtr> apply_on_physical_tensor(
auto&& index = inputs[1];
auto&& sub = inputs[2];
TensorLayout layout = inp->layout();
TensorLayout index_layout = index->layout();
TensorLayout tlayout = sub->layout();
mgb_assert(layout.is_contiguous());
DnnOprCaller<megdnn::IndexingSetOneHot> dnn_op(inp->comp_node());
auto&& indexing_one_hot_param = dnn_op.op->param();
int real_axis = static_cast<int>(op.axis);
if (real_axis < 0) {
real_axis += static_cast<int>(layout.ndim);
}
indexing_one_hot_param = real_axis;
DnnOprCaller<megdnn::IndexingSetOneHot> dnn_op(inp->comp_node(), real_axis);
TensorPtr out = Tensor::make(layout, inp->comp_node());
out->dev_tensor().copy_from_fixlayout(inp->dev_tensor());
megdnn::TensorND in = inp->dnn_tensor();
megdnn::TensorND ind = index->dnn_tensor();
megdnn::TensorND su = sub->dnn_tensor();

size_t sz = dnn_op.op->get_workspace_in_bytes(layout, index_layout, tlayout);
auto dnn_workspace = dnn_op.create_workspace(sz);
dnn_op.op->exec(out->dnn_tensor(), ind, su, dnn_workspace);
dnn_op.exec_with_ws(out, index, sub);
return {out};
}



+ 5
- 6
imperative/src/impl/ops/io_remote.cpp View File

@@ -54,14 +54,15 @@ cg::OperatorNodeBase* apply_on_var_node_remote_recv(
TensorPtr megray_recv_tensor(
std::shared_ptr<MegRay::Communicator> megray_comm, TensorLayout& layout,
CompNode cn, uint32_t rank_from) {
DeviceTensorND out = BlobManager::inst()->alloc_workspace_with_defrag(cn, layout);
auto out = Tensor::make(layout, cn);
auto dnn_out = out->dnn_tensor();
auto megray_ctx = mgb::opr::get_megray_context(cn);
size_t data_size = layout.total_nr_elems();
auto status = megray_comm->recv(
out.raw_ptr(), data_size, mgb::opr::get_megray_dtype(layout.dtype),
dnn_out.raw_ptr(), data_size, mgb::opr::get_megray_dtype(layout.dtype),
rank_from, megray_ctx);
mgb_assert(status == MegRay::MEGRAY_OK, "MegRay recv failed");
return Tensor::make(out);
return out;
}

void megray_send_tensor(
@@ -105,9 +106,7 @@ SmallVector<TensorPtr> apply_on_physical_tensor_remote_send(
mgb_assert(megray_comm != nullptr);
megray_send_tensor(megray_comm, inputs[0], op.rank_to);
TensorLayout layout({0}, inputs[0]->dtype());
DeviceTensorND out = BlobManager::inst()->alloc_workspace_with_defrag(
inputs[0]->comp_node(), layout);
return {Tensor::make(out)};
return {Tensor::make(layout, inputs[0]->comp_node())};
}

std::tuple<SmallVector<LogicalTensorDesc>, bool> infer_output_attrs_fallible_remote_recv(


+ 10
- 19
imperative/src/impl/ops/lamb.cpp View File

@@ -21,14 +21,17 @@ SmallVector<VarNode::LayoutConstraintCallback> get_input_layout_constraint(
std::tuple<SmallVector<LogicalTensorDesc>, bool> infer_output_attrs_fallible(
const OpDef& def, const SmallVector<LogicalTensorDesc>& input_descs) {
mgb_assert(input_descs.size() == 4, "IndexingOneHot expects 4inputs");

auto comp_node = input_descs[0].comp_node;
auto comp_node1 = input_descs[1].comp_node;
auto comp_node2 = input_descs[2].comp_node;
TensorLayout m_t_1 = input_descs[0].layout, v_t_1 = input_descs[1].layout,
lamb_param = input_descs[2].layout, grad = input_descs[3].layout;

TensorLayout new_param = lamb_param, m_t = m_t_1, v_t = v_t_1;
auto&& m_t_1 = input_descs[0].layout;
auto&& v_t_1 = input_descs[1].layout;
auto&& lamb_param = input_descs[2].layout;
auto&& grad = input_descs[3].layout;
MGB_MARK_USED_VAR(grad);
auto&& new_param = lamb_param;
auto&& m_t = m_t_1;
auto&& v_t = v_t_1;
return {{{m_t, comp_node}, {v_t, comp_node1}, {new_param, comp_node2}}, true};
}

@@ -46,23 +49,11 @@ SmallVector<TensorPtr> apply_on_physical_tensor(
TensorLayout lamb_param_layout{lamb_param->layout()};

auto m_t = Tensor::make(m_t_1_layout, m_t_1->comp_node());

auto v_t = Tensor::make(v_t_1_layout, v_t_1->comp_node());

auto new_param = Tensor::make(lamb_param_layout, lamb_param->comp_node());

DnnOprCaller<megdnn::LAMBUpdate> caller{lamb_param->comp_node()};
size_t sz = caller.op->get_workspace_in_bytes(
m_t_1->layout(), v_t_1->layout(), lamb_param->layout(), grad->layout(),
m_t->layout(), v_t->layout(), new_param->layout());

auto dnn_workspace = caller.create_workspace(sz);
caller.op->param() = op.param();
caller.op->exec(
m_t_1->dev_tensor().as_megdnn(), v_t_1->dev_tensor().as_megdnn(),
lamb_param->dev_tensor().as_megdnn(), grad->dev_tensor().as_megdnn(),
m_t->dnn_tensor(), v_t->dnn_tensor(), new_param->dnn_tensor(),
dnn_workspace);
DnnOprCaller<megdnn::LAMBUpdate> dnn_opr{lamb_param->comp_node(), op.param()};
dnn_opr.exec_with_ws(m_t_1, v_t_1, lamb_param, grad, m_t, v_t, new_param);
return {m_t, v_t, new_param};
}



+ 16
- 27
imperative/src/impl/ops/layer_norm.cpp View File

@@ -29,11 +29,11 @@ cg::OperatorNodeBase* apply_on_var_node(const OpDef& def, const VarNodeArray& in

std::tuple<SmallVector<LogicalTensorDesc>, bool> infer_output_attrs_fallible(
const OpDef& def, const SmallVector<LogicalTensorDesc>& inputs) {
auto&& op_def = def.cast_final_safe<LayerNorm>();
auto&& layer_norm = def.cast_final_safe<LayerNorm>();
size_t nr_inp = inputs.size();
auto p = op_def.param();
auto affine = layer_norm.affine;
mgb_assert(
(nr_inp == 3 && p.affine) || (nr_inp == 1 && !p.affine),
(nr_inp == 3 && affine) || (nr_inp == 1 && !affine),
"num of inputs of pooling should be 1 or 3 but you give %zu",
inputs.size());

@@ -47,9 +47,9 @@ std::tuple<SmallVector<LogicalTensorDesc>, bool> infer_output_attrs_fallible(
false};
}

TensorLayout oup_layout, mean_layout, rstd_layout;
megdnn::LayerNorm::deduce_layout_fwd_impl(
inp.layout, p, oup_layout, mean_layout, rstd_layout);
DnnOprHelper<megdnn::LayerNorm> dnn_opr(layer_norm.param());
auto&& [oup_layout, mean_layout, rstd_layout] =
dnn_opr.deduce_layouts<3>(inp.layout, TensorLayout{}, TensorLayout{});
return {{{oup_layout, inp_cn, {}},
{mean_layout, inp_cn, {}},
{rstd_layout, inp_cn, {}}},
@@ -69,32 +69,21 @@ SmallVector<TensorPtr> apply_on_physical_tensor(
inputs.size());

auto cn = inputs[0]->comp_node();
DnnOprCaller<megdnn::LayerNorm> caller(cn);
auto&& dnn_opr = caller.op;
dnn_opr->param() = p;
DnnOprCaller<megdnn::LayerNorm> caller(cn, op_def.param());

TensorLayout oup_layout, mean_layout, rstd_layout;
megdnn::LayerNorm::deduce_layout_fwd_impl(
inputs[0]->dnn_tensor().layout, p, oup_layout, mean_layout, rstd_layout);
auto&& [oup_layout, mean_layout, rstd_layout] = caller.deduce_layouts<3>(
inputs[0]->layout(), TensorLayout{}, TensorLayout{});

auto out = Tensor::make(oup_layout, cn);

auto mean = Tensor::make(mean_layout, cn);

auto rstd = Tensor::make(rstd_layout, cn);

auto wk_size = caller.op->get_workspace_in_bytes(
inputs[0]->dnn_tensor().layout,
p.affine ? inputs[1]->dnn_tensor().layout : TensorLayout(),
p.affine ? inputs[2]->dnn_tensor().layout : TensorLayout(), oup_layout,
mean_layout, rstd_layout);
auto dnn_wk = caller.create_workspace(wk_size);

caller.op->exec(
inputs[0]->dnn_tensor(),
p.affine ? inputs[1]->dnn_tensor() : megdnn::TensorND(),
p.affine ? inputs[2]->dnn_tensor() : megdnn::TensorND(), out->dnn_tensor(),
mean->dnn_tensor(), rstd->dnn_tensor(), dnn_wk);
if (p.affine) {
caller.exec_with_ws(inputs[0], inputs[1], inputs[2], out, mean, rstd);
} else {
megdnn::TensorND empty_dnn;
caller.exec_with_ws(inputs[0], empty_dnn, empty_dnn, out, mean, rstd);
}
return {out, mean, rstd};
}

@@ -105,4 +94,4 @@ OP_TRAIT_REG(LayerNorm, LayerNorm)
.fallback();

} // namespace layer_norm
} // namespace mgb::imperative
} // namespace mgb::imperative

+ 12
- 34
imperative/src/impl/ops/matmul.cpp View File

@@ -24,7 +24,6 @@ auto apply_on_var_node(const OpDef& def, const VarNodeArray& inputs) {
auto dim1 = matmul.dimA, dim2 = matmul.dimB;

auto cn = inputs[0]->comp_node();
using Desc = opr::AxisAddRemove::AxisDesc;
using IndexDesc = opr::Subtensor::IndexDesc;
OperatorNodeConfig config{matmul.make_name(), cn};

@@ -104,9 +103,8 @@ std::tuple<SmallVector<LogicalTensorDesc>, bool> infer_output_attrs_fallible(
dim1 = dim2 = 2;
}

DnnOprCaller<megdnn::MatrixMul> dnn_opr(inputs[0].comp_node);
dnn_opr.op->param() = matmul.param();
dnn_opr.op->deduce_dtype(layout1.dtype, layout1.dtype, dst_dtype);
DnnOprHelper<megdnn::MatrixMul> dnn_opr(matmul.param());
dnn_opr.opr().deduce_dtype(layout1.dtype, layout1.dtype, dst_dtype);

if (dim1 == 0 || dim2 == 0) {
return {{{TensorLayout(dst_dtype), inputs[0].comp_node}}, false};
@@ -143,8 +141,7 @@ SmallVector<TensorPtr> apply_on_physical_tensor(
SmallVector<TensorND> inp_tensornds(inputs.size());
TensorLayout layout1 = inputs[0]->layout(), layout2 = inputs[1]->layout();

DnnOprCaller<megdnn::MatrixMul> dnn_opr(cn);
dnn_opr.op->param() = matmul.param();
DnnOprCaller<megdnn::MatrixMul> dnn_opr(cn, matmul.param(), matmul.policy());

if (matmul.dimA == matmul.dimB && matmul.dimB >= 3) { // only happens in backward
for (size_t i = 1; i + 1 < layout1.ndim; ++i) {
@@ -160,7 +157,7 @@ SmallVector<TensorPtr> apply_on_physical_tensor(
}

DType dst_dtype;
dnn_opr.op->deduce_dtype(layout1.dtype, layout1.dtype, dst_dtype);
dnn_opr.op()->deduce_dtype(layout1.dtype, layout1.dtype, dst_dtype);

// only matters when layout1 has dim 2
if (matmul.transposeA)
@@ -229,13 +226,8 @@ SmallVector<TensorPtr> apply_on_physical_tensor(
inp_tensornds[0].layout = layout_a;
inp_tensornds[1].layout = layout_b;
}
size_t sz = setup_algo<megdnn::MatrixMul>(
{layout_a, layout_b, dst_layout}, dnn_opr.op.get(), 0, false, false, cn,
matmul.policy(), false, &inp_tensornds);
auto out = Tensor::make(dst_layout, cn);
auto dnn_wk = dnn_opr.create_workspace(sz);

dnn_opr.op->exec(inp_tensornds[0], inp_tensornds[1], out->dnn_tensor(), dnn_wk);
dnn_opr.exec_fastrun(inp_tensornds[0], inp_tensornds[1], out);
return {out->sub(0, real_dst_layout)};
}

@@ -266,7 +258,6 @@ auto apply_on_var_node(const OpDef& def, const VarNodeArray& inputs) {
auto dim1 = matmul.dimA, dim2 = matmul.dimB;

auto cn = inputs[0]->comp_node();
using Desc = opr::AxisAddRemove::AxisDesc;
using IndexDesc = opr::Subtensor::IndexDesc;
OperatorNodeConfig config{matmul.make_name(), cn};

@@ -343,9 +334,8 @@ std::tuple<SmallVector<LogicalTensorDesc>, bool> infer_output_attrs_fallible(

DType dst_dtype;

DnnOprCaller<megdnn::MatrixMul> dnn_opr(inputs[0].comp_node);
dnn_opr.op->param() = matmul.param();
dnn_opr.op->deduce_dtype(layout1.dtype, layout1.dtype, dst_dtype);
DnnOprHelper<megdnn::MatrixMul> dnn_opr(matmul.param());
dnn_opr.opr().deduce_dtype(layout1.dtype, layout1.dtype, dst_dtype);

if (dim1 == 0 || dim2 == 0) {
return {{{TensorLayout(dst_dtype), inputs[0].comp_node}}, false};
@@ -386,10 +376,9 @@ SmallVector<TensorPtr> apply_on_physical_tensor(
TensorLayout layout1 = inputs[0]->layout(), layout2 = inputs[1]->layout();
size_t dim1 = layout1.ndim, dim2 = layout2.ndim;

DnnOprCaller<megdnn::BatchedMatrixMul> dnn_opr(cn);
dnn_opr.op->param() = matmul.param();
DnnOprCaller<megdnn::BatchedMatrixMul> dnn_opr(cn, matmul.param(), matmul.policy());
DType dst_dtype;
dnn_opr.op->deduce_dtype(layout1.dtype, layout1.dtype, dst_dtype);
dnn_opr.op()->deduce_dtype(layout1.dtype, layout1.dtype, dst_dtype);

TensorShape tshp, batch_shp;
size_t j = 0;
@@ -473,14 +462,9 @@ SmallVector<TensorPtr> apply_on_physical_tensor(
inp_tensornds[1] = inp2->dnn_tensor();
inp_tensornds[1].layout = layout2;

size_t sz = setup_algo<megdnn::BatchedMatrixMul>(
{layout1, layout2, dst_layout}, dnn_opr.op.get(), 0, false, false, cn,
matmul.policy(), false, &inp_tensornds);

auto out = Tensor::make(dst_layout, cn);

auto dnn_wk = dnn_opr.create_workspace(sz);
dnn_opr.op->exec(inp_tensornds[0], inp_tensornds[1], out->dnn_tensor(), dnn_wk);
dnn_opr.exec_fastrun(inp_tensornds[0], inp_tensornds[1], out);

shp1[shp1.ndim - 2] = dst_layout[dst_layout.ndim - 2];
shp1[shp1.ndim - 1] = dst_layout[dst_layout.ndim - 1];
@@ -533,7 +517,7 @@ SmallVector<TensorPtr> apply_on_physical_tensor(
TensorLayout oup_layout{inputs[0]->dtype()};
auto inp1_tensor = inputs[0]->dnn_tensor();
auto inp2_tensor = inputs[1]->dnn_tensor();
dnn_opr.op->deduce_layout(inp1_tensor.layout, inp2_tensor.layout, oup_layout);
oup_layout = dnn_opr.deduce_layout(inp1_tensor.layout, inp2_tensor.layout);

if (inputs[0]->layout().is_empty() || inputs[1]->layout().is_empty()) {
auto out = Tensor::make(oup_layout, comp_node);
@@ -543,14 +527,8 @@ SmallVector<TensorPtr> apply_on_physical_tensor(
return {out};
}

auto sz = dnn_opr.op->get_workspace_in_bytes(
inp_tensornds[0].layout, inp_tensornds[1].layout, output_descs[0].layout);

auto out = Tensor::make(oup_layout, comp_node);

auto dnn_wk = dnn_opr.create_workspace(sz);

dnn_opr.op->exec(inp_tensornds[0], inp_tensornds[1], out->dnn_tensor(), dnn_wk);
dnn_opr.exec_with_ws(inp_tensornds[0], inp_tensornds[1], out);

return {out};
}


+ 14
- 21
imperative/src/impl/ops/misc.cpp View File

@@ -17,27 +17,18 @@ SymbolVarArray apply_on_var_node(const OpDef& def, const VarNodeArray& inputs) {
SmallVector<TensorPtr> apply_on_physical_tensor(
const OpDef& def, const SmallVector<TensorPtr>& inputs,
SmallVector<LogicalTensorDesc>& output_descs, const bool& validated) {
size_t size = inputs.size();
auto&& op = def.cast_final_safe<CheckNonFinite>();
SmallVector<TensorPtr> outputs(size + 1);
outputs[size] = Tensor::make(
TensorLayout(TensorShape({1}), dtype::Int32()), inputs[0]->comp_node());

auto dest = outputs[size];
auto cn = dest->comp_node();
DnnOprCaller<megdnn::CheckNonFinite> dnn_opr(cn);
SmallVector<megdnn::TensorND> srcs(size);
// copy an outputs to the dnn for inplace
for (size_t i = 0; i < size; ++i) {
outputs[i] = Tensor::make(inputs[i]->layout(), inputs[0]->comp_node());
outputs[i]->dev_tensor().copy_from_fixlayout(inputs[i]->dev_tensor());
srcs[i] = outputs[i]->dev_tensor().as_megdnn();
auto comp_node = inputs[0]->comp_node();
auto dest = Tensor::make(TensorLayout({1}, dtype::Int32()), comp_node);
SmallVector<TensorPtr> outputs;
outputs.reserve(inputs.size() + 1);
for (auto&& input : inputs) {
outputs.push_back(Tensor::make(input->layout(), comp_node));
outputs.back()->dev_tensor().copy_from_fixlayout(input->dev_tensor());
}
megdnn::CheckNonFinite::Param param({op.scale});
dnn_opr.op->param() = param;
size_t sz = dnn_opr.op->get_workspace_in_bytes(srcs, dest->layout());
auto dnn_wk = dnn_opr.create_workspace(sz);
dnn_opr.op->exec(srcs, dest->dnn_tensor(), dnn_wk);
DnnOprCaller<megdnn::CheckNonFinite> dnn_opr(comp_node, {op.scale});
dnn_opr.exec_with_ws(outputs, dest);
outputs.push_back(dest);
return outputs;
}

@@ -45,13 +36,15 @@ std::tuple<SmallVector<LogicalTensorDesc>, bool> infer_output_attrs_fallible(
const OpDef& def, const SmallVector<LogicalTensorDesc>& inputs) {
size_t size = inputs.size();
SmallVector<LogicalTensorDesc> dests(size + 1);
bool validated = true;
for (size_t i = 0; i < size; ++i) {
dests[i].comp_node = inputs[i].comp_node;
dests[i].layout = inputs[i].layout;
validated &= bool(dests[i].layout.ndim);
}
dests[size].comp_node = inputs[0].comp_node;
dests[size].layout = TensorLayout(TensorShape({1}), dtype::Int32());
return {dests, true};
dests[size].layout = TensorLayout({1}, dtype::Int32());
return {dests, validated};
}

OP_TRAIT_REG(CheckNonFinite, CheckNonFinite)


+ 16
- 25
imperative/src/impl/ops/padding.cpp View File

@@ -27,40 +27,31 @@ SmallVector<TensorPtr> apply_on_physical_tensor(
SmallVector<LogicalTensorDesc>& output_descs, const bool& validated) {
auto comp_node = inputs[0]->comp_node();
auto&& op_def = def.cast_final_safe<Padding>();

DnnOprCaller<megdnn::Padding> dnn_op(comp_node);
dnn_op.op->param() = op_def.param();

TensorLayout dst = output_descs[0].layout;
if (!validated) {
megdnn::Padding::deduce_layout_impl(
inputs[0]->dnn_tensor().layout, dst, op_def.param());
}

DeviceTensorND out =
BlobManager::inst()->alloc_workspace_with_defrag(comp_node, dst);

dnn_op.op->exec(inputs[0]->dnn_tensor(), out.as_megdnn());

return {Tensor::make(out)};
DnnOprCaller<megdnn::Padding> dnn_op(comp_node, op_def.param());
auto dst = [&] {
if (validated) {
return output_descs[0].layout;
} else {
return dnn_op.deduce_layout(inputs[0]->layout());
}
}();
auto out = Tensor::make(dst, comp_node);
dnn_op.exec(inputs[0], out);
return {out};
}

std::tuple<SmallVector<LogicalTensorDesc>, bool> infer_output_attrs_fallible(
const OpDef& def, const SmallVector<LogicalTensorDesc>& inputs) {
auto&& op_def = def.cast_final_safe<Padding>();
size_t nr_inp = inputs.size();
auto p = op_def.param();

auto&& inp = inputs[0];
auto& inp_cn = inp.comp_node;

if (inp.layout.ndim == 0) {
return {{{TensorLayout{inp.layout.dtype}, inp_cn, {}}}, false};
return {{{TensorLayout{inp.layout.dtype}, inp.comp_node, {}}}, false};
}

TensorLayout oup_layout;
megdnn::Padding::deduce_layout_impl(inp.layout, oup_layout, p);
return {{{oup_layout, inp_cn, {}}}, true};
DnnOprHelper<megdnn::Padding> dnn_op(op_def.param());
auto oup_layout = dnn_op.deduce_layout(inp.layout);
return {{{oup_layout, inp.comp_node}}, true};
}

OP_TRAIT_REG(Padding, Padding, opr::Padding)
@@ -74,4 +65,4 @@ OP_TRAIT_REG(Padding, Padding, opr::Padding)
} // namespace imperative
} // namespace mgb

// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}

+ 15
- 33
imperative/src/impl/ops/pooling.cpp View File

@@ -25,19 +25,13 @@ std::tuple<SmallVector<LogicalTensorDesc>, bool> infer_output_attrs_fallible(
mgb_assert(
inputs.size() == 1, "num of inputs of pooling should be 1 but you give %zu",
inputs.size());

auto&& op_def = def.cast_final_safe<Pooling>();
auto&& inp = inputs[0];
auto& inp_cn = inp.comp_node;

if (inp.layout.ndim == 0) {
return {{{TensorLayout{inp.layout.dtype}, inp_cn, {}}}, false};
if (!inputs[0].layout.ndim) {
return {{{inputs[0].layout, inputs[0].comp_node}}, false};
}

TensorLayout oup_layout;
megdnn::Pooling::deduce_layout_impl(inp.layout, op_def.param(), oup_layout);

return {{{oup_layout, inp_cn, {}}}, true};
DnnOprHelper<megdnn::Pooling> dnn_opr(op_def.param());
auto oup_layout = dnn_opr.deduce_layout(inputs[0].layout);
return {{{oup_layout, inputs[0].comp_node}}, true};
}

SmallVector<TensorPtr> apply_on_physical_tensor(
@@ -47,30 +41,18 @@ SmallVector<TensorPtr> apply_on_physical_tensor(
inputs.size() == 1, "num of inputs of pooling should be 1 but you give %zu",
inputs.size());

auto&& op_def = def.cast_final_safe<Pooling>();
auto&& pooling = def.cast_final_safe<Pooling>();
auto cn = inputs[0]->comp_node();
DnnOprCaller<megdnn::Pooling> caller(cn);
auto&& dnn_opr = caller.op;
dnn_opr->param() = op_def.param();

SmallVector<megdnn::TensorND> inp_tensornds(inputs.size());
inp_tensornds[0] = inputs[0]->dnn_tensor();

TensorLayout& oup_layout = output_descs[0].layout;
if (!validated) {
megdnn::Pooling::deduce_layout_impl(
inp_tensornds[0].layout, op_def.param(), oup_layout);
}

size_t wk_size = setup_algo<megdnn::Pooling>(
{inp_tensornds[0].layout, oup_layout}, dnn_opr.get(), 0, false, false, cn,
op_def.policy(), false, &inp_tensornds);

DnnOprCaller<megdnn::Pooling> dnn_opr(cn, pooling.param(), pooling.policy());
auto oup_layout = [&] {
if (validated) {
return output_descs[0].layout;
} else {
return dnn_opr.deduce_layout(inputs[0]->layout());
}
}();
auto out = Tensor::make(oup_layout, cn);

auto dnn_wk = caller.create_workspace(wk_size);

caller.op->exec(inp_tensornds[0], out->dnn_tensor(), dnn_wk);
dnn_opr.exec_fastrun(inputs[0], out);
return {out};
}



+ 115
- 121
imperative/src/impl/ops/reduce.cpp View File

@@ -18,33 +18,31 @@ namespace reduce {
auto apply_on_var_node(const OpDef& def, const VarNodeArray& inputs) {
auto&& reduce = static_cast<const Reduce&>(def);
auto comp_node = inputs[0]->comp_node();
OperatorNodeConfig config{reduce.make_name(), comp_node, inputs[0]->dtype()};
auto name = reduce.make_name();

if (inputs.size() > 1) {
return opr::Reduce::make(inputs[0], reduce.param(), inputs[1], config);
}

using Param = megdnn::param::Reduce;
auto param = reduce.param();
if (param.axis < 0) {
param.axis = inputs[0]->shape().ndim + param.axis;
auto axis = param.axis;
auto keepdim = reduce.keepdim;

if (inputs.size() == 2) {
return opr::Reduce::make(inputs[0], param, inputs[1], {name});
}
mgb_assert(inputs.size() == 1);

SymbolVar target_shape = (cg::VarNode*)nullptr;
if (param.axis == INT_MAX) {
DTypeScalar vi{1};
// auto graph = ComputingGraph::make();
if (axis == INT_MAX) {
// keepdim could be ignored when ndim == 1
auto graph = inputs[0]->owner_graph();
target_shape = opr::ImmutableTensor::make(*graph, vi, config);
auto scalar_shape =
opr::ImmutableTensor::make(*graph, DTypeScalar(1), {name, comp_node});
return opr::Reduce::make(inputs[0], param, scalar_shape, {name});
}
auto res = opr::Reduce::make(inputs[0], param, target_shape, config);
if (!reduce.keepdim && param.axis != INT_MAX) {
// mgb::opr::Reduce supports negative axis
auto res = opr::Reduce::make(inputs[0], param, {}, {name});
if (!keepdim) {
using Desc = opr::AxisAddRemove::AxisDesc;
std::vector<Desc> remove_param;
remove_param.push_back(Desc::make_remove(param.axis));
OperatorNodeConfig remove_config{
def.make_name(), comp_node, inputs[0]->dtype()};
return opr::AxisAddRemove::make(res, remove_param, remove_config);
std::vector<Desc> remove_axis_param;
remove_axis_param.push_back(Desc::make_remove(axis));
res = opr::AxisAddRemove::make(res, remove_axis_param, {name});
}
return res;
}
@@ -71,111 +69,104 @@ bool memory_forward_success(const OpDef& def, SmallVector<TensorPtr> inputs) {
SmallVector<TensorPtr> apply_on_physical_tensor(
const OpDef& def, const SmallVector<TensorPtr>& inputs,
SmallVector<LogicalTensorDesc>& output_descs, const bool& validated) {
// memory forward
if (memory_forward_success(def, inputs)) {
// maybe returns inputs[0] directly
return {Tensor::make(
inputs[0]->blob(), inputs[0]->offset(), inputs[0]->layout())};
}

auto size = inputs.size();
if (size > 1) {
if (inputs.size() == 2) {
// reduce to target shape, fallback to proxy_graph
return proxy_graph_detail::apply_on_physical_tensor(
def, inputs, output_descs, validated);
}
mgb_assert(inputs.size() == 1);

auto comp_node = inputs[0]->comp_node();
using TensorND = megdnn::TensorND;
auto&& op_def = def.cast_final_safe<Reduce>();
SmallVector<TensorND> inp_tensornds;
inp_tensornds.reserve(inputs.size());
auto src = inputs[0]->layout();

DnnOprCaller<megdnn::Reduce> dnn_op(comp_node);
dnn_op.op->param() = op_def.param();
auto axis = op_def.param().axis;
DnnOprCaller<megdnn::Reduce> dnn_op(comp_node, op_def.param());
auto&& mode = dnn_op.param().mode;
auto& axis = dnn_op.param().axis;
auto keepdim = op_def.keepdim;

if (axis < 0) {
axis = inputs[0]->layout().ndim + axis;
}

dnn_op.op->param().axis = axis == INT_MAX ? 0 : axis;

if (axis == INT_MAX) {
src.shape[0] = src.total_nr_elems();
src.ndim = 1;
src.init_contiguous_stride();
}
TensorLayout layout{src.dtype};
dnn_op.op->deduce_layout(src, layout);

if (inputs[0]->layout().is_empty()) {
inputs[0]->dev_tensor().reset(inputs[0]->dev_tensor().storage(), src);

auto mode = op_def.param().mode;

if (!keepdim && src.ndim > 1) {
layout.remove_axis_inplace(axis);
layout.init_contiguous_stride();
DnnTensorND dnn_input = [&] {
if (axis == INT_MAX) { // reduce to scalar
axis = 0;
// flatten input
return inputs[0]->dnn_tensor({inputs[0]->shape().total_nr_elems()});
} else {
if (axis < 0) {
axis = inputs[0]->layout().ndim + axis;
}
mgb_assert(axis >= 0 && axis < inputs[0]->layout().ndim);
return inputs[0]->dnn_tensor();
}
auto out = Tensor::make(layout, comp_node);
}();
auto output_layout = dnn_op.deduce_layout(dnn_input.layout);
auto resolve_keepdim = [&] {
if (!keepdim) {
if (output_layout.ndim > 1) {
mgb_assert(output_layout.shape[axis] == 1);
output_layout.remove_axis_inplace(axis);
}
}
};

std::string err_msg;
TensorPtr output;
if (output_layout.is_empty()) {
// output empty, no computation
resolve_keepdim();
output = Tensor::make(output_layout, comp_node);
} else if (dnn_input.layout.is_empty()) {
// input empty but output not, do fill
resolve_keepdim();
output = Tensor::make(output_layout, comp_node);
auto on_bad_empty_reduce = [](const char* name) {
mgb_throw(
MegBrainError, "empty input is not allowed for reduce mode: %s",
name);
};
switch (mode) {
case Reduce::Mode::SUM:
if (!out->empty()) {
dev_tensor_memset(out->dev_tensor(), 0);
}
// fill 0
dev_tensor_memset(output->dev_tensor(), 0);
break;
case Reduce::Mode::PRODUCT:
if (!out->empty()) {
DnnOprCaller<megdnn::Fill> fill_op(comp_node);
fill_op.op->param() = 1;
fill_op.op->exec(out->dnn_tensor(), {});
}
case Reduce::Mode::PRODUCT: {
// fill 1
DnnOprCaller<megdnn::Fill> fill_op(comp_node, {1});
fill_op.exec_with_ws(output);
break;
}
case Reduce::Mode::MEAN:
err_msg = "mean";
on_bad_empty_reduce("mean");
break;
case Reduce::Mode::MIN:
err_msg = "min";
on_bad_empty_reduce("min");
break;
case Reduce::Mode::MAX:
err_msg = "max";
on_bad_empty_reduce("max");
break;
case Reduce::Mode::SUM_SQR:
err_msg = "sum_sqr";
on_bad_empty_reduce("sum_sqr");
break;
default:
mgb_throw(MegBrainError, "bad reduce mode");
}
if (!err_msg.empty()) {
mgb_throw(
MegBrainError, "empty input is not allowed for reduce mode: %s",
err_msg.c_str());
} else {
// common reduction
if (keepdim) {
output = Tensor::make(output_layout, comp_node);
dnn_op.exec_with_ws(dnn_input, output);
} else {
// used by megdnn::exec
auto output_layout_keepdim = output_layout;
resolve_keepdim();
output = Tensor::make(output_layout, comp_node);
dnn_op.exec_with_ws(dnn_input, output->dnn_tensor(output_layout_keepdim));
}
return {out};
}

auto dnn_ten = inputs[0]->dnn_tensor();
dnn_ten.layout = src;
inp_tensornds.push_back(dnn_ten);

auto wk_size = dnn_op.op->get_workspace_in_bytes(src, layout);
auto dnn_wk = dnn_op.create_workspace(wk_size);
TensorLayout ori_layout = layout;

if (!keepdim && src.ndim > 1) {
layout.remove_axis_inplace(axis);
layout.init_contiguous_stride();
}

auto out = Tensor::make(layout, comp_node);
auto dnn_out = out->dnn_tensor();
dnn_out.layout = ori_layout;

dnn_op.op->exec(inp_tensornds[0], dnn_out, dnn_wk);

return {out};
return {output};
}

std::tuple<SmallVector<LogicalTensorDesc>, bool> infer_output_attrs_fallible(
@@ -184,16 +175,12 @@ std::tuple<SmallVector<LogicalTensorDesc>, bool> infer_output_attrs_fallible(
auto axis = op_def.param().axis;
auto keepdim = op_def.keepdim;

size_t size = inputs.size();
SmallVector<LogicalTensorDesc> dests(size);
mgb_assert(inputs.size() > 0);
auto&& comp_node = inputs[0].comp_node;
auto&& input_layout = inputs[0].layout;

for (size_t i = 0; i < size; i++) {
if (inputs[i].layout.ndim == 0) {
return {{{TensorLayout(inputs[0].layout.dtype), inputs[0].comp_node}},
false};
}
}
if (size > 1) {
if (inputs.size() == 2) {
// fallback to proxy_graph, matters on backward
auto [output_descs, validated] =
proxy_graph_detail::infer_output_attrs_fallible(def, inputs);
if (!inputs[1].value.empty()) {
@@ -203,30 +190,37 @@ std::tuple<SmallVector<LogicalTensorDesc>, bool> infer_output_attrs_fallible(
return {output_descs, validated};
}

mgb_assert(inputs.size() == 1);

if (axis == INT_MAX) {
// reduce to scalar
// ignore keepdim because ndim is 1
auto&& dtype = input_layout.dtype;
auto&& format = input_layout.format;
auto output_layout = TensorLayout{{1}, dtype, format};
return {{{output_layout, comp_node}}, true};
}

if (input_layout.ndim == 0) {
// shape incomplete
return {{{TensorLayout(input_layout.dtype, input_layout.format), comp_node}},
false};
}

if (axis < 0) {
axis = inputs[0].layout.ndim + axis;
axis = input_layout.ndim + axis;
}
mgb_assert(axis >= 0 && axis < input_layout.ndim);

if (axis == INT_MAX || inputs[0].layout.ndim == 1) {
TensorLayout layout{inputs[0].layout.dtype};
layout.shape[0] = 1;
layout.ndim = 1;
dests[0].layout = layout;
dests[0].comp_node = inputs[0].comp_node;
TensorLayout output_layout = input_layout;
bool remove_axis = (!keepdim) && input_layout.ndim > 1;
if (remove_axis) {
output_layout.remove_axis_inplace(axis);
} else {
for (size_t i = 0; i < size; ++i) {
dests[i].comp_node = inputs[i].comp_node;
dests[i].layout = inputs[i].layout;
if (!keepdim && dests[i].layout.ndim > 1) {
dests[i].layout.remove_axis_inplace(axis);
} else {
dests[i].layout.shape[axis] = 1;
}
dests[i].layout.init_contiguous_stride();
}
output_layout.shape[axis] = 1;
}
return {dests, true};
output_layout.init_contiguous_stride();
return {{{output_layout, comp_node}}, true};
}

SmallVector<VarNode::LayoutConstraintCallback> get_input_layout_constraint(


+ 10
- 22
imperative/src/impl/ops/tensor_manip.cpp View File

@@ -230,31 +230,19 @@ SmallVector<TensorPtr> param_pack_concat_apply_on_physical_tensor(
}
auto dest_layout = TensorLayout({nr_elems}, dtype);
auto output = Tensor::make(dest_layout, comp_node);
auto caller = DnnOprCaller<megdnn::ParamPackConcat>(comp_node);
size_t srcs_size = sizeof(void*) * nr_inputs;
void** srcs_raw_ptr = (void**)comp_node.alloc_host(srcs_size);
std::shared_ptr<dt_byte> srcs_ptr = {
(dt_byte*)srcs_raw_ptr,
[comp_node](dt_byte* ptr) { comp_node.free_host(ptr); }};
// FIXME: add param to ParamPackConcat
DnnOprCaller<megdnn::ParamPackConcat> caller{comp_node};
HostTensorStorage srcs_storage{comp_node};
srcs_storage.ensure_size(sizeof(void*) * nr_inputs);
TensorLayout srcs_layout = TensorLayout{{nr_inputs}, dtype::Int32()};
size_t ws_size;
{
TensorShapeArray src_shapes;
for (size_t i = 0; i < nr_inputs; ++i) {
src_shapes.push_back(inputs[i]->shape());
}
ws_size = caller.op->get_workspace_in_bytes(
src_shapes, inputs.back()->shape(), TensorShape{});
}
HostTensorND srcs_tensornd;
srcs_tensornd.reset(srcs_storage, srcs_layout);
auto* srcs_raw_ptr = reinterpret_cast<void**>(srcs_storage.ptr());
for (size_t i = 0; i < nr_inputs; ++i) {
srcs_raw_ptr[i] = inputs[i]->dev_tensor().as_megdnn().raw_ptr();
srcs_raw_ptr[i] = inputs[i]->dnn_tensor().raw_ptr();
}
HostTensorStorage srcs_storage;
srcs_storage.reset(comp_node, srcs_size, srcs_ptr);
caller.op->exec(
{srcs_raw_ptr, srcs_layout}, inputs.back()->dnn_tensor(),
output->dnn_tensor(), caller.create_workspace(ws_size));
async_release(HostTensorND{comp_node, srcs_layout}.storage(srcs_storage));
caller.exec_with_ws(srcs_tensornd.as_megdnn(), inputs.back(), output);
async_release(srcs_tensornd);
return {output};
}



+ 22
- 52
imperative/src/impl/ops/vision.cpp View File

@@ -33,69 +33,39 @@ VarNodeArray apply_on_var_node(const OpDef& def, const VarNodeArray& inputs) {

std::tuple<SmallVector<LogicalTensorDesc>, bool> infer_output_attrs_fallible(
const OpDef& def, const SmallVector<LogicalTensorDesc>& inputs) {
auto&& op = static_cast<const ROIAlign&>(def);
if (inputs[0].layout.is_empty() || inputs[1].layout.is_empty()) {
return {{{TensorLayout(inputs[0].layout.dtype), inputs[0].comp_node},
{TensorLayout(dtype::Int32()), inputs[1].comp_node}},
false};
}

SmallVector<LogicalTensorDesc> descs(2u);
size_t n = inputs[1].layout[0];
size_t c = inputs[0].layout[1];
descs[0].layout = TensorLayout(
{n, c, op.pooled_height, op.pooled_width}, inputs[0].layout.dtype);
descs[0].layout.init_contiguous_stride();
descs[0].comp_node = inputs[0].comp_node;

descs[1].layout =
TensorLayout({n, c, op.pooled_height, op.pooled_width}, dtype::Int32());
descs[1].layout.init_contiguous_stride();
descs[1].comp_node = descs[0].comp_node;

return {descs, true};
auto&& op = def.cast_final_safe<ROIAlign>();
DnnOprHelper<megdnn::ROIAlign> dnn_opr(op.param());
auto cn = inputs[0].comp_node;
auto&& [out_layout, ind_layout] =
dnn_opr.deduce_layouts<2>(inputs[0].layout, inputs[1].layout);
bool validated = out_layout.ndim == 0 && ind_layout.ndim == 0;
return {{{out_layout, cn}, {ind_layout, cn}}, validated};
}

SmallVector<TensorPtr> apply_on_physical_tensor(
const OpDef& def, const SmallVector<TensorPtr>& inputs,
SmallVector<LogicalTensorDesc>& output_descs, const bool& validated) {
auto&& op = static_cast<const ROIAlign&>(def);
CompNode cn = inputs[0]->comp_node();
auto&& op = def.cast_final_safe<ROIAlign>();
auto cn = inputs[0]->comp_node();

TensorLayout out_layout = output_descs[0].layout;
TensorLayout ind_layout = output_descs[1].layout;
if (!validated) {
size_t n = inputs[1]->layout()[0];
size_t c = inputs[0]->layout()[1];
out_layout = TensorLayout(
{n, c, op.pooled_height, op.pooled_width}, inputs[0]->layout().dtype);
out_layout.init_contiguous_stride();
ind_layout =
TensorLayout({n, c, op.pooled_height, op.pooled_width}, dtype::Int32());
ind_layout.init_contiguous_stride();
}
DnnOprCaller<megdnn::ROIAlign> dnn_opr(cn, op.param());
auto&& [out_layout, ind_layout] = [&]() -> std::array<TensorLayout, 2> {
if (validated) {
return {output_descs[0].layout, output_descs[1].layout};
} else {
return dnn_opr.deduce_layouts<2>(inputs[0]->layout(), inputs[1]->layout());
}
}();

DeviceTensorND out =
BlobManager::inst()->alloc_workspace_with_defrag(cn, out_layout);
DeviceTensorND inds =
BlobManager::inst()->alloc_workspace_with_defrag(cn, ind_layout);
auto out = Tensor::make(out_layout, cn);
auto ind = Tensor::make(ind_layout, cn);

if (out_layout.is_empty() || ind_layout.is_empty()) {
return {Tensor::make(out), Tensor::make(inds)};
return {out, ind};
}

DnnOprCaller<megdnn::ROIAlign> dnn_opr(cn);
dnn_opr.op->param() = op.param();

size_t sz = dnn_opr.op->get_workspace_in_bytes(
inputs[0]->layout(), inputs[1]->layout(), out_layout, ind_layout);

auto dnn_wk = dnn_opr.create_workspace(sz);

dnn_opr.op->exec(
inputs[0]->dnn_tensor(), inputs[1]->dnn_tensor(), out.as_megdnn(),
inds.as_megdnn(), dnn_wk);
return {Tensor::make(out), Tensor::make(inds)};
dnn_opr.exec_with_ws(inputs[0], inputs[1], out, ind);
return {out, ind};
}

SmallVector<VarNode::LayoutConstraintCallback> get_input_layout_constraint(


+ 7
- 1
imperative/src/impl/physical_tensor.cpp View File

@@ -570,11 +570,17 @@ bool Tensor::empty() {
return !m_blob->size();
}

megdnn::TensorND Tensor::dnn_tensor() {
DnnTensorND Tensor::dnn_tensor() {
mgb_assert(m_blob, "uninitialized tensor.");
mgb_assert(m_layout.ndim, "dnn don't support scalar");
return DnnTensorND{m_layout, m_blob->storage(), m_offset};
}

DnnTensorND Tensor::dnn_tensor(TensorShape new_shape) {
mgb_assert(m_blob, "uninitialized tensor.");
return DnnTensorND{m_layout.reshape(new_shape), m_blob->storage(), m_offset};
}

void Tensor::fetch_value() {
MGB_LOCK_GUARD(m_value_mtx);
if (m_value.empty()) {


+ 10
- 3
imperative/src/impl/proxy_graph/mini_graph.h View File

@@ -334,9 +334,16 @@ public:
size_t j = 0;
for (auto&& var : m_opr->output()) {
if (var->contain_flag(VarNode::Flag::VOLATILE_CONTENT)) {
TensorLayout layout{var->shape(), var->dtype(), var->format()};
var->m_dev_tensor = BlobManager::inst()->alloc_workspace_with_defrag(
var->comp_node(), layout);
auto comp_node = var->comp_node();
auto dtype = var->dtype();
auto&& shape = var->shape();
size_t size = dtype.size(shape.total_nr_elems());
mgb_assert(
var->format().is_default(), "non default format for workspace");
auto raw_storage = Blob::make(comp_node, size)->storage();
DeviceTensorStorage storage;
storage.reset(comp_node, size, raw_storage);
var->m_dev_tensor.reset(storage, {shape, dtype});
} else {
mgb_assert(j < outputs.size());
auto&& tensor = outputs[j];


+ 10
- 3
imperative/src/include/megbrain/imperative/blob_manager.h View File

@@ -1,6 +1,7 @@
#pragma once

#include "megbrain/imperative/physical_tensor.h"
#include "megbrain/imperative/utils/helper.h"

namespace mgb {
namespace imperative {
@@ -15,13 +16,19 @@ public:

virtual void alloc_direct(OwnedBlob* blob, size_t size) = 0;

virtual bool try_alloc_direct(OwnedBlob* blob, size_t size) {
try {
alloc_direct(blob, size);
return true;
} catch (MemAllocError&) {
return false;
}
}

virtual void alloc_with_defrag(OwnedBlob* blob, size_t size) = 0;

virtual void set_allocator(allocator_t allocator) = 0;

virtual DeviceTensorND alloc_workspace_with_defrag(
CompNode cn, TensorLayout& layout) = 0;

virtual void register_blob(OwnedBlob* blob) = 0;

virtual void unregister_blob(OwnedBlob* blob) = 0;


+ 9
- 12
imperative/src/include/megbrain/imperative/physical_tensor.h View File

@@ -89,24 +89,19 @@ using EventPtr = std::unique_ptr<CompNode::Event, EventDeleter>;
class Tensor;
using TensorPtr = std::shared_ptr<Tensor>;

/*
using DnnTensorND to save the reference count of workspace
allocted by blobmanager to prevent invalidation
*/
struct DnnTensorND : megdnn::TensorND {
private:
std::shared_ptr<dt_byte> m_reference;
// hold extra reference to repvent defrag-in-use
std::shared_ptr<dt_byte> reference;

public:
DnnTensorND(TensorLayout& layout_, std::shared_ptr<dt_byte> ref_ptr, size_t offset)
: megdnn::TensorND(layout_, {ref_ptr.get(), offset}) {
m_reference = ref_ptr;
DnnTensorND(
const TensorLayout& layout_, std::shared_ptr<dt_byte> ptr, size_t offset)
: megdnn::TensorND(layout_, {ptr.get(), offset}) {
reference = std::move(ptr);
}
};

class Tensor : public NonCopyableObj {
public:
Tensor() = default;
Tensor(BlobPtr blob, const TensorLayout& layout, size_t offset = 0,
const HostTensorND& hv = {});
Tensor(BlobPtr blob, const TensorLayout& layout, const HostTensorND& hv = {})
@@ -154,7 +149,9 @@ public:

void assign_from_dev_tensor(DeviceTensorND);

megdnn::TensorND dnn_tensor();
DnnTensorND dnn_tensor();

DnnTensorND dnn_tensor(TensorShape new_shape);

static TensorPtr make_scalar(DTypeScalar value, CompNode cn);



+ 33
- 2
imperative/src/include/megbrain/imperative/utils/helper.h View File

@@ -3,6 +3,7 @@
#include <iomanip>
#include <memory>
#include <mutex>
#include <optional>
#include <sstream>

#include "megbrain/utils/metahelper.h"
@@ -14,11 +15,28 @@ namespace imperative {
template <typename T = std::function<void()>>
class CleanupGuard : public NonCopyableObj {
private:
T m_callback;
std::optional<T> m_callback;

public:
CleanupGuard() = default;
explicit CleanupGuard(T cb) : m_callback{std::move(cb)} {}
~CleanupGuard() { m_callback(); }
~CleanupGuard() { reset(); }
CleanupGuard(CleanupGuard&& rhs) : m_callback(std::move(rhs.m_callback)) {
rhs.m_callback.reset();
}
CleanupGuard& operator=(CleanupGuard&& rhs) {
swap(m_callback, rhs.m_callback);
rhs.reset();
return *this;
}

public:
void reset() {
if (m_callback) {
(*m_callback)();
m_callback.reset();
}
}
};

inline std::string quoted(std::string str) {
@@ -33,6 +51,19 @@ inline std::string quoted(std::string str) {
std::call_once(_once_flag, [&] { __VA_ARGS__; }); \
} while (false)

template <typename T>
struct is_small_vector {
static constexpr bool value = false;
};

template <typename T>
struct is_small_vector<SmallVector<T>> {
static constexpr bool value = true;
};

template <typename T>
static constexpr bool is_small_vector_v = is_small_vector<T>::value;

} // namespace imperative

} // namespace mgb

+ 6
- 0
imperative/src/include/megbrain/imperative/utils/platform.h View File

@@ -6,4 +6,10 @@ namespace mgb::imperative {

std::string demangle(std::string mangled);

template <typename T>
const char* demangled_typename() {
static auto name = demangle(typeid(T).name());
return name.c_str();
}

} // namespace mgb::imperative

+ 4
- 3
src/opr/impl/misc.cpp View File

@@ -314,7 +314,8 @@ void CondTake::init_output_static_infer_desc() {
auto dtype = input(0)->dtype();
TensorLayout ily(iv.val[0].shape(), dtype);
dest.ndim = 1;
dest.shape[0] = megdnn_opr()->get_workspace_in_bytes(ily);
TensorLayout mly(iv.val[0].shape(), dtype::Int32());
dest.shape[0] = megdnn_opr()->get_workspace_in_bytes(ily, mly);
return true;
};
owner_graph()->static_infer_manager().register_shape_infer(
@@ -548,9 +549,9 @@ void CheckNonFinite::init_output_static_infer_desc() {

auto infer_wk = [this](TensorShape& dest, const InpVal& inp) {
dest.ndim = 1;
megdnn::TensorNDArray inp_arr(input().size());
SmallVector<megdnn::TensorLayout> inp_arr(input().size());
for (size_t i = 0; i < input().size(); ++i) {
inp_arr[i] = {NULL, {inp.val.at(i).shape(), input(0)->dtype()}};
inp_arr[i] = {inp.val.at(i).shape(), input(0)->dtype()};
}
dest.shape[0] = megdnn_opr()->get_workspace_in_bytes(
inp_arr, {output(input().size() + 1)->shape(),


+ 2
- 5
src/opr/impl/tensor_manip.cpp View File

@@ -1447,11 +1447,8 @@ void ParamPackConcat::init_output_static_infer_desc() {
auto infer_wk = [this](TensorShape& dest, const InpVal& inp) {
TensorShapeArray shapes;
auto vals = inp.val;
shapes.reserve(vals.size() - 1);
for (size_t i = 0; i < vals.size() - 1; i++) {
shapes.push_back(vals[i].shape());
}
dest = {m_opr->get_workspace_in_bytes(shapes, vals.back().shape(), dest)};
size_t nr_params = vals.size() - 1;
dest = {m_opr->get_workspace_in_bytes({nr_params}, vals.back().shape(), dest)};
return true;
};
mgr.register_shape_infer(output(0), {SourceType::DEP, shp_deps, infer_out});


+ 2
- 1
src/rdnn/impl/algo_chooser.cpp View File

@@ -970,8 +970,9 @@ void AlgoChooser<Opr>::AlgoChooserHelper::profile(
if (!policy.algo.valid())
continue;
size_t workspace_needed = get_workspace_size_bytes(policy);
if (m_inputs != nullptr)
if (m_inputs == nullptr) {
workspace_needed += data_size;
}
if (workspace_needed >
m_desc.get_workspace_limit(m_cn, m_execution_policy.workspace_limit)) {
continue;


+ 3
- 2
tools/format.py View File

@@ -18,7 +18,8 @@ failed_files = Manager().list()


def process_file(file, clang_format, write):
source = open(file, "r").read()
original_source = open(file, "r").read()
source = original_source
source = re.sub(r"MGB_DEFINE(?P<r>([^\\]|\n)*?)// *{", r"class MGB_DEFINE\g<r>{", source)
source, count = re.subn(r"(?<!#define )MGB_DEFINE(.*) +\\", r"class MGB_DEFINE\1{\\", source)

@@ -38,7 +39,7 @@ def process_file(file, clang_format, write):
result = re.sub(r"class MGB_DEFINE(.*){( *)\\", r"MGB_DEFINE\1\2 \\", result)
result = re.sub(r"class MGB_DEFINE((.|\n)*?){", r"MGB_DEFINE\1// {", result)

if write:
if write and original_source != result:
with tempfile.NamedTemporaryFile(
dir=os.path.dirname(file), delete=False
) as tmp_file:


Loading…
Cancel
Save