Browse Source

feat(dnn/naive): support weight preprocess interface in dnn

GitOrigin-RevId: 84791aacf9
tags/v0.5.0
Megvii Engine Team 5 years ago
parent
commit
0b32056828
30 changed files with 372 additions and 98 deletions
  1. +6
    -2
      dnn/include/megdnn/oprs/nn.h
  2. +1
    -1
      dnn/src/common/conv_bias.cpp
  3. +2
    -1
      dnn/src/common/convolution.cpp
  4. +1
    -1
      dnn/src/cuda/conv_bias/bfloat16.cpp
  5. +3
    -1
      dnn/src/cuda/conv_bias/opr_impl.cpp
  6. +27
    -2
      dnn/src/cuda/conv_bias/opr_impl.h
  7. +14
    -4
      dnn/src/cuda/convolution/opr_impl.cpp
  8. +26
    -7
      dnn/src/cuda/convolution/opr_impl.h
  9. +1
    -1
      dnn/src/cuda/mask_conv/opr_impl.cpp
  10. +1
    -1
      dnn/src/cuda/mask_conv/opr_impl.h
  11. +12
    -9
      dnn/src/fallback/conv_bias/opr_impl.cpp
  12. +4
    -2
      dnn/src/fallback/conv_bias/opr_impl.h
  13. +8
    -5
      dnn/src/fallback/convolution/opr_impl.cpp
  14. +4
    -2
      dnn/src/fallback/convolution/opr_impl.h
  15. +3
    -1
      dnn/src/naive/conv_bias/opr_impl.cpp
  16. +29
    -6
      dnn/src/naive/conv_bias/opr_impl.h
  17. +27
    -24
      dnn/src/naive/convolution/convolution.cpp
  18. +25
    -5
      dnn/src/naive/convolution/opr_impl.h
  19. +1
    -1
      dnn/src/naive/convpooling/conv_pooling.cpp
  20. +2
    -2
      dnn/src/naive/mask_conv/opr_impl.cpp
  21. +1
    -1
      dnn/src/naive/separable_conv/opr_impl.cpp
  22. +3
    -2
      dnn/test/arm_common/conv_bias_multi_thread.cpp
  23. +2
    -2
      dnn/test/common/conv_bias.cpp
  24. +115
    -3
      dnn/test/common/opr_proxy.h
  25. +2
    -2
      dnn/test/cpu/mask_conv.cpp
  26. +41
    -1
      dnn/test/cuda/chanwise_convolution.cpp
  27. +2
    -2
      dnn/test/dispatcher/null.cpp
  28. +2
    -2
      dnn/test/naive/conv_bias.cpp
  29. +3
    -2
      dnn/test/naive/convolution.cpp
  30. +4
    -3
      dnn/test/x86/conv_bias.cpp

+ 6
- 2
dnn/include/megdnn/oprs/nn.h View File

@@ -210,21 +210,25 @@ public:
_megdnn_tensor_out dst,
const PreprocessedFilter* preprocessed_filter,
_megdnn_workspace workspace) = 0;

virtual void exec_preprocess(const TensorLayout& src_layout,
_megdnn_tensor_in filter,
const TensorLayout& dst_layout,
PreprocessedFilter* preprocessed_filter,
_megdnn_workspace workspace) = 0;
void deduce_dtype(DType src, DType filter, DType& dst);

void deduce_layout(const TensorLayout& src, const TensorLayout& filter,
TensorLayout& dst);
virtual size_t get_workspace_in_bytes(
const TensorLayout& src, const TensorLayout& filter,
const TensorLayout& dst,
PreprocessedFilter* preprocessed_filter) = 0;
const PreprocessedFilter* preprocessed_filter) = 0;

virtual SmallVector<TensorLayout> deduce_preprocessed_filter_layout(
const TensorLayout& src, const TensorLayout& filter,
const TensorLayout& dst) = 0;

virtual size_t get_preprocess_workspace_in_bytes(
const TensorLayout& src, const TensorLayout& filter,
const TensorLayout& dst) = 0;
@@ -337,7 +341,7 @@ public:
const TensorLayout& src, const TensorLayout& filter,
const TensorLayout& bias, const TensorLayout& z,
const TensorLayout& dst,
PreprocessedFilter* preprocessed_filter) = 0;
const PreprocessedFilter* preprocessed_filter) = 0;
virtual size_t get_preprocess_workspace_in_bytes(
const TensorLayout& src, const TensorLayout& filter,
const TensorLayout& bias, const TensorLayout& z,


+ 1
- 1
dnn/src/common/conv_bias.cpp View File

@@ -76,7 +76,7 @@ ConvBiasForward::CanonizedFilterMeta ConvBiasForward::check_exec(
auto ret = check_layout_fwd(src, filter, dst);
megdnn_assert_contiguous(bias);
auto required_workspace_in_bytes =
get_workspace_in_bytes(src, filter, bias, z, dst);
get_workspace_in_bytes(src, filter, bias, z, dst, nullptr);
megdnn_assert(workspace_in_bytes >= required_workspace_in_bytes);
if (bias.ndim != 0) {
//! bias.layout == dst.layout failed, no assert information


+ 2
- 1
dnn/src/common/convolution.cpp View File

@@ -981,7 +981,8 @@ ConvolutionForward::CanonizedFilterMeta ConvolutionForward::check_exec(
const TensorLayout& src, const TensorLayout& filter,
const TensorLayout& dst, size_t workspace_in_bytes) {
auto ret = check_layout_fwd(src, filter, dst);
auto required_workspace_in_bytes = get_workspace_in_bytes(src, filter, dst);
auto required_workspace_in_bytes =
get_workspace_in_bytes(src, filter, dst, nullptr);
megdnn_assert(workspace_in_bytes >= required_workspace_in_bytes);
return ret;
}


+ 1
- 1
dnn/src/cuda/conv_bias/bfloat16.cpp View File

@@ -112,7 +112,7 @@ void ConvBiasForwardImpl::AlgoBFloat16::exec(const ExecArgs& args) const {
convbias_opr->param().compute_mode = Param::ComputeMode::DEFAULT;
convbias_opr->execution_policy() = {m_impl};
convbias_opr->exec(fsrc_tensor, ffilter_tensor, fbias_tensor, fz_tensor,
fdst_tensor, cvter.workspace());
fdst_tensor, nullptr, cvter.workspace());
}
{ cvter.comp_to_dst_type(fdst_tensor, *args.dst_tensor); }
}


+ 3
- 1
dnn/src/cuda/conv_bias/opr_impl.cpp View File

@@ -25,6 +25,7 @@ namespace cuda {
void ConvBiasForwardImpl::exec(_megdnn_tensor_in src, _megdnn_tensor_in filter,
_megdnn_tensor_in bias, _megdnn_tensor_in z,
_megdnn_tensor_out dst,
const PreprocessedFilter*,
_megdnn_workspace workspace) {
check_exec(src.layout, filter.layout, bias.layout, z.layout, dst.layout,
workspace.size);
@@ -208,7 +209,8 @@ size_t ConvBiasForwardImpl::get_workspace_in_bytes(const TensorLayout& src,
const TensorLayout& filter,
const TensorLayout& bias,
const TensorLayout& z,
const TensorLayout& dst) {
const TensorLayout& dst,
const PreprocessedFilter*) {
AlgoBase::SizeArgs args{this, src, filter, bias, z, dst};
return get_algorithm(this, src, filter, bias, z, dst)
->get_workspace_in_bytes(args);


+ 27
- 2
dnn/src/cuda/conv_bias/opr_impl.h View File

@@ -20,7 +20,9 @@ public:
using ConvBiasForward::ConvBiasForward;
void exec(_megdnn_tensor_in src, _megdnn_tensor_in filter,
_megdnn_tensor_in bias, _megdnn_tensor_in z,
_megdnn_tensor_out dst, _megdnn_workspace workspace) override;
_megdnn_tensor_out dst,
const PreprocessedFilter* preprocessed_filter,
_megdnn_workspace workspace) override;
std::vector<Algorithm*> get_all_algorithms(
const TensorLayout& src, const TensorLayout& filter,
const TensorLayout& bias, const TensorLayout& z,
@@ -34,7 +36,30 @@ public:
bool reproducible) override;
size_t get_workspace_in_bytes(const TensorLayout&, const TensorLayout&,
const TensorLayout&, const TensorLayout&,
const TensorLayout&) override;
const TensorLayout&,
const PreprocessedFilter*) override;

size_t get_preprocess_workspace_in_bytes(const TensorLayout&,
const TensorLayout&,
const TensorLayout&,
const TensorLayout&,
const TensorLayout&) override {
return 0;
};
SmallVector<TensorLayout> deduce_preprocessed_filter_layout(
const TensorLayout&, const TensorLayout&, const TensorLayout&,
const TensorLayout&, const TensorLayout&) override {
return {};
}
void exec_preprocess(const TensorLayout& ,
_megdnn_tensor_in ,
const TensorLayout& ,
const TensorLayout& ,
const TensorLayout& ,
PreprocessedFilter* ,
_megdnn_workspace ) override {
megdnn_throw("cuda conv_bias exec_preprocess has not implemeted yet");
}

const char* get_algorithm_set_name() const override;



+ 14
- 4
dnn/src/cuda/convolution/opr_impl.cpp View File

@@ -73,22 +73,32 @@ ConvolutionForwardImpl::get_all_algorithms(const TensorLayout& src,

size_t ConvolutionForwardImpl::get_workspace_in_bytes(
const TensorLayout& src, const TensorLayout& filter,
const TensorLayout& dst) {
const TensorLayout& dst,
const PreprocessedFilter* preprocessed_filter) {
auto extra_data = conv_bias_extra_data(dst);
return static_cast<ConvBiasForwardImpl*>(extra_data.convbias_opr.get())
->get_workspace_in_bytes(src, filter, extra_data.bias_layout,
extra_data.z_layout, dst);
->get_workspace_in_bytes(
src, filter, extra_data.bias_layout, extra_data.z_layout,
dst,
reinterpret_cast<const ConvolutionBase<
param::ConvBias>::PreprocessedFilter*>(
preprocessed_filter));
}

void ConvolutionForwardImpl::exec(_megdnn_tensor_in src,
_megdnn_tensor_in filter,
_megdnn_tensor_out dst,
const PreprocessedFilter* preprocessed_filter,
_megdnn_workspace workspace) {
auto extra_data = conv_bias_extra_data(dst.layout);
TensorND bias(nullptr, extra_data.bias_layout);
TensorND z(nullptr, extra_data.z_layout);
return static_cast<ConvBiasForwardImpl*>(extra_data.convbias_opr.get())
->exec(src, filter, bias, z, dst, workspace);
->exec(src, filter, bias, z, dst,
reinterpret_cast<const ConvolutionBase<
param::ConvBias>::PreprocessedFilter*>(
preprocessed_filter),
workspace);
}

const char* ConvolutionForwardImpl::get_algorithm_set_name() const {


+ 26
- 7
dnn/src/cuda/convolution/opr_impl.h View File

@@ -11,6 +11,7 @@
#pragma once

#include "megdnn/oprs/nn.h"
#include "src/common/utils.h"

namespace megdnn {
namespace cuda {
@@ -18,10 +19,11 @@ namespace cuda {
class ConvolutionForwardImpl: public ConvolutionForward {
public:
using ConvolutionForward::ConvolutionForward;
void exec(_megdnn_tensor_in src,
_megdnn_tensor_in filter,
_megdnn_tensor_out dst,
_megdnn_workspace workspace) override;
void exec(_megdnn_tensor_in src, _megdnn_tensor_in filter,
_megdnn_tensor_out dst,
const PreprocessedFilter* preprocessed_filter,
_megdnn_workspace workspace) override;

std::vector<Algorithm *> get_all_algorithms(const TensorLayout &src,
const TensorLayout &filter,
const TensorLayout &dst) override;
@@ -30,11 +32,28 @@ class ConvolutionForwardImpl: public ConvolutionForward {
const TensorLayout& dst,
size_t workspace_limit_in_bytes,
bool reproducible) override;
size_t get_workspace_in_bytes(const TensorLayout& src,
const TensorLayout& filter,
const TensorLayout& dst) override;
size_t get_workspace_in_bytes(
const TensorLayout& src, const TensorLayout& filter,
const TensorLayout& dst,
const PreprocessedFilter* preprocessed_filter) override;
const char* get_algorithm_set_name() const override;

SmallVector<TensorLayout> deduce_preprocessed_filter_layout(
const TensorLayout&, const TensorLayout&,
const TensorLayout&) override {
return {};
}
size_t get_preprocess_workspace_in_bytes(
const TensorLayout& , const TensorLayout& ,
const TensorLayout& ) override{
return 0;
}
void exec_preprocess(const TensorLayout&, _megdnn_tensor_in,
const TensorLayout&, PreprocessedFilter*,
_megdnn_workspace) override {
megdnn_throw("cuda exec_preprocess has not implemeted yet");
}

protected:
struct ConvBiasExtraData{
std::unique_ptr<ConvBiasForward> convbias_opr;


+ 1
- 1
dnn/src/cuda/mask_conv/opr_impl.cpp View File

@@ -27,7 +27,7 @@ void MaskConvForwardImpl::exec(_megdnn_tensor_in src, _megdnn_tensor_in filter,
_megdnn_workspace workspace) {
megdnn_assert(dst.layout.dtype.enumv() == DTypeTrait<dtype::Float32>::enumv,
"Mask conv only support Float32 dtype.");
m_conv_opr->exec(src, filter, dst, workspace);
m_conv_opr->exec(src, filter, dst, nullptr, workspace);
auto stream = cuda_stream(handle());
#define cb(DType) \
if (mask.layout.dtype == DType()) { \


+ 1
- 1
dnn/src/cuda/mask_conv/opr_impl.h View File

@@ -30,7 +30,7 @@ public:
const TensorLayout& dst) override {
MEGDNN_MARK_USED_VAR(mask);
m_conv_opr->param() = param();
return m_conv_opr->get_workspace_in_bytes(src, filter, dst);
return m_conv_opr->get_workspace_in_bytes(src, filter, dst, nullptr);
}

private:


+ 12
- 9
dnn/src/fallback/conv_bias/opr_impl.cpp View File

@@ -95,7 +95,9 @@ bool ConvBiasImpl::is_naive_algo(ConvBiasImpl::Algorithm* algo) {
}
void ConvBiasImpl::exec(_megdnn_tensor_in src, _megdnn_tensor_in filter,
_megdnn_tensor_in bias, _megdnn_tensor_in z,
_megdnn_tensor_out dst, _megdnn_workspace workspace) {
_megdnn_tensor_out dst,
const PreprocessedFilter* preprocessed_filter,
_megdnn_workspace workspace) {
check_exec(src.layout, filter.layout, bias.layout, z.layout, dst.layout,
workspace.size);
auto fparam = make_ncb_kern_param(src, filter, bias, dst, workspace);
@@ -104,20 +106,21 @@ void ConvBiasImpl::exec(_megdnn_tensor_in src, _megdnn_tensor_in filter,
ncb_algo_get_workspace(algo, fparam) <= workspace.size) {
exec_with_ncb_kern(fparam, algo);
} else {
naive::ConvBiasForwardImpl::exec(src, filter, bias, z, dst, workspace);
naive::ConvBiasForwardImpl::exec(src, filter, bias, z, dst,
preprocessed_filter, workspace);
}
}

size_t ConvBiasImpl::get_workspace_in_bytes(const TensorLayout& src,
const TensorLayout& filter,
const TensorLayout& bias,
const TensorLayout& z,
const TensorLayout& dst) {
size_t ConvBiasImpl::get_workspace_in_bytes(
const TensorLayout& src, const TensorLayout& filter,
const TensorLayout& bias, const TensorLayout& z,
const TensorLayout& dst,
const PreprocessedFilter* preprocessed_filter) {
auto fparam = make_ncb_kern_size_param(src, filter, bias, dst);
ConvBiasImpl::Algorithm* algo = get_algorithm(fparam);
if (is_naive_algo(algo)) {
return naive::ConvBiasForwardImpl::get_workspace_in_bytes(src, filter,
bias, z, dst);
return naive::ConvBiasForwardImpl::get_workspace_in_bytes(
src, filter, bias, z, dst, preprocessed_filter);
} else {
return ncb_algo_get_workspace(algo, fparam);
}


+ 4
- 2
dnn/src/fallback/conv_bias/opr_impl.h View File

@@ -41,14 +41,16 @@ public:
//! implemented by exec_with_ncb_kern()
void exec(_megdnn_tensor_in src, _megdnn_tensor_in filter,
_megdnn_tensor_in bias, _megdnn_tensor_in z,
_megdnn_tensor_out dst, _megdnn_workspace workspace) override;
_megdnn_tensor_out dst, const PreprocessedFilter*,
_megdnn_workspace workspace) override;

//! implemented by get_workspace_with_ncb()
size_t get_workspace_in_bytes(const TensorLayout& src,
const TensorLayout& filter,
const TensorLayout& bias,
const TensorLayout& z,
const TensorLayout& dst) override;
const TensorLayout& dst,
const PreprocessedFilter*) override;

//! implemented by get_all_algorithms_with_ncb()
std::vector<Algorithm*> get_all_algorithms(


+ 8
- 5
dnn/src/fallback/convolution/opr_impl.cpp View File

@@ -82,6 +82,7 @@ bool ConvolutionImpl::is_naive_algo(ConvolutionImpl::Algorithm* algo) {
}
void ConvolutionImpl::exec(_megdnn_tensor_in src, _megdnn_tensor_in filter,
_megdnn_tensor_out dst,
const PreprocessedFilter* preprocessed_filter,
_megdnn_workspace workspace) {
auto fparam = make_ncb_kern_param(src, filter, dst, workspace);
ConvolutionImpl::Algorithm* algo = get_algorithm(fparam, workspace.size);
@@ -89,18 +90,20 @@ void ConvolutionImpl::exec(_megdnn_tensor_in src, _megdnn_tensor_in filter,
ncb_algo_get_workspace(algo, fparam) <= workspace.size) {
exec_with_ncb_kern(fparam, algo);
} else {
naive::ConvolutionForwardImpl::exec(src, filter, dst, workspace);
naive::ConvolutionForwardImpl::exec(src, filter, dst,
preprocessed_filter, workspace);
}
}

size_t ConvolutionImpl::get_workspace_in_bytes(const TensorLayout& src,
const TensorLayout& filter,
const TensorLayout& dst) {
size_t ConvolutionImpl::get_workspace_in_bytes(
const TensorLayout& src, const TensorLayout& filter,
const TensorLayout& dst,
const PreprocessedFilter* preprocessed_filter) {
auto fparam = make_ncb_kern_size_param(src, filter, dst);
Algorithm* algo = get_algorithm(fparam);
if (is_naive_algo(algo)) {
return naive::ConvolutionForwardImpl::get_workspace_in_bytes(
src, filter, dst);
src, filter, dst, preprocessed_filter);
} else {
return ncb_algo_get_workspace(algo, fparam);
}


+ 4
- 2
dnn/src/fallback/convolution/opr_impl.h View File

@@ -36,12 +36,14 @@ public:

//! implemented by exec_with_ncb_kern()
void exec(_megdnn_tensor_in src, _megdnn_tensor_in filter,
_megdnn_tensor_out dst, _megdnn_workspace workspace) override;
_megdnn_tensor_out dst, const PreprocessedFilter*,
_megdnn_workspace workspace) override;

//! implemented by get_workspace_with_ncb()
size_t get_workspace_in_bytes(const TensorLayout& src,
const TensorLayout& filter,
const TensorLayout& dst) override;
const TensorLayout& dst,
const PreprocessedFilter*) override;

//! implemented by get_all_algorithms_with_ncb()
std::vector<Algorithm*> get_all_algorithms(


+ 3
- 1
dnn/src/naive/conv_bias/opr_impl.cpp View File

@@ -54,7 +54,8 @@ size_t ConvBiasForwardImpl::get_workspace_in_bytes(const TensorLayout& src,
const TensorLayout& flt,
const TensorLayout& bias,
const TensorLayout& z,
const TensorLayout& dst) {
const TensorLayout& dst,
const PreprocessedFilter*) {
size_t float_workspace_size = 0;

if (z.ndim > 0 && z.dtype.category() != DTypeCategory::FLOAT) {
@@ -79,6 +80,7 @@ size_t ConvBiasForwardImpl::get_workspace_in_bytes(const TensorLayout& src,
void ConvBiasForwardImpl::exec(_megdnn_tensor_in src, _megdnn_tensor_in filter,
_megdnn_tensor_in bias, _megdnn_tensor_in z,
_megdnn_tensor_out dst,
const PreprocessedFilter*,
_megdnn_workspace workspace) {
MIDOUT_BEGIN(megdnn_naive_conv_bias_fwd) {
dt_byte *workspace_ptr = workspace.raw_ptr;


+ 29
- 6
dnn/src/naive/conv_bias/opr_impl.h View File

@@ -22,7 +22,9 @@ public:
using ConvBiasForward::ConvBiasForward;
void exec(_megdnn_tensor_in src, _megdnn_tensor_in filter,
_megdnn_tensor_in bias, _megdnn_tensor_in z,
_megdnn_tensor_out dst, _megdnn_workspace workspace) override;
_megdnn_tensor_out dst,
const PreprocessedFilter* preprocessed_filter,
_megdnn_workspace workspace) override;

std::vector<Algorithm*> get_all_algorithms(
const TensorLayout& src, const TensorLayout& filter,
@@ -37,11 +39,32 @@ public:
size_t workspace_limit_in_bytes,
bool reproducible) override;

size_t get_workspace_in_bytes(const TensorLayout& src,
const TensorLayout& filter,
const TensorLayout& bias,
const TensorLayout& z,
const TensorLayout& dst) override;
size_t get_workspace_in_bytes(
const TensorLayout& src, const TensorLayout& filter,
const TensorLayout& bias, const TensorLayout& z,
const TensorLayout& dst,
const PreprocessedFilter* preprocessed_filter) override;

size_t get_preprocess_workspace_in_bytes(const TensorLayout&,
const TensorLayout&,
const TensorLayout&,
const TensorLayout&,
const TensorLayout&) override {
return 0;
}
SmallVector<TensorLayout> deduce_preprocessed_filter_layout(
const TensorLayout&, const TensorLayout&, const TensorLayout&,
const TensorLayout&, const TensorLayout&) override {
return {};
}

void exec_preprocess(const TensorLayout&, _megdnn_tensor_in,
const TensorLayout&, const TensorLayout&,
const TensorLayout&, PreprocessedFilter*,
_megdnn_workspace) override{
megdnn_throw("conv_bias exec_preprocess is not impl yet");
}

const char* get_algorithm_set_name() const override;
};



+ 27
- 24
dnn/src/naive/convolution/convolution.cpp View File

@@ -26,15 +26,14 @@ using namespace megdnn;
using namespace naive;

void ConvolutionForwardImpl::exec(_megdnn_tensor_in src,
_megdnn_tensor_in filter,
_megdnn_tensor_out dst,
_megdnn_workspace workspace)
{
_megdnn_tensor_in filter,
_megdnn_tensor_out dst,
const PreprocessedFilter*,
_megdnn_workspace workspace) {
MIDOUT_BEGIN(megdnn_naive_conv_fwd) {

auto filter_meta = check_exec(
src.layout, filter.layout, dst.layout, workspace.size);
using ComputeMode = Param::ComputeMode;
auto filter_meta = check_exec(src.layout, filter.layout, dst.layout,
workspace.size);
using ComputeMode = Param::ComputeMode;
#define DISPATCH_CMODE(in_dt, out_dt, in_ct, out_ct, comp_ct, cmode) \
do { \
using namespace dtype; \
@@ -52,24 +51,28 @@ void ConvolutionForwardImpl::exec(_megdnn_tensor_in src,
#define cb(dt) \
DISPATCH(dt, dt, DTypeTrait<dt>::ctype, DTypeTrait<dt>::ctype, \
DTypeTrait<dt>::ctype)
MEGDNN_FOREACH_COMPUTING_DTYPE_FLOAT(cb);
MEGDNN_FOREACH_COMPUTING_DTYPE_FLOAT(cb);
#undef cb
DISPATCH(Int8, Int16, dt_int8, dt_int16, dt_int16);
DISPATCH(Int8, Int32, dt_int8, dt_int32, dt_int32);
DISPATCH(QuantizedS8, QuantizedS32, dt_int8, dt_int32, dt_int32);
MEGDNN_INC_FLOAT16(DISPATCH_CMODE(Float16, Float16, dt_float16, dt_float16,
dt_float32, ComputeMode::FLOAT32));
MEGDNN_INC_FLOAT16(DISPATCH_CMODE(BFloat16, BFloat16, dt_bfloat16,
dt_bfloat16, dt_float32,
ComputeMode::FLOAT32));
DISPATCH(Quantized8Asymm, QuantizedS32, dt_quint8, dt_qint32, dt_qint32);
DISPATCH(QuantizedS8, QuantizedS8, dt_int8, dt_int8, dt_int32);
DISPATCH(Int8, Int16, dt_int8, dt_int16, dt_int16);
DISPATCH(Int8, Int32, dt_int8, dt_int32, dt_int32);
DISPATCH(QuantizedS8, QuantizedS32, dt_int8, dt_int32, dt_int32);
MEGDNN_INC_FLOAT16(DISPATCH_CMODE(Float16, Float16, dt_float16,
dt_float16, dt_float32,
ComputeMode::FLOAT32));
MEGDNN_INC_FLOAT16(DISPATCH_CMODE(BFloat16, BFloat16, dt_bfloat16,
dt_bfloat16, dt_float32,
ComputeMode::FLOAT32));
DISPATCH(Quantized8Asymm, QuantizedS32, dt_quint8, dt_qint32,
dt_qint32);
DISPATCH(QuantizedS8, QuantizedS8, dt_int8, dt_int8, dt_int32);
#undef DISPATCH
megdnn_throw(ssprintf("unsupported Conv(%s, %s) -> %s with cmode = %d",
src.layout.dtype.name(), filter.layout.dtype.name(),
dst.layout.dtype.name(),
static_cast<int>(param().compute_mode)));
} MIDOUT_END();
megdnn_throw(ssprintf("unsupported Conv(%s, %s) -> %s with cmode = %d",
src.layout.dtype.name(),
filter.layout.dtype.name(),
dst.layout.dtype.name(),
static_cast<int>(param().compute_mode)));
}
MIDOUT_END();
}

size_t ConvolutionBackwardDataImpl::get_workspace_in_bytes(const TensorLayout& filter,


+ 25
- 5
dnn/src/naive/convolution/opr_impl.h View File

@@ -10,6 +10,7 @@
*/
#pragma once
#include "megdnn/oprs.h"
#include "src/common/utils.h"

namespace megdnn {
namespace naive {
@@ -17,10 +18,10 @@ namespace naive {
class ConvolutionForwardImpl: public ConvolutionForward {
public:
using ConvolutionForward::ConvolutionForward;
void exec(_megdnn_tensor_in src,
_megdnn_tensor_in filter,
_megdnn_tensor_out dst,
_megdnn_workspace workspace) override;
void exec(_megdnn_tensor_in src, _megdnn_tensor_in filter,
_megdnn_tensor_out dst,
const PreprocessedFilter* preprocessed_filter,
_megdnn_workspace workspace) override;
std::vector<Algorithm *> get_all_algorithms(const TensorLayout &src,
const TensorLayout &filter,
const TensorLayout &dst) override;
@@ -30,10 +31,29 @@ class ConvolutionForwardImpl: public ConvolutionForward {
size_t workspace_limit_in_bytes,
bool reproducible) override;
size_t get_workspace_in_bytes(const TensorLayout&, const TensorLayout&,
const TensorLayout&) override {
const TensorLayout&,
const PreprocessedFilter*) override {
return 0;
}

size_t get_preprocess_workspace_in_bytes(const TensorLayout&,
const TensorLayout&,
const TensorLayout&) override {
return 0;
}

void exec_preprocess(const TensorLayout&, _megdnn_tensor_in,
const TensorLayout&, PreprocessedFilter*,
_megdnn_workspace) override {
megdnn_throw("convolution exec_preprocess in not impl yet");
}

SmallVector<TensorLayout> deduce_preprocessed_filter_layout(
const TensorLayout& , const TensorLayout& ,
const TensorLayout& )override{
return {};
}

const char* get_algorithm_set_name() const override;
};



+ 1
- 1
dnn/src/naive/convpooling/conv_pooling.cpp View File

@@ -97,7 +97,7 @@ void ConvPoolingForwardImpl::exec(const _megdnn_in TensorND src,
TensorND conv_dst((float*)(workspace.raw_ptr), conv_dst_layout);
//convFwd->check_layout(src.layout, filter.layout, workspace.layout, empty_wsp.layout);
check_layout(src.layout, filter.layout, bias.layout, dst.layout, workspace.size);
convFwd->exec(src, filter, conv_dst, empty_wsp);
convFwd->exec(src, filter, conv_dst, nullptr, empty_wsp);

// calculate bias
int conv_dst_batch = conv_dst.layout.shape[0];


+ 2
- 2
dnn/src/naive/mask_conv/opr_impl.cpp View File

@@ -80,7 +80,7 @@ void MaskConvForwardImpl::exec(_megdnn_tensor_in src, _megdnn_tensor_in filter,
_megdnn_workspace workspace) {
MEGDNN_MARK_USED_VAR(mask);
m_conv_opr->param() = this->param();
m_conv_opr->exec(src, filter, dst, workspace);
m_conv_opr->exec(src, filter, dst, nullptr, workspace);
#define cb(DType) \
if (mask.layout.dtype == DType()) { \
using ctype = typename DTypeTrait<DType>::ctype; \
@@ -99,7 +99,7 @@ size_t MaskConvForwardImpl::get_workspace_in_bytes(const TensorLayout& src,
const TensorLayout& dst) {
MEGDNN_MARK_USED_VAR(mask);
m_conv_opr->param() = this->param();
return m_conv_opr->get_workspace_in_bytes(src, filter, dst);
return m_conv_opr->get_workspace_in_bytes(src, filter, dst, nullptr);
}

void MaskPropagateImpl::exec(_megdnn_tensor_in src, _megdnn_tensor_out dst,


+ 1
- 1
dnn/src/naive/separable_conv/opr_impl.cpp View File

@@ -103,7 +103,7 @@ void SeparableConvForwardImpl::exec(_megdnn_tensor_in src,

ConvolutionForwardImpl* convOptr = new ConvolutionForwardImpl(this->handle());
Workspace empty_wsp;
convOptr->exec(src, filter2d, dst, empty_wsp);
convOptr->exec(src, filter2d, dst, nullptr, empty_wsp);
delete(convOptr);

free(filter2d_buf);


+ 3
- 2
dnn/test/arm_common/conv_bias_multi_thread.cpp View File

@@ -664,7 +664,7 @@ TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_WINOGRAD) {
conv_bias_opr->get_workspace_in_bytes(
tensors[0].layout, filter_transform_layout,
tensors[2].layout, tensors[3].layout,
tensors[4].layout);
tensors[4].layout, nullptr);

WorkspaceBundle wb(nullptr, {filter_transform_layout.span().dist_byte(),
conv_bias_workspace_in_bytes,
@@ -676,7 +676,8 @@ TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_WINOGRAD) {
winograd_preprocess_opr->exec(tensors[1], filter_transform_tensor,
wb.get_workspace(2));
conv_bias_opr->exec(tensors[0], filter_transform_tensor, tensors[2],
tensors[3], tensors[4], wb.get_workspace(1));
tensors[3], tensors[4], nullptr,
wb.get_workspace(1));

free(wb.ptr());
};


+ 2
- 2
dnn/test/common/conv_bias.cpp View File

@@ -1008,7 +1008,7 @@ void winograd_algo_extra_impl(const TensorNDArray& tensors, uint32_t m,
conv_bias_opr->param().output_block_size = m;
size_t conv_bias_workspace_in_bytes = conv_bias_opr->get_workspace_in_bytes(
tensors[0].layout, filter_transform_layout, tensors[2].layout,
tensors[3].layout, tensors[4].layout);
tensors[3].layout, tensors[4].layout, nullptr);

WorkspaceBundle wb(nullptr, {filter_transform_layout.span().dist_byte(),
conv_bias_workspace_in_bytes,
@@ -1020,7 +1020,7 @@ void winograd_algo_extra_impl(const TensorNDArray& tensors, uint32_t m,
winograd_preprocess_opr->exec(tensors[1], filter_transform_tensor,
wb.get_workspace(2));
conv_bias_opr->exec(tensors[0], filter_transform_tensor, tensors[2],
tensors[3], tensors[4], wb.get_workspace(1));
tensors[3], tensors[4], nullptr, wb.get_workspace(1));

free(wb.ptr());
};


+ 115
- 3
dnn/test/common/opr_proxy.h View File

@@ -200,15 +200,70 @@ struct OprProxyProfilingTernary : public OprProxyProfilingBase<Opr, 3> {
using OprProxyProfilingTernary<c>::OprProxyProfilingTernary; \
}

DEF_PROF3(ConvolutionForward);
DEF_PROF3(ConvolutionBackwardData);
DEF_PROF3(ConvolutionBackwardFilter);
DEF_PROF3(LocalShareForward);
DEF_PROF3(LocalShareBackwardData);
DEF_PROF3(LocalShareBackwardFilter);

#undef DEF_PROF3

//! TODO: it should adapt weight preprocess later
template <>
struct OprProxy<ConvolutionForward>
: public OprProxyProfilingTernary<ConvolutionForward> {
using OprProxyProfilingTernary<ConvolutionForward>::OprProxyProfilingTernary;
void exec(ConvolutionForward* opr, const TensorNDArray& tensors) {
megdnn_assert(tensors.size() == 3);
if (!Base::W.valid()) {
Base::W = WorkspaceWrapper(opr->handle(), 0);
}
if (Base::m_profiling && !Base::target_algo) {
size_t min_time = std::numeric_limits<size_t>::max();
for (auto algo :
opr->get_all_algorithms(tensors[0].layout, tensors[1].layout,
tensors[2].layout)) {
opr->execution_policy().algorithm = algo;
auto workspace_size = opr->get_workspace_in_bytes(
tensors[0].layout, tensors[1].layout, tensors[2].layout,
nullptr);
Base::W.update(workspace_size);

for (size_t times = 0; times < Base::warmup_times; ++times)
opr->exec(tensors[0], tensors[1], tensors[2], nullptr,
Base::W.workspace());
megcoreSynchronize(opr->handle()->megcore_computing_handle());
Timer timer;
timer.start();
for (size_t times = 0; times < Base::exec_times; ++times) {
opr->exec(tensors[0], tensors[1], tensors[2], nullptr,
Base::W.workspace());
}
megcoreSynchronize(opr->handle()->megcore_computing_handle());
timer.stop();
printf("%.3fms %s\n", timer.get_time_in_us() / 1e3,
algo->name());
if (min_time > timer.get_time_in_us()) {
min_time = timer.get_time_in_us();
Base::target_algo = algo;
}
}
opr->execution_policy().algorithm = Base::target_algo;
auto workspace_size = opr->get_workspace_in_bytes(
tensors[0].layout, tensors[1].layout, tensors[2].layout, nullptr);
Base::W.update(workspace_size);
}
if (!Base::target_algo) {
auto workspace_size = opr->get_workspace_in_bytes(
tensors[0].layout, tensors[1].layout, tensors[2].layout,
nullptr);
Base::W.update(workspace_size);
}
opr->exec(tensors[0], tensors[1], tensors[2], nullptr,
Base::W.workspace());
}
};


template <class Opr>
struct OprProxyProfiling5 : public OprProxyProfilingBase<Opr, 5> {
using Base = OprProxyProfilingBase<Opr, 5>;
@@ -274,10 +329,67 @@ struct OprProxyProfiling5 : public OprProxyProfilingBase<Opr, 5> {

DEF_PROF5(DeformableConvForward);
DEF_PROF5(DeformableConvBackwardFilter);
DEF_PROF5(ConvBiasForward);
//DEF_PROF5(ConvBiasForward);
DEF_PROF5(BatchConvBiasForward);
#undef DEF_PROF5

//! TODO: it should adapt weight preprocess later
template <>
struct OprProxy<ConvBiasForward> : public OprProxyProfiling5<ConvBiasForward> {
using OprProxyProfiling5<ConvBiasForward>::OprProxyProfiling5;
void exec(ConvBiasForward* opr, const TensorNDArray& tensors) {
megdnn_assert(tensors.size() == 5);
if (!Base::W.valid()) {
Base::W = WorkspaceWrapper(opr->handle(), 0);
}
if (Base::m_profiling && !Base::target_algo) {
size_t min_time = std::numeric_limits<size_t>::max();
for (auto algo :
opr->get_all_algorithms(tensors[0].layout, tensors[1].layout,
tensors[2].layout, tensors[3].layout,
tensors[4].layout)) {
opr->execution_policy().algorithm = algo;
auto workspace_size = opr->get_workspace_in_bytes(
tensors[0].layout, tensors[1].layout, tensors[2].layout,
tensors[3].layout, tensors[4].layout, nullptr);
Base::W.update(workspace_size);

for (size_t times = 0; times < Base::warmup_times; ++times)
opr->exec(tensors[0], tensors[1], tensors[2], tensors[3],
tensors[4], nullptr, Base::W.workspace());
megcoreSynchronize(opr->handle()->megcore_computing_handle());
Timer timer;
timer.start();
for (size_t times = 0; times < Base::exec_times; ++times) {
opr->exec(tensors[0], tensors[1], tensors[2], tensors[3],
tensors[4], nullptr, Base::W.workspace());
}
megcoreSynchronize(opr->handle()->megcore_computing_handle());
timer.stop();
printf("%.3fms %s\n", timer.get_time_in_us() / 1e3,
algo->name());
if (min_time > timer.get_time_in_us()) {
min_time = timer.get_time_in_us();
Base::target_algo = algo;
}
}
opr->execution_policy().algorithm = Base::target_algo;
auto workspace_size = opr->get_workspace_in_bytes(
tensors[0].layout, tensors[1].layout, tensors[2].layout,
tensors[3].layout, tensors[4].layout, nullptr);
Base::W.update(workspace_size);
}
if (!Base::target_algo) {
auto workspace_size = opr->get_workspace_in_bytes(
tensors[0].layout, tensors[1].layout, tensors[2].layout,
tensors[3].layout, tensors[4].layout, nullptr);
Base::W.update(workspace_size);
}
opr->exec(tensors[0], tensors[1], tensors[2], tensors[3], tensors[4],
nullptr, Base::W.workspace());
}
};

template <class Opr>
struct OprProxyProfiling8 : public OprProxyProfilingBase<Opr, 8> {
using Base = OprProxyProfilingBase<Opr, 8>;


+ 2
- 2
dnn/test/cpu/mask_conv.cpp View File

@@ -75,8 +75,8 @@ TEST_F(CPU, MASK_PROPAGATE) {
auto dst = TensorND{dst_ptr, dst_layout};
WorkspaceWrapper workspace{
handle(), opr->get_workspace_in_bytes(src.layout, filter.layout,
dst.layout)};
opr->exec(src, filter, dst, workspace.workspace());
dst.layout, nullptr)};
opr->exec(src, filter, dst, nullptr, workspace.workspace());
for (size_t i = 0; i < dst.layout.total_nr_elems(); ++i) {
mask_dst.ptr<int>()[i] = dst_ptr[i] > 0;
}


+ 41
- 1
dnn/test/cuda/chanwise_convolution.cpp View File

@@ -176,6 +176,46 @@ public:
}
}

//! special for weight preprocess
void exec_convolution(ConvolutionForward* opr0, ConvolutionForward* opr1) {
opr0->param().pad_h = pad_h;
opr0->param().pad_w = pad_w;
opr1->param() = opr0->param();
opr1->param().sparse = param::Convolution::Sparse::GROUP;

TensorND a0, b0, c0, a1, b1, c1;
std::tie(a0, b0, c0) = shuffle(std::make_tuple(
src0->tensornd(), flt0->tensornd(), dst0->tensornd()));
std::tie(a1, b1, c1) = shuffle(std::make_tuple(
src1->tensornd(), flt1->tensornd(), dst1->tensornd()));
WorkspaceWrapper wk(
handle,
std::max(opr0->get_workspace_in_bytes(a0.layout, b0.layout,
c0.layout, nullptr),
opr1->get_workspace_in_bytes(a1.layout, b1.layout,
c1.layout, nullptr)));
cudaProfilerStart();
cudaEventRecord(cuda_ev[0], cuda_stream);
opr0->exec(a0, b0, c0, nullptr, wk.workspace());
cudaEventRecord(cuda_ev[1], cuda_stream);
opr1->exec(a1, b1, c1, nullptr, wk.workspace());
cudaEventRecord(cuda_ev[2], cuda_stream);
cudaProfilerStop();

if (getenv("MEGDNN_CHANWISE_CONV_VERBOSE") ||
getenv("MEGDNN_CHANWISE_CONV_FULLBENCH")) {
cudaStreamSynchronize(cuda_stream);
float t0 = -1, t1 = -1;
cudaEventElapsedTime(&t0, cuda_ev[0], cuda_ev[1]);
cudaEventElapsedTime(&t1, cuda_ev[1], cuda_ev[2]);
printf("%s;%s;%s: cudnn/megdnn: %.3fms/%.3fms=%.3f\n",
lsrc.TensorShape::to_string().c_str(),
lflt1.TensorShape::to_string().c_str(),
ldst.TensorShape::to_string().c_str(),
t0, t1, t0 / t1);
}
}

void cmp_dst() {
Tensor<> dst0_cpu(handle_cpu, ldst), dst1_cpu(handle_cpu, ldst);
megdnn_memcpy_D2H(handle,
@@ -399,7 +439,7 @@ TEST_F(CUDA, CHANWISE_CONVOLUTION_FORWARD_BENCH_CHECK) {
benv.alloc(N, IC, IH, IW, CHL_MUL, FH, FW, PH, PW);
benv.fill_src();
benv.fill_flt();
benv.exec(conv0.get(), conv1.get());
benv.exec_convolution(conv0.get(), conv1.get());
benv.cmp_dst();
};



+ 2
- 2
dnn/test/dispatcher/null.cpp View File

@@ -30,10 +30,10 @@ TEST(DISPATCHER, NULL_DISPATCHER)

auto layout = TensorLayout({1, 1, 1, 1}, dtype::Float32());
TensorND src(nullptr, layout), filter(nullptr, layout), dst(nullptr, layout);
auto wsize = opr->get_workspace_in_bytes(layout, layout, layout);
auto wsize = opr->get_workspace_in_bytes(layout, layout, layout, nullptr);
Workspace workspace(nullptr, wsize);

opr->exec(src, filter, dst, workspace);
opr->exec(src, filter, dst, nullptr, workspace);
}
#endif



+ 2
- 2
dnn/test/naive/conv_bias.cpp View File

@@ -217,11 +217,11 @@ TEST_F(NAIVE, CONV_BIAS_QUANTIZED8x8x32_NCHW32) {

size_t ws_size = conv_opr->get_workspace_in_bytes(
src_layout_4, filter_layout_4, bias_layout_4, z_layout_4,
dst_layout_4);
dst_layout_4, nullptr);
WorkspaceWrapper ws{handle(), ws_size};
conv_opr->exec(src_ts_4.tensornd(), filter_ts_4.tensornd(),
bias_ts_4.tensornd(), z_ts_4.tensornd(), dst_ts_4.tensornd(),
ws.workspace());
nullptr, ws.workspace());

TensorLayout src_layout_32{{N, IC / 32, IH, IW, 32},
dtype::QuantizedS8(0.1f)};


+ 3
- 2
dnn/test/naive/convolution.cpp View File

@@ -209,7 +209,8 @@ TEST_F(NAIVE, CONVOLUTION_WITH_NCHW4) {
}

auto workspace_size = conv->get_workspace_in_bytes(
tensors[0].layout, tensors[1].layout, tensors[2].layout);
tensors[0].layout, tensors[1].layout, tensors[2].layout,
nullptr);
dt_byte* workspace_ptr = static_cast<dt_byte*>(malloc(workspace_size));
Workspace workspace{workspace_ptr, workspace_size};

@@ -217,7 +218,7 @@ TEST_F(NAIVE, CONVOLUTION_WITH_NCHW4) {
relayout->exec(nchw4_tensors[0], nchw_tensors[0]);
relayout->exec(nchw4_tensors[1], nchw_tensors[1]);

conv->exec(nchw_tensors[0], nchw_tensors[1], nchw_tensors[2],
conv->exec(nchw_tensors[0], nchw_tensors[1], nchw_tensors[2], nullptr,
workspace);

relayout->exec(nchw_tensors[2], nchw4_tensors[2]);


+ 4
- 3
dnn/test/x86/conv_bias.cpp View File

@@ -1334,8 +1334,8 @@ TEST_F(X86_MULTI_THREADS, CONV_BIAS_WINOGRAD_WEIGHT_PREPROCESS) {
size_t conv_bias_workspace_in_bytes =
conv_bias_opr->get_workspace_in_bytes(
tensors[0].layout, filter_transform_layout,
tensors[2].layout, tensors[3].layout,
tensors[4].layout);
tensors[2].layout, tensors[3].layout, tensors[4].layout,
nullptr);

WorkspaceBundle wb(nullptr, {filter_transform_layout.span().dist_byte(),
conv_bias_workspace_in_bytes,
@@ -1347,7 +1347,8 @@ TEST_F(X86_MULTI_THREADS, CONV_BIAS_WINOGRAD_WEIGHT_PREPROCESS) {
winograd_preprocess_opr->exec(tensors[1], filter_transform_tensor,
wb.get_workspace(2));
conv_bias_opr->exec(tensors[0], filter_transform_tensor, tensors[2],
tensors[3], tensors[4], wb.get_workspace(1));
tensors[3], tensors[4], nullptr,
wb.get_workspace(1));

free(wb.ptr());
};


Loading…
Cancel
Save