Compare commits

...

26 Commits

Author SHA1 Message Date
  Xu Xinran a601fda89f chore(release): bump version 5 years ago
  Megvii Engine Team b90c1540db fix(dnn/naive): fix midout for pooling 5 years ago
  Megvii Engine Team df47637d03 fix(dnn/naive): fix midout for relayout_format 5 years ago
  Megvii Engine Team 0d8b913682 fix(mge/functional): reshape bias to (1, out_features) in linear 5 years ago
  Megvii Engine Team f60ab501ef fix(mgb/opt): nchw to nchw4 pass suppport ic less than 4 5 years ago
  Megvii Engine Team 8ec099221f fix(dnn): fix Image2DPack4TensorFormat check 5 years ago
  Megvii Engine Team 28d85838ef feat(dnn): add relayout_format for nchw to nchw4 and ic <=4 5 years ago
  Megvii Engine Team 3a53872f73 fix(dnn/native): also fix native logic 5 years ago
  Megvii Engine Team 43b42a651c fix(dnn/cuda): fix indexing logic in psroi_pooling 5 years ago
  Megvii Engine Team 3f2770fe03 feat(mge/module): add quantize dtype load support for module load_state_dict 5 years ago
  Megvii Engine Team fdd14e09e4 feat(mgb/opt): add nchw->nchw4 for tensorrt replace pass 5 years ago
  Megvii Engine Team 3eb29f5e2a feat(mgb/opt): add nchw->nchw4 in tensorcore pass 5 years ago
  Megvii Engine Team 52cb4b39e2 fix(mgb/gopt): fix convert format nchw->nchw4 pass 5 years ago
  Megvii Engine Team 90d5895ec3 fix(mgb/gopt): remove redundant reshape in nchw->nchw4 pass 5 years ago
  Megvii Engine Team 0d12ae8095 feat(opr/standalone): import NMSKeep from MegSkull contrib 5 years ago
  Megvii Engine Team 129fa70cba fix(mgb/serialization): fix multiple graph load error 5 years ago
  Megvii Engine Team 4755400eda refactor(mge/quantization): add `narrow_range` to control quant dtype's lower bound 5 years ago
  Megvii Engine Team c8a9094bb3 fix(mge/data/dataloader): add refcount in _PlasmaStoreManager 5 years ago
  Xu Xinran 05682707fc chore(version): bump version. 5 years ago
  Megvii Engine Team 5257991e68 fix(jit): fix jit doc and add NCHW44_DOT 5 years ago
  Megvii Engine Team cdf25c4afb feat(mge/utils): export _internal.plugin to mge.utils 5 years ago
  Megvii Engine Team f1dd86799b fix(load_and_run): fix dump_with_testcase_mge.py with import megbrain 5 years ago
  Megvii Engine Team be205727bc fix(mge): fix some warnings 5 years ago
  Megvii Engine Team 5b87e8a8ed fix(load_and_run): fix load_and_run with --input which ignore iters 5 years ago
  Megvii Engine Team 80af2f935f fix(serialization): fix model compatibility 5 years ago
  Megvii Engine Team fda9599a84 feat(mge/quant): add TQT quant method 5 years ago
52 changed files with 1565 additions and 480 deletions
Split View
  1. +5
    -2
      dnn/scripts/opr_param_defs.py
  2. +40
    -48
      dnn/src/arm_common/conv_bias/int8/direct_dotprod.cpp
  3. +3
    -1
      dnn/src/arm_common/conv_bias/int8/direct_dotprod_nchw44.cpp
  4. +3
    -1
      dnn/src/arm_common/conv_bias/int8/direct_dotprod_nchw44_algo.cpp
  5. +1
    -0
      dnn/src/arm_common/conv_bias/int8/direct_dotprod_nchw44_kern.h
  6. +0
    -3
      dnn/src/common/conv_bias.cpp
  7. +48
    -0
      dnn/src/common/relayout_format.cpp
  8. +33
    -6
      dnn/src/cuda/convolution/opr_impl.cpp
  9. +3
    -1
      dnn/src/cuda/convolution/opr_impl.h
  10. +1
    -1
      dnn/src/cuda/deformable_ps_roi_pooling/opr_impl.cpp
  11. +32
    -3
      dnn/src/cuda/relayout_format/opr_impl.cpp
  12. +2
    -2
      dnn/src/naive/deformable_ps_roi_pooling/opr_impl.cpp
  13. +1
    -5
      dnn/src/naive/handle.cpp
  14. +58
    -58
      dnn/src/naive/pooling/opr_impl.cpp
  15. +75
    -39
      dnn/src/naive/relayout_format/opr_impl.cpp
  16. +22
    -0
      dnn/test/cuda/relayout_format.cpp
  17. +1
    -1
      python_module/megengine/_internal/__init__.py
  18. +3
    -0
      python_module/megengine/_internal/opr_extra.py
  19. +1
    -0
      python_module/megengine/core/function.py
  20. +8
    -0
      python_module/megengine/core/tensor.py
  21. +6
    -1
      python_module/megengine/data/_queue.py
  22. +1
    -1
      python_module/megengine/functional/nn.py
  23. +32
    -10
      python_module/megengine/jit/__init__.py
  24. +5
    -0
      python_module/megengine/module/module.py
  25. +15
    -10
      python_module/megengine/module/qat/module.py
  26. +1
    -0
      python_module/megengine/quantization/__init__.py
  27. +116
    -27
      python_module/megengine/quantization/fake_quant.py
  28. +27
    -8
      python_module/megengine/quantization/observer.py
  29. +49
    -19
      python_module/megengine/quantization/qconfig.py
  30. +4
    -0
      python_module/megengine/utils/__init__.py
  31. +1
    -2
      python_module/megengine/version.py
  32. +0
    -1
      python_module/test/unit/core/test_function.py
  33. +46
    -0
      python_module/test/unit/core/test_tensor.py
  34. +49
    -0
      python_module/test/unit/data/test_dataloader.py
  35. +37
    -0
      python_module/test/unit/module/test_module.py
  36. +77
    -0
      python_module/test/unit/quantization/test_TQT.py
  37. +1
    -1
      sdk/load-and-run/dump_with_testcase_mge.py
  38. +39
    -39
      sdk/load-and-run/src/mgblar.cpp
  39. +5
    -0
      src/CMakeLists.txt
  40. +1
    -1
      src/core/impl/comp_node/cpu/comp_node.cpp
  41. +1
    -1
      src/core/impl/graph/cg_impl.cpp
  42. +2
    -2
      src/core/include/megbrain/version.h
  43. +2
    -0
      src/gopt/impl/framework.cpp
  44. +209
    -62
      src/gopt/impl/tensor_reformat.cpp
  45. +319
    -50
      src/gopt/test/inference.cpp
  46. +5
    -0
      src/opr/impl/mgb_cpp_opr.fbs
  47. +69
    -68
      src/serialization/impl/schema.fbs
  48. +2
    -2
      src/serialization/impl/serializer_oss.cpp
  49. +29
    -1
      src/serialization/test/serializer_oss.cpp
  50. +10
    -1
      src/tensorrt/impl/opr_replace.cpp
  51. +2
    -1
      src/tensorrt/include/megbrain/tensorrt/opr_replace.h
  52. +63
    -1
      src/tensorrt/test/opr_replace.cpp

+ 5
- 2
dnn/scripts/opr_param_defs.py View File

@@ -434,7 +434,7 @@ pdef('PowC', 'power with constant exponent').add_fields('float32', 'exp', 0)
'layout is (K/4, M/4, 4(k), 4(m)) x (K/4, N, 4(k))'),
Doc('MK8', 'Split 8 from M and K, better for neon compute:'
'(M/8, K/8, 8(k), 8(m)) x (K/8, N, 8(k)). if transposeA the '
'layout is (K/8, M/8, 8(k), 8(m)) x (K/8, N, 8(k))'),
'layout is (K/8, M/8, 8(k), 8(m)) x (K/8, N, 8(k))'),
Doc('MK4_DOT', 'Split 4 from M and K, better for neon dotprod:'
'M/4, K/4, 4(m), 4(k)) x (K/4, N, 4(k)). if transposeA the '
'layout is (K/4, M/4, 4(m), 4(k)) x (K/4, N, 4(k))'))
@@ -858,7 +858,10 @@ when the ``I`` suffix is present.
'NCHW_NCHW88_CONV_CHAN_WEIGHT',
'NCHW_NCHW88_CONV_GROUP_WEIGHT',
'NCHW_NCHW88',
'NCHW88_NCHW')
'NCHW88_NCHW',
'NCHW_NCHW4_IC_SMALL',
'NCHW_NCHW4_IC_SMALL_CONV_DENSE_WEIGHT',
)
)




+ 40
- 48
dnn/src/arm_common/conv_bias/int8/direct_dotprod.cpp View File

@@ -90,12 +90,11 @@ inline int8x16_t vqtbl1q_s8_v7(int8x16_t a, uint8x16_t index) {
_sum1##_c_idx = vdotq_s32(_sum1##_c_idx, _k##_k2_idx, _elem);

template <bool first_ic, bool last_ic, BiasMode bias_mode, typename Op>
void conv_bias::conv_direct_stride1_2x2_int8_dot(const int8_t* src,
const int8_t* filter,
const int32_t* bias, int32_t* temp,
int8_t* dst, const size_t IH,
const size_t IW, const size_t OH,
const size_t OW, const Op& op) {
void conv_bias::conv_direct_stride1_2x2_int8_dot(
const int8_t* src, const int8_t* filter, const int32_t* bias,
int32_t* temp, int8_t* dst, const size_t IH, const size_t IW,
const size_t OH, const size_t OW, const Op& op) {
MEGDNN_MARK_USED_VAR(IH);
const size_t tail_step = IW - OW;
const uint8x16_t _idx0 = {0, 1, 16, 16, 1, 2, 16, 16,
2, 3, 16, 16, 3, 4, 16, 16};
@@ -326,12 +325,11 @@ void conv_bias::conv_direct_stride1_2x2_int8_dot(const int8_t* src,
}

template <bool first_ic, bool last_ic, BiasMode bias_mode, typename Op>
void conv_bias::conv_direct_stride1_3x3_int8_dot(const int8_t* src,
const int8_t* filter,
const int32_t* bias, int32_t* temp,
int8_t* dst, const size_t IH,
const size_t IW, const size_t OH,
const size_t OW, const Op& op) {
void conv_bias::conv_direct_stride1_3x3_int8_dot(
const int8_t* src, const int8_t* filter, const int32_t* bias,
int32_t* temp, int8_t* dst, const size_t IH, const size_t IW,
const size_t OH, const size_t OW, const Op& op) {
MEGDNN_MARK_USED_VAR(IH);
const size_t tail_step = IW - OW;

const uint8x16_t _idx0 = {0, 1, 2, 16, 1, 2, 3, 16,
@@ -562,12 +560,11 @@ void conv_bias::conv_direct_stride1_3x3_int8_dot(const int8_t* src,
}

template <bool first_ic, bool last_ic, BiasMode bias_mode, typename Op>
void conv_bias::conv_direct_stride2_2x2_int8_dot(const int8_t* src,
const int8_t* filter,
const int32_t* bias, int32_t* temp,
int8_t* dst, const size_t IH,
const size_t IW, const size_t OH,
const size_t OW, const Op& op) {
void conv_bias::conv_direct_stride2_2x2_int8_dot(
const int8_t* src, const int8_t* filter, const int32_t* bias,
int32_t* temp, int8_t* dst, const size_t IH, const size_t IW,
const size_t OH, const size_t OW, const Op& op) {
MEGDNN_MARK_USED_VAR(IH);
const size_t tail_step = IW - 2 * OW + IW;

const uint8x16_t _idx0 = {0, 1, 16, 16, 2, 3, 16, 16,
@@ -658,12 +655,11 @@ void conv_bias::conv_direct_stride2_2x2_int8_dot(const int8_t* src,
}

template <bool first_ic, bool last_ic, BiasMode bias_mode, typename Op>
void conv_bias::conv_direct_stride2_3x3_int8_dot(const int8_t* src,
const int8_t* filter,
const int32_t* bias, int32_t* temp,
int8_t* dst, const size_t IH,
const size_t IW, const size_t OH,
const size_t OW, const Op& op) {
void conv_bias::conv_direct_stride2_3x3_int8_dot(
const int8_t* src, const int8_t* filter, const int32_t* bias,
int32_t* temp, int8_t* dst, const size_t IH, const size_t IW,
const size_t OH, const size_t OW, const Op& op) {
MEGDNN_MARK_USED_VAR(IH);
const size_t tail_step = IW - 2 * OW + IW;

const uint8x16_t _idx0 = {0, 1, 2, 16, 2, 3, 4, 16,
@@ -814,12 +810,11 @@ void conv_bias::conv_direct_stride2_3x3_int8_dot(const int8_t* src,
_sum1##_c_idx = vdotq_s32(_sum1##_c_idx, _k##_k11_idx, _elem);

template <bool first_ic, bool last_ic, BiasMode bias_mode, typename Op>
void conv_bias::conv_direct_stride2_5x5_int8_dot(const int8_t* src,
const int8_t* filter,
const int32_t* bias, int32_t* temp,
int8_t* dst, const size_t IH,
const size_t IW, const size_t OH,
const size_t OW, const Op& op) {
void conv_bias::conv_direct_stride2_5x5_int8_dot(
const int8_t* src, const int8_t* filter, const int32_t* bias,
int32_t* temp, int8_t* dst, const size_t IH, const size_t IW,
const size_t OH, const size_t OW, const Op& op) {
MEGDNN_MARK_USED_VAR(IH);
const size_t tail_step = IW - 2 * OW + IW;

const uint8x16_t _idx00 = {0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9};
@@ -1113,12 +1108,11 @@ void conv_bias::conv_direct_stride2_5x5_int8_dot(const int8_t* src,
}

template <bool first_ic, bool last_ic, BiasMode bias_mode, typename Op>
void conv_bias::conv_direct_stride2_7x7_int8_dot(const int8_t* src,
const int8_t* filter,
const int32_t* bias, int32_t* temp,
int8_t* dst, const size_t IH,
const size_t IW, const size_t OH,
const size_t OW, const Op& op) {
void conv_bias::conv_direct_stride2_7x7_int8_dot(
const int8_t* src, const int8_t* filter, const int32_t* bias,
int32_t* temp, int8_t* dst, const size_t IH, const size_t IW,
const size_t OH, const size_t OW, const Op& op) {
MEGDNN_MARK_USED_VAR(IH);
const size_t tail_step = IW - 2 * OW + IW;

const uint8x16_t _idx00 = {0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9};
@@ -1476,12 +1470,11 @@ void conv_bias::conv_direct_stride2_7x7_int8_dot(const int8_t* src,
}

template <bool first_ic, bool last_ic, BiasMode bias_mode, typename Op>
void conv_bias::conv_direct_stride1_5x5_int8_dot(const int8_t* src,
const int8_t* filter,
const int32_t* bias, int32_t* temp,
int8_t* dst, const size_t IH,
const size_t IW, const size_t OH,
const size_t OW, const Op& op) {
void conv_bias::conv_direct_stride1_5x5_int8_dot(
const int8_t* src, const int8_t* filter, const int32_t* bias,
int32_t* temp, int8_t* dst, const size_t IH, const size_t IW,
const size_t OH, const size_t OW, const Op& op) {
MEGDNN_MARK_USED_VAR(IH);
const size_t tail_step = IW - OW;

const uint8x16_t _idx00 = {0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6};
@@ -1777,12 +1770,11 @@ void conv_bias::conv_direct_stride1_5x5_int8_dot(const int8_t* src,
}

template <bool first_ic, bool last_ic, BiasMode bias_mode, typename Op>
void conv_bias::conv_direct_stride1_7x7_int8_dot(const int8_t* src,
const int8_t* filter,
const int32_t* bias, int32_t* temp,
int8_t* dst, const size_t IH,
const size_t IW, const size_t OH,
const size_t OW, const Op& op) {
void conv_bias::conv_direct_stride1_7x7_int8_dot(
const int8_t* src, const int8_t* filter, const int32_t* bias,
int32_t* temp, int8_t* dst, const size_t IH, const size_t IW,
const size_t OH, const size_t OW, const Op& op) {
MEGDNN_MARK_USED_VAR(IH);
const size_t tail_step = IW - OW;

const uint8x16_t _idx00 = {0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6};


+ 3
- 1
dnn/src/arm_common/conv_bias/int8/direct_dotprod_nchw44.cpp View File

@@ -29,6 +29,7 @@ void copy_packed_src_int8_nchw44<1>(int8_t* dst, const int dst_step,
const int ih, const int pad_left,
const int pad_right, const int pad_top,
const int pad_bottom) {
MEGDNN_MARK_USED_VAR(pad_right);
constexpr int IC_PACK_SIZE = 4;
rep_step(ic_idx, ic, IC_PACK_SIZE) {
const int8_t* i_src = src + ic_idx * ic_step;
@@ -66,6 +67,7 @@ void copy_packed_src_int8_nchw44<2>(int8_t* dst, const int dst_step,
const int ih, const int pad_left,
const int pad_right, const int pad_top,
const int pad_bottom) {
MEGDNN_MARK_USED_VAR(pad_right);
constexpr int IC_PACK_SIZE = 4;
int odd_start = megdnn::div_ceil(dst_step, 2);
bool nochange = pad_left % 2 == 0;
@@ -367,4 +369,4 @@ FOR_FILTER(2)
} // namespace megdnn
#endif

//vim: syntax=cpp.doxygen
//vim: syntax=cpp.doxygen

+ 3
- 1
dnn/src/arm_common/conv_bias/int8/direct_dotprod_nchw44_algo.cpp View File

@@ -163,6 +163,7 @@ static void conv_kern(WorkspaceBundle bundle,
bool ConvBiasImpl::AlgoDotS8Direct_NCHW44::usable(
FallbackConvBiasImpl*, const NCBKernSizeParam& param,
AlgoSelectionStrategy algo_selection_strategy) const {
MEGDNN_MARK_USED_VAR(algo_selection_strategy);
auto&& fm = param.filter_meta;
auto FH = fm.spatial[0];
auto FW = fm.spatial[1];
@@ -199,6 +200,7 @@ bool ConvBiasImpl::AlgoDotS8Direct_NCHW44::usable(

bool ConvBiasImpl::AlgoDotS8Direct_NCHW44::is_preferred(
megdnn::fallback::ConvBiasImpl*, const NCBKernSizeParam& param) const {
MEGDNN_MARK_USED_VAR(param);
return true;
}

@@ -338,4 +340,4 @@ ConvBiasImpl::AlgoDotS8Direct_NCHW44::dispatch_kerns(

#endif

//vim: syntax=cpp.doxygen
//vim: syntax=cpp.doxygen

+ 1
- 0
dnn/src/arm_common/conv_bias/int8/direct_dotprod_nchw44_kern.h View File

@@ -98,6 +98,7 @@ template <int ow_remain, typename Op, typename T>
struct StoreOCxOWx<1, ow_remain, Op, T> {
static void impl(int32x4_t res[][8], const Op& op, T* dst_ptr,
const int ld_dst_oc) {
MEGDNN_MARK_USED_VAR(ld_dst_oc);
switch (ow_remain) {
case 8:
UNROLL_CALL_RAW(4, cb12);


+ 0
- 3
dnn/src/common/conv_bias.cpp View File

@@ -337,14 +337,11 @@ ConvBias::WinogradParam ConvBias::parse_winograd_name(
&(ret.channel_block_size), &(ret.output_block_size),
&(ret.tile_size));
if (strcmp(name, pre.c_str())) {
megdnn_log_warn("algo %s is not %s algo", name, pre.c_str());
ret = INVALID_WINOGRAD_PARAM;
return false;
}
if (ret.tile_size == 0 || ret.output_block_size == 0 ||
ret.channel_block_size == 0) {
megdnn_log_warn("the algo name %s is not suitable for %s",
algo_name.c_str(), pre.c_str());
ret = INVALID_WINOGRAD_PARAM;
return false;
}


+ 48
- 0
dnn/src/common/relayout_format.cpp View File

@@ -28,6 +28,26 @@ void RelayoutFormat::deduce_layout_fwd(const TensorLayout& src,
dst[3] = src[3];
dst[4] = 4;
break;
case Param::Mode::NCHW_NCHW4_IC_SMALL:
dst.ndim = 5;
megdnn_assert(src[1] <= 4_z, "ic should be less equal 4");
dst[0] = src[0];
dst[1] = div_ceil(src[1], 4_z);
dst[2] = src[2];
dst[3] = src[3];
dst[4] = 4;
break;
case Param::Mode::NCHW_NCHW4_IC_SMALL_CONV_DENSE_WEIGHT:
megdnn_assert(src.ndim == 4, "src must be oihw, ndim == 4");
megdnn_assert(src[1] <= 4_z, "ic should be less equal 4");
dst.ndim = 5;
dst[0] = src[0];
dst[1] = div_ceil(src[1], 4_z);
dst[2] = src[2];
dst[3] = src[3];
dst[4] = 4;
break;

case Param::Mode::NCHW_NCHW88:
dst.ndim = 5;
dst[0] = src[0];
@@ -276,6 +296,8 @@ void RelayoutFormat::deduce_format(TensorFormat src, TensorFormat& dst) {
case Param::Mode::NCHW_NCHW88_CONV_DENSE_WEIGHT:
case Param::Mode::NCHW_NCHW88_CONV_CHAN_WEIGHT:
case Param::Mode::NCHW_NCHW88_CONV_GROUP_WEIGHT:
case Param::Mode::NCHW_NCHW4_IC_SMALL:
case Param::Mode::NCHW_NCHW4_IC_SMALL_CONV_DENSE_WEIGHT:
CHECK_SRC(DefaultTensorFormat::make());
dst = src;
break;
@@ -284,6 +306,15 @@ void RelayoutFormat::deduce_format(TensorFormat src, TensorFormat& dst) {
megdnn_throw("Invalid relayout format mode");
break;
}

if (!dst.is_default() &&
(
handle()->type() != Handle::HandleType::NAIVE)) {
megdnn_throw(
"Only naive and opencl handle support "
"Image2DPack4TensorFormat, try to export MGB_USE_MEGDNN_DBG=2 "
"to enable naive handle");
}
#undef CHECK_SRC
}

@@ -374,6 +405,23 @@ void RelayoutFormat::deduce_exec_layout(const TensorLayout& src,
exec_dst = dst;
}
break;

case Param::Mode::NCHW_NCHW4_IC_SMALL:
case Param::Mode::NCHW_NCHW4_IC_SMALL_CONV_DENSE_WEIGHT:
// nchw to nchw4c or oihw to oihw4i
{
TensorLayout work_space_layout(
{src[0], round_up(src[1], 4_z), src[2], src[3]},
src.dtype, src.format);
exec_src = work_space_layout
.reshape({src[0], div_ceil(src[1], 4_z), 4,
src[2], src[3]})
.dimshuffle({0, 1, 3, 4, 2});
exec_dst = dst;
}
break;


case Param::Mode::NCHW_NHWCD4:
case Param::Mode::NCHW_NHWCD4I:
// src is {N, C, H, W}


+ 33
- 6
dnn/src/cuda/convolution/opr_impl.cpp View File

@@ -10,6 +10,7 @@
*/

#include "src/cuda/convolution/opr_impl.h"
#include "megdnn/dtype.h"
#include "src/cuda/convolution/helper.h"
#include "src/cuda/convolution/backward_data/algo.h"
#include "src/cuda/convolution/backward_filter/algo.h"
@@ -28,10 +29,35 @@ using namespace convolution;

/* ============== ConvolutionForwardImpl ============== */
ConvolutionForwardImpl::ConvBiasExtraData
ConvolutionForwardImpl::conv_bias_extra_data(const TensorLayout& dst) {
ConvolutionForwardImpl::conv_bias_extra_data(const TensorLayout& src,
const TensorLayout& filter,
const TensorLayout& dst) {
auto conv_param = param();
DType bias_type;
if (src.dtype.enumv() == DTypeEnum::QuantizedS8) {
bias_type = dtype::QuantizedS32(
src.dtype.param<dtype::QuantizedS8>().scale *

filter.dtype.param<dtype::QuantizedS8>().scale);
} else if (src.dtype.enumv() == DTypeEnum::Quantized8Asymm) {
bias_type = dtype::QuantizedS32(
src.dtype.param<dtype::Quantized8Asymm>().scale *

filter.dtype.param<dtype::Quantized8Asymm>().scale);
} else if (src.dtype.enumv() == DTypeEnum::Uint8 ||
src.dtype.enumv() == DTypeEnum::Int8) {
bias_type = dtype::Int32{};
} else if (src.dtype.enumv() == DTypeEnum::Quantized4Asymm) {
bias_type = dtype::QuantizedS32(
src.dtype.param<dtype::Quantized4Asymm>().scale *

filter.dtype.param<dtype::Quantized4Asymm>().scale);
} else {
megdnn_assert(src.dtype.category() == DTypeCategory::FLOAT);
bias_type = src.dtype;
}
ConvBiasExtraData ret = {this->handle()->create_operator<ConvBiasForward>(),
TensorLayout(dst.dtype), TensorLayout(dst.dtype)};
TensorLayout(bias_type), TensorLayout(dst.dtype)};
ret.convbias_opr->param() = {param::ConvBias::NonlineMode::IDENTITY,
conv_param.mode,
conv_param.sparse,
@@ -54,7 +80,7 @@ ConvolutionForwardImpl::get_algorithm_heuristic(const TensorLayout& src,
const TensorLayout& dst,
size_t workspace_limit_in_bytes,
bool reproducible) {
auto extra_data = conv_bias_extra_data(dst);
auto extra_data = conv_bias_extra_data(src, filter, dst);
return static_cast<ConvBiasForwardImpl*>(extra_data.convbias_opr.get())
->get_algorithm_heuristic(src, filter, extra_data.bias_layout,
extra_data.z_layout, dst,
@@ -65,7 +91,7 @@ std::vector<ConvolutionForwardImpl::Algorithm*>
ConvolutionForwardImpl::get_all_algorithms(const TensorLayout& src,
const TensorLayout& filter,
const TensorLayout& dst) {
auto extra_data = conv_bias_extra_data(dst);
auto extra_data = conv_bias_extra_data(src, filter, dst);
return static_cast<ConvBiasForwardImpl*>(extra_data.convbias_opr.get())
->get_all_algorithms(src, filter, extra_data.bias_layout,
extra_data.z_layout, dst);
@@ -75,7 +101,7 @@ size_t ConvolutionForwardImpl::get_workspace_in_bytes(
const TensorLayout& src, const TensorLayout& filter,
const TensorLayout& dst,
const PreprocessedFilter* preprocessed_filter) {
auto extra_data = conv_bias_extra_data(dst);
auto extra_data = conv_bias_extra_data(src, filter, dst);
return static_cast<ConvBiasForwardImpl*>(extra_data.convbias_opr.get())
->get_workspace_in_bytes(
src, filter, extra_data.bias_layout, extra_data.z_layout,
@@ -90,7 +116,8 @@ void ConvolutionForwardImpl::exec(_megdnn_tensor_in src,
_megdnn_tensor_out dst,
const PreprocessedFilter* preprocessed_filter,
_megdnn_workspace workspace) {
auto extra_data = conv_bias_extra_data(dst.layout);
auto extra_data =
conv_bias_extra_data(src.layout, filter.layout, dst.layout);
TensorND bias(nullptr, extra_data.bias_layout);
TensorND z(nullptr, extra_data.z_layout);
return static_cast<ConvBiasForwardImpl*>(extra_data.convbias_opr.get())


+ 3
- 1
dnn/src/cuda/convolution/opr_impl.h View File

@@ -61,7 +61,9 @@ class ConvolutionForwardImpl: public ConvolutionForward {
TensorLayout z_layout;
};
private:
ConvBiasExtraData conv_bias_extra_data(const TensorLayout&);
ConvBiasExtraData conv_bias_extra_data(const TensorLayout&,
const TensorLayout&,
const TensorLayout&);
};

class ConvolutionBackwardDataImpl: public ConvolutionBackwardData {


+ 1
- 1
dnn/src/cuda/deformable_ps_roi_pooling/opr_impl.cpp View File

@@ -32,7 +32,7 @@ void create_param(const DeformablePSROIPoolingBase* opr,
p.sample_per_part = param.sample_per_part;
p.trans_std = param.trans_std;
p.scale = param.spatial_scale;
p.nr_cls = p.no_trans ? 1 : trans[0];
p.nr_cls = p.no_trans ? 1 : trans[1] / 2;
p.nr_bbox = rois[0];
p.IC = data[1];
p.IH = data[2];


+ 32
- 3
dnn/src/cuda/relayout_format/opr_impl.cpp View File

@@ -11,6 +11,7 @@

#include "src/cuda/relayout_format/opr_impl.h"
#include "src/cuda/handle.h"
#include "src/cuda/utils.h"

using namespace megdnn;
using namespace cuda;
@@ -20,15 +21,22 @@ void RelayoutFormatImpl::exec(_megdnn_tensor_in src, _megdnn_tensor_out dst,
auto src_dtype = src.layout.dtype;
megdnn_assert(
param().mode == param::RelayoutFormat::Mode::NCHW4_CHWN4 ||
param().mode == param::RelayoutFormat::Mode::CHWN4_NCHW4,
param().mode == param::RelayoutFormat::Mode::CHWN4_NCHW4 ||
param().mode == Param::Mode::NCHW_NCHW4_IC_SMALL ||
param().mode ==
Param::Mode::NCHW_NCHW4_IC_SMALL_CONV_DENSE_WEIGHT,
"relayout format of cuda only support NCHW4->CHWN4 or "
"CHWN4->NCHW4");
if (src_dtype.enumv() == DTypeEnum::QuantizedS8) {
"CHWN4->NCHW4 or NCHW->NCHW4");
if ((param().mode == param::RelayoutFormat::Mode::NCHW4_CHWN4 ||
param().mode == param::RelayoutFormat::Mode::CHWN4_NCHW4) &&
src_dtype.enumv() == DTypeEnum::QuantizedS8) {
size_t row = 0, col = 0;
if (param().mode == Param::RelayoutFormat::Mode::NCHW4_CHWN4) {
row = src.layout[0],
col = src.layout[1] * src.layout[2] * src.layout[3];
} else {
megdnn_assert(param().mode ==
param::RelayoutFormat::Mode::CHWN4_NCHW4);
row = src.layout[0] * src.layout[1] * src.layout[2],
col = src.layout[3];
}
@@ -43,6 +51,27 @@ void RelayoutFormatImpl::exec(_megdnn_tensor_in src, _megdnn_tensor_out dst,
return handle()->create_operator<RelayoutForward>()->exec(trans_in,
trans_out);
}
if ((param().mode == Param::Mode::NCHW_NCHW4_IC_SMALL ||
param().mode == Param::Mode::NCHW_NCHW4_IC_SMALL_CONV_DENSE_WEIGHT) &&
src.layout[1] % 4 != 0) {
megdnn_assert(src.raw_ptr != dst.raw_ptr && src.layout.ndim == 4,
"The mode of NCHW_NCHW4 and NCHW_NCHW4_CONV_DENSE_WEIGHT "
"of RelayoutFormat opr(cuda backend) does not support "
"src.ptr == dst.ptr");
megdnn_assert(src.layout[1] <= 4);
cuda_check(cudaMemsetAsync(dst.raw_ptr, 0,
dst.layout.span().dist_byte(),
cuda_stream(this->handle())));
TensorLayout exec_dst_layout = dst.layout;
exec_dst_layout[4] = src.layout[1];
TensorLayout exec_src_layout =
src.layout
.reshape({src.layout[0], src.layout[1], 1,
src.layout[2], src.layout[3]})
.dimshuffle({0, 2, 3, 4, 1});
return handle()->create_operator<RelayoutForward>()->exec(
{src.raw_ptr, exec_src_layout}, {dst.raw_ptr, exec_dst_layout});
}
TensorLayout exec_src, exec_dst;
deduce_exec_layout(src.layout, dst.layout, exec_src, exec_dst);
TensorND exec_src_nd{src.raw_ptr, exec_src};


+ 2
- 2
dnn/src/naive/deformable_ps_roi_pooling/opr_impl.cpp View File

@@ -293,7 +293,7 @@ void Fwd::exec(_megdnn_tensor_in data, _megdnn_tensor_in rois,
float trans_std = param.trans_std, scale = param.spatial_scale;

size_t nr_bbox = rois.layout[0];
size_t nr_cls = no_trans ? 1 : trans.layout[0];
size_t nr_cls = no_trans ? 1 : trans.layout[1] / 2;
size_t IC = data.layout[1], IH = data.layout[2], IW = data.layout[3];

const float* data_ptr = data.ptr<float>();
@@ -339,7 +339,7 @@ void Bwd::exec(_megdnn_tensor_in data, _megdnn_tensor_in rois,
float trans_std = param.trans_std, scale = param.spatial_scale;

size_t nr_bbox = rois.layout[0];
size_t nr_cls = no_trans ? 1 : trans.layout[0];
size_t nr_cls = no_trans ? 1 : trans.layout[1] / 2;
size_t IC = data.layout[1], IH = data.layout[2], IW = data.layout[3];

const float* data_ptr = data.ptr<float>();


+ 1
- 5
dnn/src/naive/handle.cpp View File

@@ -107,11 +107,7 @@ HandleImpl::HandleImpl(megcoreComputingHandle_t computing_handle,
m_dispatcher{megcoreGetCPUDispatcher(computing_handle)} {}

size_t HandleImpl::image2d_pitch_alignment() const {
if (type() == Handle::HandleType::NAIVE) {
// only naive CPU handle supports this format
return g_image2d_pitch_alignment;
}
megdnn_throw("Image2DTensorFormat is not supported on this handle");
return g_image2d_pitch_alignment;
}

size_t HandleImpl::exchange_image2d_pitch_alignment(size_t alignment) {


+ 58
- 58
dnn/src/naive/pooling/opr_impl.cpp View File

@@ -370,65 +370,67 @@ void pooling_backward_max_impl(const ctype* __restrict src,
}
}

} // anonymous namespace
} // namespace

namespace megdnn {
namespace naive {

void PoolingForwardImpl::exec(_megdnn_tensor_in src, _megdnn_tensor_out dst,
_megdnn_workspace workspace) {
MIDOUT_BEGIN(megdnn_naive_pooling) {
check_exec(src.layout, dst.layout, workspace.size);
size_t c_pos, spatial_pos, batch_pos = 0;
if (param().format == Param::Format::NCHW ||
param().format == Param::Format::NCHW4 ||
param().format == Param::Format::NCHW88 ||
param().format == Param::Format::NCHW44 ||
param().format == Param::Format::NCHW32) {
c_pos = 1;
spatial_pos = 2;
} else if (param().format == Param::Format::NHWC) {
c_pos = 3;
spatial_pos = 1;
} else if (param().format == Param::Format::CHWN4) {
c_pos = 0;
spatial_pos = 1;
batch_pos = 3;
} else {
megdnn_assert(param().format == Param::Format::NHWCD4);
c_pos = 2;
spatial_pos = 1;
}
size_t N = src.layout.shape[batch_pos], C = src.layout.shape[c_pos],
IH = src.layout.shape[spatial_pos + 0],
IW = src.layout.shape[spatial_pos + 1];
size_t OH = dst.layout.shape[spatial_pos + 0],
OW = dst.layout.shape[spatial_pos + 1];
if (param().format == Param::Format::NHWCD4) {
C *= 4;
IW = src.layout.shape[spatial_pos + 2];
OW = dst.layout.shape[spatial_pos + 2];
}
if (param().format == Param::Format::NCHW4 ||
param().format == Param::Format::NCHW44 ||
param().format == Param::Format::CHWN4) {
C *= 4;
}
if (param().format == Param::Format::NCHW88) {
C *= 8;
}
if (param().format == Param::Format::NCHW32) {
C *= 32;
}
size_t PH = param().pad_h, PW = param().pad_w;
size_t FH = param().window_h, FW = param().window_w;
size_t SH = param().stride_h, SW = param().stride_w;
#define DISPATCH_WITH_POOLER_AND_IDX_GETTER(Pooler, IdxGetter) \
MEGDNN_DISPATCH_CPU_KERN( \
static_cast<naive::HandleImpl*>(handle()), \
pooling_forward_impl<Pooler MEGDNN_COMMA IdxGetter>( \
sptr, dptr, src.layout.dtype, N, C, IH, IW, OH, OW, PH, \
PW, SH, SW, FH, FW));
check_exec(src.layout, dst.layout, workspace.size);
size_t c_pos, spatial_pos, batch_pos = 0;
if (param().format == Param::Format::NCHW ||
param().format == Param::Format::NCHW4 ||
param().format == Param::Format::NCHW88 ||
param().format == Param::Format::NCHW44 ||
param().format == Param::Format::NCHW32) {
c_pos = 1;
spatial_pos = 2;
} else if (param().format == Param::Format::NHWC) {
c_pos = 3;
spatial_pos = 1;
} else if (param().format == Param::Format::CHWN4) {
c_pos = 0;
spatial_pos = 1;
batch_pos = 3;
} else {
megdnn_assert(param().format == Param::Format::NHWCD4);
c_pos = 2;
spatial_pos = 1;
}
size_t N = src.layout.shape[batch_pos], C = src.layout.shape[c_pos],
IH = src.layout.shape[spatial_pos + 0],
IW = src.layout.shape[spatial_pos + 1];
size_t OH = dst.layout.shape[spatial_pos + 0],
OW = dst.layout.shape[spatial_pos + 1];
if (param().format == Param::Format::NHWCD4) {
C *= 4;
IW = src.layout.shape[spatial_pos + 2];
OW = dst.layout.shape[spatial_pos + 2];
}
if (param().format == Param::Format::NCHW4 ||
param().format == Param::Format::NCHW44 ||
param().format == Param::Format::CHWN4) {
C *= 4;
}
if (param().format == Param::Format::NCHW88) {
C *= 8;
}
if (param().format == Param::Format::NCHW32) {
C *= 32;
}
size_t PH = param().pad_h, PW = param().pad_w;
size_t FH = param().window_h, FW = param().window_w;
size_t SH = param().stride_h, SW = param().stride_w;
#define DISPATCH_WITH_POOLER_AND_IDX_GETTER(Pooler, IdxGetter) \
MIDOUT_BEGIN(megdnn_naive_pooling, midout_iv(#Pooler #IdxGetter##_hash)) { \
MEGDNN_DISPATCH_CPU_KERN( \
static_cast<naive::HandleImpl*>(handle()), \
pooling_forward_impl<Pooler MEGDNN_COMMA IdxGetter>( \
sptr, dptr, src.layout.dtype, N, C, IH, IW, OH, OW, \
PH, PW, SH, SW, FH, FW)); \
} \
MIDOUT_END();

#define DISPATCH_WITH_POOLER(Pooler) \
switch (param().format) { \
@@ -484,14 +486,12 @@ void PoolingForwardImpl::exec(_megdnn_tensor_in src, _megdnn_tensor_out dst,
} \
} \
}
MEGDNN_FOREACH_COMPUTING_DTYPE(cb)
MEGDNN_FOREACH_QUANTIZED_DTYPE(cb)
MEGDNN_FOREACH_COMPUTING_DTYPE(cb)
MEGDNN_FOREACH_QUANTIZED_DTYPE(cb)
#undef cb
#undef DISPATCH_WITH_POOLER_AND_IDX_GETTER
#undef DISPATCH_WITH_POOLER
megdnn_assert_internal(0);
}
MIDOUT_END();
megdnn_assert_internal(0);
}

WorkspaceBundle PoolingBackwardImpl::get_workspace_bundle(


+ 75
- 39
dnn/src/naive/relayout_format/opr_impl.cpp View File

@@ -14,6 +14,10 @@

#include "megdnn/tensor_iter.h"

#include "midout.h"

MIDOUT_DECL(megdnn_naive_relayout_format)

using namespace megdnn;
using namespace naive;

@@ -79,6 +83,7 @@ void padding_to_workspace(_megdnn_tensor_out dst, _megdnn_tensor_in src,
}

cb(Float32, dt_float32);
cb(QuantizedS8, dt_qint8);
default:
megdnn_assert(0);
#undef cb
@@ -138,7 +143,7 @@ size_t RelayoutFormatImpl::get_workspace_in_bytes(const TensorLayout& src,
return n * c * h * w * src.dtype.size();
}
case Param::Mode::NCHW_NCHW88_CONV_DENSE_WEIGHT: {
megdnn_assert(src.ndim == 4, "src must be oihw ,nmdim == 5");
megdnn_assert(src.ndim == 4, "src must be oihw, ndim == 5");
megdnn_assert(src[0] % 8 == 0,
"NCHW_NCHW88_CONV_DENSE_WEIGHT oc must align to 8");
if (src[1] % 8 == 0)
@@ -150,7 +155,7 @@ size_t RelayoutFormatImpl::get_workspace_in_bytes(const TensorLayout& src,
return oc * ic * h * w * src.dtype.size();
}
case Param::Mode::NCHW_NCHW88_CONV_GROUP_WEIGHT: {
megdnn_assert(src.ndim == 5, "src must be goihw ,nmdim == 5");
megdnn_assert(src.ndim == 5, "src must be goihw, ndim == 5");
megdnn_assert(src[1] % 8 == 0,
"NCHW_NCHW88_CONV_CHAN_WEIGHT oc per group must "
"align to 8");
@@ -164,7 +169,7 @@ size_t RelayoutFormatImpl::get_workspace_in_bytes(const TensorLayout& src,
return group * ocpg * icpg * h * w * src.dtype.size();
}
case Param::Mode::NCHW_NCHW88_CONV_CHAN_WEIGHT: {
megdnn_assert(src.ndim == 5, "src must be goihw ,nmdim == 5");
megdnn_assert(src.ndim == 5, "src must be goihw, ndim == 5");
if (src[0] % 8 == 0)
return 0;
size_t group = round_up(src[0], 8_z);
@@ -174,6 +179,27 @@ size_t RelayoutFormatImpl::get_workspace_in_bytes(const TensorLayout& src,
size_t w = src[4];
return group * ocpg * icpg * h * w * src.dtype.size();
}

case Param::Mode::NCHW_NCHW4_IC_SMALL: {
if (src[1] % 4 == 0)
return 0;
size_t n = src[0];
size_t c = round_up(src[1], 4_z);
size_t h = src[2];
size_t w = src[3];
return n * c * h * w * src.dtype.size();
}
case Param::Mode::NCHW_NCHW4_IC_SMALL_CONV_DENSE_WEIGHT: {
megdnn_assert(src.ndim == 4, "src must be oihw, ndim == 5");
if (src[1] % 4 == 0)
return 0;
size_t oc = src[0];
size_t ic = round_up(src[1], 4_z);
size_t h = src[2];
size_t w = src[3];
return oc * ic * h * w * src.dtype.size();
}

default:
return 0;
}
@@ -200,14 +226,18 @@ void RelayoutFormatImpl::exec(_megdnn_tensor_in src, _megdnn_tensor_out dst,
//! ic % 4 != 0
if ((IC & 0x3)) {
switch (src.layout.dtype.enumv()) {
#define cb(name, ctype) \
case (DTypeEnum::name): { \
ctype* sptr = src.compatible_ptr<ctype>(); \
ctype* dptr = workspace.ptr<ctype>(); \
MEGDNN_DISPATCH_CPU_KERN( \
m_handle, \
padding_src_to_workspace<ctype>(dptr, sptr, N, IC, IH, IW);); \
break; \
#define cb(name, ctype) \
case (DTypeEnum::name): { \
MIDOUT_BEGIN(megdnn_naive_relayout_format, ctype, \
midout_iv(Param::Mode::NCHW_NHWCD4I)) { \
ctype* sptr = src.compatible_ptr<ctype>(); \
ctype* dptr = workspace.ptr<ctype>(); \
MEGDNN_DISPATCH_CPU_KERN( \
m_handle, padding_src_to_workspace<ctype>(dptr, sptr, N, \
IC, IH, IW);); \
} \
MIDOUT_END(); \
break; \
}
cb(Float32, dt_float32);
MEGDNN_INC_FLOAT16(cb(Float16, dt_float16));
@@ -226,14 +256,18 @@ void RelayoutFormatImpl::exec(_megdnn_tensor_in src, _megdnn_tensor_out dst,
size_t FW = src.layout[3];
if ((IC & 0x3)) {
switch (src.layout.dtype.enumv()) {
#define cb(name, ctype) \
case (DTypeEnum::name): { \
ctype* sptr = src.compatible_ptr<ctype>(); \
ctype* dptr = workspace.ptr<ctype>(); \
MEGDNN_DISPATCH_CPU_KERN( \
m_handle, padding_filter_to_workspace<ctype>(dptr, sptr, OC, \
IC, FH, FW);); \
break; \
#define cb(name, ctype) \
case (DTypeEnum::name): { \
MIDOUT_BEGIN(megdnn_naive_relayout_format, ctype, \
midout_iv(Param::Mode::INTER_WEIGHT_DENSEI_DOT)) { \
ctype* sptr = src.compatible_ptr<ctype>(); \
ctype* dptr = workspace.ptr<ctype>(); \
MEGDNN_DISPATCH_CPU_KERN(m_handle, \
padding_filter_to_workspace<ctype>( \
dptr, sptr, OC, IC, FH, FW);); \
} \
MIDOUT_END(); \
break; \
}
cb(Quantized8Asymm, dt_uint8);
cb(QuantizedS8, dt_int8);
@@ -244,33 +278,35 @@ void RelayoutFormatImpl::exec(_megdnn_tensor_in src, _megdnn_tensor_out dst,
exec_src_nd.raw_ptr = workspace.raw_ptr;
}
} else if (param().mode == Param::Mode::NCHW_NCHW88) {
size_t ic = src.layout[1];
if (ic % 8 != 0) {
padding_to_workspace({workspace.raw_ptr, exec_src}, src, 1, 8);
exec_src_nd.raw_ptr = workspace.raw_ptr;
}
#define cb(_idx, _pack_size, _mode) \
MIDOUT_BEGIN(megdnn_naive_relayout_format, \
midout_iv(Param::Mode::_mode)) { \
size_t val = src.layout[_idx]; \
if (val % _pack_size != 0) { \
padding_to_workspace({workspace.raw_ptr, exec_src}, src, _idx, \
_pack_size); \
exec_src_nd.raw_ptr = workspace.raw_ptr; \
} \
} \
MIDOUT_END();
cb(1, 8, NCHW_NCHW88);

} else if (param().mode == Param::Mode::NCHW_NCHW88_CONV_DENSE_WEIGHT) {
megdnn_assert(src.layout[0] % 8 == 0);
size_t ic = src.layout[1];
if (ic % 8 != 0) {
padding_to_workspace({workspace.raw_ptr, exec_src}, src, 1, 8_z);
exec_src_nd.raw_ptr = workspace.raw_ptr;
}
cb(1, 8, NCHW_NCHW88_CONV_DENSE_WEIGHT);
} else if (param().mode == Param::Mode::NCHW_NCHW88_CONV_CHAN_WEIGHT) {
size_t group = src.layout[0];
if (group % 8 != 0) {
padding_to_workspace({workspace.raw_ptr, exec_src}, src, 0, 8_z);
exec_src_nd.raw_ptr = workspace.raw_ptr;
}
cb(0, 8, NCHW_NCHW88_CONV_CHAN_WEIGHT);
} else if (param().mode == Param::Mode::NCHW_NCHW88_CONV_GROUP_WEIGHT) {
megdnn_assert(src.layout[1] % 8 == 0);
size_t ic = src.layout[2];
if (ic % 8 != 0) {
padding_to_workspace({workspace.raw_ptr, exec_src}, src, 2, 8_z);
exec_src_nd.raw_ptr = workspace.raw_ptr;
}
cb(2, 8, NCHW_NCHW88_CONV_GROUP_WEIGHT);
} else if (param().mode == Param::Mode::NCHW_NCHW4_IC_SMALL) {
cb(1, 4, NCHW_NCHW4_IC_SMALL);
} else if (param().mode ==
Param::Mode::NCHW_NCHW4_IC_SMALL_CONV_DENSE_WEIGHT) {
cb(1, 4, NCHW_NCHW4_IC_SMALL_CONV_DENSE_WEIGHT);
}
m_handle->relayout_opr()->exec(exec_src_nd, exec_dst_nd, handle());
#undef cb
}

// vim: syntax=cpp.doxygen

+ 22
- 0
dnn/test/cuda/relayout_format.cpp View File

@@ -8,6 +8,7 @@
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
*/
#include "megdnn/dtype.h"
#include "megdnn/oprs.h"
#include "test/common/checker.h"
#include "test/common/rng.h"
@@ -30,4 +31,25 @@ TEST_F(CUDA, RELAYOUT_FORMAT) {
checker.execs({{22, 23, 24, 25, 4}, {}});
}

TEST_F(CUDA, RELAYOUT_FORMAT_NCHW4) {
Checker<RelayoutFormat> checker(handle_cuda());
UniformIntRNG rng{-50, 50};
param::RelayoutFormat param;
param.mode = param::RelayoutFormat::Mode::NCHW_NCHW4_IC_SMALL;

for (DType dtype :
std::vector<DType>({dtype::QuantizedS8{0.1f}, dtype::Float32{}})) {
checker.set_dtype(0, dtype).set_rng(0, &rng);

checker.set_param(param).execs({{2, 4, 35, 36}, {}});
checker.set_param(param).execs({{2, 3, 35, 36}, {}});
checker.set_param(param).execs({{2, 1, 35, 36}, {}});
param.mode = param::RelayoutFormat::Mode::
NCHW_NCHW4_IC_SMALL_CONV_DENSE_WEIGHT;
checker.set_param(param).execs({{4, 3, 3, 3}, {}});
checker.set_param(param).execs({{4, 4, 3, 3}, {}});
checker.set_param(param).execs({{1, 4, 3, 3}, {}});
}
}

// vim: syntax=cpp.doxygen

+ 1
- 1
python_module/megengine/_internal/__init__.py View File

@@ -25,7 +25,7 @@ from . import config, craniotome, dtype
from . import global_init as _global_init
from . import helper as _helper
from . import mgb as _detail
from . import opr, opr_param_defs, plugin
from . import opr, opr_extra, opr_param_defs, plugin
from .exc import MegBrainError
from .logconf import get_logger
from .mgb import (


+ 3
- 0
python_module/megengine/_internal/opr_extra.py View File

@@ -0,0 +1,3 @@
# -*- coding: utf-8 -*-
# Copyright (c) 2015-2019 Megvii Inc. All rights reserved.


+ 1
- 0
python_module/megengine/core/function.py View File

@@ -154,6 +154,7 @@ class Function(metaclass=ABCMeta):
memo[id(self)] = result
for k, v in self.__dict__.items():
setattr(result, k, copy.deepcopy(v, memo))
setattr(result, "saved_tensors", tmp)
self.saved_tensors = tmp
return result



+ 8
- 0
python_module/megengine/core/tensor.py View File

@@ -235,6 +235,14 @@ class Tensor:
return self.__val.dtype
return self._symvar.dtype

def set_dtype(self, dtype: str = None):
r"""Set the data type of the tensor.
"""
if self.__val is not None:
self.__val = mgb.make_shared(self.device, value=self.astype(dtype).numpy())
elif self.__sym is not None:
self.__sym = self.__sym.astype(dtype)

@property
def _comp_node(self):
if self.__val is not None:


+ 6
- 1
python_module/megengine/data/_queue.py View File

@@ -26,7 +26,7 @@ def _clear_plasma_store():
# `_PlasmaStoreManager.__del__` will not be called automaticly in subprocess,
# so this function should be called explicitly
global MGE_PLASMA_STORE_MANAGER
if MGE_PLASMA_STORE_MANAGER is not None:
if MGE_PLASMA_STORE_MANAGER is not None and MGE_PLASMA_STORE_MANAGER.refcount == 0:
del MGE_PLASMA_STORE_MANAGER
MGE_PLASMA_STORE_MANAGER = None

@@ -50,6 +50,7 @@ class _PlasmaStoreManager:
stderr=None if debug_flag else subprocess.DEVNULL,
)
self.__initialized = True
self.refcount = 1

def __del__(self):
if self.__initialized and self.plasma_store.returncode is None:
@@ -83,6 +84,8 @@ class PlasmaShmQueue:
"Exception happened in starting plasma_store: {}\n"
"Tips: {}".format(str(e), err_info)
)
else:
MGE_PLASMA_STORE_MANAGER.refcount += 1

self.socket_name = MGE_PLASMA_STORE_MANAGER.socket_name

@@ -133,6 +136,8 @@ class PlasmaShmQueue:
def close(self):
self.queue.close()
self.disconnect_client()
global MGE_PLASMA_STORE_MANAGER
MGE_PLASMA_STORE_MANAGER.refcount -= 1
_clear_plasma_store()

def cancel_join_thread(self):


+ 1
- 1
python_module/megengine/functional/nn.py View File

@@ -44,7 +44,7 @@ def linear(inp: Tensor, weight: Tensor, bias: Optional[Tensor] = None) -> Tensor
ret = mgb.opr.matrix_mul(inp, weight, transposeB=True)
ret = ret.reshape(orig_shape[:-1], weight.shape[0])
if bias is not None:
ret += bias
ret += bias.reshape(1, bias.shape[0])
return ret




+ 32
- 10
python_module/megengine/jit/__init__.py View File

@@ -442,17 +442,38 @@ class trace:
Serialize trace to file system.

:param fpath: positional only argument. Path of output file.
:param arg_names: names of the input tensors in the traced function
:param append: whether output is appended to ``fpath``
:param f16_io_f32_comp: whether to use float16 for I/O between oprs and use
:param arg_names: names of the input tensors in the traced function.
:param append: whether output is appended to ``fpath``.
:param optimize_for_inference: whether to enable optimize_for_inference
pass before dump.

:param enable_io16xc32: whether to use float16 for I/O between oprs and use
float32 as internal computation precision. Note the output var would be
changed to float16
:param f16_io_comp: whether to use float16 for both I/O and computation
precision
:param use_nhwcd4: whether to use NHWCD4 data format. This is faster on some
OpenCL devices
:param fuse_conv_bias_nonlinearity: whether to fuse conv+bias+nonlinearty
into one opr. This is supported only in NHWCD4 format.
changed to float16.
:param enable_ioc16: whether to use float16 for both I/O and computation
precision.

:param enable_hwcd4: whether to use NHWCD4 data layout. This is faster on some
OpenCL backend.
:param enable_nchw88: whether to use NCHW4 data layout. it currently
used in X86 AVX backend.
:param enable_nchw44: whether to use NCHW4 data layout. it currently
used in arm backend.
:param enable_nchw44_dot: whether to use NCHW4 data layout. it currently
used in armv8.2+dotprod backend.
:param enable_nchw4: whether to use NCHW4 data layout. it currently
used in nvidia backend(based on cudnn).
:param enable_nchw32 whether to use NCHW32 data layout. it currently
used in nvidia backend with tensorcore(based on cudnn).
:param enable_chwn4 whether to use CHWN4 data layout. it currently
used in nvidia backend with tensorcore.

:param enable_fuse_conv_bias_nonlinearity: whether to fuse conv+bias+nonlinearty
into one opr.
:param enable_fuse_conv_bias_with_z: whether to fuse conv_bias with z
input for inference on nvidia backend(this optimization pass will
result in mismatch of the precision of output of training and
inference)
"""
if self._status != self._FINISHED:
raise ValueError("not traced")
@@ -475,6 +496,7 @@ class trace:
"enable_nchw88": "use_nchw88",
"enable_nchw32": "use_nchw32",
"enable_nchw44": "use_nchw44",
"enable_nchw44_dot": "use_nchw44_dot",
"enable_chwn4": "use_chwn4",
"enable_fuse_conv_bias_nonlinearity": "fuse_conv_bias_nonlinearity",
"enable_fuse_conv_bias_with_z": "fuse_conv_bias_with_z",


+ 5
- 0
python_module/megengine/module/module.py View File

@@ -11,6 +11,7 @@ from typing import Any, Callable, Iterable, Optional, Set, Tuple, Union

import numpy as np

from .._internal.dtype import is_quantize
from ..core import Buffer, Parameter, Tensor
from ..logger import get_logger

@@ -460,6 +461,10 @@ class Module(metaclass=ABCMeta):
), "param `{}` shape mismatch, should be {}, get {}".format(
k, var.shape, to_be_load.shape
)
# For quantized dtype, the initialized dtype
# scale/zero_points maybe invalid, use pretrained dtype instead.
if is_quantize(to_be_load.dtype) and is_quantize(var.dtype):
var.set_dtype(to_be_load.dtype)
var.set_value(to_be_load)
loaded.append(k)



+ 15
- 10
python_module/megengine/module/qat/module.py View File

@@ -37,15 +37,14 @@ class QATModule(Module):
Set quantization related configs with ``qconfig``, including
observer and fake_quant for weight and activation.
"""
self.weight_observer = qconfig.weight_observer()
self.act_observer = qconfig.act_observer()

if qconfig.fake_quant is None:
self.weight_fake_quant = None
self.act_fake_quant = None
else:
self.weight_fake_quant = qconfig.fake_quant(self.weight_observer.dtype)
self.act_fake_quant = qconfig.fake_quant(self.act_observer.dtype)
def safe_call(func):
return func() if func is not None else None

self.weight_observer = safe_call(qconfig.weight_observer)
self.act_observer = safe_call(qconfig.act_observer)
self.weight_fake_quant = safe_call(qconfig.weight_fake_quant)
self.act_fake_quant = safe_call(qconfig.act_fake_quant)

def _apply_fakequant_with_observer(
self, target: Tensor, fake_quant: FakeQuantize, observer: Observer
@@ -77,13 +76,19 @@ class QATModule(Module):
r"""
Get weight's quantization dtype as the method from ``qconfig``.
"""
return self.weight_observer.get_dtype()
if hasattr(self.act_fake_quant, "get_dtype"):
return self.weight_fake_quant.get_dtype()
else:
return self.weight_observer.get_dtype()

def get_activation_dtype(self):
r"""
Get activation's quantization dtype as the method from ``qconfig``.
"""
return self.act_observer.get_dtype()
if hasattr(self.act_fake_quant, "get_dtype"):
return self.act_fake_quant.get_dtype()
else:
return self.act_observer.get_dtype()

@classmethod
@abstractmethod


+ 1
- 0
python_module/megengine/quantization/__init__.py View File

@@ -12,4 +12,5 @@ from .qconfig import (
calibration_qconfig,
ema_fakequant_qconfig,
min_max_fakequant_qconfig,
tqt_quant_qconfig,
)

+ 116
- 27
python_module/megengine/quantization/fake_quant.py View File

@@ -5,18 +5,21 @@
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
import copy
import math

import numpy as np

from .. import functional as F
from .._internal.dtype import _metadata_dict
from .._internal.dtype import _metadata_dict, get_quantized_dtype
from ..core import Buffer, Function, Parameter
from ..jit import sideeffect
from ..module import Module
from .observer import ObserverMode, Round


class FakeQuantize(Module):
r"""
A module to do quant and dequant according to observer's scale and zero_point.
"""

def __init__(self, dtype: str, enable: bool = True):
class _FakeQuantize(Module):
def __init__(self, dtype: str, narrow_range: bool = False, enable: bool = True):
super().__init__()
if not dtype in _metadata_dict.keys():
raise ValueError(
@@ -25,7 +28,10 @@ class FakeQuantize(Module):
)
)
self.dtype = dtype
self.qmin = _metadata_dict[dtype].qmin
self.narrow_range = narrow_range
self.qmin = (
-_metadata_dict[dtype].qmax if narrow_range else _metadata_dict[dtype].qmin
)
self.qmax = _metadata_dict[dtype].qmax
self.enabled = enable

@@ -35,25 +41,108 @@ class FakeQuantize(Module):
def disable(self):
self.enabled = False

def fake_quant_forward(self, inp, q_dict):
return inp

def normal_foward(self, inp, q_dict):
return inp

def forward(self, inp, q_dict):
if self.enabled:
if q_dict["mode"] == ObserverMode.SYMMERTIC:
scale = q_dict["scale"]
# Quant
oup = Round()(inp / scale)
# clip
oup = F.minimum(F.maximum(oup, self.qmin), self.qmax)
# DeQuant
oup = (oup) * scale
return oup
else:
scale = q_dict["scale"]
zero_point = q_dict["zero_point"]
# Quant
oup = Round()(inp / scale) + zero_point
# clip
oup = F.minimum(F.maximum(oup, self.qmin), self.qmax)
# DeQuant
oup = (oup - zero_point) * scale
return oup
return self.fake_quant_forward(inp, q_dict)
else:
return self.normal_foward(inp, q_dict)


class TQT_Function(Function):
def __init__(self, lowerbound, upperbound):
super().__init__()
self.lowerbound = lowerbound
self.upperbound = upperbound

def forward(self, inp, scale):
t = 2 ** scale
# t = F.maximum(t, 1e-4)
inp_scaled = inp / t
inp_clipped = F.maximum(F.minimum(inp_scaled, self.upperbound), self.lowerbound)
inp_rounded = F.round(inp_clipped)
inp_flq = inp_rounded * t
self.save_for_backward(inp_scaled, inp_rounded, t)
return inp_flq

def backward(self, grad_inp_flq):
(inp_scaled, inp_rounded, t) = self.saved_tensors
mask_clip = (inp_scaled < -0.5 + self.lowerbound) + (
inp_scaled > self.upperbound + 0.5
) # mask for accumulating the gradients of |data_scaled|>L
mask_quant = F.abs(
mask_clip - 1
) # mask for accumulating the gradients with |data_scaled|<=L
grad_quant = (
grad_inp_flq * mask_quant * (inp_rounded - inp_scaled)
) # gradient within |data_scaled|<=L
grad_clip = (
grad_inp_flq * mask_clip * inp_rounded
) # gradient with | data_scaled|>L
grad_s = grad_clip.sum() + grad_quant.sum()
# dL/ds = dL/dt * t * ln(2)
grad_s = grad_s * t * math.log(2)
grad_inp = grad_inp_flq * mask_quant
return grad_inp, grad_s


class TQT(_FakeQuantize):
"""
TQT: https://arxiv.org/abs/1903.08066 Trained Quantization Thresholds
for Accurate and Efficient Fixed-Point Inference of Deep Neural Networks
"""

def __init__(self, dtype: str, narrow_range: bool = False, enable: bool = True):
super().__init__(dtype, narrow_range, enable)
self.scale = Parameter(0.0, dtype=np.float32)

def fake_quant_forward(self, inp, q_dict):
# when enable, TQT will do fakequant forward, finetune the scale
return TQT_Function(self.qmin, self.qmax)(inp, self.scale)

def normal_foward(self, inp, q_dict):
# when disable, TQT will do normal forward, initialize scale weight
tmp_scale = F.maximum(F.abs(q_dict["min_val"]), F.abs(q_dict["max_val"]))
tmp_scale = F.log(tmp_scale / 127) / F.log(2)
F.add_update(self.scale, tmp_scale, alpha=0.0, beta=1.0, bias=0.0)
return inp

def get_dtype(self):
return get_quantized_dtype(self.dtype, 2 ** self.scale.numpy()[0], None)


class FakeQuantize(_FakeQuantize):
r"""
A module to do quant and dequant according to observer's scale and zero_point.

:param dtype: A string indicating the target quantization type of input.
:param narrow_range: Whether the absolute value of ``qmin`` is the same as ``qmax``,
instead of 1 greater. Usually True for weight and False for activation.
:param enable: Whether do ``normal_forward`` or ``fake_quant_forward``.
"""

def fake_quant_forward(self, inp, q_dict):
if q_dict["mode"] == ObserverMode.SYMMERTIC:
scale = q_dict["scale"]
# Quant
oup = Round()(inp / scale)
# clip
oup = F.minimum(F.maximum(oup, self.qmin), self.qmax)
# DeQuant
oup = (oup) * scale
return oup
else:
scale = q_dict["scale"]
zero_point = q_dict["zero_point"]
# Quant
oup = Round()(inp / scale) + zero_point
# clip
oup = F.minimum(F.maximum(oup, self.qmin), self.qmax)
# DeQuant
oup = (oup - zero_point) * scale
return oup

+ 27
- 8
python_module/megengine/quantization/observer.py View File

@@ -31,9 +31,11 @@ class Observer(Module):
A base class for Observer Module.

:param dtype: a string indicating to collect scale and zero_point of which dtype
:param narrow_range: Whether the absolute value of ``qmin`` is the same as ``qmax``,
instead of 1 greater. Usually True for weight and False for activation.
"""

def __init__(self, dtype="qint8"):
def __init__(self, dtype: str, narrow_range: bool = False):
super().__init__()
if dtype not in _metadata_dict.keys():
raise ValueError(
@@ -42,7 +44,10 @@ class Observer(Module):
)
)
self.dtype = dtype
self.qmin = _metadata_dict[dtype].qmin
self.narrow_range = narrow_range
self.qmin = (
-_metadata_dict[dtype].qmax if narrow_range else _metadata_dict[dtype].qmin
)
self.qmax = _metadata_dict[dtype].qmax
self.enabled = True

@@ -96,8 +101,14 @@ def create_observer_dict(mode):


class MinMaxObserver(Observer):
def __init__(self, mode=ObserverMode.SYMMERTIC, eps=0.00001, dtype="qint8"):
super().__init__(dtype)
def __init__(
self,
mode=ObserverMode.SYMMERTIC,
eps=0.00001,
dtype="qint8",
narrow_range: bool = False,
):
super().__init__(dtype, narrow_range)
self.mode = mode
self.min_val = Buffer(np.finfo(np.float32).max, dtype=np.float32)
self.max_val = Buffer(np.finfo(np.float32).min, dtype=np.float32)
@@ -107,6 +118,8 @@ class MinMaxObserver(Observer):
min_val = F.minimum(0.0, inp_min_val)
max_val = F.maximum(0.0, inp_max_val)
q_dict = create_observer_dict(self.mode)
q_dict["min_val"] = inp_min_val
q_dict["max_val"] = inp_max_val
if self.mode == ObserverMode.SYMMERTIC:
symmetric_max_vals = F.maximum(-min_val, max_val)
# use maximun to avoid scale too small at the begin
@@ -151,9 +164,14 @@ class MinMaxObserver(Observer):

class ExponentialMovingAverageObserver(MinMaxObserver):
def __init__(
self, momentum=0.9, mode=ObserverMode.SYMMERTIC, eps=0.00001, dtype="qint8"
self,
momentum=0.9,
mode=ObserverMode.SYMMERTIC,
eps=0.00001,
dtype="qint8",
narrow_range: bool = False,
):
super().__init__(mode, eps, dtype)
super().__init__(mode, eps, dtype, narrow_range)
self.momentum = Buffer(momentum)
self.runtime_momentum = Buffer(0.0)

@@ -186,11 +204,12 @@ class HistogramObserver(MinMaxObserver):
self,
bins=2048,
upsample_rate=128,
dtype="qint8",
mode=ObserverMode.SYMMERTIC,
eps=0.00001,
dtype="qint8",
narrow_range: bool = False,
):
super().__init__(mode, eps, dtype)
super().__init__(mode, eps, dtype, narrow_range)
self.bins = bins
self.upsample_rate = upsample_rate
self.dst_nbins = _metadata_dict[dtype].qmax - _metadata_dict[dtype].qmin + 1


+ 49
- 19
python_module/megengine/quantization/qconfig.py View File

@@ -1,12 +1,14 @@
# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
#
# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
#
#'
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
from functools import partial

from ..module import Module
from .fake_quant import FakeQuantize
from .fake_quant import TQT, FakeQuantize
from .observer import (
ExponentialMovingAverageObserver,
HistogramObserver,
@@ -22,9 +24,9 @@ class QConfig:
:param weight_observer: interface to instantiate an :class:`~.Observer` indicating
how to collect scales and zero_point of wegiht.
:param act_observer: similar to ``weight_observer`` but toward activation.
:param fake_quant: interface to instantiate a :class:`~.FakeQuantize` indicating
how to do fake_quant calculation. can be invoked multi times to get different
instance for each target tensor, for better control on enable and disable.
:param weight_fake_quant: interface to instantiate a :class:`~.FakeQuantize` indicating
how to do fake_quant calculation.
:param act_observer: similar to ``weight_fake_quant`` but toward activation.

Examples:

@@ -32,14 +34,24 @@ class QConfig:

# Default EMA QConfig for QAT.
ema_fakequant_qconfig = QConfig(
weight_observer=MinMaxObserver,
act_observer=ExponentialMovingAverageObserver,
fake_quant=FakeQuantize,
weight_observer=partial(MinMaxObserver, dtype="qint8", narrow_range=True),
act_observer=partial(ExponentialMovingAverageObserver, dtype="qint8", narrow_range=False),
weight_fake_quant=partial(FakeQuantize, dtype="qint8", narrow_range=True),
act_fake_quant=partial(FakeQuantize, dtype="qint8", narrow_range=False),
)

Each parameter is a ``class`` rather than an instance. And we recommand using ``functools.partial``
to add initialization parameters of the ``class``, so that don't need to provide parameters in
:meth:`~.QATModule.set_qconfig`.

Usually we set ``narrow_range`` of weight related paramters to ``True`` and of activation related
parameters to ``False``. For the result of multiplication and addition as ``a * b + c * d``, if
four variables are all -128 of dtype ``qint8``, then the result will be ``2^15`` and cause overflow.
Weights are commonly calculated in this way, so needed to narrow the range.
"""

def __init__(
self, act_observer, weight_observer, fake_quant,
self, weight_observer, act_observer, weight_fake_quant, act_fake_quant
):
if isinstance(act_observer, Module) or isinstance(weight_observer, Module):
raise ValueError(
@@ -47,24 +59,42 @@ class QConfig:
" class generator using `partial(Observer, ...)` instead. Use"
" partial(MyObserver, x=1) to override arguments to constructor if needed"
)
self.act_observer = act_observer
self.weight_observer = weight_observer
self.fake_quant = fake_quant
self.act_observer = act_observer
self.weight_fake_quant = weight_fake_quant
self.act_fake_quant = act_fake_quant


# Default QAT QConfigs
tqt_quant_qconfig = QConfig(
weight_observer=partial(
ExponentialMovingAverageObserver, dtype="qint8", narrow_range=True
),
act_observer=partial(
ExponentialMovingAverageObserver, dtype="qint8", narrow_range=False
),
weight_fake_quant=partial(TQT, dtype="qint8", narrow_range=True),
act_fake_quant=partial(TQT, dtype="qint8", narrow_range=False),
)

min_max_fakequant_qconfig = QConfig(
weight_observer=MinMaxObserver,
act_observer=MinMaxObserver,
fake_quant=FakeQuantize,
weight_observer=partial(MinMaxObserver, dtype="qint8", narrow_range=True),
act_observer=partial(MinMaxObserver, dtype="qint8", narrow_range=False),
weight_fake_quant=partial(FakeQuantize, dtype="qint8", narrow_range=True),
act_fake_quant=partial(FakeQuantize, dtype="qint8", narrow_range=False),
)

ema_fakequant_qconfig = QConfig(
weight_observer=MinMaxObserver,
act_observer=ExponentialMovingAverageObserver,
fake_quant=FakeQuantize,
weight_observer=partial(MinMaxObserver, dtype="qint8", narrow_range=True),
act_observer=partial(
ExponentialMovingAverageObserver, dtype="qint8", narrow_range=False
),
weight_fake_quant=partial(FakeQuantize, dtype="qint8", narrow_range=True),
act_fake_quant=partial(FakeQuantize, dtype="qint8", narrow_range=False),
)

calibration_qconfig = QConfig(
weight_observer=MinMaxObserver, act_observer=HistogramObserver, fake_quant=None,
weight_observer=partial(MinMaxObserver, dtype="qint8", narrow_range=True),
act_observer=partial(HistogramObserver, dtype="qint8", narrow_range=False),
weight_fake_quant=None,
act_fake_quant=None,
)

+ 4
- 0
python_module/megengine/utils/__init__.py View File

@@ -6,6 +6,10 @@
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

from megengine._internal.plugin import load_tensor_binary


def prod(iterable):
result = 1
for i in iterable:


+ 1
- 2
python_module/megengine/version.py View File

@@ -1,2 +1 @@
__version__ = "0.4.0"

__version__ = "0.5.1"

+ 0
- 1
python_module/test/unit/core/test_function.py View File

@@ -96,7 +96,6 @@ def test_deepcopy():
origin = Sigmoid(0)
new = copy.deepcopy(Sigmoid(0))
assert new.param == origin.param
assert new.saved_tensors == None


def test_save_context():


+ 46
- 0
python_module/test/unit/core/test_tensor.py View File

@@ -10,6 +10,7 @@ import numpy as np
import pytest

import megengine as mge
import megengine._internal as mgb


def test_wrong_dtype():
@@ -26,3 +27,48 @@ def test_tensor_routine():
mge.tensor([1])

mge.tensor(1.5)


def test_tensor_set_dtype():
def check_dtype_value(tensor, dtype_scale, value):
if mgb.dtype.is_quantize(tensor.dtype):
if np.abs(mgb.dtype.get_scale(tensor.dtype) - dtype_scale) > 1e-5:
raise AssertionError(
"compare scale failed expect {} got {}".format(
dtype_scale, mgb.dtype.get_scale(tensor.dtype)
)
)
if np.abs(tensor.numpy()[0][0] - value) > 1e-5:
raise AssertionError(
"compare value failed expect {} got {}".format(
tensor.numpy()[0][0], value
)
)

t = mge.Parameter(np.ones((3, 4), dtype="float32"))
t.set_dtype(mgb.dtype.qint8(0.1))
check_dtype_value(t, 0.1, 10)

t = mge.Parameter(np.ones((3, 4), dtype=mgb.dtype.qint8(1)))
t.set_dtype(mgb.dtype.qint8(0.3))
check_dtype_value(t, 0.3, 3)

t = mge.Buffer(np.ones((3, 4), dtype="float32"))
t.set_dtype(mgb.dtype.qint8(0.1))
check_dtype_value(t, 0.1, 10)

t = mge.Buffer(np.ones((3, 4), dtype=mgb.dtype.qint8(1)))
t.set_dtype(mgb.dtype.qint8(0.3))
check_dtype_value(t, 0.3, 3)

t = mge.Buffer(np.ones((3, 4), dtype="float32"))
s = t + 1
s.set_dtype(mgb.dtype.qint8(0.2))
check_dtype_value(s, 0.2, 10)

t.set_dtype(mgb.dtype.qint8(0.3))
s = t + 1
s.set_dtype(mgb.dtype.qint8(0.1))
check_dtype_value(s, 0.1, 18)
s.set_dtype("float32")
check_dtype_value(s, 0, 1.8)

+ 49
- 0
python_module/test/unit/data/test_dataloader.py View File

@@ -132,3 +132,52 @@ def test_dataloader_parallel_worker_exception():
with pytest.raises(RuntimeError, match=r"worker.*died"):
data_iter = iter(dataloader)
batch_data = next(data_iter)


def _multi_instances_parallel_dataloader_worker():
dataset = init_dataset()

for divide_flag in [True, False]:
train_dataloader = DataLoader(
dataset,
sampler=RandomSampler(dataset, batch_size=4, drop_last=False),
num_workers=2,
divide=divide_flag,
)
val_dataloader = DataLoader(
dataset,
sampler=RandomSampler(dataset, batch_size=10, drop_last=False),
num_workers=2,
divide=divide_flag,
)
for idx, (data, label) in enumerate(train_dataloader):
assert data.shape == (4, 1, 32, 32)
assert label.shape == (4,)
if idx % 5 == 0:
for val_data, val_label in val_dataloader:
assert val_data.shape == (10, 1, 32, 32)
assert val_label.shape == (10,)


def test_dataloader_parallel_multi_instances():
# set max shared memory to 100M
os.environ["MGE_PLASMA_MEMORY"] = "100000000"

_multi_instances_parallel_dataloader_worker()


def test_dataloader_parallel_multi_instances_multiprocessing():
# set max shared memory to 100M
os.environ["MGE_PLASMA_MEMORY"] = "100000000"

import multiprocessing as mp

# mp.set_start_method("spawn")
processes = []
for i in range(4):
p = mp.Process(target=_multi_instances_parallel_dataloader_worker)
p.start()
processes.append(p)

for p in processes:
p.join()

+ 37
- 0
python_module/test/unit/module/test_module.py View File

@@ -14,8 +14,10 @@ import pytest
from helpers import MLP

import megengine as mge
import megengine._internal as mgb
from megengine.core import Buffer, Parameter, Tensor, tensor
from megengine.module import BatchNorm1d, BatchNorm2d, Conv2d, Module, Sequential
from megengine.quantization.quantize import quantize, quantize_qat
from megengine.test import assertTensorClose


@@ -347,3 +349,38 @@ def test_dump_model():
pred = mlp(data)
with tempfile.NamedTemporaryFile() as f:
mge.dump(pred, f.name)


def test_load_quantized():
data_shape = (2, 28)
data = tensor(np.random.random(data_shape), dtype="float32")
data = data.astype(mgb.dtype.qint8(0.1))
mlp = MLP()
quantize_qat(mlp)
quantize(mlp)
mlp.dense0.weight = Parameter(
mlp.dense0.weight.astype(mgb.dtype.qint8(0.001)).numpy()
)
mlp.dense1.weight = Parameter(
mlp.dense1.weight.astype(mgb.dtype.qint8(0.0002)).numpy()
)
mlp.eval()
pred0 = mlp(data)

with BytesIO() as fout:
mge.save(mlp.state_dict(), fout)
fout.seek(0)
checkpoint = mge.load(fout)
# change mlp weight.
mlp.dense0.weight = Parameter(
mlp.dense0.weight.astype(mgb.dtype.qint8(0.00001)).numpy()
)
mlp.dense1.weight = Parameter(
mlp.dense1.weight.astype(mgb.dtype.qint8(0.2)).numpy()
)
mlp.load_state_dict(checkpoint)
pred1 = mlp(data)

assertTensorClose(
pred0.astype("float32").numpy(), pred1.astype("float32").numpy(), max_err=5e-6
)

+ 77
- 0
python_module/test/unit/quantization/test_TQT.py View File

@@ -0,0 +1,77 @@
# -*- coding: utf-8 -*-
# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
#
# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
import numpy as np
import pytest

import megengine as mge
import megengine._internal as mgb
from megengine.core import tensor
from megengine.quantization.fake_quant import TQT_Function
from megengine.test import assertTensorClose


class numpy_TQT_Function:
def __init__(self, lowerbound, upperbound):
super().__init__()
self.lowerbound = lowerbound
self.upperbound = upperbound

def forward(self, inp, scale):
t = 2 ** scale
# t = F.maximum(t, 1e-4)
inp_scaled = inp / t
inp_clipped = np.maximum(
np.minimum(inp_scaled, self.upperbound), self.lowerbound
)
inp_rounded = np.round(inp_clipped)
inp_flq = inp_rounded * t
self.saved_tensors = (inp_scaled, inp_rounded, t)
return inp_flq

def backward(self, grad_inp_flq):
(inp_scaled, inp_rounded, t) = self.saved_tensors
mask_clip = (inp_scaled < -0.5 + self.lowerbound) + (
inp_scaled > self.upperbound + 0.5
) # mask for accumulating the gradients of |data_scaled|>L
mask_quant = np.abs(
mask_clip - 1
) # mask for accumulating the gradients with |data_scaled|<=L
grad_quant = (
grad_inp_flq * mask_quant * (inp_rounded - inp_scaled)
) # gradient within |data_scaled|<=L
grad_clip = (
grad_inp_flq * mask_clip * inp_rounded
) # gradient with | data_scaled|>L
grad_s = grad_clip.sum() + grad_quant.sum()
# dL/ds = dL/dt * t * ln(2)
grad_s = grad_s * t * np.log(2)
grad_inp = grad_inp_flq * mask_quant
return grad_inp, grad_s


def test_TQT():
f = TQT_Function(-127, 127)
nf = numpy_TQT_Function(-127, 127)

def check_inp(a, b, c, a_np, b_np, c_np):
assertTensorClose(
f.forward(a, b).numpy(), nf.forward(a_np, b_np).astype("float32")
)
c1, c2 = f.backward(c)
c1_np, c2_np = nf.backward(c_np)
assertTensorClose(c1.numpy(), c1_np.astype("float32"))
assertTensorClose(c2.numpy(), c2_np.astype("float32"))

a = tensor()
b = tensor()
a_np = np.random.random((4, 3)).astype("float32")
b_np = np.random.random((1)).astype("float32")
a.set_value(a_np)
b.set_value(b_np)
check_inp(a, b, b, a_np, b_np, b_np)

+ 1
- 1
sdk/load-and-run/dump_with_testcase_mge.py View File

@@ -14,7 +14,7 @@ import struct
import cv2
import numpy as np

import megbrain as mgb
import megengine._internal as mgb
import megengine as mge

logger = mge.get_logger(__name__)


+ 39
- 39
sdk/load-and-run/src/mgblar.cpp View File

@@ -709,6 +709,41 @@ void run_test_st(Args &env) {
}
};

auto run_iters = [&](uint32_t case_idx) -> float {
double time_sqrsum = 0, time_sum = 0,
min_time = std::numeric_limits<double>::max(), max_time = 0;
for (int run = 0; run < env.nr_run; ++run) {
mgb_log_debug("load_and_run: before running iter %d", run);
timer.reset();
func->execute();
mgb_log_debug("load_and_run: before waiting iter %d", run);
auto exec_time = timer.get_msecs();
func->wait();
output_dumper.write_to_file();
auto cur = timer.get_msecs();
printf("iter %d/%d: %.3fms (exec=%.3f,device=%.3f)\n", run,
env.nr_run, cur, exec_time,
func->get_prev_exec_time() * 1e3);
time_sum += cur;
time_sqrsum += cur * cur;
fflush(stdout);
if (cur < min_time) {
min_time = cur;
}
if (cur > max_time) {
max_time = cur;
}
}
printf("=== finished test #%u: time=%.3fms avg_time=%.3fms "
"sd=%.3fms minmax=%.3f,%.3f\n\n",
case_idx, time_sum, time_sum / env.nr_run,
std::sqrt((time_sqrsum * env.nr_run - time_sum * time_sum) /
(env.nr_run * (env.nr_run - 1))),
min_time, max_time);
return time_sum;

};

if (nr_test) {
// run testcase, generated by dump_with_testcase.py

@@ -742,37 +777,7 @@ void run_test_st(Args &env) {
if (!env.nr_run) {
continue;
}
double time_sqrsum = 0, time_sum = 0,
min_time = std::numeric_limits<double>::max(), max_time = 0;
for (int run = 0; run < env.nr_run; ++ run) {
mgb_log_debug("load_and_run: before running iter %d", run);
timer.reset();
func->execute();
mgb_log_debug("load_and_run: before waiting iter %d", run);
auto exec_time = timer.get_msecs();
func->wait();
output_dumper.write_to_file();
auto cur = timer.get_msecs();
printf("iter %d/%d: %.3fms (exec=%.3f,device=%.3f)\n", run,
env.nr_run, cur, exec_time,
func->get_prev_exec_time() * 1e3);
time_sum += cur;
time_sqrsum += cur * cur;
fflush(stdout);
if (cur < min_time) {
min_time = cur;
}
if (cur > max_time) {
max_time = cur;
}
}
tot_time += time_sum;
printf("=== finished test #%u: time=%.3fms avg_time=%.3fms "
"sd=%.3fms minmax=%.3f,%.3f\n\n",
i, time_sum, time_sum / env.nr_run,
std::sqrt((time_sqrsum * env.nr_run - time_sum * time_sum) /
(env.nr_run * (env.nr_run - 1))),
min_time, max_time);
tot_time += run_iters(i);
}

printf("=== total time: %.3fms\n", tot_time);
@@ -793,15 +798,10 @@ void run_test_st(Args &env) {
in->copy_from(i.second);
}

warmup();
timer.reset();
func->execute();
auto exec_time = timer.get_msecs();
func->wait();
output_dumper.write_to_file();
auto cur = timer.get_msecs();
printf("%.3fms %.3fms (device=%.3f)\n", cur, exec_time,
func->get_prev_exec_time() * 1e3);

printf("=== going to run input for %d times\n", env.nr_run);
run_iters(0);
} else {
// run speed test for a raw mgb graph
mgb_assert(env.load_ret.tensor_map.empty(),


+ 5
- 0
src/CMakeLists.txt View File

@@ -34,6 +34,11 @@ if(MGE_WITH_CUDA AND MGE_WITH_TRT)
endif()


if(MGE_WITH_CUDA)
file(GLOB_RECURSE SOURCES_ opr/impl/standalone/*.cu)
list(APPEND SOURCES ${SOURCES_})
endif()

add_library(megbrain OBJECT EXCLUDE_FROM_ALL ${SOURCES})
target_link_libraries(megbrain PUBLIC mgb_opr_param_defs)
target_include_directories(megbrain


+ 1
- 1
src/core/impl/comp_node/cpu/comp_node.cpp View File

@@ -795,7 +795,7 @@ bool CpuCompNode::CompNodeImpl::check_global_finalized(const char* reason) {
/* ======================== CompNode methods ======================== */

CompNode CompNode::default_cpu() {
static Locator locator{DeviceType::CPU, Locator::DEVICE_CPU_DEFAULT, -1};
static Locator locator{DeviceType::CPU, Locator::DEVICE_CPU_DEFAULT, {-1}};
static auto empty_queue =
std::make_shared<CpuCompNode::WorkerQueue>(locator);
static CpuCompNodeImpl impl{locator, locator, empty_queue};


+ 1
- 1
src/core/impl/graph/cg_impl.cpp View File

@@ -464,7 +464,7 @@ ComputingGraphImpl::CompileState ComputingGraphImpl::compile_prepare(
#if MGB_ENABLE_TENSOR_RT
if (options().graph_opt.tensorrt) {
options().graph_opt.tensorrt = false;
tensorrt::transform_dest_vars_inplace(dest_vars);
tensorrt::transform_dest_vars_inplace(dest_vars, options().graph_opt);
}
#endif



+ 2
- 2
src/core/include/megbrain/version.h View File

@@ -12,8 +12,8 @@
#pragma once

#define MGB_MAJOR 8
#define MGB_MINOR 4
#define MGB_PATCH 1
#define MGB_MINOR 5
#define MGB_PATCH 0
//! whether it is development version
#ifndef MGB_IS_DEV
#define MGB_IS_DEV 0


+ 2
- 0
src/gopt/impl/framework.cpp View File

@@ -756,6 +756,7 @@ const GraphOptimizer& GraphOptimizer::add_passes_for_optimize_options(
cb(nchw32, {
add_pass<FuseConvBiasNonlinPass>();
add_pass<FuseConvBiasZPass>();
add_pass(EnableNCHW4Pass::make_nchw4_converter());
add_pass(EnableTensorCorePass::make_tensorcore_converter());
add_pass<ShuffleShuffleRemovePass>();
add_pass<RemoveRedundantTypeCvtPass>();
@@ -763,6 +764,7 @@ const GraphOptimizer& GraphOptimizer::add_passes_for_optimize_options(
cb(chwn4, {
add_pass<FuseConvBiasNonlinPass>();
add_pass<FuseConvBiasZPass>();
add_pass(EnableNCHW4Pass::make_nchw4_converter());
add_pass(EnableCHWN4Pass::make_chwn4_converter());
add_pass<ShuffleShuffleRemovePass>();
add_pass<RemoveRedundantTypeCvtPass>();


+ 209
- 62
src/gopt/impl/tensor_reformat.cpp View File

@@ -60,19 +60,24 @@ MGB_DEFINE_OPR_CLASS(TensorReformatPass::RelayoutPlaceholder,
public:
//! relayout type of this opr
enum class LayoutType {
NCHW4_TO_NCHW32, //!< from nchw4 layout to nchw32 layout
NCHW32_TO_NCHW4, //!< from nchw32 layout to nchw4 layout
NCHW4_TO_CHWN4, //!< from nchw4 layout to chwn4 layout
CHWN4_TO_NCHW4, //!< from chwn4 layout to nchw4 layout
NCHW_TO_NCHW4, //!< from nchw layout to nchw4 layout
NCHW4_TO_NCHW, //!< from nchw4 layout to nchw layout
NCHW_TO_NCHW88, //!< from nchw layout to nchw88 layout
NCHW88_TO_NCHW, //!< from nchw88 layout to nchw layout
NCHW4_TO_NCHW32, //!< from nchw4 layout to nchw32 layout
NCHW32_TO_NCHW4, //!< from nchw32 layout to nchw4 layout
NCHW4_TO_CHWN4, //!< from nchw4 layout to chwn4 layout
CHWN4_TO_NCHW4, //!< from chwn4 layout to nchw4 layout
NCHW_TO_NCHW4, //!< from nchw layout to nchw4 layout
NCHW_TO_NCHW4_IC_SMALL_CONV, ///< from nchw layout to nchw4 whose
///< channel size less than 4
NCHW4_TO_NCHW, //!< from nchw4 layout to nchw layout
NCHW_TO_NCHW88, //!< from nchw layout to nchw88 layout
NCHW88_TO_NCHW, //!< from nchw88 layout to nchw layout

WEIGHT_NCHW_TO_NCHW4_DENSE, //!< weight from nchw layout to nchw4
//!< layout
WEIGHT_NCHW_TO_NCHW4_GROUP, //!< group weight from nchw layout to
//!< nchw4 layout
WEIGHT_NCHW_TO_NCHW4_DENSE_IC_SMALL_CONV, //!< weight from nchw layout
//!< to nchw4 layout whose
//! channel size less than 4

WEIGHT_NCHW_TO_NCHW88_DENSE, //!< weight from nchw layout to nchw88
//!< layout
@@ -177,11 +182,21 @@ void TensorReformatPass::RelayoutPlaceholder::init_output_static_infer_desc() {
dst[3] = inp_shape[2];
dst[4] = inp_shape[4];
} else if (layout_type() ==
RelayoutPlaceholder::LayoutType::NCHW_TO_NCHW4){
mgb_assert(inp_shape.ndim == 4 && inp_shape[1] % 4 == 0);
RelayoutPlaceholder::LayoutType::NCHW_TO_NCHW4 ||
layout_type() == RelayoutPlaceholder::LayoutType::
NCHW_TO_NCHW4_IC_SMALL_CONV) {
if (layout_type() ==
RelayoutPlaceholder::LayoutType::NCHW_TO_NCHW4) {
mgb_assert(inp_shape.ndim == 4 && inp_shape[1] % 4 == 0);
} else {
mgb_assert(layout_type() ==
RelayoutPlaceholder::LayoutType::
NCHW_TO_NCHW4_IC_SMALL_CONV);
mgb_assert(inp_shape.ndim == 4 && inp_shape[1] < 4);
}
dst.ndim = 5;
dst[0] = inp_shape[0];
dst[1] = inp_shape[1] / 4;
dst[1] = (inp_shape[1] + 4 - 1) / 4;
dst[2] = inp_shape[2];
dst[3] = inp_shape[3];
dst[4] = 4;
@@ -194,11 +209,23 @@ void TensorReformatPass::RelayoutPlaceholder::init_output_static_infer_desc() {
dst[2] = inp_shape[2];
dst[3] = inp_shape[3];
} else if (layout_type() == RelayoutPlaceholder::LayoutType::
WEIGHT_NCHW_TO_NCHW4_DENSE) {
mgb_assert(inp_shape.ndim == 4 && inp_shape[1] % 4 == 0);
WEIGHT_NCHW_TO_NCHW4_DENSE ||
layout_type() ==
RelayoutPlaceholder::LayoutType::
WEIGHT_NCHW_TO_NCHW4_DENSE_IC_SMALL_CONV) {
if (layout_type() ==
RelayoutPlaceholder::LayoutType::WEIGHT_NCHW_TO_NCHW4_DENSE) {
mgb_assert(inp_shape.ndim == 4 && inp_shape[1] % 4 == 0);
} else {
mgb_assert(layout_type() ==
RelayoutPlaceholder::LayoutType::
WEIGHT_NCHW_TO_NCHW4_DENSE_IC_SMALL_CONV);
mgb_assert(inp_shape.ndim == 4 && inp_shape[1] < 4);
}

dst.ndim = 5;
dst[0] = inp_shape[0];
dst[1] = inp_shape[1] / 4;
dst[1] = (inp_shape[1] + 4 - 1) / 4;
dst[2] = inp_shape[2];
dst[3] = inp_shape[3];
dst[4] = 4;
@@ -427,6 +454,23 @@ void TensorReformatPass::translate_pass(OptState& opt) const {
auto y2 = opr::Reshape::make(y1, tshp1);
return y2.node();
};

reformat[LayoutType::NCHW_TO_NCHW4_IC_SMALL_CONV] =
[](VarNode* inp) -> VarNode* {
auto x = SymbolVar(inp);
auto y = opr::RelayoutFormat::make(
x, megdnn::param::RelayoutFormat::Mode::NCHW_NCHW4_IC_SMALL);
return y.node();
};
reformat[LayoutType::WEIGHT_NCHW_TO_NCHW4_DENSE_IC_SMALL_CONV] =
[](VarNode* inp) -> VarNode* {
auto x = SymbolVar(inp);
auto y = opr::RelayoutFormat::make(
x, megdnn::param::RelayoutFormat::Mode::
NCHW_NCHW4_IC_SMALL_CONV_DENSE_WEIGHT);
return y.node();
};

reformat[LayoutType::NCHW_TO_NCHW4] = [](VarNode* inp) -> VarNode* {
auto x = SymbolVar(inp);
auto xshp = opr::GetVarShape::make(x);
@@ -435,13 +479,10 @@ void TensorReformatPass::translate_pass(OptState& opt) const {
return opr::IndexAt::make(xshp, {{0, cv(idx)}});
};
auto tshp0 = opr::Concat::make(
{sub(0), sub(1) / 4, cv(4), sub(2), sub(3)}, 0),
tshp1 = opr::Concat::make(
{sub(0), sub(1) / 4, sub(2), sub(3), cv(4)}, 0);
{sub(0), sub(1) / 4, cv(4), sub(2), sub(3)}, 0);
auto y0 = opr::Reshape::make(x, tshp0);
auto y1 = opr::Dimshuffle::make(y0, {0, 1, 3, 4, 2});
auto y2 = opr::Reshape::make(y1, tshp1);
return y2.node();
return y1.node();
};
reformat[LayoutType::NCHW4_TO_NCHW] = [](VarNode* inp) -> VarNode* {
auto x = SymbolVar(inp);
@@ -455,7 +496,8 @@ void TensorReformatPass::translate_pass(OptState& opt) const {
auto y1 = opr::Reshape::make(y0, tshp0);
return y1.node();
};
reformat[LayoutType::WEIGHT_NCHW_TO_NCHW4_DENSE] = [](VarNode* inp) -> VarNode* {
reformat[LayoutType::WEIGHT_NCHW_TO_NCHW4_DENSE] =
[](VarNode* inp) -> VarNode* {
auto x = SymbolVar(inp);
auto xshp = opr::GetVarShape::make(x);
auto cv = [&x](int v) { return x.make_scalar(v); };
@@ -471,7 +513,8 @@ void TensorReformatPass::translate_pass(OptState& opt) const {
auto y2 = opr::Reshape::make(y1, tshp1);
return y2.node();
};
reformat[LayoutType::WEIGHT_NCHW_TO_NCHW4_GROUP] = [](VarNode* inp) -> VarNode* {
reformat[LayoutType::WEIGHT_NCHW_TO_NCHW4_GROUP] =
[](VarNode* inp) -> VarNode* {
auto x = SymbolVar(inp);
auto xshp = opr::GetVarShape::make(x);
auto cv = [&x](int v) { return x.make_scalar(v); };
@@ -1357,56 +1400,71 @@ std::unique_ptr<EnableNCHW4Pass> EnableNCHW4Pass::make_nchw4_converter(){
using RelayoutMode = RelayoutPlaceholder::LayoutType;
megdnn::param::Convolution::Format conv_format =
megdnn::param::Convolution::Format::NCHW4;
megdnn::param::ConvBias::Format conv_bias_format =
megdnn::param::ConvBias::Format conv_bias_format =
megdnn::param::ConvBias::Format::NCHW4;
megdnn::param::BatchConvBias::Format batch_conv_bias_format =
megdnn::param::BatchConvBias::Format::NCHW4;
RelayoutMode src_to_nchw4_mode = RelayoutMode::NCHW_TO_NCHW4;
RelayoutMode src_to_nchw_mode = RelayoutMode::NCHW4_TO_NCHW;
RelayoutMode weight_to_nchw4_mode_dense =
RelayoutMode weight_to_nchw4_mode_dense =
RelayoutMode::WEIGHT_NCHW_TO_NCHW4_DENSE;
RelayoutMode weight_to_nchw4_mode_group =
RelayoutMode weight_to_nchw4_mode_group =
RelayoutMode::WEIGHT_NCHW_TO_NCHW4_GROUP;
auto trans_nchw4 = [weight_to_nchw4_mode_dense,
weight_to_nchw4_mode_group](

struct ConvMode {
RelayoutMode weight;
RelayoutMode src;
};

auto trans_nchw4 =
[weight_to_nchw4_mode_dense, weight_to_nchw4_mode_group,
src_to_nchw4_mode](
const megdnn::param::Convolution::Sparse conv_mode,
const VarNode* filter) -> RelayoutMode {
const VarNode* filter) -> ConvMode {
if (conv_mode == megdnn::param::Convolution::Sparse::DENSE) {
mgb_assert(filter->shape().ndim == 4,
"The origin filter is not NCHW mode");
size_t IC = filter->shape()[1];
mgb_assert(IC % 4 == 0,
"The input channel should be divisible by 4");
return weight_to_nchw4_mode_dense;
if (IC < 4) {
return {RelayoutMode::WEIGHT_NCHW_TO_NCHW4_DENSE_IC_SMALL_CONV,
RelayoutMode::NCHW_TO_NCHW4_IC_SMALL_CONV};
} else {
return {weight_to_nchw4_mode_dense, src_to_nchw4_mode};
}
} else {
mgb_assert(conv_mode == megdnn::param::Convolution::Sparse::GROUP);
mgb_assert(filter->shape().ndim == 5,
"The origin filter if not NCHW mode");
size_t IC = filter->shape()[2];
mgb_assert(IC % 4 == 0,
"The input channel should be divisible by 4");
return weight_to_nchw4_mode_group;
"The input channel should be divisible by 4 for group "
"conv");
return {weight_to_nchw4_mode_group, src_to_nchw4_mode};
}
};
auto replace_conv_opr = [trans_nchw4, conv_format, src_to_nchw4_mode](
OperatorNodeBase* opr, const VarNodeArray& new_inp) {
auto replace_conv_opr = [trans_nchw4, conv_format](
OperatorNodeBase* opr,
const VarNodeArray& new_inp) {
mgb_assert(opr->input().size() == new_inp.size());
auto& conv_opr = opr->cast_final_safe<opr::ConvolutionForward>();
mgb_assert(conv_opr.param().format ==
megdnn::param::Convolution::Format::NCHW,
"ConvertFormat Pass only support converting NCHW to NCHW4");
if (conv_opr.param().format !=
megdnn::param::Convolution::Format::NCHW) {
return serialization::copy_opr_shallow(*opr, new_inp,
opr->config());
}
auto conv_mode =
trans_nchw4(conv_opr.param().sparse, new_inp[1]);
VarNode *conv_src = new_inp[0], *conv_filter = new_inp[1];
// src: NCHW --> NCWH4
if (new_inp[0]->shape().ndim != 5) {
mgb_assert(new_inp[0]->shape().ndim == 4);
auto new_src = RelayoutPlaceholder::make(new_inp[0],
src_to_nchw4_mode);
auto new_src =
RelayoutPlaceholder::make(new_inp[0], conv_mode.src);
conv_src = new_src.node();
}
// weight: NCHW --> NCHW4
auto weight_mode =
trans_nchw4(conv_opr.param().sparse, new_inp[1]);
auto new_filter = RelayoutPlaceholder::make(new_inp[1], weight_mode);
auto new_filter =
RelayoutPlaceholder::make(new_inp[1], conv_mode.weight);
conv_filter = new_filter.node();
// format: NCHW --> NCHW4
auto new_param = conv_opr.param();
@@ -1428,7 +1486,13 @@ std::unique_ptr<EnableNCHW4Pass> EnableNCHW4Pass::make_nchw4_converter(){
mgb_assert(opr->input().size() == new_inp.size());
auto& batch_conv_bias_opr =
opr->cast_final_safe<opr::BatchConvBiasForward>();
mgb_assert(batch_conv_bias_opr.param().format ==
if (batch_conv_bias_opr.param().format !=
megdnn::param::BatchConvBias::Format::NCHW) {
return serialization::copy_opr_shallow(*opr, new_inp,
opr->config());
}

mgb_assert(batch_conv_bias_opr.param().format ==
megdnn::param::BatchConvBias::Format::NCHW,
"ConvertFormat Pass only support converting NCHW to NCHW4");
// what should be converted: src, weight
@@ -1491,26 +1555,30 @@ std::unique_ptr<EnableNCHW4Pass> EnableNCHW4Pass::make_nchw4_converter(){
};
auto replace_conv_bias_opr = [trans_nchw4, conv_bias_format,
src_to_nchw4_mode](
OperatorNodeBase* opr,
const VarNodeArray& new_inp) {
OperatorNodeBase* opr,
const VarNodeArray& new_inp) {
mgb_assert(opr->input().size() == new_inp.size());
auto& conv_bias_opr = opr->cast_final_safe<opr::ConvBiasForward>();
mgb_assert(conv_bias_opr.param().format ==
megdnn::param::ConvBias::Format::NCHW,
"ConvertFormat Pass only support converting NCHW to NCHW4");
if (conv_bias_opr.param().format !=
megdnn::param::Convolution::Format::NCHW) {
return serialization::copy_opr_shallow(*opr, new_inp,
opr->config());
}

// what should be converted: src, weight
VarNode *conv_bias_src = new_inp[0], *conv_bias_filter = new_inp[1];
auto conv_mode =
trans_nchw4(conv_bias_opr.param().sparse, new_inp[1]);
// src: NCHW --> NCHW4
if (new_inp[0]->shape().ndim !=5) {
if (new_inp[0]->shape().ndim != 5) {
mgb_assert(new_inp[0]->shape().ndim == 4);
auto new_src = RelayoutPlaceholder::make(new_inp[0],
src_to_nchw4_mode);
auto new_src =
RelayoutPlaceholder::make(new_inp[0], conv_mode.src);
conv_bias_src = new_src.node();
}
// weight: NCHW --> NCHW4 or GNCHW --> GNCHW4
auto weight_mode =
trans_nchw4(conv_bias_opr.param().sparse, new_inp[1]);
auto new_filter = RelayoutPlaceholder::make(new_inp[1], weight_mode);
auto new_filter =
RelayoutPlaceholder::make(new_inp[1], conv_mode.weight);
conv_bias_filter = new_filter.node();
// format: NCHW --> NCHW4
auto new_param = conv_bias_opr.param();
@@ -1527,8 +1595,8 @@ std::unique_ptr<EnableNCHW4Pass> EnableNCHW4Pass::make_nchw4_converter(){
// bias: NCHW --> NCHW4
VarNode* conv_bias_bias = new_inp[2];
if (new_inp[2]->shape().ndim == 4) {
auto new_bias = RelayoutPlaceholder::make(new_inp[2],
src_to_nchw4_mode);
auto new_bias =
RelayoutPlaceholder::make(new_inp[2], src_to_nchw4_mode);
conv_bias_bias = new_bias.node();
}
if (new_inp.size() == 3) {
@@ -1543,8 +1611,8 @@ std::unique_ptr<EnableNCHW4Pass> EnableNCHW4Pass::make_nchw4_converter(){
// z_inp: NCHW --> NCHW4
VarNode* z_inp = new_inp[3];
if (new_inp[3]->shape().ndim == 4) {
auto new_z = RelayoutPlaceholder::make(new_inp[3],
src_to_nchw4_mode);
auto new_z =
RelayoutPlaceholder::make(new_inp[3], src_to_nchw4_mode);
z_inp = new_z.node();
}
auto new_conv_bias_opr = opr::ConvBias::make(conv_bias_src,
@@ -1599,18 +1667,100 @@ std::unique_ptr<EnableNCHW4Pass> EnableNCHW4Pass::make_nchw4_converter(){
}
return serialization::copy_opr_shallow(*opr, temp_inp, opr->config());
};
auto replace_pooling_opr = [](OperatorNodeBase* opr,
const VarNodeArray& new_inp) {
using Param = opr::PoolingForward::Param;
using Format = Param::Format;
mgb_assert(opr->input().size() == new_inp.size());
auto& pooling = opr->cast_final_safe<opr::PoolingForward>();
if (pooling.param().format != Format::NCHW) {
return opr;
}
if (new_inp[0]->shape().ndim == 5) {
mgb_assert(new_inp[0]->dtype().enumv() == DTypeEnum::QuantizedS8);
auto new_param = pooling.param();
new_param.format = Format::NCHW4;
auto new_pooling =
opr::PoolingForward::make(new_inp[0], new_param, opr->config());
mgb_assert(new_pooling.shape().ndim == 5,
"out var of Pooling opr after transform must be 5 (got: "
"%zu).",
new_pooling.shape().ndim);
return new_pooling.node()->owner_opr();
}
auto new_opr =
serialization::copy_opr_shallow(*opr, new_inp, opr->config());
return new_opr;
};
auto replace_resize_opr = [](OperatorNodeBase* opr,
const VarNodeArray& new_inp) {
using Param = opr::ResizeForward::Param;
using Format = Param::Format;
mgb_assert(opr->input().size() == new_inp.size());
auto& resize = opr->cast_final_safe<opr::ResizeForward>();
if (new_inp[0]->shape().ndim == 5) {
mgb_assert(new_inp[0]->dtype().enumv() == DTypeEnum::QuantizedS8);
auto new_param = resize.param();
new_param.format = Format::NCHW4;
auto new_resize = opr::ResizeForward::make(
new_inp[0], new_inp[1], new_param, opr->config());
mgb_assert(new_resize.shape().ndim == 5,
"out var of Resize opr after transform must be 5 (got: "
"%zu).",
new_resize.shape().ndim);
return new_resize.node()->owner_opr();
}
auto new_opr =
serialization::copy_opr_shallow(*opr, new_inp, opr->config());
return new_opr;
};
auto replace_warp_perspective_opr = [](OperatorNodeBase* opr,
const VarNodeArray& new_inp) {
using Param = opr::WarpPerspective::Param;
using Format = Param::Format;
mgb_assert(opr->input().size() == new_inp.size());
auto& warp = opr->cast_final_safe<opr::WarpPerspectiveForward>();
if (new_inp[0]->shape().ndim == 5) {
mgb_assert(new_inp[0]->dtype().enumv() == DTypeEnum::QuantizedS8);
auto new_param = warp.param();
new_param.format = Format::NCHW4;
SymbolVar new_warp;
if (new_inp.size() == 3) {
new_warp = opr::WarpPerspectiveForward::make(
new_inp[0], new_inp[1], nullptr, new_inp[2], new_param,
opr->config());
} else {
mgb_assert(new_inp.size() == 4);
new_warp = opr::WarpPerspectiveForward::make(
new_inp[0], new_inp[1], new_inp[2], new_inp[3],
new_param, opr->config());
}
mgb_assert(new_warp.shape().ndim == 5,
"out var of WarpPerspective opr after transform must be "
"5 (got: "
"%zu).",
new_warp.shape().ndim);
return new_warp.node()->owner_opr();
}
auto new_opr =
serialization::copy_opr_shallow(*opr, new_inp, opr->config());
return new_opr;
};
auto&& replace_func = ret->m_opr_replace_func;
//! supportted nchw4
replace_func[opr::Convolution::typeinfo()] = replace_conv_opr;
replace_func[opr::ConvBias::typeinfo()] = replace_conv_bias_opr;
replace_func[opr::BatchConvBias::typeinfo()] =
replace_batch_conv_bias_opr;
replace_func[opr::PoolingForward::typeinfo()] = replace_pooling_opr;
replace_func[opr::ResizeForward::typeinfo()] = replace_resize_opr;
replace_func[opr::WarpPerspectiveForward::typeinfo()] =
replace_warp_perspective_opr;
replace_func[opr::Elemwise::typeinfo()] = replace_elemwise_opr;
replace_func[opr::TypeCvt::typeinfo()] = replace_elemwise_opr;
replace_func[opr::ElemwiseMultiType::typeinfo()] = replace_elemwise_opr;
replace_func[opr::PowC::typeinfo()] = replace_elemwise_opr;
//! not supported nchw4
replace_func[opr::PoolingForward::typeinfo()] = relayout_inp_to_nchw;
replace_func[opr::Concat::typeinfo()] = relayout_inp_to_nchw;
replace_func[opr::ConvolutionBackwardData::typeinfo()] =
relayout_inp_to_nchw;
@@ -1620,9 +1770,6 @@ std::unique_ptr<EnableNCHW4Pass> EnableNCHW4Pass::make_nchw4_converter(){
replace_func[opr::Reduce::typeinfo()] = relayout_inp_to_nchw;
replace_func[opr::AssertEqual::typeinfo()] = relayout_inp_to_nchw;
replace_func[opr::IncrSubtensor::typeinfo()] = relayout_inp_to_nchw;
replace_func[opr::ResizeForward::typeinfo()] = relayout_inp_to_nchw;
replace_func[opr::WarpPerspectiveForward::typeinfo()] =
relayout_inp_to_nchw;
replace_func[opr::WarpAffineForward::typeinfo()] = relayout_inp_to_nchw;
return ret;
}


+ 319
- 50
src/gopt/test/inference.cpp View File

@@ -1512,6 +1512,7 @@ TEST_PASS(FuseConvBiasNonlinPass, Basic) {


#if MGB_CUDA

TEST(TestEnableTensorCore, SmallInputShape) {
REQUIRE_GPU(1);
auto cn = CompNode::load("gpu0");
@@ -1579,6 +1580,104 @@ TEST(TestEnableTensorCore, SmallInputShape) {
MGB_ASSERT_TENSOR_EQ(host_y, host_y_opt);
}


TEST(TestEnableTensorCore, Nchw4Nchw) {
REQUIRE_GPU(1);
auto cn = CompNode::load("gpu0");
cn.activate();
auto&& prop = CompNodeEnv::from_comp_node(cn).cuda_env().device_prop;
auto sm_ver = prop.major * 10 + prop.minor;
if (sm_ver < 75) {
printf("This testcast ignored due to insufficient cuda cap(got: %d, "
"expected: %d)\n",
sm_ver, 75);
return;
}

HostTensorGenerator<dtype::Int8> gen;
auto graph = ComputingGraph::make();
graph->options().graph_opt_level = 0;
auto mkvar = [&](const char* name, const TensorShape& shp,
const DType& dtype) {
return opr::TypeCvt::make(
opr::Host2DeviceCopy::make(*graph, gen(shp, cn)).rename(name),
dtype);
};
auto mkcvar = [&](const char* name, const TensorShape& shp,
const DType& dtype) {
return opr::TypeCvt::make(
opr::SharedDeviceTensor::make(*graph, *gen(shp, cn))
.rename(name),
dtype);
};

auto mkshape = [](opr::ConvBias::Param::Format format, size_t N, size_t C,
size_t H, size_t W) -> TensorShape {
mgb_assert(C % 4 == 0);
if (format == opr::ConvBias::Param::Format::NCHW4) {
return {N, C / 4, H, W, 4};
} else {
mgb_assert(format == opr::ConvBias::Param::Format::NCHW);
return {N, C, H, W};
}
};

for (auto format : {opr::ConvBias::Param::Format::NCHW,
opr::ConvBias::Param::Format::NCHW4}) {
auto x = mkvar("x", mkshape(format, 32, 64, 16, 16),
dtype::QuantizedS8(2.5f)),
w = mkcvar("w1", mkshape(format, 64, 64, 3, 3),
dtype::QuantizedS8(2.5f)),
b = mkcvar("b", mkshape(format, 1, 64, 1, 1),
dtype::QuantizedS32(6.25f)),
z = mkcvar("b1", mkshape(format, 32, 64, 8, 8),
dtype::QuantizedS8(2.5f));
opr::ConvBias::Param param;
param.format = format;
param.nonlineMode = opr::ConvBias::Param::NonlineMode::RELU;
param.stride_h = param.stride_w = 2;
param.pad_h = param.pad_w = 1;

auto y = opr::ConvBias::make(
x, w, b, z, param, {},
OperatorNodeConfig{dtype::QuantizedS8(2.5f)});
y = opr::ConvBias::make(y, w, b, param, {},
OperatorNodeConfig{dtype::QuantizedS8(2.5f)});
y = opr::TypeCvt::make(y, dtype::Float32());

SymbolVar y_opt;
SymbolVar y_no_tc;
{
auto options = gopt::OptimizeForInferenceOptions{};
options.enable_nchw32().enable_fuse_conv_bias_nonlinearity();
unpack_vector(gopt::optimize_for_inference({y}, options), y_opt);
}
{
auto options = gopt::OptimizeForInferenceOptions{};
options.enable_fuse_conv_bias_nonlinearity();
unpack_vector(gopt::optimize_for_inference({y}, options), y_no_tc);
}
auto nr_dimshuffle = find_opr_num<mgb::opr::Dimshuffle>(y_opt);
std::string json_name;
ASSERT_EQ(2u, nr_dimshuffle);
if (format == opr::ConvBias::Param::Format::NCHW4) {
json_name = "TestGoptInference.Nchw4Nchw.NCHW4.json";
} else {
mgb_assert(format == opr::ConvBias::Param::Format::NCHW);
json_name = "TestGoptInference.Nchw4Nchw.NCHW.json";
}

graph->compile({{y_opt, {}}})
->to_json()
->writeto_fpath(output_file(json_name.c_str()));
HostTensorND host_y, host_y_opt;
auto func = graph->compile({make_callback_copy(y_no_tc, host_y),
make_callback_copy(y_opt, host_y_opt)});
func->execute();
MGB_ASSERT_TENSOR_EQ(host_y, host_y_opt);
}
}

TEST(TestEnableTensorCore, ConvBiasWithZ) {
REQUIRE_GPU(1);
auto cn = CompNode::load("gpu0");
@@ -2043,53 +2142,74 @@ TEST(TestGoptInference, EnableCHWN4) {
.rename(name),
dtype);
};
auto mkshape = [](opr::ConvBias::Param::Format format, size_t N, size_t C,
size_t H, size_t W) -> TensorShape {
mgb_assert(C % 4 == 0);
if (format == opr::ConvBias::Param::Format::NCHW4) {
return {N, C / 4, H, W, 4};
} else {
mgb_assert(format == opr::ConvBias::Param::Format::NCHW);
return {N, C, H, W};
}
};

auto x = mkvar("x", {32, 16, 16, 16, 4}, dtype::QuantizedS8(2.5f)),
w = mkcvar("w1", {64, 16, 3, 3, 4}, dtype::QuantizedS8(2.5f)),
b = mkcvar("b", {1, 16, 1, 1, 4}, dtype::QuantizedS32(6.25f)),
b1 = mkvar("b1", {32, 16, 16, 16, 4}, dtype::QuantizedS8(2.5f));
opr::ConvBias::Param param;
param.format = opr::ConvBias::Param::Format::NCHW4;
param.stride_h = param.stride_w = 1;
param.pad_h = param.pad_w = 1;
param.nonlineMode = opr::ConvBias::Param::NonlineMode::RELU;
for (auto format : {opr::ConvBias::Param::Format::NCHW,
opr::ConvBias::Param::Format::NCHW4}) {
auto x = mkvar("x", mkshape(format, 32, 64, 16, 16),
dtype::QuantizedS8(2.5f)),
w = mkcvar("w1", mkshape(format, 64, 64, 3, 3),
dtype::QuantizedS8(2.5f)),
b = mkcvar("b", mkshape(format, 1, 64, 1, 1),
dtype::QuantizedS32(6.25f)),
b1 = mkvar("b1", mkshape(format, 32, 64, 16, 16),
dtype::QuantizedS8(2.5f));
opr::ConvBias::Param param;
param.format = format;
param.stride_h = param.stride_w = 1;
param.pad_h = param.pad_w = 1;
param.nonlineMode = opr::ConvBias::Param::NonlineMode::RELU;

auto y = opr::ConvBiasForward::make(
x, w, b, param, {}, OperatorNodeConfig{dtype::QuantizedS8{2.5f}});
auto y1 = opr::ElemwiseMultiType::make(
{y, b1}, opr::ElemwiseMultiType::Mode::QFUSE_ADD_RELU,
OperatorNodeConfig{dtype::QuantizedS8{2.5f}});
auto y2 = opr::ConvBiasForward::make(
y, w, b, param, {}, OperatorNodeConfig{dtype::QuantizedS8{2.5f}});
auto y3 = opr::ElemwiseMultiType::make(
{y, b1}, opr::ElemwiseMultiType::Param::Mode::QSUB,
OperatorNodeConfig{dtype::QuantizedS8{2.5f}});
auto y4 = opr::ElemwiseMultiType::make(
{y1, y2}, opr::ElemwiseMultiType::Param::Mode::QADD,
OperatorNodeConfig{dtype::QuantizedS8{2.5f}});
y4 = opr::ElemwiseMultiType::make(
{y3, y4}, opr::ElemwiseMultiType::Param::Mode::QADD,
OperatorNodeConfig{dtype::QuantizedS8{2.5f}});
y4 = opr::TypeCvt::make(y4, dtype::Float32());
SymbolVar y_opt;
SymbolVar y_cudnn;
{
auto options = gopt::OptimizeForInferenceOptions{};
options.enable_chwn4();
unpack_vector(gopt::optimize_for_inference({y4}, options), y_opt);
auto y = opr::ConvBiasForward::make(
x, w, b, param, {},
OperatorNodeConfig{dtype::QuantizedS8{2.5f}});
auto y1 = opr::ElemwiseMultiType::make(
{y, b1}, opr::ElemwiseMultiType::Mode::QFUSE_ADD_RELU,
OperatorNodeConfig{dtype::QuantizedS8{2.5f}});
auto y2 = opr::ConvBiasForward::make(
y, w, b, param, {},
OperatorNodeConfig{dtype::QuantizedS8{2.5f}});
auto y3 = opr::ElemwiseMultiType::make(
{y, b1}, opr::ElemwiseMultiType::Param::Mode::QSUB,
OperatorNodeConfig{dtype::QuantizedS8{2.5f}});
auto y4 = opr::ElemwiseMultiType::make(
{y1, y2}, opr::ElemwiseMultiType::Param::Mode::QADD,
OperatorNodeConfig{dtype::QuantizedS8{2.5f}});
y4 = opr::ElemwiseMultiType::make(
{y3, y4}, opr::ElemwiseMultiType::Param::Mode::QADD,
OperatorNodeConfig{dtype::QuantizedS8{2.5f}});
y4 = opr::TypeCvt::make(y4, dtype::Float32());
SymbolVar y_opt;
SymbolVar y_cudnn;
{
auto options = gopt::OptimizeForInferenceOptions{};
options.enable_chwn4();
unpack_vector(gopt::optimize_for_inference({y4}, options), y_opt);
}
unpack_vector(gopt::GraphOptimizer{}
.add_pass<gopt::FuseConvBiasNonlinPass>()
.add_pass<gopt::FuseConvBiasZPass>()
.apply({{y4}})
.endpoint_vars(),
y_cudnn);

ASSERT_EQ(opr::ConvBias::Param::Format::CHWN4,
find_opr<opr::ConvBias>(y_opt).param().format);
HostTensorND host_y, host_y_opt;
auto func = graph->compile({make_callback_copy(y_cudnn, host_y),
make_callback_copy(y_opt, host_y_opt)});
func->execute();
MGB_ASSERT_TENSOR_EQ(host_y, host_y_opt);
}
unpack_vector(gopt::GraphOptimizer{}
.add_pass<gopt::FuseConvBiasNonlinPass>()
.add_pass<gopt::FuseConvBiasZPass>()
.apply({{y4}})
.endpoint_vars(),
y_cudnn);

HostTensorND host_y, host_y_opt;
auto func = graph->compile({make_callback_copy(y_cudnn, host_y),
make_callback_copy(y_opt, host_y_opt)});
func->execute();
MGB_ASSERT_TENSOR_EQ(host_y, host_y_opt);
}

TEST(TestGoptInference, EnableCHWN4WarpPespective) {
@@ -2430,14 +2550,16 @@ TEST(TestGoptInference, ConvertFormatNCHW4GPU) {
auto w1 = mkcvar("w1", {8, 4, 3, 3}, dtype::QuantizedS8(2.5f)),
b1 = mkcvar("b1", {1, 8, 1, 1}, dtype::QuantizedS32(6.25f));
auto conv1 = opr::ConvBiasForward::make(
x, w1, b1, param_conv_bias, {}, OperatorNodeConfig{dtype::QuantizedS8{2.5f}});
x, w1, b1, param_conv_bias, {},
OperatorNodeConfig{dtype::QuantizedS8{2.5f}});
// group
// icpg != 1 && ocpg != 1
param_conv_bias.sparse = opr::ConvBias::Param::Sparse::GROUP;
auto w2 = mkcvar("w2", {2, 4, 4, 3, 3}, dtype::QuantizedS8(2.5f)),
b2 = mkcvar("b2", {1, 8, 1, 1}, dtype::QuantizedS32(6.25f));
auto conv2 = opr::ConvBiasForward::make(conv1, w2, b2,
param_conv_bias, {}, OperatorNodeConfig{dtype::QuantizedS8{2.5f}});
auto conv2 = opr::ConvBiasForward::make(
conv1, w2, b2, param_conv_bias, {},
OperatorNodeConfig{dtype::QuantizedS8{2.5f}});

auto y = opr::TypeCvt::make(conv2, dtype::Float32());

@@ -2450,11 +2572,13 @@ TEST(TestGoptInference, ConvertFormatNCHW4GPU) {

ASSERT_EQ(opr::ConvBias::Param::Format::NCHW4,
find_opr<opr::ConvBias>(y_opt).param().format);
auto nr_reshape = find_opr_num<mgb::opr::Reshape>(y_opt);
ASSERT_EQ(2u, nr_reshape);

graph->compile({{y_opt, {}}})
->to_json()
->writeto_fpath(
output_file("TestGoptInference.ConvertFormatNCHW4GPU.json"));
->writeto_fpath(output_file(
"TestGoptInference.ConvertFormatNCHW4GPU.json"));

HostTensorND host_y, host_y_opt;
auto func = graph->compile({make_callback_copy(y, host_y),
@@ -2465,6 +2589,90 @@ TEST(TestGoptInference, ConvertFormatNCHW4GPU) {

#endif

TEST(TestGoptInference, ConvertFormatNCHW4NonConvOpr) {
auto cn = CompNode::load("xpu0");
HostTensorGenerator<dtype::Int8> gen;
auto graph = ComputingGraph::make();
graph->options().graph_opt_level = 0;
auto mkvar = [&](const char* name, const TensorShape& shp,
const DType& dtype) {
return opr::TypeCvt::make(
opr::Host2DeviceCopy::make(*graph, gen(shp, cn)).rename(name),
dtype);
};
auto mkcvar = [&](const char* name, const TensorShape& shp,
const DType& dtype) {
return opr::TypeCvt::make(
opr::SharedDeviceTensor::make(*graph, *gen(shp, cn))
.rename(name),
dtype);
};
auto mkcvarf32 = [&](const char* name, const TensorShape& shp) {
return opr::SharedDeviceTensor::make(*graph, *gen(shp, cn))
.rename(name);
};

auto x = mkvar("x", {2, 4, 16, 16}, dtype::QuantizedS8(2.5f));
opr::ConvBias::Param param_conv_bias;
param_conv_bias.format = opr::ConvBias::Param::Format::NCHW;
param_conv_bias.stride_h = param_conv_bias.stride_w = 1;
param_conv_bias.pad_h = param_conv_bias.pad_w = 1;
param_conv_bias.nonlineMode = opr::ConvBias::Param::NonlineMode::RELU;
// dense
param_conv_bias.sparse = opr::ConvBias::Param::Sparse::DENSE;
auto w1 = mkcvar("w1", {8, 4, 3, 3}, dtype::QuantizedS8(2.5f)),
b1 = mkcvar("b1", {1, 8, 1, 1}, dtype::QuantizedS32(6.25f));
auto conv1 = opr::ConvBiasForward::make(
x, w1, b1, param_conv_bias, {},
OperatorNodeConfig{dtype::QuantizedS8{2.5f}});
// test Resize
auto shape_of = opr::GetVarShape::make(x);
auto subtensor = opr::Subtensor::make(
shape_of, {opr::Subtensor::AxisIndexer::make_interval(
0, x.make_scalar(2), None, x.make_scalar(1))});
opr::Resize::Param param_resize;
param_resize.format = opr::Resize::Param::Format::NCHW;
auto resize = opr::ResizeForward::make(conv1, subtensor * 2, param_resize);
// test WarpPerspective
auto mat = mkcvarf32("mat", {2, 3, 3}),
warp = opr::WarpPerspectiveForward::make(
resize, mat, nullptr, cg::var_from_tensor_shape(x, {32, 32}));
opr::Pooling::Param pool_param;
pool_param.format = opr::Pooling::Param::Format::NCHW;
// test Pooling
auto pool = opr::Pooling::make(warp, pool_param);
// group
// icpg != 1 && ocpg != 1
param_conv_bias.sparse = opr::ConvBias::Param::Sparse::GROUP;
auto w2 = mkcvar("w2", {2, 4, 4, 3, 3}, dtype::QuantizedS8(2.5f)),
b2 = mkcvar("b2", {1, 8, 1, 1}, dtype::QuantizedS32(6.25f));
auto conv2 = opr::ConvBiasForward::make(
pool, w2, b2, param_conv_bias, {},
OperatorNodeConfig{dtype::QuantizedS8{2.5f}});

auto add = opr::ElemwiseMultiType::make(
{conv1, conv2}, {opr::ElemwiseMultiType::Param::Mode::QADD},
OperatorNodeConfig{dtype::QuantizedS8{1.2f}});
auto y = opr::TypeCvt::make(add, dtype::Float32());

SymbolVar y_opt;
{
auto options = gopt::OptimizeForInferenceOptions{};
options.enable_nchw4();
unpack_vector(gopt::optimize_for_inference({y}, options), y_opt);
}
auto nr_dimshuffle = find_opr_num<mgb::opr::Dimshuffle>(y_opt);
ASSERT_EQ(2u, nr_dimshuffle);
ASSERT_EQ(opr::ConvBias::Param::Format::NCHW4,
find_opr<opr::ConvBias>(y_opt).param().format);
ASSERT_EQ(opr::ResizeForward::Param::Format::NCHW4,
find_opr<opr::ResizeForward>(y_opt).param().format);
ASSERT_EQ(opr::WarpPerspectiveForward::Param::Format::NCHW4,
find_opr<opr::WarpPerspectiveForward>(y_opt).param().format);
ASSERT_EQ(opr::PoolingForward::Param::Format::NCHW4,
find_opr<opr::PoolingForward>(y_opt).param().format);
}

TEST(TestGoptInference, ConvertFormatNCHW4) {
HostTensorGenerator<> gen;
auto cn = CompNode::load("cpu0");
@@ -2479,7 +2687,7 @@ TEST(TestGoptInference, ConvertFormatNCHW4) {
};

auto x = mkvar("x", {2, 4, 16, 16});
// ConvBias
// ConvBias test dense
opr::ConvBias::Param param_conv_bias;
param_conv_bias.pad_h = param_conv_bias.pad_w = 1;
param_conv_bias.sparse = opr::ConvBias::Param::Sparse::DENSE;
@@ -2517,6 +2725,67 @@ TEST(TestGoptInference, ConvertFormatNCHW4) {
MGB_ASSERT_TENSOR_NEAR(host_y, host_y_opt, 1e-3);
}

TEST(TestGoptInference, ConvertFormatNCHW4Ic3) {
REQUIRE_GPU(1);
HostTensorGenerator<dtype::Float32, RandomDistribution::UNIFORM> gen{
1.2f, 127 * 127};
auto graph = ComputingGraph::make();
graph->options().graph_opt_level = 0;
auto mkvar = [&](const char* name, const TensorShape& shp,
const DType& dtype) {
return opr::TypeCvt::make(
opr::Host2DeviceCopy::make(*graph, gen(shp)).rename(name),
dtype);
};
auto mkcvar = [&](const char* name, const TensorShape& shp,
const DType& dtype) {
return opr::TypeCvt::make(
opr::SharedDeviceTensor::make(*graph, *gen(shp))
.rename(name),
dtype);
};

auto x = mkvar("x", {2, 3, 16, 16}, dtype::QuantizedS8(2.5f));
// ConvBias test dense
opr::ConvBias::Param param_conv_bias;
param_conv_bias.pad_h = param_conv_bias.pad_w = 1;
param_conv_bias.sparse = opr::ConvBias::Param::Sparse::DENSE;
auto w1 = mkcvar("w1", {8, 3, 3, 3}, dtype::QuantizedS8(2.5f)),
b1 = mkcvar("b1", {1, 8, 1, 1}, dtype::QuantizedS32(6.25f));
auto conv1 =
opr::ConvBias::make(x, w1, b1, param_conv_bias, {},
OperatorNodeConfig{dtype::QuantizedS8{2.5f}});
param_conv_bias.sparse = opr::ConvBias::Param::Sparse::GROUP;
auto w2 = mkcvar("w2", {2, 4, 4, 3, 3}, dtype::QuantizedS8(2.5f)),
b2 = mkcvar("b2", {1, 8, 1, 1}, dtype::QuantizedS32(6.25f));
auto conv2 =
opr::ConvBias::make(conv1, w2, b2, param_conv_bias, {},
OperatorNodeConfig{dtype::QuantizedS8{2.5f}});
auto y = opr::TypeCvt::make(conv2, dtype::Float32());

SymbolVar y_opt;
{
auto options = gopt::OptimizeForInferenceOptions{};
options.enable_nchw4();
unpack_vector(gopt::optimize_for_inference({y}, options), y_opt);
}

ASSERT_EQ(opr::ConvBias::Param::Format::NCHW4,
find_opr<opr::ConvBias>(y_opt).param().format);

graph->compile({{y_opt, {}}})
->to_json()
->writeto_fpath(output_file(
"TestGoptInference.ConvertFormatNCHW4Ic3.json"));

HostTensorND host_y_opt, host_y;
auto func = graph->compile({make_callback_copy(y, host_y),
make_callback_copy(y_opt, host_y_opt)});
func->execute();
MGB_ASSERT_TENSOR_NEAR(host_y, host_y_opt, 1e-3);
}


TEST(TestGoptInference, ConvertFormatNCHW88) {
HostTensorGenerator<> gen;
auto cn = CompNode::load("cpu0");


+ 5
- 0
src/opr/impl/mgb_cpp_opr.fbs View File

@@ -55,3 +55,8 @@ struct IndexDescMaskItem {
table IndexDescMaskDump {
items:[IndexDescMaskItem];
}

table NMSKeep {
iou_thresh:float;
max_output:uint;
}

+ 69
- 68
src/serialization/impl/schema.fbs View File

@@ -30,74 +30,75 @@ table Blob {
table Reserved0 {}

union OperatorParam {
param.Empty,
param.Axis,
param.Convolution,
param.MaskPropagate,
param.ConvPooling,
param.ConvBias,
param.SeparableConv,
param.Images2Neibs,
param.Pooling,
param.LRN,
param.BN,
param.ROIPooling,
param.WarpPerspective,
param.SpatialTfGridGenerator,
param.SpatialTfSampler,
param.MGBAddUpdate,
param.Elemwise,
param.ElemwiseMultiType,
param.PowC,
param.MatrixMul,
param.Winograd,
param.SVD,
param.Reduce,
param.Cumsum,
param.CondTake,
param.Argsort,
param.IndexingRemap,
param.MGBSleep,
param.Linspace,
param.LinspaceFull,
param.Eye,
param.UniformRNG,
param.GaussianRNG,
param.Flip,
param.Rotate,
param.ROICopy,
param.CvtColor,
param.WarpAffine,
param.GaussianBlur,
param.Resize,
param.Remap,
param.Convolution3D,
param.Conv3DBias,
param.SeparableConv3D,
param.TopK,
param.RelayoutFormat,
param.SeparableFilter,
param.LocalShare,
param.ROIAlign,
param.DeformablePSROIPooling,
param.BatchConvBias,
param.DType,
param.PersistentOutputStorage,
param.OptionalAxis,
param.OptionalAxisV1,
param.ExecutionPolicy,
param.AssertEqual,
Reserved0,
param.CollectiveComm,
param.CondExecPred,
param.CondExecPredLogical,
param.CondExecMark,
param.CondExecMerge,
param.Host2DeviceCopy,
param.Dimshuffle,
param.AxisAddRemove,
param.IndexDescMaskDump,
DType,
param.Empty = 1,
param.Axis = 2,
param.Convolution = 3,
param.MaskPropagate = 4,
param.ConvPooling = 5,
param.ConvBias = 6,
param.SeparableConv = 7,
param.Images2Neibs = 8,
param.Pooling = 9,
param.LRN = 10,
param.BN = 11,
param.ROIPooling = 12,
param.WarpPerspective = 13,
param.SpatialTfGridGenerator = 14,
param.SpatialTfSampler = 15,
param.MGBAddUpdate = 16,
param.Elemwise = 17,
param.ElemwiseMultiType = 18,
param.PowC = 19,
param.MatrixMul = 20,
param.Winograd = 21,
param.SVD = 22,
param.Reduce = 23,
param.Cumsum = 24,
param.CondTake = 25,
param.Argsort = 26,
param.IndexingRemap = 27,
param.MGBSleep = 28,
param.Linspace = 29,
param.LinspaceFull = 30,
param.Eye = 31,
param.UniformRNG = 32,
param.GaussianRNG = 33,
param.Flip = 34,
param.Rotate = 35,
param.ROICopy = 36,
param.CvtColor = 37,
param.WarpAffine = 38,
param.GaussianBlur = 39,
param.Resize = 40,
param.Convolution3D = 41,
param.Conv3DBias = 42,
param.SeparableConv3D = 43,
param.TopK = 44,
param.RelayoutFormat = 45,
param.SeparableFilter = 46,
param.LocalShare = 47,
param.ROIAlign = 48,
param.DeformablePSROIPooling = 49,
param.BatchConvBias = 50,
param.DType = 51,
param.PersistentOutputStorage = 52,
param.OptionalAxis = 53,
param.OptionalAxisV1 = 54,
param.ExecutionPolicy = 55,
param.AssertEqual = 56,
Reserved0 = 57,
param.CollectiveComm = 58,
param.CondExecPred = 59,
param.CondExecPredLogical = 60,
param.CondExecMark = 61,
param.CondExecMerge = 62,
param.Host2DeviceCopy = 63,
param.Dimshuffle = 64,
param.AxisAddRemove = 65,
param.IndexDescMaskDump = 66,
DType = 67,
param.Remap = 68,
param.NMSKeep = 69,
}

table Operator {


+ 2
- 2
src/serialization/impl/serializer_oss.cpp View File

@@ -846,7 +846,7 @@ GraphLoader::LoadResult GraphLoaderOSS::load(const LoadConfig& config,
OprLoadContextImpl ctx{this, m_graph->mgb_version()};
auto result = ctx.load_oprs();

auto fbs_end = tensor_begin + offset_to_fbs + size;
auto fbs_end = tensor_begin + offset_to_fbs + sizeof(size) + size;
auto cur = m_file->tell();
mgb_assert(fbs_end > cur);
// Skip to Graph end
@@ -872,4 +872,4 @@ bool is_fbs_file(InputFile& file) {
} // namespace serialization
} // namespace mgb

#endif
#endif

+ 29
- 1
src/serialization/test/serializer_oss.cpp View File

@@ -64,6 +64,34 @@ TEST(TestSerializer2, GraphDumpLoad) {
load();
}

TEST(TestSerializer2, MultiGraphDumpLoad) {
auto fname = GET_OUTPUT_FILE();

auto dump = [&]() {
auto cn = CompNode::load("cpu0");
auto graph = ComputingGraph::make();
auto x = opr::ImmutableTensor::make(*graph, 1926.0817f, {cn});
x.rename("varz");
auto dumper = GraphDumper::make(OutputFile::make_fs(fname.c_str()),
GraphDumpFormat::FLATBUFFERS);
// dump twice
dumper->dump({x});
dumper->dump({x});
};
auto load = [&]() {
GraphLoader::LoadConfig load_config = {};
auto loader = GraphLoader::make(InputFile::make_fs(fname.c_str()),
GraphDumpFormat::FLATBUFFERS);
// load twice
loader->load(load_config, false);
loader = GraphLoader::make(loader->reset_file(), loader->format());
loader->load(load_config, false);
};

dump();
load();
}

TEST(TestSerializer2, APlusB) {
auto fname = GET_OUTPUT_FILE();
TensorShape shape{2, 3};
@@ -733,4 +761,4 @@ TEST(TestSerializer2, HasOutputDtype) {
load();
}

#endif
#endif

+ 10
- 1
src/tensorrt/impl/opr_replace.cpp View File

@@ -1727,8 +1727,17 @@ void TensorRTReplacePass::Impl::TensorRTGraph::mark_varnode_format_nchw4() {
}
}

void mgb::tensorrt::transform_dest_vars_inplace(mgb::cg::VarNodeArray& dest_vars) {
void mgb::tensorrt::transform_dest_vars_inplace(
mgb::cg::VarNodeArray& dest_vars,
cg::GraphCommonOptimizeOptions& options) {
gopt::GraphOptimizer optimizer;
//! As in megengine, the layout is NCHW, while tensorrt pass currently
//! only support NCHW4(int8), so we transform layout to nchw4 firstly.
if (options.has_set_nchw4()) {
options.disable_nchw4();
optimizer.add_pass<FuseConvBiasNonlinPass>();
optimizer.add_pass(EnableNCHW4Pass::make_nchw4_converter());
}
optimizer.add_pass<ExpandFusedArithPass>();
optimizer.add_pass<gopt::TensorRTReplacePass>();
optimizer.add_pass<ArithFusePass>();


+ 2
- 1
src/tensorrt/include/megbrain/tensorrt/opr_replace.h View File

@@ -32,7 +32,8 @@ public:

namespace tensorrt {

void transform_dest_vars_inplace(mgb::cg::VarNodeArray& dest_vars);
void transform_dest_vars_inplace(mgb::cg::VarNodeArray& dest_vars,
cg::GraphCommonOptimizeOptions& options);
}

} // namespace mgb


+ 63
- 1
src/tensorrt/test/opr_replace.cpp View File

@@ -1930,7 +1930,7 @@ TEST(TestTensorRTReplace, FuseConvAdd) {
param.stride_h = param.stride_w = 1;
param.pad_h = param.pad_w = 1;
auto y = opr::Convolution::make(x, w, param);
auto nchw2nchw4 = [](SymbolVar x) {
auto xshp = opr::GetVarShape::make(x);

@@ -1978,6 +1978,68 @@ TEST(TestTensorRTReplace, FuseConvAdd) {
MGB_ASSERT_TENSOR_NEAR(outputs[1], outputs[3], 1e-3);
}

TEST(TestTensorRTReplace, FuseConvAddNchw2nchw4) {
REQUIRE_GPU(1);
HostTensorGenerator<dtype::Float32, RandomDistribution::UNIFORM> gen{
1.2f, 127 * 127};
auto graph = ComputingGraph::make();
graph->options().graph_opt_level = 0;
auto mkvar = [&](const char* name, const TensorShape& shp,
const DType& dtype) {
return opr::TypeCvt::make(
opr::Host2DeviceCopy::make(*graph, gen(shp)).rename(name),
dtype);
};
auto mkcvar = [&](const char* name, const TensorShape& shp,
const DType& dtype) {
return opr::TypeCvt::make(
opr::SharedDeviceTensor::make(*graph, *gen(shp))
.rename(name),
dtype);
};

auto x = mkvar("x", {32, 4, 28, 28}, dtype::QuantizedS8(2.5f)),
w = mkcvar("w", {16, 4, 3, 3}, dtype::QuantizedS8(2.5f)),
b = mkcvar("b", {1, 16, 1, 1}, dtype::QuantizedS32(6.25f));
opr::ConvBias::Param param;
param.format = opr::ConvBias::Param::Format::NCHW;
param.stride_h = param.stride_w = 1;
param.pad_h = param.pad_w = 1;
auto y = opr::ConvBias::make(x, w, b, param, {},
OperatorNodeConfig{dtype::QuantizedS8{2.5f}});
auto z = opr::TypeCvt::make(y, dtype::Float32());

SymbolVar trt_z;
SymbolVar mgb_z;

ComputingGraph::Options opt;
opt.graph_opt_level = 0;
unpack_vector(
gopt::GraphOptimizer{}
.add_pass<gopt::FuseConvBiasNonlinPass>()
.add_pass(gopt::EnableNCHW4Pass::make_nchw4_converter())
.add_pass<gopt::ExpandFusedArithPass>()
.add_pass<gopt::TensorRTReplacePass>()
.add_pass<gopt::ArithFusePass>()
.apply({{z}})
.endpoint_vars(),
trt_z);

opt.graph_opt_level = 0;
unpack_vector(gopt::GraphOptimizer{}.apply({{z}}).endpoint_vars(),
mgb_z);

ComputingGraph::OutputSpec outspec(2);
SmallVector<HostTensorND> outputs(2);
outspec[0] = make_callback_copy(trt_z, outputs[0], false);
outspec[1] = make_callback_copy(mgb_z, outputs[1], false);
graph->options().graph_opt.tensorrt = false;
auto func = graph->compile(outspec);
func->execute();

MGB_ASSERT_TENSOR_NEAR(outputs[0], outputs[1], 1e-3);
}

#endif // MGB_ENABLE_TENSOR_RT

// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}

Loading…
Cancel
Save