Browse Source

feat(dnn): use ref ptr interface for all backends

GitOrigin-RevId: f65feae5cc
release-1.7
Megvii Engine Team 3 years ago
parent
commit
c85631aa77
100 changed files with 946 additions and 768 deletions
  1. +1
    -1
      CMakeLists.txt
  2. +9
    -14
      dnn/src/aarch64/relayout/opr_impl.cpp
  3. +3
    -1
      dnn/src/aarch64/rotate/opr_impl.cpp
  4. +14
    -8
      dnn/src/aarch64/warp_perspective/warp_perspective_cv.cpp
  5. +4
    -4
      dnn/src/arm_common/conv_bias/int8x8x16/algos.cpp
  6. +2
    -1
      dnn/src/arm_common/conv_bias/int8x8x16/direct_nchw_nchw44_algo.cpp
  7. +1
    -1
      dnn/src/arm_common/cvt_color/opr_impl.cpp
  8. +33
    -33
      dnn/src/arm_common/elemwise/binary/algo.cpp
  9. +40
    -40
      dnn/src/arm_common/elemwise/ternary/algo.cpp
  10. +2
    -2
      dnn/src/arm_common/elemwise/unary/algo.cpp
  11. +20
    -18
      dnn/src/arm_common/elemwise_multi_type/opr_impl.cpp
  12. +5
    -5
      dnn/src/arm_common/elemwise_multi_type/opr_impl.h
  13. +88
    -80
      dnn/src/arm_common/pooling/algo.cpp
  14. +6
    -5
      dnn/src/arm_common/pooling/algo_fp32_pooling_nchw44.cpp
  15. +2
    -2
      dnn/src/arm_common/pooling/opr_impl.cpp
  16. +4
    -4
      dnn/src/arm_common/pooling/opr_impl.h
  17. +4
    -4
      dnn/src/arm_common/reduce/opr_impl.cpp
  18. +6
    -6
      dnn/src/arm_common/resize/direct_nchwxx.cpp
  19. +6
    -6
      dnn/src/arm_common/resize/upsample2_nchw.cpp
  20. +6
    -6
      dnn/src/arm_common/resize/upsample2_nchwxx.cpp
  21. +4
    -4
      dnn/src/arm_common/separable_filter/opr_impl.cpp
  22. +12
    -12
      dnn/src/arm_common/type_cvt/opr_impl.cpp
  23. +10
    -12
      dnn/src/arm_common/warp_perspective/warp_perspective_cv.cpp
  24. +3
    -3
      dnn/src/armv7/relayout/opr_impl.cpp
  25. +3
    -1
      dnn/src/armv7/rotate/opr_impl.cpp
  26. +1
    -1
      dnn/src/atlas/checksum/opr_impl.cpp
  27. +2
    -2
      dnn/src/cambricon/checksum/opr_impl.cpp
  28. +3
    -4
      dnn/src/common/concat_split.cpp
  29. +6
    -10
      dnn/src/common/elemwise_multi_type/opr_impl_helper.cpp
  30. +6
    -6
      dnn/src/common/elemwise_multi_type/opr_impl_helper.h
  31. +6
    -6
      dnn/src/common/local/local_def.inl
  32. +58
    -96
      dnn/src/common/reduce_helper.h
  33. +222
    -0
      dnn/src/common/reduce_helper_device.h
  34. +7
    -2
      dnn/src/common/utils.h
  35. +1
    -1
      dnn/src/cuda/argmxx/opr_impl.cpp
  36. +38
    -36
      dnn/src/cuda/batch_normalization/opr_impl.cpp
  37. +3
    -3
      dnn/src/cuda/batched_matrix_mul/brute_force.cpp
  38. +3
    -3
      dnn/src/cuda/batched_matrix_mul/cublas.cpp
  39. +8
    -8
      dnn/src/cuda/batched_matrix_mul/cublas_lt.cpp
  40. +4
    -2
      dnn/src/cuda/check_non_finite/kern.cu
  41. +2
    -2
      dnn/src/cuda/check_non_finite/opr_impl.cpp
  42. +2
    -2
      dnn/src/cuda/checksum/opr_impl.cpp
  43. +5
    -5
      dnn/src/cuda/conv_bias/batched_matmul.cpp
  44. +5
    -5
      dnn/src/cuda/conv_bias/chanwise.cpp
  45. +2
    -2
      dnn/src/cuda/conv_bias/chanwise_8x8x32.cpp
  46. +5
    -5
      dnn/src/cuda/conv_bias/chanwise_small.cpp
  47. +5
    -5
      dnn/src/cuda/conv_bias/cudnn_conv.cpp
  48. +10
    -10
      dnn/src/cuda/conv_bias/cudnn_conv_bias_activation.cpp
  49. +11
    -10
      dnn/src/cuda/conv_bias/group_conv.cpp
  50. +3
    -3
      dnn/src/cuda/conv_bias/helper.cpp
  51. +1
    -1
      dnn/src/cuda/conv_bias/helper.h
  52. +3
    -3
      dnn/src/cuda/conv_bias/implicit_gemm_int4_int4_nchw64_imma.cpp
  53. +3
    -3
      dnn/src/cuda/conv_bias/implicit_gemm_int4_int4_nhwc_imma.cpp
  54. +4
    -4
      dnn/src/cuda/conv_bias/implicit_gemm_int4_nchw64_imma_base.cpp
  55. +4
    -4
      dnn/src/cuda/conv_bias/implicit_gemm_int4_nhwc_imma_base.cpp
  56. +3
    -6
      dnn/src/cuda/conv_bias/implicit_gemm_int8_chwn4_imma_reorder_filter.cpp
  57. +3
    -6
      dnn/src/cuda/conv_bias/implicit_gemm_int8_chwn4_imma_unroll_width.cpp
  58. +9
    -9
      dnn/src/cuda/conv_bias/implicit_gemm_int8_nchw32_imma.cpp
  59. +11
    -17
      dnn/src/cuda/conv_bias/implicit_gemm_int8_nchw4_dp4a.cpp
  60. +4
    -20
      dnn/src/cuda/conv_bias/implicit_gemm_int8_nchw4_imma.cpp
  61. +7
    -7
      dnn/src/cuda/conv_bias/implicit_gemm_int8_nhwc_imma.cpp
  62. +5
    -5
      dnn/src/cuda/conv_bias/implicit_gemm_uint4_int4_nchw64_imma.cpp
  63. +5
    -5
      dnn/src/cuda/conv_bias/implicit_gemm_uint4_int4_nhwc_imma.cpp
  64. +2
    -2
      dnn/src/cuda/conv_bias/inplace_matmul.cpp
  65. +4
    -3
      dnn/src/cuda/conv_bias/matmul.cpp
  66. +13
    -13
      dnn/src/cuda/conv_bias/matmul_8x8x32.cpp
  67. +8
    -8
      dnn/src/cuda/conv_bias/quint4x4x32_wmma.cpp
  68. +3
    -3
      dnn/src/cuda/convolution/backward_data/chanwise.cpp
  69. +3
    -3
      dnn/src/cuda/convolution/backward_data/chanwise_small.cpp
  70. +4
    -3
      dnn/src/cuda/convolution/backward_data/cudnn.cpp
  71. +6
    -6
      dnn/src/cuda/convolution/backward_data/group_conv.cpp
  72. +4
    -3
      dnn/src/cuda/convolution/backward_data/implicit_gemm_int8_nchw_dp4a.cpp
  73. +1
    -1
      dnn/src/cuda/convolution/backward_data/matmul.cpp
  74. +3
    -3
      dnn/src/cuda/convolution/backward_filter/chanwise.cpp
  75. +2
    -2
      dnn/src/cuda/convolution/backward_filter/cudnn.cpp
  76. +6
    -6
      dnn/src/cuda/convolution/backward_filter/group_conv.cpp
  77. +4
    -4
      dnn/src/cuda/convolution/backward_filter/matmul.cpp
  78. +3
    -3
      dnn/src/cuda/convolution/helper.cpp
  79. +1
    -1
      dnn/src/cuda/convolution/helper.h
  80. +4
    -3
      dnn/src/cuda/convolution3d/backward_data/cudnn.cpp
  81. +6
    -6
      dnn/src/cuda/convolution3d/backward_data/group_conv.cpp
  82. +2
    -2
      dnn/src/cuda/convolution3d/backward_filter/cudnn.cpp
  83. +6
    -6
      dnn/src/cuda/convolution3d/backward_filter/group_conv.cpp
  84. +5
    -5
      dnn/src/cuda/convolution3d/forward/1x1x1.cpp
  85. +4
    -3
      dnn/src/cuda/convolution3d/forward/cudnn.cpp
  86. +6
    -6
      dnn/src/cuda/convolution3d/forward/group_conv.cpp
  87. +3
    -3
      dnn/src/cuda/convolution3d/helper.cpp
  88. +1
    -1
      dnn/src/cuda/convolution3d/helper.h
  89. +4
    -4
      dnn/src/cuda/convpooling/opr_impl.cpp
  90. +1
    -1
      dnn/src/cuda/cumsum/opr_impl.cpp
  91. +1
    -1
      dnn/src/cuda/dct/opr_impl.cpp
  92. +1
    -1
      dnn/src/cuda/elemwise_helper.cpp
  93. +38
    -38
      dnn/src/cuda/elemwise_multi_type/opr_impl.cpp
  94. +6
    -6
      dnn/src/cuda/elemwise_multi_type/opr_impl.h
  95. +12
    -15
      dnn/src/cuda/group_local/forward/opr_impl.cpp
  96. +2
    -2
      dnn/src/cuda/local_share/backward_data/batched_matmul.cpp
  97. +2
    -2
      dnn/src/cuda/local_share/backward_filter/batched_matmul.cpp
  98. +2
    -2
      dnn/src/cuda/local_share/forward/batched_matmul.cpp
  99. +3
    -3
      dnn/src/cuda/lrn/opr_impl.cpp
  100. +2
    -2
      dnn/src/cuda/matrix_inverse/opr_impl.cpp

+ 1
- 1
CMakeLists.txt View File

@@ -588,7 +588,7 @@ if(MGE_WITH_CUDA)
set(CMAKE_CUDA_FLAGS_MINSIZEREL "-Os")
if(MSVC OR WIN32)
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xfatbin -compress-all")
set(CCBIN_FLAG "${CCBIN_FLAG} /wd4819 /wd4334 /wd4267 /wd4002 /wd4244 /wd4068 /std:c++14")
set(CCBIN_FLAG "${CCBIN_FLAG} /wd4819 /wd4334 /wd4267 /wd4002 /wd4244 /wd4068 /std:c++14 /bigobj")
if(${CMAKE_BUILD_TYPE} STREQUAL "Debug")
set(CCBIN_FLAG "${CCBIN_FLAG} -D_ITERATOR_DEBUG_LEVEL=2 -MTd")
endif()


+ 9
- 14
dnn/src/aarch64/relayout/opr_impl.cpp View File

@@ -365,27 +365,22 @@ void aarch64::RelayoutForwardImpl::exec(
relayout::TransposeParam trans_param;
bool trans = relayout::is_transpose(src.layout, dst.layout, trans_param, true);
if (trans && trans_param.c == 1 && src0.layout.dtype.size() == 1) {
auto sptr = static_cast<TransposeByte*>(src.raw_ptr),
dptr = static_cast<TransposeByte*>(dst.raw_ptr);
MEGDNN_DISPATCH_CPU_KERN_OPR(transpose_fallback::transpose<TransposeByte>(
trans_param.batch, trans_param.m, trans_param.n, sptr, dptr,
trans_param.stride_m));
trans_param.batch, trans_param.m, trans_param.n,
static_cast<TransposeByte*>(src.raw_ptr()),
static_cast<TransposeByte*>(dst.raw_ptr()), trans_param.stride_m));
return;
} else if (trans && trans_param.c == 1 && src0.layout.dtype.size() == 2) {
auto sptr = static_cast<Transpose2Byte*>(src.raw_ptr),
dptr = static_cast<Transpose2Byte*>(dst.raw_ptr);

MEGDNN_DISPATCH_CPU_KERN_OPR(transpose_fallback::transpose<Transpose2Byte>(
trans_param.batch, trans_param.m, trans_param.n, sptr, dptr,
trans_param.stride_m));
trans_param.batch, trans_param.m, trans_param.n,
static_cast<Transpose2Byte*>(src.raw_ptr()),
static_cast<Transpose2Byte*>(dst.raw_ptr()), trans_param.stride_m));
return;
} else if (trans && trans_param.c == 1 && src0.layout.dtype.size() == 4) {
auto sptr = static_cast<Transpose4Byte*>(src.raw_ptr),
dptr = static_cast<Transpose4Byte*>(dst.raw_ptr);

MEGDNN_DISPATCH_CPU_KERN_OPR(transpose_fallback::transpose<Transpose4Byte>(
trans_param.batch, trans_param.m, trans_param.n, sptr, dptr,
trans_param.stride_m));
trans_param.batch, trans_param.m, trans_param.n,
static_cast<Transpose4Byte*>(src.raw_ptr()),
static_cast<Transpose4Byte*>(dst.raw_ptr()), trans_param.stride_m));
return;
}



+ 3
- 1
dnn/src/aarch64/rotate/opr_impl.cpp View File

@@ -358,11 +358,13 @@ void RotateImpl::exec(
return fallback::RotateImpl::exec(src, dst, workspace);
}

auto clockwise = param().clockwise;

MEGDNN_DISPATCH_CPU_KERN_OPR({
for (size_t i = 0; i < src.layout.shape[0]; ++i) {
Mat<uchar> src_mat = TensorND2Mat<uchar>(src, i);
Mat<uchar> dst_mat = TensorND2Mat<uchar>(dst, i);
rotate(src_mat, dst_mat, param().clockwise);
rotate(src_mat, dst_mat, clockwise);
}
});
}


+ 14
- 8
dnn/src/aarch64/warp_perspective/warp_perspective_cv.cpp View File

@@ -205,16 +205,16 @@ void megdnn::aarch64::warp_perspective_cv_exec(
megdnn_assert(
ch == 1 || ch == 3 || ch == 2,
"unsupported src channel: %zu, avaiable channel size: 1/2/3", ch);
const float* trans_ptr = trans.ptr<dt_float32>();
const int* midx_ptr = nullptr;
if (mat_idx.raw_ptr) {
megdnn_assert(mat_idx.layout.ndim == 1);
midx_ptr = mat_idx.ptr<int>();
}
if (dst.layout.dtype.enumv() == DTypeEnum::Float32) {
#define cb(_imode, _bmode, _ch) \
auto task = [src, trans_ptr, midx_ptr, dst, border_value, parallelism_batch]( \
auto task = [src, trans, mat_idx, dst, border_value, parallelism_batch]( \
size_t index, size_t) { \
const float* trans_ptr = trans.ptr<dt_float32>(); \
const int* midx_ptr = nullptr; \
if (mat_idx.raw_ptr()) { \
megdnn_assert(mat_idx.layout.ndim == 1); \
midx_ptr = mat_idx.ptr<int>(); \
} \
size_t batch_id = index / parallelism_batch; \
size_t task_id = index % parallelism_batch; \
size_t src_id = batch_id; \
@@ -240,8 +240,14 @@ void megdnn::aarch64::warp_perspective_cv_exec(
#undef cb
} else if (dst.layout.dtype.enumv() == DTypeEnum::Uint8) {
#define cb(_imode, _bmode, _ch) \
auto task = [src, trans_ptr, midx_ptr, dst, border_value, parallelism_batch]( \
auto task = [src, trans, mat_idx, dst, border_value, parallelism_batch]( \
size_t index, size_t) { \
const float* trans_ptr = trans.ptr<dt_float32>(); \
const int* midx_ptr = nullptr; \
if (mat_idx.raw_ptr()) { \
megdnn_assert(mat_idx.layout.ndim == 1); \
midx_ptr = mat_idx.ptr<int>(); \
} \
size_t batch_id = index / parallelism_batch; \
size_t task_id = index % parallelism_batch; \
size_t src_id = batch_id; \


+ 4
- 4
dnn/src/arm_common/conv_bias/int8x8x16/algos.cpp View File

@@ -531,10 +531,10 @@ SmallVector<ConvBiasImpl::NCBKern> ConvBiasImpl::AlgoI8x8x16Stride2Filter2::
megdnn_arm_common_conv_bias_int8816_kimpl,
midout_iv("AlgoI8x8x16Stride2Filter2::dispatch_kerns"_hash)) {
auto ncb_param = param;
ncb_param.src_ptr = param.src<void>(0, ncb_index.ndrange_id[0]);
ncb_param.dst_ptr = param.dst<void>(0, ncb_index.ndrange_id[0]);
ncb_param.filter_ptr = param.filter<void>(ncb_index.ndrange_id[0]);
ncb_param.bias_ptr = param.bias<void>(0, ncb_index.ndrange_id[0]);
ncb_param.src_ptr += param.src_offset(0, ncb_index.ndrange_id[0]);
ncb_param.dst_ptr += param.dst_offset(0, ncb_index.ndrange_id[0]);
ncb_param.filter_ptr += param.filter_offset(ncb_index.ndrange_id[0]);
ncb_param.bias_ptr += param.bias_offset(0, ncb_index.ndrange_id[0]);
conv_bias::conv_int8x8x16_stride2_flt2(ncb_param);
}
MIDOUT_END();


+ 2
- 1
dnn/src/arm_common/conv_bias/int8x8x16/direct_nchw_nchw44_algo.cpp View File

@@ -133,7 +133,8 @@ static void pack_weight(
constexpr int pack_oc = 8;
if (kern_param.bias_mode == BiasMode::BROADCAST_CHANNEL_BIAS && oc % pack_oc != 0) {
auto packed_bias = reinterpret_cast<int16_t*>(bundle.get(2));
memcpy(packed_bias, kern_param.bias_ptr, round_up(oc, 8) * sizeof(int16_t));
memcpy(packed_bias, kern_param.bias_ptr.get_ptr(),
round_up(oc, 8) * sizeof(int16_t));
}
}



+ 1
- 1
dnn/src/arm_common/cvt_color/opr_impl.cpp View File

@@ -1657,4 +1657,4 @@ void CvtColorImpl::exec(
} // namespace arm_common
} // namespace megdnn

// vim: syntax=cpp.doxygen
// vim: syntax=cpp.doxygen

+ 33
- 33
dnn/src/arm_common/elemwise/binary/algo.cpp View File

@@ -220,9 +220,9 @@ void ElemwiseImpl::AlgoBinaryVecVec::exec(const KernParam& kern_param) const {
run = OpCallerBinary<_op<_type, _type>, BcastType::VEC_VEC>::run; \
MEGDNN_DISPATCH_CPU_KERN( \
static_cast<naive::HandleImpl*>(kern_param.handle), \
run(static_cast<const _type*>(src0.raw_ptr), \
static_cast<const _type*>(src1.raw_ptr), \
static_cast<_type*>(dst.raw_ptr), src0.layout.dtype, \
run(static_cast<const _type*>(src0.raw_ptr()), \
static_cast<const _type*>(src1.raw_ptr()), \
static_cast<_type*>(dst.raw_ptr()), src0.layout.dtype, \
src1.layout.dtype, dst.layout.dtype, \
src0.layout.total_nr_elems())); \
} \
@@ -254,9 +254,9 @@ void ElemwiseImpl::AlgoBinaryVecScalar::exec(const KernParam& kern_param) const
_op<_type, _type>, BcastType::VEC_SCALAR>::run; \
MEGDNN_DISPATCH_CPU_KERN( \
static_cast<naive::HandleImpl*>(kern_param.handle), \
run(static_cast<const _type*>(src0.raw_ptr), \
static_cast<const _type*>(src1.raw_ptr)[0], \
static_cast<_type*>(dst.raw_ptr), src0.layout.dtype, \
run(static_cast<const _type*>(src0.raw_ptr()), \
static_cast<const _type*>(src1.raw_ptr())[0], \
static_cast<_type*>(dst.raw_ptr()), src0.layout.dtype, \
src1.layout.dtype, dst.layout.dtype, \
src0.layout.total_nr_elems())); \
} \
@@ -280,9 +280,9 @@ void ElemwiseImpl::AlgoBinaryVecScalar::exec(const KernParam& kern_param) const
_op<_type, _type>, BcastType::SCALAR_VEC>::run; \
MEGDNN_DISPATCH_CPU_KERN( \
static_cast<naive::HandleImpl*>(kern_param.handle), \
run(static_cast<const _type*>(src0.raw_ptr)[0], \
static_cast<const _type*>(src1.raw_ptr), \
static_cast<_type*>(dst.raw_ptr), src0.layout.dtype, \
run(static_cast<const _type*>(src0.raw_ptr())[0], \
static_cast<const _type*>(src1.raw_ptr()), \
static_cast<_type*>(dst.raw_ptr()), src0.layout.dtype, \
src1.layout.dtype, dst.layout.dtype, \
src1.layout.total_nr_elems())); \
} \
@@ -318,9 +318,9 @@ void ElemwiseImpl::AlgoBinaryVecBcast101::exec(const KernParam& kern_param) cons
_op<_type, _type>, BcastType::VEC_BCAST101>::run; \
MEGDNN_DISPATCH_CPU_KERN( \
static_cast<naive::HandleImpl*>(kern_param.handle), \
run(static_cast<const _type*>(src0.raw_ptr), \
static_cast<const _type*>(src1.raw_ptr), \
static_cast<_type*>(dst.raw_ptr), src0.layout.dtype, \
run(static_cast<const _type*>(src0.raw_ptr()), \
static_cast<const _type*>(src1.raw_ptr()), \
static_cast<_type*>(dst.raw_ptr()), src0.layout.dtype, \
src1.layout.dtype, dst.layout.dtype, binfo.x, binfo.y, \
binfo.z)); \
} \
@@ -347,9 +347,9 @@ void ElemwiseImpl::AlgoBinaryVecBcast101::exec(const KernParam& kern_param) cons
_op<_type, _type>, BcastType::BCAST101_VEC>::run; \
MEGDNN_DISPATCH_CPU_KERN( \
static_cast<naive::HandleImpl*>(kern_param.handle), \
run(static_cast<const _type*>(src0.raw_ptr), \
static_cast<const _type*>(src1.raw_ptr), \
static_cast<_type*>(dst.raw_ptr), src0.layout.dtype, \
run(static_cast<const _type*>(src0.raw_ptr()), \
static_cast<const _type*>(src1.raw_ptr()), \
static_cast<_type*>(dst.raw_ptr()), src0.layout.dtype, \
src1.layout.dtype, dst.layout.dtype, binfo.x, binfo.y, \
binfo.z)); \
} \
@@ -384,9 +384,9 @@ void ElemwiseImpl::AlgoBinaryVecBcastX0X::exec(const KernParam& kern_param) cons
_op<_type, _type>, BcastType::VEC_BCASTX0X>::run; \
MEGDNN_DISPATCH_CPU_KERN( \
static_cast<naive::HandleImpl*>(kern_param.handle), \
run(static_cast<const _type*>(src0.raw_ptr), \
static_cast<const _type*>(src1.raw_ptr), \
static_cast<_type*>(dst.raw_ptr), src0.layout.dtype, \
run(static_cast<const _type*>(src0.raw_ptr()), \
static_cast<const _type*>(src1.raw_ptr()), \
static_cast<_type*>(dst.raw_ptr()), src0.layout.dtype, \
src1.layout.dtype, dst.layout.dtype, binfo.x, binfo.y, \
binfo.z)); \
} \
@@ -413,9 +413,9 @@ void ElemwiseImpl::AlgoBinaryVecBcastX0X::exec(const KernParam& kern_param) cons
_op<_type, _type>, BcastType::BCASTX0X_VEC>::run; \
MEGDNN_DISPATCH_CPU_KERN( \
static_cast<naive::HandleImpl*>(kern_param.handle), \
run(static_cast<const _type*>(src0.raw_ptr), \
static_cast<const _type*>(src1.raw_ptr), \
static_cast<_type*>(dst.raw_ptr), src0.layout.dtype, \
run(static_cast<const _type*>(src0.raw_ptr()), \
static_cast<const _type*>(src1.raw_ptr()), \
static_cast<_type*>(dst.raw_ptr()), src0.layout.dtype, \
src1.layout.dtype, dst.layout.dtype, binfo.x, binfo.y, \
binfo.z)); \
} \
@@ -450,9 +450,9 @@ void ElemwiseImpl::AlgoBinaryVecBcast111C::exec(const KernParam& kern_param) con
_op<_type, _type>, BcastType::VEC_BCAST111C>::run; \
MEGDNN_DISPATCH_CPU_KERN( \
static_cast<naive::HandleImpl*>(kern_param.handle), \
run(static_cast<const _type*>(src0.raw_ptr), \
static_cast<const _type*>(src1.raw_ptr), \
static_cast<_type*>(dst.raw_ptr), src0.layout.dtype, \
run(static_cast<const _type*>(src0.raw_ptr()), \
static_cast<const _type*>(src1.raw_ptr()), \
static_cast<_type*>(dst.raw_ptr()), src0.layout.dtype, \
src1.layout.dtype, dst.layout.dtype, binfo.x, binfo.y, \
binfo.z)); \
} \
@@ -479,9 +479,9 @@ void ElemwiseImpl::AlgoBinaryVecBcast111C::exec(const KernParam& kern_param) con
_op<_type, _type>, BcastType::BCAST111C_VEC>::run; \
MEGDNN_DISPATCH_CPU_KERN( \
static_cast<naive::HandleImpl*>(kern_param.handle), \
run(static_cast<const _type*>(src0.raw_ptr), \
static_cast<const _type*>(src1.raw_ptr), \
static_cast<_type*>(dst.raw_ptr), src0.layout.dtype, \
run(static_cast<const _type*>(src0.raw_ptr()), \
static_cast<const _type*>(src1.raw_ptr()), \
static_cast<_type*>(dst.raw_ptr()), src0.layout.dtype, \
src1.layout.dtype, dst.layout.dtype, binfo.x, binfo.y, \
binfo.z)); \
} \
@@ -519,9 +519,9 @@ void ElemwiseImpl::AlgoBinaryVecBcast101xX::exec(const KernParam& kern_param) co
_op<_type, _type>, BcastType::VEC_BCAST101xX>::run; \
MEGDNN_DISPATCH_CPU_KERN( \
static_cast<naive::HandleImpl*>(kern_param.handle), \
run(static_cast<const _type*>(src0.raw_ptr), \
static_cast<const _type*>(src1.raw_ptr), \
static_cast<_type*>(dst.raw_ptr), src0.layout.dtype, \
run(static_cast<const _type*>(src0.raw_ptr()), \
static_cast<const _type*>(src1.raw_ptr()), \
static_cast<_type*>(dst.raw_ptr()), src0.layout.dtype, \
src1.layout.dtype, dst.layout.dtype, batch_size, binfo.x, \
binfo.y, binfo.z)); \
} \
@@ -551,9 +551,9 @@ void ElemwiseImpl::AlgoBinaryVecBcast101xX::exec(const KernParam& kern_param) co
_op<_type, _type>, BcastType::BCAST101xX_VEC>::run; \
MEGDNN_DISPATCH_CPU_KERN( \
static_cast<naive::HandleImpl*>(kern_param.handle), \
run(static_cast<const _type*>(src0.raw_ptr), \
static_cast<const _type*>(src1.raw_ptr), \
static_cast<_type*>(dst.raw_ptr), src0.layout.dtype, \
run(static_cast<const _type*>(src0.raw_ptr()), \
static_cast<const _type*>(src1.raw_ptr()), \
static_cast<_type*>(dst.raw_ptr()), src0.layout.dtype, \
src1.layout.dtype, dst.layout.dtype, batch_size, binfo.x, \
binfo.y, binfo.z)); \
} \


+ 40
- 40
dnn/src/arm_common/elemwise/ternary/algo.cpp View File

@@ -79,10 +79,10 @@ void ElemwiseImpl::AlgoTernaryFma3VecVecVec::exec(const KernParam& kern_param) c
_op<_type, _type>, BcastType::VEC_VEC_VEC>::run; \
MEGDNN_DISPATCH_CPU_KERN( \
static_cast<naive::HandleImpl*>(kern_param.handle), \
run(static_cast<const _type*>(src0.raw_ptr), \
static_cast<const _type*>(src1.raw_ptr), \
static_cast<const _type*>(src2.raw_ptr), \
static_cast<_type*>(dst.raw_ptr), src0.layout.dtype, \
run(static_cast<const _type*>(src0.raw_ptr()), \
static_cast<const _type*>(src1.raw_ptr()), \
static_cast<const _type*>(src2.raw_ptr()), \
static_cast<_type*>(dst.raw_ptr()), src0.layout.dtype, \
src1.layout.dtype, src2.layout.dtype, dst.layout.dtype, \
src0.layout.total_nr_elems())); \
} \
@@ -113,10 +113,10 @@ void ElemwiseImpl::AlgoTernaryFma3VecVecScalar::exec(
_op<_type, _type>, BcastType::VEC_VEC_SCALAR>::run; \
MEGDNN_DISPATCH_CPU_KERN( \
static_cast<naive::HandleImpl*>(kern_param.handle), \
run(static_cast<const _type*>(src0.raw_ptr), \
static_cast<const _type*>(src1.raw_ptr), \
static_cast<const _type*>(src2.raw_ptr)[0], \
static_cast<_type*>(dst.raw_ptr), src0.layout.dtype, \
run(static_cast<const _type*>(src0.raw_ptr()), \
static_cast<const _type*>(src1.raw_ptr()), \
static_cast<const _type*>(src2.raw_ptr())[0], \
static_cast<_type*>(dst.raw_ptr()), src0.layout.dtype, \
src1.layout.dtype, src2.layout.dtype, dst.layout.dtype, \
src0.layout.total_nr_elems())); \
} \
@@ -149,10 +149,10 @@ void ElemwiseImpl::AlgoTernaryFma3Bcast101VecBcast101::exec(
_op<_type, _type>, BcastType::BCAST101_VEC_BCAST101>::run; \
MEGDNN_DISPATCH_CPU_KERN( \
static_cast<naive::HandleImpl*>(kern_param.handle), \
run(static_cast<const _type*>(src0.raw_ptr), \
static_cast<const _type*>(src1.raw_ptr), \
static_cast<const _type*>(src2.raw_ptr), \
static_cast<_type*>(dst.raw_ptr), src0.layout.dtype, \
run(static_cast<const _type*>(src0.raw_ptr()), \
static_cast<const _type*>(src1.raw_ptr()), \
static_cast<const _type*>(src2.raw_ptr()), \
static_cast<_type*>(dst.raw_ptr()), src0.layout.dtype, \
src1.layout.dtype, src2.layout.dtype, dst.layout.dtype, \
binfo.x, binfo.y, binfo.z)); \
} \
@@ -187,11 +187,11 @@ void ElemwiseImpl::AlgoTernaryFma3Bcast111CVecBcast111C::exec(
BcastType::BCAST111C_VEC_BCAST111C>::run; \
MEGDNN_DISPATCH_CPU_KERN( \
static_cast<naive::HandleImpl*>(kern_param.handle), \
run(static_cast<const _type*>(src0.raw_ptr), \
static_cast<const _type*>(src1.raw_ptr), \
run(static_cast<const _type*>(src0.raw_ptr()), \
static_cast<const _type*>(src1.raw_ptr()), \
is_vector(src1.layout) ? 0 : src1.layout.stride[0] - binfo.z, \
static_cast<const _type*>(src2.raw_ptr), \
static_cast<_type*>(dst.raw_ptr), src0.layout.dtype, \
static_cast<const _type*>(src2.raw_ptr()), \
static_cast<_type*>(dst.raw_ptr()), src0.layout.dtype, \
src1.layout.dtype, src2.layout.dtype, dst.layout.dtype, \
binfo.x, binfo.y, binfo.z)); \
} \
@@ -228,10 +228,10 @@ void ElemwiseImpl::AlgoTernaryFma3Bcast101xXVecBcast101xX::exec(
BcastType::BCAST101xX_VEC_BCAST101xX>::run; \
MEGDNN_DISPATCH_CPU_KERN( \
static_cast<naive::HandleImpl*>(kern_param.handle), \
run(static_cast<const _type*>(src0.raw_ptr), \
static_cast<const _type*>(src1.raw_ptr), \
static_cast<const _type*>(src2.raw_ptr), \
static_cast<_type*>(dst.raw_ptr), src0.layout.dtype, \
run(static_cast<const _type*>(src0.raw_ptr()), \
static_cast<const _type*>(src1.raw_ptr()), \
static_cast<const _type*>(src2.raw_ptr()), \
static_cast<_type*>(dst.raw_ptr()), src0.layout.dtype, \
src1.layout.dtype, src2.layout.dtype, dst.layout.dtype, \
batch_size, binfo.x, binfo.y, binfo.z)); \
} \
@@ -268,10 +268,10 @@ void ElemwiseImpl::AlgoTernaryFma3VecBcast101xXVec::exec(
_op<_type, _type>, BcastType::VEC_BCAST101xX_VEC>::run; \
MEGDNN_DISPATCH_CPU_KERN( \
static_cast<naive::HandleImpl*>(kern_param.handle), \
run(static_cast<const _type*>(src0.raw_ptr), \
static_cast<const _type*>(src1.raw_ptr), \
static_cast<const _type*>(src2.raw_ptr), \
static_cast<_type*>(dst.raw_ptr), src0.layout.dtype, \
run(static_cast<const _type*>(src0.raw_ptr()), \
static_cast<const _type*>(src1.raw_ptr()), \
static_cast<const _type*>(src2.raw_ptr()), \
static_cast<_type*>(dst.raw_ptr()), src0.layout.dtype, \
src1.layout.dtype, src2.layout.dtype, dst.layout.dtype, \
batch_size, binfo.x, binfo.y, binfo.z)); \
} \
@@ -306,10 +306,10 @@ void ElemwiseImpl::AlgoTernaryFma3VecBcast101Vec::exec(
_op<_type, _type>, BcastType::VEC_BCAST101_VEC>::run; \
MEGDNN_DISPATCH_CPU_KERN( \
static_cast<naive::HandleImpl*>(kern_param.handle), \
run(static_cast<const _type*>(src0.raw_ptr), \
static_cast<const _type*>(src1.raw_ptr), \
static_cast<const _type*>(src2.raw_ptr), \
static_cast<_type*>(dst.raw_ptr), src0.layout.dtype, \
run(static_cast<const _type*>(src0.raw_ptr()), \
static_cast<const _type*>(src1.raw_ptr()), \
static_cast<const _type*>(src2.raw_ptr()), \
static_cast<_type*>(dst.raw_ptr()), src0.layout.dtype, \
src1.layout.dtype, src2.layout.dtype, dst.layout.dtype, \
binfo.x, binfo.y, binfo.z)); \
} \
@@ -343,12 +343,12 @@ void ElemwiseImpl::AlgoTernaryFma3VecBcast111CVec::exec(
_op<_type, _type>, BcastType::VEC_BCAST111C_VEC>::run; \
MEGDNN_DISPATCH_CPU_KERN( \
static_cast<naive::HandleImpl*>(kern_param.handle), \
run(static_cast<const _type*>(src0.raw_ptr), \
run(static_cast<const _type*>(src0.raw_ptr()), \
is_vector(src0.layout) ? 0 : src0.layout.stride[0] - binfo.z, \
static_cast<const _type*>(src1.raw_ptr), \
static_cast<const _type*>(src2.raw_ptr), \
static_cast<const _type*>(src1.raw_ptr()), \
static_cast<const _type*>(src2.raw_ptr()), \
is_vector(src2.layout) ? 0 : src2.layout.stride[0] - binfo.z, \
static_cast<_type*>(dst.raw_ptr), src0.layout.dtype, \
static_cast<_type*>(dst.raw_ptr()), src0.layout.dtype, \
src1.layout.dtype, src2.layout.dtype, dst.layout.dtype, \
binfo.x, binfo.y, binfo.z)); \
} \
@@ -380,10 +380,10 @@ void ElemwiseImpl::AlgoTernaryFma3VecScalarVec::exec(
_op<_type, _type>, BcastType::VEC_SCALAR_VEC>::run; \
MEGDNN_DISPATCH_CPU_KERN( \
static_cast<naive::HandleImpl*>(kern_param.handle), \
run(static_cast<const _type*>(src0.raw_ptr), \
static_cast<const _type*>(src1.raw_ptr)[0], \
static_cast<const _type*>(src2.raw_ptr), \
static_cast<_type*>(dst.raw_ptr), src0.layout.dtype, \
run(static_cast<const _type*>(src0.raw_ptr()), \
static_cast<const _type*>(src1.raw_ptr())[0], \
static_cast<const _type*>(src2.raw_ptr()), \
static_cast<_type*>(dst.raw_ptr()), src0.layout.dtype, \
src1.layout.dtype, src2.layout.dtype, dst.layout.dtype, \
src0.layout.total_nr_elems())); \
} \
@@ -414,10 +414,10 @@ void ElemwiseImpl::AlgoTernaryFma3VecScalarScalar::exec(
_op<_type, _type>, BcastType::VEC_SCALAR_SCALAR>::run; \
MEGDNN_DISPATCH_CPU_KERN( \
static_cast<naive::HandleImpl*>(kern_param.handle), \
run(static_cast<const _type*>(src0.raw_ptr), \
static_cast<const _type*>(src1.raw_ptr)[0], \
static_cast<const _type*>(src2.raw_ptr)[0], \
static_cast<_type*>(dst.raw_ptr), src0.layout.dtype, \
run(static_cast<const _type*>(src0.raw_ptr()), \
static_cast<const _type*>(src1.raw_ptr())[0], \
static_cast<const _type*>(src2.raw_ptr())[0], \
static_cast<_type*>(dst.raw_ptr()), src0.layout.dtype, \
src1.layout.dtype, src2.layout.dtype, dst.layout.dtype, \
src0.layout.total_nr_elems())); \
} \


+ 2
- 2
dnn/src/arm_common/elemwise/unary/algo.cpp View File

@@ -76,8 +76,8 @@ void ElemwiseImpl::AlgoUnary::exec(const KernParam& kern_param) const {
size_t offset = task_id * nr_elems_per_thread; \
size_t nr_elems_thread = \
std::min(nr_elems - offset, nr_elems_per_thread); \
run(static_cast<const _type*>(src0.raw_ptr) + offset, \
static_cast<_type*>(dst_tensor.raw_ptr) + offset, \
run(static_cast<const _type*>(src0.raw_ptr()) + offset, \
static_cast<_type*>(dst_tensor.raw_ptr()) + offset, \
src0.layout.dtype, dst_tensor.layout.dtype, nr_elems_thread); \
}; \
MEGDNN_DISPATCH_MULTI_THREAD_CPU_KERN( \


+ 20
- 18
dnn/src/arm_common/elemwise_multi_type/opr_impl.cpp View File

@@ -148,17 +148,17 @@ void ElemwiseMultiTypeImpl::neon_round_shr_saturate_bcast_scalar<int32_t>(

template <typename ctype>
void ElemwiseMultiTypeImpl::dispatch_round_shr_saturate_iXxi8xi8_bcast_scalar(
const ElemwiseOpParamN<2>& param, megdnn::dt_int8* dst) {
auto a_ptr = param[0].ptr<ctype>();
const ElemwiseOpParamN<2>& param, const TensorND& dst) {
auto k = param[1].ptr<dt_int8>()[0];
size_t size = param.size;
auto src = param[0];

MEGDNN_DISPATCH_CPU_KERN_OPR(
neon_round_shr_saturate_bcast_scalar(a_ptr, k, size, dst));
MEGDNN_DISPATCH_CPU_KERN_OPR(neon_round_shr_saturate_bcast_scalar(
src.ptr<ctype>(), k, size, static_cast<dt_int8*>(dst.raw_ptr())));
}

void ElemwiseMultiTypeImpl::on_round_shr_saturate_iXxi8xi8(
const ElemwiseOpParamN<2>& param, megdnn::dt_int8* dst) {
const ElemwiseOpParamN<2>& param, const TensorND& dst) {
if (is_vector(param[0].layout) && is_broadcasted_scalar(param[1].layout)) {
switch (param[0].layout.dtype.enumv()) {
#define cb(t) \
@@ -282,7 +282,7 @@ void neon_fuse_add_rmulh_round_shr_saturate_bcast_1c11_int32(
}

bool ElemwiseMultiTypeImpl::dispatch_fuse_add_rmulh_rshr(
const ElemwiseOpParamN<6>& param, megdnn::dt_int8* dst) {
const ElemwiseOpParamN<6>& param, const TensorND& dst) {
BroadcastChannelInfo binfo;
if (is_vector(param[0].layout) &&
is_broadcasted_channel_like(param[1].layout, binfo) &&
@@ -294,16 +294,18 @@ bool ElemwiseMultiTypeImpl::dispatch_fuse_add_rmulh_rshr(
auto minv = param[4].ptr<dt_int8>()[0];
auto maxv = param[5].ptr<dt_int8>()[0];
switch (param[0].layout.dtype.enumv()) {
#define DISPATCH(stype, suffix) \
case DTypeTrait<stype>::enumv: { \
auto x_ptr = param[0].ptr<DTypeTrait<stype>::ctype>(); \
auto b_ptr = param[1].ptr<DTypeTrait<stype>::ctype>(); \
auto M = param[2].ptr<DTypeTrait<stype>::ctype>()[0]; \
MEGDNN_DISPATCH_CPU_KERN_OPR( \
neon_fuse_add_rmulh_round_shr_saturate_bcast_1c11_##suffix( \
binfo.x, binfo.y, binfo.z, x_ptr, b_ptr, M, offset, minv, \
maxv, param.size, dst)); \
break; \
#define DISPATCH(stype, suffix) \
case DTypeTrait<stype>::enumv: { \
auto M = param[2].ptr<DTypeTrait<stype>::ctype>()[0]; \
auto src0 = param[0]; \
auto src1 = param[1]; \
MEGDNN_DISPATCH_CPU_KERN_OPR( \
neon_fuse_add_rmulh_round_shr_saturate_bcast_1c11_##suffix( \
binfo.x, binfo.y, binfo.z, \
src0.ptr<DTypeTrait<stype>::ctype>(), \
src1.ptr<DTypeTrait<stype>::ctype>(), M, offset, minv, maxv, \
param.size, static_cast<dt_int8*>(dst.raw_ptr()))); \
break; \
}
DISPATCH(dtype::Int16, int16)
DISPATCH(dtype::Int32, int32)
@@ -317,7 +319,7 @@ bool ElemwiseMultiTypeImpl::dispatch_fuse_add_rmulh_rshr(
}

void ElemwiseMultiTypeImpl::on_fuse_add_rmulh_round_shr_saturate_int16x16x16x8(
const ElemwiseOpParamN<6>& param, megdnn::dt_int8* dst) {
const ElemwiseOpParamN<6>& param, const TensorND& dst) {
if (dispatch_fuse_add_rmulh_rshr(param, dst))
return;
fallback::ElemwiseMultiTypeImpl::on_fuse_add_rmulh_round_shr_saturate_int16x16x16x8(
@@ -325,7 +327,7 @@ void ElemwiseMultiTypeImpl::on_fuse_add_rmulh_round_shr_saturate_int16x16x16x8(
}

void ElemwiseMultiTypeImpl::on_fuse_add_rmulh_round_shr_saturate_int32x32x32x8(
const ElemwiseOpParamN<6>& param, megdnn::dt_int8* dst) {
const ElemwiseOpParamN<6>& param, const TensorND& dst) {
if (dispatch_fuse_add_rmulh_rshr(param, dst))
return;
fallback::ElemwiseMultiTypeImpl::on_fuse_add_rmulh_round_shr_saturate_int32x32x32x8(


+ 5
- 5
dnn/src/arm_common/elemwise_multi_type/opr_impl.h View File

@@ -23,18 +23,18 @@ class ElemwiseMultiTypeImpl : public fallback::ElemwiseMultiTypeImpl {

template <typename ctype>
void dispatch_round_shr_saturate_iXxi8xi8_bcast_scalar(
const ElemwiseOpParamN<2>& param, megdnn::dt_int8* dst);
const ElemwiseOpParamN<2>& param, const TensorND& dst);

bool dispatch_fuse_add_rmulh_rshr(
const ElemwiseOpParamN<6>& param, megdnn::dt_int8* dst);
const ElemwiseOpParamN<6>& param, const TensorND& dst);

protected:
void on_round_shr_saturate_iXxi8xi8(
const ElemwiseOpParamN<2>& param, dt_int8* dst) override;
const ElemwiseOpParamN<2>& param, const TensorND& dst) override;
void on_fuse_add_rmulh_round_shr_saturate_int16x16x16x8(
const ElemwiseOpParamN<6>& param, dt_int8* dst) override;
const ElemwiseOpParamN<6>& param, const TensorND& dst) override;
void on_fuse_add_rmulh_round_shr_saturate_int32x32x32x8(
const ElemwiseOpParamN<6>& param, dt_int8* dst) override;
const ElemwiseOpParamN<6>& param, const TensorND& dst) override;

void on_quantized_mode(
const ElemwiseOpParamN<1>& param, const TensorND& dst,


+ 88
- 80
dnn/src/arm_common/pooling/algo.cpp View File

@@ -117,27 +117,27 @@ void PoolingImpl::AlgoFilterxModexStride1::exec(const PoolingKernParam& param) c
auto PW = param.padding[1];
auto FH = param.filter[0];

void* src_ptr = param.src_ptr;
void* dst_ptr = param.dst_ptr;
#define DISPATCH_FUNC(Pooler, NeonPooler, window, midout_type_id) \
MIDOUT_BEGIN( \
megdnn_arm_common_pooling, midout_iv(0), midout_iv(midout_type_id), \
Pooler::MIDOUT_CASE_NUM, NeonPooler::MIDOUT_CASE_NUM, window) { \
auto run = [C, IH, IW, OH, OW, PH, PW, src_ptr, dst_ptr, \
src_dtype = param.src_type](size_t index, size_t) { \
size_t n = index / C; \
size_t c = index % C; \
do_pooling_compact<Pooler MEGDNN_COMMA NeonPooler MEGDNN_COMMA window>( \
static_cast<const typename Pooler::ctype*>(src_ptr) + \
n * C * IH * IW + c * IH * IW, \
static_cast<typename Pooler::ctype*>(dst_ptr) + n * C * OH * OW + \
c * OH * OW, \
src_dtype, IH, IW, OH, OW, PH, PW); \
}; \
MEGDNN_DISPATCH_MULTI_THREAD_CPU_KERN( \
static_cast<::megdnn::naive::HandleImpl*>(param.handle), N* C, run); \
} \
auto src_ptr = param.src_ptr;
auto dst_ptr = param.dst_ptr;
#define DISPATCH_FUNC(Pooler, NeonPooler, window, midout_type_id) \
MIDOUT_BEGIN( \
megdnn_arm_common_pooling, midout_iv(0), midout_iv(midout_type_id), \
Pooler::MIDOUT_CASE_NUM, NeonPooler::MIDOUT_CASE_NUM, window) { \
auto run = [C, IH, IW, OH, OW, PH, PW, src_ptr, dst_ptr, \
src_dtype = param.src_type](size_t index, size_t) { \
size_t n = index / C; \
size_t c = index % C; \
do_pooling_compact<Pooler MEGDNN_COMMA NeonPooler MEGDNN_COMMA window>( \
static_cast<const typename Pooler::ctype*>(src_ptr.get_ptr()) + \
n * C * IH * IW + c * IH * IW, \
static_cast<typename Pooler::ctype*>(dst_ptr.get_ptr()) + \
n * C * OH * OW + c * OH * OW, \
src_dtype, IH, IW, OH, OW, PH, PW); \
}; \
MEGDNN_DISPATCH_MULTI_THREAD_CPU_KERN( \
static_cast<::megdnn::naive::HandleImpl*>(param.handle), N* C, run); \
} \
MIDOUT_END()

#define DISPATCH_WINDOW(Pooler, NeonPooler, dtype, ctype, comp_type, midout_type_id) \
@@ -213,26 +213,26 @@ void PoolingImpl::AlgoFilter2ModexStride2::exec(const PoolingKernParam& param) c
auto PH = param.padding[0];
auto PW = param.padding[1];

void* src_ptr = param.src_ptr;
void* dst_ptr = param.dst_ptr;
#define DISPATCH_FUNC(Pooler, mode, midout_type_id) \
MIDOUT_BEGIN( \
megdnn_arm_common_pooling, midout_iv(1), midout_iv(midout_type_id), \
Pooler::MIDOUT_CASE_NUM) { \
auto run = [C, IH, IW, OH, OW, PH, PW, src_ptr, dst_ptr, \
src_dtype = param.src_type](size_t index, size_t) { \
size_t n = index / C; \
size_t c = index % C; \
do_pooling_2x2<Pooler MEGDNN_COMMA mode>( \
static_cast<const typename Pooler::ctype*>(src_ptr) + \
n * C * IH * IW + c * IH * IW, \
static_cast<typename Pooler::ctype*>(dst_ptr) + n * C * OH * OW + \
c * OH * OW, \
src_dtype, IH, IW, OH, OW, PH, PW); \
}; \
MEGDNN_DISPATCH_MULTI_THREAD_CPU_KERN( \
static_cast<::megdnn::naive::HandleImpl*>(param.handle), N* C, run); \
} \
auto src_ptr = param.src_ptr;
auto dst_ptr = param.dst_ptr;
#define DISPATCH_FUNC(Pooler, mode, midout_type_id) \
MIDOUT_BEGIN( \
megdnn_arm_common_pooling, midout_iv(1), midout_iv(midout_type_id), \
Pooler::MIDOUT_CASE_NUM) { \
auto run = [C, IH, IW, OH, OW, PH, PW, src_ptr, dst_ptr, \
src_dtype = param.src_type](size_t index, size_t) { \
size_t n = index / C; \
size_t c = index % C; \
do_pooling_2x2<Pooler MEGDNN_COMMA mode>( \
static_cast<const typename Pooler::ctype*>(src_ptr.get_ptr()) + \
n * C * IH * IW + c * IH * IW, \
static_cast<typename Pooler::ctype*>(dst_ptr.get_ptr()) + \
n * C * OH * OW + c * OH * OW, \
src_dtype, IH, IW, OH, OW, PH, PW); \
}; \
MEGDNN_DISPATCH_MULTI_THREAD_CPU_KERN( \
static_cast<::megdnn::naive::HandleImpl*>(param.handle), N* C, run); \
} \
MIDOUT_END()

#define DISPATCH_MODE(dtype, ctype, comp_type, midout_type_id) \
@@ -286,8 +286,8 @@ void PoolingImpl::AlgoFilter3MaxStride2::exec(const PoolingKernParam& param) con
auto PH = param.padding[0];
auto PW = param.padding[1];

void* src_ptr = param.src_ptr;
void* dst_ptr = param.dst_ptr;
auto src_ptr = param.src_ptr;
auto dst_ptr = param.dst_ptr;

#define DISPATCH_FUNC(type, func, midout_type_id) \
MIDOUT_BEGIN(megdnn_arm_common_pooling, midout_iv(2), midout_iv(midout_type_id)) { \
@@ -300,9 +300,11 @@ void PoolingImpl::AlgoFilter3MaxStride2::exec(const PoolingKernParam& param) con
size_t n = index / C; \
size_t c = index % C; \
do_max_pooling_3x3_s2x2_##func##_NEON( \
static_cast<const type*>(src_ptr) + n * C * IH * IW + c * IH * IW, \
static_cast<type*>(dst_ptr) + n * C * OH * OW + c * OH * OW, IH, \
IW, OH, OW, PH, PW, ws); \
static_cast<const type*>(src_ptr.get_ptr()) + n * C * IH * IW + \
c * IH * IW, \
static_cast<type*>(dst_ptr.get_ptr()) + n * C * OH * OW + \
c * OH * OW, \
IH, IW, OH, OW, PH, PW, ws); \
}; \
MEGDNN_DISPATCH_MULTI_THREAD_CPU_KERN( \
static_cast<::megdnn::naive::HandleImpl*>(param.handle), N* C, run); \
@@ -339,8 +341,8 @@ void PoolingImpl::AlgoFilter3AverageStride2::exec(const PoolingKernParam& param)
auto PH = param.padding[0];
auto PW = param.padding[1];

void* src_ptr = param.src_ptr;
void* dst_ptr = param.dst_ptr;
auto src_ptr = param.src_ptr;
auto dst_ptr = param.dst_ptr;

#define DISPATCH_FUNC(type, MEGDNN_SIMD_WIDTH, midout_type_id) \
MIDOUT_BEGIN(megdnn_arm_common_pooling, midout_iv(3), midout_iv(midout_type_id)) { \
@@ -353,9 +355,11 @@ void PoolingImpl::AlgoFilter3AverageStride2::exec(const PoolingKernParam& param)
size_t n = index / C; \
size_t c = index % C; \
do_average_pooling_3x3_s2x2_NEON( \
static_cast<const type*>(src_ptr) + n * C * IH * IW + c * IH * IW, \
static_cast<type*>(dst_ptr) + n * C * OH * OW + c * OH * OW, IH, \
IW, OH, OW, PH, PW, ws, MEGDNN_SIMD_WIDTH); \
static_cast<const type*>(src_ptr.get_ptr()) + n * C * IH * IW + \
c * IH * IW, \
static_cast<type*>(dst_ptr.get_ptr()) + n * C * OH * OW + \
c * OH * OW, \
IH, IW, OH, OW, PH, PW, ws, MEGDNN_SIMD_WIDTH); \
}; \
MEGDNN_DISPATCH_MULTI_THREAD_CPU_KERN( \
static_cast<::megdnn::naive::HandleImpl*>(param.handle), N* C, run); \
@@ -392,8 +396,8 @@ void PoolingImpl::AlgoFilter4MaxStride2::exec(const PoolingKernParam& param) con
auto PH = param.padding[0];
auto PW = param.padding[1];

void* src_ptr = param.src_ptr;
void* dst_ptr = param.dst_ptr;
auto src_ptr = param.src_ptr;
auto dst_ptr = param.dst_ptr;

#define DISPATCH_FUNC(type, func, midout_type_id) \
MIDOUT_BEGIN(megdnn_arm_common_pooling, midout_iv(4), midout_iv(midout_type_id)) { \
@@ -402,8 +406,10 @@ void PoolingImpl::AlgoFilter4MaxStride2::exec(const PoolingKernParam& param) con
size_t n = index / C; \
size_t c = index % C; \
do_max_pooling_w4x4_s2x2_##func##_NEON( \
static_cast<const type*>(src_ptr) + n * C * IH * IW + c * IH * IW, \
static_cast<type*>(dst_ptr) + n * C * OH * OW + c * OH * OW, \
static_cast<const type*>(src_ptr.get_ptr()) + n * C * IH * IW + \
c * IH * IW, \
static_cast<type*>(dst_ptr.get_ptr()) + n * C * OH * OW + \
c * OH * OW, \
src_dtype, IH, IW, OH, OW, PH, PW); \
}; \
MEGDNN_DISPATCH_MULTI_THREAD_CPU_KERN( \
@@ -446,8 +452,8 @@ void PoolingImpl::AlgoFilter5MaxStride2::exec(const PoolingKernParam& param) con
auto PH = param.padding[0];
auto PW = param.padding[1];

void* src_ptr = param.src_ptr;
void* dst_ptr = param.dst_ptr;
auto src_ptr = param.src_ptr;
auto dst_ptr = param.dst_ptr;

#define DISPATCH_FUNC(dtype, type, midout_type_id, MEGDNN_SIMD_WIDTH) \
MIDOUT_BEGIN(megdnn_arm_common_pooling, midout_iv(5), midout_iv(midout_type_id)) { \
@@ -460,9 +466,11 @@ void PoolingImpl::AlgoFilter5MaxStride2::exec(const PoolingKernParam& param) con
size_t n = index / C; \
size_t c = index % C; \
do_max_pooling_w5x5_s2x2_NEON<dtype>( \
static_cast<const type*>(src_ptr) + n * C * IH * IW + c * IH * IW, \
static_cast<type*>(dst_ptr) + n * C * OH * OW + c * OH * OW, IH, \
IW, OH, OW, PH, PW, ws, MEGDNN_SIMD_WIDTH); \
static_cast<const type*>(src_ptr.get_ptr()) + n * C * IH * IW + \
c * IH * IW, \
static_cast<type*>(dst_ptr.get_ptr()) + n * C * OH * OW + \
c * OH * OW, \
IH, IW, OH, OW, PH, PW, ws, MEGDNN_SIMD_WIDTH); \
}; \
MEGDNN_DISPATCH_MULTI_THREAD_CPU_KERN( \
static_cast<::megdnn::naive::HandleImpl*>(param.handle), N* C, run); \
@@ -593,8 +601,8 @@ void PoolingImpl::AlgoFilter3ModexStridexNCHW44::exec(
auto PW = param.padding[1];
auto SW = param.stride[0];

void* src_ptr = param.src_ptr;
void* dst_ptr = param.dst_ptr;
auto src_ptr = param.src_ptr;
auto dst_ptr = param.dst_ptr;

#define DISPATCH_FUNC(type, func, i, mode) \
MIDOUT_BEGIN( \
@@ -608,9 +616,9 @@ void PoolingImpl::AlgoFilter3ModexStridexNCHW44::exec(
size_t n = index / C; \
size_t c = index % C; \
do_##mode##_pooling_3x3_stride##i##_##func##_nchw44_NEON( \
static_cast<const type*>(src_ptr) + n * C * IH * IW * 4 + \
c * IH * IW * 4, \
static_cast<type*>(dst_ptr) + n * C * OH * OW * 4 + \
static_cast<const type*>(src_ptr.get_ptr()) + \
n * C * IH * IW * 4 + c * IH * IW * 4, \
static_cast<type*>(dst_ptr.get_ptr()) + n * C * OH * OW * 4 + \
c * OH * OW * 4, \
IH, IW, OH, OW, PH, PW, ws); \
}; \
@@ -685,8 +693,8 @@ void PoolingImpl::AlgoFilter2ModexStridexNCHW44::exec(
auto PW = param.padding[1];
auto SW = param.stride[0];

void* src_ptr = param.src_ptr;
void* dst_ptr = param.dst_ptr;
auto src_ptr = param.src_ptr;
auto dst_ptr = param.dst_ptr;

#define DISPATCH_FUNC(type, func, i, mode) \
MIDOUT_BEGIN( \
@@ -700,9 +708,9 @@ void PoolingImpl::AlgoFilter2ModexStridexNCHW44::exec(
size_t n = index / C; \
size_t c = index % C; \
do_##mode##_pooling_2x2_stride##i##_##func##_nchw44_NEON( \
static_cast<const type*>(src_ptr) + n * C * IH * IW * 4 + \
c * IH * IW * 4, \
static_cast<type*>(dst_ptr) + n * C * OH * OW * 4 + \
static_cast<const type*>(src_ptr.get_ptr()) + \
n * C * IH * IW * 4 + c * IH * IW * 4, \
static_cast<type*>(dst_ptr.get_ptr()) + n * C * OH * OW * 4 + \
c * OH * OW * 4, \
IH, IW, OH, OW, PH, PW, ws); \
}; \
@@ -778,8 +786,8 @@ void PoolingImpl::AlgoFilter4ModexStridexNCHW44::exec(
auto PW = param.padding[1];
auto SW = param.stride[0];

void* src_ptr = param.src_ptr;
void* dst_ptr = param.dst_ptr;
auto src_ptr = param.src_ptr;
auto dst_ptr = param.dst_ptr;

#define DISPATCH_FUNC(type, func, i, mode) \
MIDOUT_BEGIN( \
@@ -793,9 +801,9 @@ void PoolingImpl::AlgoFilter4ModexStridexNCHW44::exec(
size_t n = index / C; \
size_t c = index % C; \
do_##mode##_pooling_4x4_stride##i##_##func##_nchw44_NEON( \
static_cast<const type*>(src_ptr) + n * C * IH * IW * 4 + \
c * IH * IW * 4, \
static_cast<type*>(dst_ptr) + n * C * OH * OW * 4 + \
static_cast<const type*>(src_ptr.get_ptr()) + \
n * C * IH * IW * 4 + c * IH * IW * 4, \
static_cast<type*>(dst_ptr.get_ptr()) + n * C * OH * OW * 4 + \
c * OH * OW * 4, \
IH, IW, OH, OW, PH, PW, ws); \
}; \
@@ -870,8 +878,8 @@ void PoolingImpl::AlgoFilter5ModexStridexNCHW44::exec(
auto PW = param.padding[1];
auto SW = param.stride[0];

void* src_ptr = param.src_ptr;
void* dst_ptr = param.dst_ptr;
auto src_ptr = param.src_ptr;
auto dst_ptr = param.dst_ptr;

#define DISPATCH_FUNC(type, func, i, mode) \
MIDOUT_BEGIN( \
@@ -885,9 +893,9 @@ void PoolingImpl::AlgoFilter5ModexStridexNCHW44::exec(
size_t n = index / C; \
size_t c = index % C; \
do_##mode##_pooling_5x5_stride##i##_##func##_nchw44_NEON( \
static_cast<const type*>(src_ptr) + n * C * IH * IW * 4 + \
c * IH * IW * 4, \
static_cast<type*>(dst_ptr) + n * C * OH * OW * 4 + \
static_cast<const type*>(src_ptr.get_ptr()) + \
n * C * IH * IW * 4 + c * IH * IW * 4, \
static_cast<type*>(dst_ptr.get_ptr()) + n * C * OH * OW * 4 + \
c * OH * OW * 4, \
IH, IW, OH, OW, PH, PW, ws); \
}; \


+ 6
- 5
dnn/src/arm_common/pooling/algo_fp32_pooling_nchw44.cpp View File

@@ -50,8 +50,8 @@ void PoolingImpl::AlgoFp32ModexStridexNCHW44::exec(
int sh = param.stride[0];
int fh = param.filter[0];

void* src_ptr = param.src_ptr;
void* dst_ptr = param.dst_ptr;
auto src_ptr = param.src_ptr;
auto dst_ptr = param.dst_ptr;

#define DISPATCH_FUNC(filter, stride, mode) \
MIDOUT_BEGIN( \
@@ -60,9 +60,10 @@ void PoolingImpl::AlgoFp32ModexStridexNCHW44::exec(
auto run = [ih, iw, oh, ow, ph, pw, src_ptr, dst_ptr](size_t index, size_t) { \
const int c_idx = index; \
pooling_fp32_nchw44<filter, stride, mode>( \
static_cast<const float*>(src_ptr) + c_idx * ih * iw * 4, \
static_cast<float*>(dst_ptr) + c_idx * oh * ow * 4, ih, iw, oh, \
ow, ph, pw); \
static_cast<const float*>(src_ptr.get_ptr()) + \
c_idx * ih * iw * 4, \
static_cast<float*>(dst_ptr.get_ptr()) + c_idx * oh * ow * 4, ih, \
iw, oh, ow, ph, pw); \
}; \
MEGDNN_DISPATCH_MULTI_THREAD_CPU_KERN( \
static_cast<::megdnn::naive::HandleImpl*>(param.handle), n* ic, run); \


+ 2
- 2
dnn/src/arm_common/pooling/opr_impl.cpp View File

@@ -89,8 +89,8 @@ PoolingImpl::PoolingKernParam PoolingImpl::make_pooling_kern_param(
PoolingKernParam ret;
static_cast<PoolingKernSizeParam&>(ret) =
make_pooling_kern_szie_param(opr, src.layout, dst.layout);
ret.src_ptr = src.raw_ptr;
ret.dst_ptr = dst.raw_ptr;
ret.src_ptr = src.get_ref_ptr();
ret.dst_ptr = dst.get_ref_ptr();
ret.workspace_ptr = workspace.raw_ptr;
ret.workspace_size = workspace.size;
return ret;


+ 4
- 4
dnn/src/arm_common/pooling/opr_impl.h View File

@@ -56,21 +56,21 @@ public:
};

struct PoolingKernParam : public PoolingKernSizeParam {
void* src_ptr;
void* dst_ptr;
RefPtr src_ptr;
RefPtr dst_ptr;
void* workspace_ptr;
size_t workspace_size;

template <typename T>
const T* src() const {
src_type.assert_is_compatible_ctype<T>();
return static_cast<const T*>(src_ptr);
return static_cast<const T*>(src_ptr.get_ptr());
}

template <typename T>
T* dst() const {
dst_type.assert_is_compatible_ctype<T>();
return static_cast<T*>(dst_ptr);
return static_cast<T*>(dst_ptr.get_ptr());
}

template <typename T>


+ 4
- 4
dnn/src/arm_common/reduce/opr_impl.cpp View File

@@ -816,8 +816,8 @@ void ReduceImpl::exec(
MIDOUT_BEGIN( \
megdnn_arm_common_reduce, ctype, dtype, comp_type, midout_iv(1)) { \
MEGDNN_DISPATCH_CPU_KERN_OPR(do_reduce( \
reinterpret_cast<ctype*>(src.raw_ptr), \
reinterpret_cast<ctype*>(dst.raw_ptr), src_type, A, B, C)); \
reinterpret_cast<ctype*>(src.raw_ptr()), \
reinterpret_cast<ctype*>(dst.raw_ptr()), src_type, A, B, C)); \
execed = true; \
} \
MIDOUT_END(); \
@@ -828,8 +828,8 @@ void ReduceImpl::exec(
MIDOUT_BEGIN( \
megdnn_arm_common_reduce, ctype, dtype, comp_type, midout_iv(1)) { \
MEGDNN_DISPATCH_CPU_KERN_OPR(do_reduce( \
reinterpret_cast<ctype*>(src.raw_ptr), \
reinterpret_cast<ctype*>(dst.raw_ptr), src_type, A, B, C)); \
reinterpret_cast<ctype*>(src.raw_ptr()), \
reinterpret_cast<ctype*>(dst.raw_ptr()), src_type, A, B, C)); \
execed = true; \
} \
MIDOUT_END(); \


+ 6
- 6
dnn/src/arm_common/resize/direct_nchwxx.cpp View File

@@ -72,14 +72,14 @@ void resize_direct_nchwxx(
void megdnn::arm_common::resize_direct_nearest_nchw44_fp32(
const ResizeImpl::KernParam<float>& kern_param) {
resize_direct_nchwxx<float, InterpolationMode::INTER_NEAREST>(
kern_param.sptr, kern_param.dptr, kern_param.n * kern_param.c / 4,
kern_param.src(), kern_param.dst(), kern_param.n * kern_param.c / 4,
kern_param.ih, kern_param.iw, kern_param.oh, kern_param.ow);
}

void megdnn::arm_common::resize_direct_linear_nchw44_fp32(
const ResizeImpl::KernParam<float>& kern_param) {
resize_direct_nchwxx<float, InterpolationMode::INTER_LINEAR>(
kern_param.sptr, kern_param.dptr, kern_param.n * kern_param.c / 4,
kern_param.src(), kern_param.dst(), kern_param.n * kern_param.c / 4,
kern_param.ih, kern_param.iw, kern_param.oh, kern_param.ow);
}

@@ -87,8 +87,8 @@ void megdnn::arm_common::resize_direct_linear_nchw44_fp32(

void megdnn::arm_common::resize_direct_nearest_nchw88_fp16(
const ResizeImpl::KernParam<dt_float16>& kern_param) {
auto sptr = reinterpret_cast<const __fp16*>(kern_param.sptr);
auto dptr = reinterpret_cast<__fp16*>(kern_param.dptr);
auto sptr = reinterpret_cast<const __fp16*>(kern_param.sptr.get_ptr());
auto dptr = reinterpret_cast<__fp16*>(kern_param.dptr.get_ptr());
resize_direct_nchwxx<__fp16, InterpolationMode::INTER_NEAREST>(
sptr, dptr, kern_param.n * kern_param.c / 8, kern_param.ih, kern_param.iw,
kern_param.oh, kern_param.ow);
@@ -96,8 +96,8 @@ void megdnn::arm_common::resize_direct_nearest_nchw88_fp16(

void megdnn::arm_common::resize_direct_linear_nchw88_fp16(
const ResizeImpl::KernParam<dt_float16>& kern_param) {
auto sptr = reinterpret_cast<const __fp16*>(kern_param.sptr);
auto dptr = reinterpret_cast<__fp16*>(kern_param.dptr);
auto sptr = reinterpret_cast<const __fp16*>(kern_param.sptr.get_ptr());
auto dptr = reinterpret_cast<__fp16*>(kern_param.dptr.get_ptr());
resize_direct_nchwxx<__fp16, InterpolationMode::INTER_LINEAR>(
sptr, dptr, kern_param.n * kern_param.c / 8, kern_param.ih, kern_param.iw,
kern_param.oh, kern_param.ow);


+ 6
- 6
dnn/src/arm_common/resize/upsample2_nchw.cpp View File

@@ -191,14 +191,14 @@ void nearest_upsample2_nchw(
void megdnn::arm_common::resize_linear_upsample2_nchw_fp32(
const ResizeImpl::KernParam<float>& kern_param) {
linear_upsample2_nchw(
kern_param.sptr, kern_param.dptr, kern_param.n * kern_param.c,
kern_param.src(), kern_param.dst(), kern_param.n * kern_param.c,
kern_param.ih, kern_param.iw);
}

void megdnn::arm_common::resize_nearest_upsample2_nchw_fp32(
const ResizeImpl::KernParam<float>& kern_param) {
nearest_upsample2_nchw(
kern_param.sptr, kern_param.dptr, kern_param.n * kern_param.c,
kern_param.src(), kern_param.dst(), kern_param.n * kern_param.c,
kern_param.ih, kern_param.iw);
}

@@ -206,16 +206,16 @@ void megdnn::arm_common::resize_nearest_upsample2_nchw_fp32(

void megdnn::arm_common::resize_linear_upsample2_nchw_fp16(
const ResizeImpl::KernParam<dt_float16>& kern_param) {
auto sptr = reinterpret_cast<const __fp16*>(kern_param.sptr);
auto dptr = reinterpret_cast<__fp16*>(kern_param.dptr);
auto sptr = reinterpret_cast<const __fp16*>(kern_param.sptr.get_ptr());
auto dptr = reinterpret_cast<__fp16*>(kern_param.dptr.get_ptr());
linear_upsample2_nchw(
sptr, dptr, kern_param.n * kern_param.c, kern_param.ih, kern_param.iw);
}

void megdnn::arm_common::resize_nearest_upsample2_nchw_fp16(
const ResizeImpl::KernParam<dt_float16>& kern_param) {
auto sptr = reinterpret_cast<const __fp16*>(kern_param.sptr);
auto dptr = reinterpret_cast<__fp16*>(kern_param.dptr);
auto sptr = reinterpret_cast<const __fp16*>(kern_param.sptr.get_ptr());
auto dptr = reinterpret_cast<__fp16*>(kern_param.dptr.get_ptr());
nearest_upsample2_nchw(
sptr, dptr, kern_param.n * kern_param.c, kern_param.ih, kern_param.iw);
}


+ 6
- 6
dnn/src/arm_common/resize/upsample2_nchwxx.cpp View File

@@ -158,14 +158,14 @@ void nearest_upsample2_nchwxx(
void megdnn::arm_common::resize_linear_upsample2_nchw44_fp32(
const ResizeImpl::KernParam<float>& kern_param) {
linear_upsample2_nchwxx(
kern_param.sptr, kern_param.dptr, kern_param.n * kern_param.c / 4,
kern_param.src(), kern_param.dst(), kern_param.n * kern_param.c / 4,
kern_param.ih, kern_param.iw);
}

void megdnn::arm_common::resize_nearest_upsample2_nchw44_fp32(
const ResizeImpl::KernParam<float>& kern_param) {
nearest_upsample2_nchwxx(
kern_param.sptr, kern_param.dptr, kern_param.n * kern_param.c / 4,
kern_param.src(), kern_param.dst(), kern_param.n * kern_param.c / 4,
kern_param.ih, kern_param.iw);
}

@@ -173,16 +173,16 @@ void megdnn::arm_common::resize_nearest_upsample2_nchw44_fp32(

void megdnn::arm_common::resize_linear_upsample2_nchw88_fp16(
const ResizeImpl::KernParam<dt_float16>& kern_param) {
auto sptr = reinterpret_cast<const __fp16*>(kern_param.sptr);
auto dptr = reinterpret_cast<__fp16*>(kern_param.dptr);
auto sptr = reinterpret_cast<const __fp16*>(kern_param.sptr.get_ptr());
auto dptr = reinterpret_cast<__fp16*>(kern_param.dptr.get_ptr());
linear_upsample2_nchwxx(
sptr, dptr, kern_param.n * kern_param.c / 8, kern_param.ih, kern_param.iw);
}

void megdnn::arm_common::resize_nearest_upsample2_nchw88_fp16(
const ResizeImpl::KernParam<dt_float16>& kern_param) {
auto sptr = reinterpret_cast<const __fp16*>(kern_param.sptr);
auto dptr = reinterpret_cast<__fp16*>(kern_param.dptr);
auto sptr = reinterpret_cast<const __fp16*>(kern_param.sptr.get_ptr());
auto dptr = reinterpret_cast<__fp16*>(kern_param.dptr.get_ptr());
nearest_upsample2_nchwxx(
sptr, dptr, kern_param.n * kern_param.c / 8, kern_param.ih, kern_param.iw);
}


+ 4
- 4
dnn/src/arm_common/separable_filter/opr_impl.cpp View File

@@ -78,9 +78,9 @@ void SeparableFilterImpl::separable_filter_exec_8u(
megdnn_assert(src.layout.dtype == dtype::Uint8());

Mat<float> kernel_column(
1, filter_y.layout.shape[3], 1, static_cast<float*>(filter_y.raw_ptr));
1, filter_y.layout.shape[3], 1, static_cast<float*>(filter_y.raw_ptr()));
Mat<float> kernel_row(
1, filter_x.layout.shape[3], 1, static_cast<float*>(filter_x.raw_ptr));
1, filter_x.layout.shape[3], 1, static_cast<float*>(filter_x.raw_ptr()));

size_t src_channels = src.layout.shape[3];

@@ -128,9 +128,9 @@ void SeparableFilterImpl::separable_filter_exec(
_megdnn_tensor_in src, _megdnn_tensor_in filter_x, _megdnn_tensor_in filter_y,
_megdnn_tensor_out dst) {
Mat<T> kernel_column(
1, filter_y.layout.shape[3], 1, static_cast<T*>(filter_y.raw_ptr));
1, filter_y.layout.shape[3], 1, static_cast<T*>(filter_y.raw_ptr()));
Mat<T> kernel_row(
1, filter_x.layout.shape[3], 1, static_cast<T*>(filter_x.raw_ptr));
1, filter_x.layout.shape[3], 1, static_cast<T*>(filter_x.raw_ptr()));
size_t src_channels = src.layout.shape[3];

T border_value[4] = {0, 0, 0, 0};


+ 12
- 12
dnn/src/arm_common/type_cvt/opr_impl.cpp View File

@@ -483,18 +483,18 @@ void TypeCvtImpl::exec(_megdnn_tensor_in src, _megdnn_tensor_out dst) {
#undef DISPATCH_QUANTIZED

#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
#define DISPATCH_FLOAT(_stype_enumv, _stype, _dtype_enumv, _dtype, _midout_iv) \
if (src_dtype.enumv() == DTypeTrait<_stype_enumv>::enumv && \
dst_dtype.enumv() == DTypeTrait<_dtype_enumv>::enumv) { \
MIDOUT_BEGIN(megdnn_arm_typecvt_float, midout_iv(_midout_iv)) { \
using _TypeCvter = FloatTypeCvter<_stype, _dtype>; \
MEGDNN_DISPATCH_CPU_KERN_OPR(do_typecvt<_TypeCvter>( \
reinterpret_cast<_stype*>(src.raw_ptr), \
reinterpret_cast<_dtype*>(dst.raw_ptr), src_dtype, dst_dtype, \
nr_elems)); \
execed = true; \
} \
MIDOUT_END(); \
#define DISPATCH_FLOAT(_stype_enumv, _stype, _dtype_enumv, _dtype, _midout_iv) \
if (src_dtype.enumv() == DTypeTrait<_stype_enumv>::enumv && \
dst_dtype.enumv() == DTypeTrait<_dtype_enumv>::enumv) { \
MIDOUT_BEGIN(megdnn_arm_typecvt_float, midout_iv(_midout_iv)) { \
using _TypeCvter = FloatTypeCvter<_stype, _dtype>; \
MEGDNN_DISPATCH_CPU_KERN_OPR(do_typecvt<_TypeCvter>( \
reinterpret_cast<_stype*>(src.raw_ptr()), \
reinterpret_cast<_dtype*>(dst.raw_ptr()), src_dtype, dst_dtype, \
nr_elems)); \
execed = true; \
} \
MIDOUT_END(); \
}
DISPATCH_FLOAT(dt_float16, __fp16, float, float, 0);
DISPATCH_FLOAT(float, float, dt_float16, __fp16, 1);


+ 10
- 12
dnn/src/arm_common/warp_perspective/warp_perspective_cv.cpp View File

@@ -167,21 +167,17 @@ void megdnn::arm_common::warp_perspective_cv_exec(
megdnn_assert(
ch == 1 || ch == 3 || ch == 2,
"unsupported src channel: %zu, avaiable channel size: 1/2/3", ch);
const float* trans_ptr = trans.ptr<dt_float32>();
const int* midx_ptr = nullptr;
if (mat_idx.raw_ptr) {
megdnn_assert(mat_idx.layout.ndim == 1);
midx_ptr = mat_idx.ptr<int>();
}
if (dst.layout.dtype.enumv() == DTypeEnum::Float32) {
#define cb(_imode, _bmode, _ch) \
auto task = [src, trans_ptr, midx_ptr, dst, border_value, parallelism_batch]( \
auto task = [src, trans, mat_idx, dst, border_value, parallelism_batch]( \
size_t index, size_t) { \
size_t batch_id = index / parallelism_batch; \
size_t task_id = index % parallelism_batch; \
size_t src_id = batch_id; \
if (midx_ptr) { \
src_id = midx_ptr[batch_id]; \
const float* trans_ptr = trans.ptr<dt_float32>(); \
if (mat_idx.raw_ptr()) { \
megdnn_assert(mat_idx.layout.ndim == 1); \
src_id = mat_idx.ptr<int>()[batch_id]; \
megdnn_assert( \
src_id < src.layout.shape[0], \
"mat_idx out of bound: mat_idx[%zu]=%zu src_batch=%zu", batch_id, \
@@ -202,13 +198,15 @@ void megdnn::arm_common::warp_perspective_cv_exec(
#undef cb
} else if (dst.layout.dtype.enumv() == DTypeEnum::Uint8) {
#define cb(_imode, _bmode, _ch) \
auto task = [src, trans_ptr, midx_ptr, dst, border_value, parallelism_batch]( \
auto task = [src, trans, mat_idx, dst, border_value, parallelism_batch]( \
size_t index, size_t) { \
size_t batch_id = index / parallelism_batch; \
size_t task_id = index % parallelism_batch; \
size_t src_id = batch_id; \
if (midx_ptr) { \
src_id = midx_ptr[batch_id]; \
const float* trans_ptr = trans.ptr<dt_float32>(); \
if (mat_idx.raw_ptr()) { \
megdnn_assert(mat_idx.layout.ndim == 1); \
src_id = mat_idx.ptr<int>()[batch_id]; \
megdnn_assert( \
src_id < src.layout.shape[0], \
"mat_idx out of bound: mat_idx[%zu]=%zu src_batch=%zu", batch_id, \


+ 3
- 3
dnn/src/armv7/relayout/opr_impl.cpp View File

@@ -136,10 +136,10 @@ void armv7::RelayoutForwardImpl::exec(
relayout::TransposeParam trans_param;
bool trans = relayout::is_transpose(src.layout, dst.layout, trans_param);
if (trans && trans_param.c == 1 && src0.layout.dtype.size() == 1) {
auto sptr = static_cast<TransposeByte*>(src.raw_ptr),
dptr = static_cast<TransposeByte*>(dst.raw_ptr);
MEGDNN_DISPATCH_CPU_KERN_OPR(transpose_fallback::transpose<TransposeByte>(
trans_param.batch, trans_param.m, trans_param.n, sptr, dptr));
trans_param.batch, trans_param.m, trans_param.n,
static_cast<TransposeByte*>(src.raw_ptr()),
static_cast<TransposeByte*>(dst.raw_ptr())));
return;
}
exec_after_preprocess(src, dst, trans ? &trans_param : nullptr);


+ 3
- 1
dnn/src/armv7/rotate/opr_impl.cpp View File

@@ -288,11 +288,13 @@ void RotateImpl::exec(
return fallback::RotateImpl::exec(src, dst, workspace);
}

auto clockwise = param().clockwise;

MEGDNN_DISPATCH_CPU_KERN_OPR({
for (size_t i = 0; i < src.layout.shape[0]; ++i) {
Mat<uchar> src_mat = TensorND2Mat<uchar>(src, i);
Mat<uchar> dst_mat = TensorND2Mat<uchar>(dst, i);
rotate(src_mat, dst_mat, param().clockwise);
rotate(src_mat, dst_mat, clockwise);
}
});
}


+ 1
- 1
dnn/src/atlas/checksum/opr_impl.cpp View File

@@ -36,7 +36,7 @@ ChecksumForward::Result ChecksumForwardImpl::exec(
megcoreComputingHandle_t comp_handle = handle()->megcore_computing_handle();
megcoreGetDeviceHandle(comp_handle, &dev_handle);
megcoreMemcpy(
comp_handle, cpu_data.data(), data.raw_ptr, cpu_data.size(),
comp_handle, cpu_data.data(), data.raw_ptr(), cpu_data.size(),
megcoreMemcpyDeviceToHost);
megcoreSynchronize(comp_handle);



+ 2
- 2
dnn/src/cambricon/checksum/opr_impl.cpp View File

@@ -62,7 +62,7 @@ ChecksumForward::Result ChecksumForwardImpl::exec(
check_exec(data.layout, workspace.size);
auto queue = cnrt_queue(handle());

auto ptr = static_cast<uint8_t*>(data.raw_ptr);
auto ptr = static_cast<uint8_t*>(data.raw_ptr());
size_t size_all = data.layout.shape[0], size_ints = size_all / sizeof(uint32_t);
auto last_val_size = std::min<size_t>(size_all, 4);
cnrt_check(cnrtMemcpyAsync(
@@ -72,7 +72,7 @@ ChecksumForward::Result ChecksumForwardImpl::exec(
auto&& device_info = current_device_info();
bang_c_wrapper(
reinterpret_cast<uint32_t*>(workspace.raw_ptr),
static_cast<uint32_t*>(data.raw_ptr), size_ints, queue,
static_cast<uint32_t*>(data.raw_ptr()), size_ints, queue,
device_info.core_version);
cnrt_check(cnrtMemcpyAsync(
&result.checksum, workspace.raw_ptr, sizeof(result.checksum), queue,


+ 3
- 4
dnn/src/common/concat_split.cpp View File

@@ -38,10 +38,9 @@ void ConcatSplitBase::check_layout_common(
megdnn_assert_eq_size_t(src.ndim, ndim);
}
// ensure param().axis is correct
auto errmsg = "param().axis=" + std::to_string(param().axis) +
", ndim=" + std::to_string(ndim);
MEGDNN_MARK_USED_VAR(errmsg);
megdnn_assert(param().axis < static_cast<int32_t>(ndim), "%s", errmsg.c_str());
megdnn_assert(
param().axis < static_cast<int32_t>(ndim), "param().axis=%u, ndim=%zu",
param().axis, ndim);
// ensure shape size for each axis is correct
for (size_t i = 0; i < ndim; ++i) {
if (i == static_cast<size_t>(param().axis)) {


+ 6
- 10
dnn/src/common/elemwise_multi_type/opr_impl_helper.cpp View File

@@ -24,28 +24,24 @@ void ElemwiseMultiTypeImplHelper::exec(
_megdnn_in const TensorNDArray& src, _megdnn_tensor_out dst) {
switch (m_param.mode) {
case Mode::FUSE_MUL_ADD3_INT16x32x32x32:
on_fuse_mul_add3_int16x32x32x32(
make_elemwise_op_param<3>(src, dst), dst.ptr<dt_int32>());
on_fuse_mul_add3_int16x32x32x32(make_elemwise_op_param<3>(src, dst), dst);
break;
case Mode::FUSE_MUL_ADD3_IXxF32xF32xI8:
on_fuse_mul_add3_iXxf32xf32xi8(
make_elemwise_op_param<3>(src, dst), dst.ptr<dt_int8>());
on_fuse_mul_add3_iXxf32xf32xi8(make_elemwise_op_param<3>(src, dst), dst);
break;
case Mode::ROUND_SHR_SATURATE_IXxI8xI8:
on_round_shr_saturate_iXxi8xi8(
make_elemwise_op_param<2>(src, dst), dst.ptr<dt_int8>());
on_round_shr_saturate_iXxi8xi8(make_elemwise_op_param<2>(src, dst), dst);
break;
case Mode::FUSE_ADD_RMULH_ROUND_SHR_SATURATE_INT16x16x16x8:
on_fuse_add_rmulh_round_shr_saturate_int16x16x16x8(
make_elemwise_op_param<6>(src, dst), dst.ptr<dt_int8>());
make_elemwise_op_param<6>(src, dst), dst);
break;
case Mode::FUSE_ADD_RMULH_ROUND_SHR_SATURATE_INT32x32x32x8:
on_fuse_add_rmulh_round_shr_saturate_int32x32x32x8(
make_elemwise_op_param<6>(src, dst), dst.ptr<dt_int8>());
make_elemwise_op_param<6>(src, dst), dst);
break;
case Mode::ROUND_SHR_SATURATE_IXxI8xI16:
on_round_shr_saturate_iXxi8xi16(
make_elemwise_op_param<2>(src, dst), dst.ptr<dt_int16>());
on_round_shr_saturate_iXxi8xi16(make_elemwise_op_param<2>(src, dst), dst);
break;
ON_QUANTIZED_MODE(RELU, 1);
ON_QUANTIZED_MODE(ABS, 1);


+ 6
- 6
dnn/src/common/elemwise_multi_type/opr_impl_helper.h View File

@@ -33,22 +33,22 @@ class ElemwiseMultiTypeImplHelper : public ElemwiseMultiType,

protected:
virtual void on_fuse_mul_add3_int16x32x32x32(
const ElemwiseOpParamN<3>& param, dt_int32* dst) = 0;
const ElemwiseOpParamN<3>& param, const TensorND& dst) = 0;

virtual void on_fuse_mul_add3_iXxf32xf32xi8(
const ElemwiseOpParamN<3>& param, dt_int8* dst) = 0;
const ElemwiseOpParamN<3>& param, const TensorND& dst) = 0;

virtual void on_round_shr_saturate_iXxi8xi8(
const ElemwiseOpParamN<2>& param, dt_int8* dst) = 0;
const ElemwiseOpParamN<2>& param, const TensorND& dst) = 0;

virtual void on_fuse_add_rmulh_round_shr_saturate_int16x16x16x8(
const ElemwiseOpParamN<6>& param, dt_int8* dst) = 0;
const ElemwiseOpParamN<6>& param, const TensorND& dst) = 0;

virtual void on_fuse_add_rmulh_round_shr_saturate_int32x32x32x8(
const ElemwiseOpParamN<6>& param, dt_int8* dst) = 0;
const ElemwiseOpParamN<6>& param, const TensorND& dst) = 0;

virtual void on_round_shr_saturate_iXxi8xi16(
const ElemwiseOpParamN<2>& param, dt_int16* dst) = 0;
const ElemwiseOpParamN<2>& param, const TensorND& dst) = 0;

virtual void on_quantized_mode(
const ElemwiseOpParamN<1>& param, const TensorND& dst,


+ 6
- 6
dnn/src/common/local/local_def.inl View File

@@ -29,9 +29,9 @@ template <int N, int OC>
void local_xcorr_tpl(const LocalKParam& kparam) MEGDNN_SIMD_ATTRIBUTE_TARGET;
template <int N, int OC>
void local_xcorr_tpl(const LocalKParam& kparam) {
const float* src = static_cast<const float*>(kparam.src);
const float* filter = static_cast<const float*>(kparam.filter);
float* dst = static_cast<float*>(kparam.dst);
const float* src = static_cast<const float*>(kparam.src.get_ptr());
const float* filter = static_cast<const float*>(kparam.filter.get_ptr());
float* dst = static_cast<float*>(kparam.dst.get_ptr());
float* workspace = static_cast<float*>(kparam.workspace);
const int IC = kparam.ic, IH = kparam.ih, IW = kparam.iw, OH = kparam.oh,
OW = kparam.ow, FH = kparam.fh, FW = kparam.fw;
@@ -191,9 +191,9 @@ template <int N, int OC>
void local_conv_tpl(const LocalKParam& kparam) MEGDNN_SIMD_ATTRIBUTE_TARGET;
template <int N, int OC>
void local_conv_tpl(const LocalKParam& kparam) {
const float* src = static_cast<const float*>(kparam.src);
const float* filter = static_cast<const float*>(kparam.filter);
float* dst = static_cast<float*>(kparam.dst);
const float* src = static_cast<const float*>(kparam.src.get_ptr());
const float* filter = static_cast<const float*>(kparam.filter.get_ptr());
float* dst = static_cast<float*>(kparam.dst.get_ptr());
float* workspace = static_cast<float*>(kparam.workspace);
const int IC = kparam.ic, IH = kparam.ih, IW = kparam.iw, OH = kparam.oh,
OW = kparam.ow, FH = kparam.fh, FW = kparam.fw;


+ 58
- 96
dnn/src/common/reduce_helper.h View File

@@ -11,9 +11,7 @@
#pragma once
#include "megdnn/dtype.h"

#if MEGDNN_CC_HOST
#include "megdnn/basic_types.h"
#endif

namespace megdnn {
namespace reduce {
@@ -24,16 +22,14 @@ struct SumOp {

const wtype INIT;

src_ctype* src;
dst_ctype* dst;
RefPtr src;
RefPtr dst;
const size_t B;

MEGDNN_HOST MEGDNN_DEVICE wtype read(uint32_t idx) { return src[idx]; }
MEGDNN_HOST MEGDNN_DEVICE void write(uint32_t idx, wtype val) { dst[idx] = val; }
static MEGDNN_HOST MEGDNN_DEVICE wtype apply(wtype lhs, wtype rhs) {
return lhs + rhs;
}
MEGDNN_HOST MEGDNN_DEVICE SumOp(src_ctype* src, dst_ctype* dst, size_t B)
wtype read(uint32_t idx) { return src.ptr<src_ctype>()[idx]; }
void write(uint32_t idx, wtype val) { dst.ptr<dst_ctype>()[idx] = val; }
static wtype apply(wtype lhs, wtype rhs) { return lhs + rhs; }
SumOp(const RefPtr& src, const RefPtr& dst, size_t B)
: INIT(wtype(0)), src(src), dst(dst), B(B) {}
};

@@ -43,18 +39,16 @@ struct MeanOp {

const wtype INIT;

src_ctype* src;
dst_ctype* dst;
RefPtr src;
RefPtr dst;
const size_t B;

MEGDNN_HOST MEGDNN_DEVICE wtype read(uint32_t idx) { return src[idx]; }
MEGDNN_HOST MEGDNN_DEVICE void write(uint32_t idx, wtype val) {
dst[idx] = val / static_cast<wtype>(B);
}
static MEGDNN_HOST MEGDNN_DEVICE wtype apply(wtype lhs, wtype rhs) {
return lhs + rhs;
wtype read(uint32_t idx) { return src.ptr<src_ctype>()[idx]; }
void write(uint32_t idx, wtype val) {
dst.ptr<dst_ctype>()[idx] = val / static_cast<wtype>(B);
}
MEGDNN_HOST MEGDNN_DEVICE MeanOp(src_ctype* src, dst_ctype* dst, size_t B)
static wtype apply(wtype lhs, wtype rhs) { return lhs + rhs; }
MeanOp(const RefPtr& src, const RefPtr& dst, size_t B)
: INIT(wtype(0)), src(src), dst(dst), B(B) {}
};

@@ -64,18 +58,17 @@ struct SumSqrOp {

const wtype INIT;

src_ctype* src;
dst_ctype* dst;
RefPtr src;
RefPtr dst;
const size_t B;

MEGDNN_HOST MEGDNN_DEVICE wtype read(uint32_t idx) {
return static_cast<wtype>(src[idx]) * static_cast<wtype>(src[idx]);
wtype read(uint32_t idx) {
return static_cast<wtype>(src.ptr<src_ctype>()[idx]) *
static_cast<wtype>(src.ptr<src_ctype>()[idx]);
}
MEGDNN_HOST MEGDNN_DEVICE void write(uint32_t idx, wtype val) { dst[idx] = val; }
static MEGDNN_HOST MEGDNN_DEVICE wtype apply(wtype lhs, wtype rhs) {
return lhs + rhs;
}
MEGDNN_HOST MEGDNN_DEVICE SumSqrOp(src_ctype* src, dst_ctype* dst, size_t B)
void write(uint32_t idx, wtype val) { dst.ptr<dst_ctype>()[idx] = val; }
static wtype apply(wtype lhs, wtype rhs) { return lhs + rhs; }
SumSqrOp(const RefPtr& src, const RefPtr& dst, size_t B)
: INIT(wtype(0)), src(src), dst(dst), B(B) {}
};

@@ -84,16 +77,14 @@ struct ProdOp {
typedef wtype_ wtype;
const wtype INIT;

src_ctype* src;
dst_ctype* dst;
RefPtr src;
RefPtr dst;
const size_t B;

MEGDNN_HOST MEGDNN_DEVICE wtype read(uint32_t idx) { return src[idx]; }
MEGDNN_HOST MEGDNN_DEVICE void write(uint32_t idx, wtype val) { dst[idx] = val; }
static MEGDNN_HOST MEGDNN_DEVICE wtype apply(wtype lhs, wtype rhs) {
return lhs * rhs;
}
MEGDNN_HOST MEGDNN_DEVICE ProdOp(src_ctype* src, dst_ctype* dst, size_t B)
wtype read(uint32_t idx) { return src.ptr<src_ctype>()[idx]; }
void write(uint32_t idx, wtype val) { dst.ptr<dst_ctype>()[idx] = val; }
static wtype apply(wtype lhs, wtype rhs) { return lhs * rhs; }
ProdOp(const RefPtr& src, const RefPtr& dst, size_t B)
: INIT(wtype(1)), src(src), dst(dst), B(B) {}
};

@@ -102,20 +93,14 @@ struct MinOp {
typedef wtype_ wtype;
const wtype INIT;

src_ctype* src;
dst_ctype* dst;
RefPtr src;
RefPtr dst;
const size_t B;

MEGDNN_HOST MEGDNN_DEVICE wtype read(uint32_t idx) { return src[idx]; }
MEGDNN_HOST MEGDNN_DEVICE void write(uint32_t idx, wtype val) { dst[idx] = val; }
static MEGDNN_HOST MEGDNN_DEVICE wtype apply(wtype lhs, wtype rhs) {
#if defined(__CUDA_ARCH__)
return lhs < rhs ? lhs : rhs;
#else
return std::min(lhs, rhs);
#endif
}
MEGDNN_HOST MEGDNN_DEVICE MinOp(src_ctype* src, dst_ctype* dst, size_t B)
wtype read(uint32_t idx) { return src.ptr<src_ctype>()[idx]; }
void write(uint32_t idx, wtype val) { dst.ptr<dst_ctype>()[idx] = val; }
static wtype apply(wtype lhs, wtype rhs) { return std::min(lhs, rhs); }
MinOp(const RefPtr& src, const RefPtr& dst, size_t B)
: INIT(wtype(DTypeTrait<wtype>::max())), src(src), dst(dst), B(B) {}
};

@@ -124,20 +109,16 @@ struct MinOp<src_ctype, dst_ctype, dt_float32> {
typedef dt_float32 wtype;
const wtype INIT;

src_ctype* src;
dst_ctype* dst;
RefPtr src;
RefPtr dst;
const size_t B;

MEGDNN_HOST MEGDNN_DEVICE wtype read(uint32_t idx) { return src[idx]; }
MEGDNN_HOST MEGDNN_DEVICE void write(uint32_t idx, wtype val) { dst[idx] = val; }
static MEGDNN_HOST MEGDNN_DEVICE wtype apply(wtype lhs, wtype rhs) {
#if defined(__CUDA_ARCH__)
return (isnan(lhs) || lhs < rhs) ? lhs : rhs;
#else
wtype read(uint32_t idx) { return src.ptr<src_ctype>()[idx]; }
void write(uint32_t idx, wtype val) { dst.ptr<dst_ctype>()[idx] = val; }
static wtype apply(wtype lhs, wtype rhs) {
return (std::isnan(lhs) || lhs < rhs) ? lhs : rhs;
#endif
}
MEGDNN_HOST MEGDNN_DEVICE MinOp(src_ctype* src, dst_ctype* dst, size_t B)
MinOp(const RefPtr& src, const RefPtr& dst, size_t B)
: INIT(wtype(DTypeTrait<wtype>::max())), src(src), dst(dst), B(B) {}
};

@@ -146,20 +127,14 @@ struct MaxOp {
typedef wtype_ wtype;
const wtype INIT;

src_ctype* src;
dst_ctype* dst;
RefPtr src;
RefPtr dst;
const size_t B;

MEGDNN_HOST MEGDNN_DEVICE wtype read(uint32_t idx) { return src[idx]; }
MEGDNN_HOST MEGDNN_DEVICE void write(uint32_t idx, wtype val) { dst[idx] = val; }
static MEGDNN_HOST MEGDNN_DEVICE wtype apply(wtype lhs, wtype rhs) {
#if defined(__CUDA_ARCH__)
return lhs > rhs ? lhs : rhs;
#else
return std::max(lhs, rhs);
#endif
}
MEGDNN_HOST MEGDNN_DEVICE MaxOp(src_ctype* src, dst_ctype* dst, size_t B)
wtype read(uint32_t idx) { return src.ptr<src_ctype>()[idx]; }
void write(uint32_t idx, wtype val) { dst.ptr<dst_ctype>()[idx] = val; }
static wtype apply(wtype lhs, wtype rhs) { return std::max(lhs, rhs); }
MaxOp(const RefPtr& src, const RefPtr& dst, size_t B)
: INIT(wtype(DTypeTrait<wtype>::min())), src(src), dst(dst), B(B) {}
};

@@ -168,20 +143,16 @@ struct MaxOp<src_ctype, dst_ctype, dt_float32> {
typedef dt_float32 wtype;
const wtype INIT;

src_ctype* src;
dst_ctype* dst;
RefPtr src;
RefPtr dst;
const size_t B;

MEGDNN_HOST MEGDNN_DEVICE wtype read(uint32_t idx) { return src[idx]; }
MEGDNN_HOST MEGDNN_DEVICE void write(uint32_t idx, wtype val) { dst[idx] = val; }
static MEGDNN_HOST MEGDNN_DEVICE wtype apply(wtype lhs, wtype rhs) {
#if defined(__CUDA_ARCH__)
return (isnan(lhs) || lhs > rhs) ? lhs : rhs;
#else
wtype read(uint32_t idx) { return src.ptr<src_ctype>()[idx]; }
void write(uint32_t idx, wtype val) { dst.ptr<dst_ctype>()[idx] = val; }
static wtype apply(wtype lhs, wtype rhs) {
return (std::isnan(lhs) || lhs > rhs) ? lhs : rhs;
#endif
}
MEGDNN_HOST MEGDNN_DEVICE MaxOp(src_ctype* src, dst_ctype* dst, size_t B)
MaxOp(const RefPtr& src, const RefPtr& dst, size_t B)
: INIT(wtype(DTypeTrait<wtype>::min())), src(src), dst(dst), B(B) {}
};

@@ -190,28 +161,19 @@ struct CheckNonFiniteOp {
typedef wtype_ wtype;
const wtype INIT;

src_ctype* src;
dst_ctype* dst;
RefPtr src;
RefPtr dst;
const size_t B;

MEGDNN_HOST MEGDNN_DEVICE wtype read(uint32_t idx) {
#if defined(__CUDA_ARCH__)
return !isfinite(src[idx]);
#else
return !std::isfinite(src[idx]);
#endif
}
MEGDNN_HOST MEGDNN_DEVICE void write(uint32_t idx, wtype val) { dst[idx] = val; }
static MEGDNN_HOST MEGDNN_DEVICE wtype apply(wtype lhs, wtype rhs) {
return lhs | rhs;
}
MEGDNN_HOST MEGDNN_DEVICE CheckNonFiniteOp(src_ctype* src, dst_ctype* dst, size_t B)
wtype read(uint32_t idx) { return !std::isfinite(src.ptr<src_ctype>()[idx]); }
void write(uint32_t idx, wtype val) { dst.ptr<dst_ctype>()[idx] = val; }
static wtype apply(wtype lhs, wtype rhs) { return lhs | rhs; }
MEGDNN_HOST MEGDNN_DEVICE
CheckNonFiniteOp(const RefPtr& src, const RefPtr& dst, size_t B)
: INIT(wtype(0)), src(src), dst(dst), B(B) {}
};

#if MEGDNN_CC_HOST
void get_ABC(const TensorShape& shape, size_t& A, size_t& B, size_t& C, size_t axis);
#endif

} // namespace reduce
} // namespace megdnn


+ 222
- 0
dnn/src/common/reduce_helper_device.h View File

@@ -0,0 +1,222 @@
/**
* \file dnn/src/common/reduce_helper_device.h
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
*/
#pragma once
#include "megdnn/dtype.h"

#if MEGDNN_CC_HOST
#include "megdnn/basic_types.h"
#endif

namespace megdnn {
namespace device_reduce {

template <typename src_ctype, typename dst_ctype, typename wtype_>
struct SumOp {
typedef wtype_ wtype;

const wtype INIT;

src_ctype* src;
dst_ctype* dst;
const size_t B;

MEGDNN_HOST MEGDNN_DEVICE wtype read(uint32_t idx) { return src[idx]; }
MEGDNN_HOST MEGDNN_DEVICE void write(uint32_t idx, wtype val) { dst[idx] = val; }
static MEGDNN_HOST MEGDNN_DEVICE wtype apply(wtype lhs, wtype rhs) {
return lhs + rhs;
}
MEGDNN_HOST MEGDNN_DEVICE SumOp(src_ctype* src, dst_ctype* dst, size_t B)
: INIT(wtype(0)), src(src), dst(dst), B(B) {}
};

template <typename src_ctype, typename dst_ctype, typename wtype_>
struct MeanOp {
typedef wtype_ wtype;

const wtype INIT;

src_ctype* src;
dst_ctype* dst;
const size_t B;

MEGDNN_HOST MEGDNN_DEVICE wtype read(uint32_t idx) { return src[idx]; }
MEGDNN_HOST MEGDNN_DEVICE void write(uint32_t idx, wtype val) {
dst[idx] = val / static_cast<wtype>(B);
}
static MEGDNN_HOST MEGDNN_DEVICE wtype apply(wtype lhs, wtype rhs) {
return lhs + rhs;
}
MEGDNN_HOST MEGDNN_DEVICE MeanOp(src_ctype* src, dst_ctype* dst, size_t B)
: INIT(wtype(0)), src(src), dst(dst), B(B) {}
};

template <typename src_ctype, typename dst_ctype, typename wtype_>
struct SumSqrOp {
typedef wtype_ wtype;

const wtype INIT;

src_ctype* src;
dst_ctype* dst;
const size_t B;

MEGDNN_HOST MEGDNN_DEVICE wtype read(uint32_t idx) {
return static_cast<wtype>(src[idx]) * static_cast<wtype>(src[idx]);
}
MEGDNN_HOST MEGDNN_DEVICE void write(uint32_t idx, wtype val) { dst[idx] = val; }
static MEGDNN_HOST MEGDNN_DEVICE wtype apply(wtype lhs, wtype rhs) {
return lhs + rhs;
}
MEGDNN_HOST MEGDNN_DEVICE SumSqrOp(src_ctype* src, dst_ctype* dst, size_t B)
: INIT(wtype(0)), src(src), dst(dst), B(B) {}
};

template <typename src_ctype, typename dst_ctype, typename wtype_>
struct ProdOp {
typedef wtype_ wtype;
const wtype INIT;

src_ctype* src;
dst_ctype* dst;
const size_t B;

MEGDNN_HOST MEGDNN_DEVICE wtype read(uint32_t idx) { return src[idx]; }
MEGDNN_HOST MEGDNN_DEVICE void write(uint32_t idx, wtype val) { dst[idx] = val; }
static MEGDNN_HOST MEGDNN_DEVICE wtype apply(wtype lhs, wtype rhs) {
return lhs * rhs;
}
MEGDNN_HOST MEGDNN_DEVICE ProdOp(src_ctype* src, dst_ctype* dst, size_t B)
: INIT(wtype(1)), src(src), dst(dst), B(B) {}
};

template <typename src_ctype, typename dst_ctype, typename wtype_>
struct MinOp {
typedef wtype_ wtype;
const wtype INIT;

src_ctype* src;
dst_ctype* dst;
const size_t B;

MEGDNN_HOST MEGDNN_DEVICE wtype read(uint32_t idx) { return src[idx]; }
MEGDNN_HOST MEGDNN_DEVICE void write(uint32_t idx, wtype val) { dst[idx] = val; }
static MEGDNN_HOST MEGDNN_DEVICE wtype apply(wtype lhs, wtype rhs) {
#if defined(__CUDA_ARCH__)
return lhs < rhs ? lhs : rhs;
#else
return std::min(lhs, rhs);
#endif
}
MEGDNN_HOST MEGDNN_DEVICE MinOp(src_ctype* src, dst_ctype* dst, size_t B)
: INIT(wtype(DTypeTrait<wtype>::max())), src(src), dst(dst), B(B) {}
};

template <typename src_ctype, typename dst_ctype>
struct MinOp<src_ctype, dst_ctype, dt_float32> {
typedef dt_float32 wtype;
const wtype INIT;

src_ctype* src;
dst_ctype* dst;
const size_t B;

MEGDNN_HOST MEGDNN_DEVICE wtype read(uint32_t idx) { return src[idx]; }
MEGDNN_HOST MEGDNN_DEVICE void write(uint32_t idx, wtype val) { dst[idx] = val; }
static MEGDNN_HOST MEGDNN_DEVICE wtype apply(wtype lhs, wtype rhs) {
#if defined(__CUDA_ARCH__)
return (isnan(lhs) || lhs < rhs) ? lhs : rhs;
#else
return (std::isnan(lhs) || lhs < rhs) ? lhs : rhs;
#endif
}
MEGDNN_HOST MEGDNN_DEVICE MinOp(src_ctype* src, dst_ctype* dst, size_t B)
: INIT(wtype(DTypeTrait<wtype>::max())), src(src), dst(dst), B(B) {}
};

template <typename src_ctype, typename dst_ctype, typename wtype_>
struct MaxOp {
typedef wtype_ wtype;
const wtype INIT;

src_ctype* src;
dst_ctype* dst;
const size_t B;

MEGDNN_HOST MEGDNN_DEVICE wtype read(uint32_t idx) { return src[idx]; }
MEGDNN_HOST MEGDNN_DEVICE void write(uint32_t idx, wtype val) { dst[idx] = val; }
static MEGDNN_HOST MEGDNN_DEVICE wtype apply(wtype lhs, wtype rhs) {
#if defined(__CUDA_ARCH__)
return lhs > rhs ? lhs : rhs;
#else
return std::max(lhs, rhs);
#endif
}
MEGDNN_HOST MEGDNN_DEVICE MaxOp(src_ctype* src, dst_ctype* dst, size_t B)
: INIT(wtype(DTypeTrait<wtype>::min())), src(src), dst(dst), B(B) {}
};

template <typename src_ctype, typename dst_ctype>
struct MaxOp<src_ctype, dst_ctype, dt_float32> {
typedef dt_float32 wtype;
const wtype INIT;

src_ctype* src;
dst_ctype* dst;
const size_t B;

MEGDNN_HOST MEGDNN_DEVICE wtype read(uint32_t idx) { return src[idx]; }
MEGDNN_HOST MEGDNN_DEVICE void write(uint32_t idx, wtype val) { dst[idx] = val; }
static MEGDNN_HOST MEGDNN_DEVICE wtype apply(wtype lhs, wtype rhs) {
#if defined(__CUDA_ARCH__)
return (isnan(lhs) || lhs > rhs) ? lhs : rhs;
#else
return (std::isnan(lhs) || lhs > rhs) ? lhs : rhs;
#endif
}
MEGDNN_HOST MEGDNN_DEVICE MaxOp(src_ctype* src, dst_ctype* dst, size_t B)
: INIT(wtype(DTypeTrait<wtype>::min())), src(src), dst(dst), B(B) {}
};

template <typename src_ctype, typename dst_ctype, typename wtype_>
struct CheckNonFiniteOp {
typedef wtype_ wtype;
const wtype INIT;

src_ctype* src;
dst_ctype* dst;
const size_t B;

MEGDNN_HOST MEGDNN_DEVICE wtype read(uint32_t idx) {
#if defined(__CUDA_ARCH__)
return !isfinite(src[idx]);
#else
return !std::isfinite(src[idx]);
#endif
}
MEGDNN_HOST MEGDNN_DEVICE void write(uint32_t idx, wtype val) { dst[idx] = val; }
static MEGDNN_HOST MEGDNN_DEVICE wtype apply(wtype lhs, wtype rhs) {
return lhs | rhs;
}
MEGDNN_HOST MEGDNN_DEVICE CheckNonFiniteOp(src_ctype* src, dst_ctype* dst, size_t B)
: INIT(wtype(0)), src(src), dst(dst), B(B) {}
};

} // namespace device_reduce

namespace reduce {
#if MEGDNN_CC_HOST
void get_ABC(const TensorShape& shape, size_t& A, size_t& B, size_t& C, size_t axis);
#endif
} // namespace reduce

} // namespace megdnn

// vim: syntax=cpp.doxygen

+ 7
- 2
dnn/src/common/utils.h View File

@@ -362,6 +362,10 @@ static inline void copy_plane_in_bytes(

megcoreDeviceHandle_t get_device_handle(Handle* handle);

static inline void incr_refp(RefPtr& ptr, ptrdiff_t delta) {
ptr += (size_t)delta;
}

static inline void incr_voidp(void*& ptr, ptrdiff_t delta) {
ptr = reinterpret_cast<void*>(reinterpret_cast<uintptr_t>(ptr) + delta);
}
@@ -674,7 +678,8 @@ struct CompTypeCvter {
comp.layout.dtype.enumv() != DTypeTrait<CompType>::enumv) {
comp.layout.dtype = CompType();
comp.layout.init_contiguous_stride();
comp.raw_ptr = m_workspace_bundle->get(m_workspace_idx++);
comp = TensorND{
m_workspace_bundle->get(m_workspace_idx++), comp.layout};
if (src.layout.ndim) {
m_cvt_opr->exec(src, comp);
}
@@ -699,7 +704,7 @@ struct CompTypeCvter {
* \brief get TensorND raw_ptr+low_byte pointer.
*/
inline dt_byte* get_low_ptr(const TensorND* tensor) {
return static_cast<dt_byte*>(tensor->raw_ptr) + tensor->layout.span().low_byte;
return static_cast<dt_byte*>(tensor->raw_ptr()) + tensor->layout.span().low_byte;
}

/*!


+ 1
- 1
dnn/src/cuda/argmxx/opr_impl.cpp View File

@@ -11,7 +11,7 @@
#include "src/cuda/argmxx/opr_impl.h"

#include "src/common/argmxx_helper.h"
#include "src/common/reduce_helper.h"
#include "src/common/reduce_helper_device.h"
#include "src/cuda/reduce_helper.cuh"
#include "src/cuda/utils.h"



+ 38
- 36
dnn/src/cuda/batch_normalization/opr_impl.cpp View File

@@ -117,32 +117,34 @@ void BNForwardImpl::exec(
#if CUDNN_VERSION >= 7410
cudnn_check(cudnnBatchNormalizationForwardTrainingEx(
handle, tensor_desc.bn_mode, CUDNN_BATCHNORM_OPS_BN, &alpha,
&beta, // one & zero
tensor_desc.xy_desc.desc, src.raw_ptr, // xDesc & x
nullptr, nullptr, // zDesc & z
tensor_desc.xy_desc.desc, dst.raw_ptr, // yDesc & y
tensor_desc.param_desc.desc, // bnScaleBiasMeanVarDesc
bn_scale.raw_ptr, bn_bias.raw_ptr, m_param.avg_factor, mean.raw_ptr,
variance.raw_ptr, m_param.epsilon, batch_mean.raw_ptr,
batch_inv_variance.raw_ptr, nullptr, workspace.raw_ptr,
workspace.size, reserve.raw_ptr, reserve.layout.access_bytes()));
&beta, // one & zero
tensor_desc.xy_desc.desc, src.raw_ptr(), // xDesc & x
nullptr, nullptr, // zDesc & z
tensor_desc.xy_desc.desc, dst.raw_ptr(), // yDesc & y
tensor_desc.param_desc.desc, // bnScaleBiasMeanVarDesc
bn_scale.raw_ptr(), bn_bias.raw_ptr(), m_param.avg_factor,
mean.raw_ptr(), variance.raw_ptr(), m_param.epsilon,
batch_mean.raw_ptr(), batch_inv_variance.raw_ptr(), nullptr,
workspace.raw_ptr, workspace.size, reserve.raw_ptr(),
reserve.layout.access_bytes()));
#else
cudnn_check(cudnnBatchNormalizationForwardTraining(
handle, tensor_desc.bn_mode, &alpha, &beta,
tensor_desc.xy_desc.desc, src.raw_ptr, // xDesc & x
tensor_desc.xy_desc.desc, dst.raw_ptr, // yDesc & y
tensor_desc.param_desc.desc, // bnScaleBiasMeanVarDesc
bn_scale.raw_ptr, bn_bias.raw_ptr, m_param.avg_factor, mean.raw_ptr,
variance.raw_ptr, m_param.epsilon, batch_mean.raw_ptr,
batch_inv_variance.raw_ptr));
tensor_desc.xy_desc.desc, src.raw_ptr(), // xDesc & x
tensor_desc.xy_desc.desc, dst.raw_ptr(), // yDesc & y
tensor_desc.param_desc.desc, // bnScaleBiasMeanVarDesc
bn_scale.raw_ptr(), bn_bias.raw_ptr(), m_param.avg_factor,
mean.raw_ptr(), variance.raw_ptr(), m_param.epsilon,
batch_mean.raw_ptr(), batch_inv_variance.raw_ptr()));
#endif // CUDNN_VERSION >= 7410
break;
case param::BN::FwdMode::INFERENCE:
cudnn_check(cudnnBatchNormalizationForwardInference(
handle, tensor_desc.bn_mode, &alpha, &beta,
tensor_desc.xy_desc.desc, src.raw_ptr, tensor_desc.xy_desc.desc,
dst.raw_ptr, tensor_desc.param_desc.desc, bn_scale.raw_ptr,
bn_bias.raw_ptr, mean.raw_ptr, variance.raw_ptr, m_param.epsilon));
tensor_desc.xy_desc.desc, src.raw_ptr(), tensor_desc.xy_desc.desc,
dst.raw_ptr(), tensor_desc.param_desc.desc, bn_scale.raw_ptr(),
bn_bias.raw_ptr(), mean.raw_ptr(), variance.raw_ptr(),
m_param.epsilon));
break;
default:
megdnn_throw("Unknown forward mode type of batch normalization.");
@@ -198,27 +200,27 @@ void BNBackwardImpl::exec(
cudnn_check(cudnnBatchNormalizationBackwardEx(
handle, tensor_desc.bn_mode, CUDNN_BATCHNORM_OPS_BN, &alpha, &beta, &alpha,
&beta, tensor_desc.xy_desc.desc,
x.raw_ptr, // xDesc & x
nullptr, nullptr, // yDesc & y
tensor_desc.xy_desc.desc, dy.raw_ptr, // dyDesc & dy
nullptr, nullptr, // dzDesc & dz
tensor_desc.xy_desc.desc, dx.raw_ptr, // dxDesc & dx
tensor_desc.param_desc.desc, bn_scale.raw_ptr, // bnScale
nullptr, // bnBias
d_bn_scale.raw_ptr, d_bn_bias.raw_ptr, // dScale, dBias
m_param.epsilon, saved_batch_mean.raw_ptr, saved_batch_inv_variance.raw_ptr,
nullptr, workspace.raw_ptr, workspace.size, reserve.raw_ptr,
reserve.layout.access_bytes()));
x.raw_ptr(), // xDesc & x
nullptr, nullptr, // yDesc & y
tensor_desc.xy_desc.desc, dy.raw_ptr(), // dyDesc & dy
nullptr, nullptr, // dzDesc & dz
tensor_desc.xy_desc.desc, dx.raw_ptr(), // dxDesc & dx
tensor_desc.param_desc.desc, bn_scale.raw_ptr(), // bnScale
nullptr, // bnBias
d_bn_scale.raw_ptr(), d_bn_bias.raw_ptr(), // dScale, dBias
m_param.epsilon, saved_batch_mean.raw_ptr(),
saved_batch_inv_variance.raw_ptr(), nullptr, workspace.raw_ptr,
workspace.size, reserve.raw_ptr(), reserve.layout.access_bytes()));
#else
cudnn_check(cudnnBatchNormalizationBackward(
handle, tensor_desc.bn_mode, &alpha, &beta, &alpha, &beta,
tensor_desc.xy_desc.desc, x.raw_ptr, // xDesc & x
tensor_desc.xy_desc.desc, dy.raw_ptr, // dyDesc & dy
tensor_desc.xy_desc.desc, dx.raw_ptr, // dxDesc & dx
tensor_desc.param_desc.desc, bn_scale.raw_ptr, // bnScale
d_bn_scale.raw_ptr, d_bn_bias.raw_ptr, // dScale, dBias
m_param.epsilon, saved_batch_mean.raw_ptr,
saved_batch_inv_variance.raw_ptr));
tensor_desc.xy_desc.desc, x.raw_ptr(), // xDesc & x
tensor_desc.xy_desc.desc, dy.raw_ptr(), // dyDesc & dy
tensor_desc.xy_desc.desc, dx.raw_ptr(), // dxDesc & dx
tensor_desc.param_desc.desc, bn_scale.raw_ptr(), // bnScale
d_bn_scale.raw_ptr(), d_bn_bias.raw_ptr(), // dScale, dBias
m_param.epsilon, saved_batch_mean.raw_ptr(),
saved_batch_inv_variance.raw_ptr()));
#endif
}



+ 3
- 3
dnn/src/cuda/batched_matrix_mul/brute_force.cpp View File

@@ -80,9 +80,9 @@ void BatchedMatrixMulForwardImpl::AlgoBruteForce::exec(const ExecArgs& args) con
rep(n, N) {
TensorND A_, B_, C_;
auto tensor_n_from_batch = [n](const TensorND& in, TensorND& out) {
out.raw_ptr = static_cast<void*>(
static_cast<dt_byte*>(in.raw_ptr) +
n * in.layout.stride[0] * in.layout.dtype.size());
out.reset_ptr(static_cast<void*>(
static_cast<dt_byte*>(in.raw_ptr()) +
n * in.layout.stride[0] * in.layout.dtype.size()));
out.layout = in.layout.remove_axis(0);
};
tensor_n_from_batch(args.tensor_a, A_);


+ 3
- 3
dnn/src/cuda/batched_matrix_mul/cublas.cpp View File

@@ -76,13 +76,13 @@ void BatchedMatrixMulForwardImpl::AlgoCublas::exec(const ExecArgs& args) const {
static_cast<void*>(workspace.raw_ptr + 2 * batch * sizeof(uintptr_t)));

arange<uintptr_t>(
As, reinterpret_cast<uintptr_t>(args.tensor_a.raw_ptr),
As, reinterpret_cast<uintptr_t>(args.tensor_a.raw_ptr()),
args.layout_a.stride[0] * dtype.size(), batch, stream);
arange<uintptr_t>(
Bs, reinterpret_cast<uintptr_t>(args.tensor_b.raw_ptr),
Bs, reinterpret_cast<uintptr_t>(args.tensor_b.raw_ptr()),
args.layout_b.stride[0] * dtype.size(), batch, stream);
arange<uintptr_t>(
Cs, reinterpret_cast<uintptr_t>(args.tensor_c.raw_ptr),
Cs, reinterpret_cast<uintptr_t>(args.tensor_c.raw_ptr()),
args.layout_c.stride[0] * dtype.size(), batch, stream);

auto io32_c32 = [&]() {


+ 8
- 8
dnn/src/cuda/batched_matrix_mul/cublas_lt.cpp View File

@@ -62,10 +62,10 @@ void BatchedMatrixMulForwardImpl::AlgoCublasLt::exec(const ExecArgs& args) const
"workspace bundle size should be 1(ws_algo)");
cublas_check(cublasLtMatmul(
cublasLt_handle, desc.matmul_desc, one_half,
static_cast<const __half*>(args.tensor_b.raw_ptr), desc.layout_b,
static_cast<const __half*>(args.tensor_a.raw_ptr), desc.layout_a,
zero_half, static_cast<const __half*>(args.tensor_c.raw_ptr),
desc.layout_c, static_cast<__half*>(args.tensor_c.raw_ptr),
static_cast<const __half*>(args.tensor_b.raw_ptr()), desc.layout_b,
static_cast<const __half*>(args.tensor_a.raw_ptr()), desc.layout_a,
zero_half, static_cast<const __half*>(args.tensor_c.raw_ptr()),
desc.layout_c, static_cast<__half*>(args.tensor_c.raw_ptr()),
desc.layout_c, &algo, ws_bundle.get(0), ws_bundle.get_size(0), stream));
};
auto batched_sgemm = [&]() {
@@ -77,7 +77,7 @@ void BatchedMatrixMulForwardImpl::AlgoCublasLt::exec(const ExecArgs& args) const
auto dev_a = (desc.dt_a == CUDA_R_16F)
? static_cast<void*>(args.tensor_a.ptr<dt_float16>())
: static_cast<void*>(args.tensor_a.ptr<dt_float32>());
auto dev_c = static_cast<void*>(args.tensor_c.raw_ptr);
auto dev_c = static_cast<void*>(args.tensor_c.raw_ptr());
megdnn_assert(
ws_bundle.nr_workspace() == 1,
"workspace bundle size should be 1(ws_algo)");
@@ -104,14 +104,14 @@ void BatchedMatrixMulForwardImpl::AlgoCublasLt::exec(const ExecArgs& args) const
transform_desc, CUBLASLT_MATRIX_TRANSFORM_DESC_POINTER_MODE, &pm,
sizeof(pm)));
cublas_check(cublasLtMatrixTransform(
cublasLt_handle, transform_desc, one, args.tensor_b.raw_ptr,
cublasLt_handle, transform_desc, one, args.tensor_b.raw_ptr(),
desc.layout_b, zero, nullptr, nullptr, ws_b, desc.layout_trans_b,
stream));
cublas_check(cublasLtMatrixTransformDescSetAttribute(
transform_desc, CUBLASLT_MATRIX_TRANSFORM_DESC_TRANSA, &trans_a,
sizeof(trans_a)));
cublas_check(cublasLtMatrixTransform(
cublasLt_handle, transform_desc, one, args.tensor_a.raw_ptr,
cublasLt_handle, transform_desc, one, args.tensor_a.raw_ptr(),
desc.layout_a, zero, nullptr, nullptr, ws_a, desc.layout_trans_a,
stream));
cublas_check(cublasLtMatmul(
@@ -124,7 +124,7 @@ void BatchedMatrixMulForwardImpl::AlgoCublasLt::exec(const ExecArgs& args) const
sizeof(trans_c)));
cublas_check(cublasLtMatrixTransform(
cublasLt_handle, transform_desc, one, ws_c, desc.layout_trans_c, zero,
nullptr, nullptr, args.tensor_c.raw_ptr, desc.layout_c, stream));
nullptr, nullptr, args.tensor_c.raw_ptr(), desc.layout_c, stream));
cublas_check(cublasLtMatrixTransformDescDestroy(transform_desc));
};



+ 4
- 2
dnn/src/cuda/check_non_finite/kern.cu View File

@@ -8,7 +8,7 @@
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
*/
#include "src/common/reduce_helper.h"
#include "src/common/reduce_helper_device.h"

#include "megdnn/dtype.h"
#include "src/cuda/reduce_helper.cuh"
@@ -18,7 +18,9 @@ namespace cuda {

#define COMMA ,

INST_REDUCE(reduce::CheckNonFiniteOp<dt_float32 COMMA dt_int32 COMMA dt_int32>, false);
INST_REDUCE(
device_reduce::CheckNonFiniteOp<dt_float32 COMMA dt_int32 COMMA dt_int32>,
false);

#undef COMMA
} // namespace cuda


+ 2
- 2
dnn/src/cuda/check_non_finite/opr_impl.cpp View File

@@ -15,12 +15,12 @@
#include "src/cuda/handle.h"
#include "src/cuda/utils.h"

#include "src/common/reduce_helper.h"
#include "src/common/reduce_helper_device.h"

namespace megdnn {
namespace cuda {

using reduce::CheckNonFiniteOp;
using device_reduce::CheckNonFiniteOp;

size_t CheckNonFiniteImpl::get_workspace_in_bytes(
const TensorLayout& src, const TensorLayout& dst) {


+ 2
- 2
dnn/src/cuda/checksum/opr_impl.cpp View File

@@ -45,7 +45,7 @@ ChecksumForward::Result ChecksumForwardImpl::exec(
check_exec(data.layout, workspace.size);
auto stream = cuda_stream(handle());

auto ptr = static_cast<uint8_t*>(data.raw_ptr);
auto ptr = static_cast<uint8_t*>(data.raw_ptr());
size_t size_all = data.layout.shape[0], size_ints = size_all / sizeof(uint32_t);
auto last_val_size = std::min<size_t>(size_all, 4);
cuda_check(cudaMemcpyAsync(
@@ -54,7 +54,7 @@ ChecksumForward::Result ChecksumForwardImpl::exec(
if (size_ints) {
checksum::calc(
static_cast<uint32_t*>(wbundle.get(1)),
static_cast<uint32_t*>(data.raw_ptr),
static_cast<uint32_t*>(data.raw_ptr()),
static_cast<uint32_t*>(wbundle.get(0)), size_ints, stream);
cuda_check(cudaMemcpyAsync(
&result.checksum, wbundle.get(1), sizeof(result.checksum),


+ 5
- 5
dnn/src/cuda/conv_bias/batched_matmul.cpp View File

@@ -135,9 +135,9 @@ size_t ConvBiasForwardImpl::AlgoBatchedMatmul::get_workspace_in_bytes(

void ConvBiasForwardImpl::AlgoBatchedMatmul::exec(const ExecArgs& args) const {
auto bundle = get_workspace_bundle(args.workspace.raw_ptr, args);
auto conv_dst_tensor = *args.dst_tensor;
TensorND conv_dst_tensor = *args.dst_tensor;
if (args.dst_layout->dtype.enumv() != args.bias_layout->dtype.enumv()) {
conv_dst_tensor.raw_ptr = bundle.get(1);
conv_dst_tensor = TensorND{bundle.get(1), args.dst_tensor->layout};
conv_dst_tensor.layout.dtype = DType();
args.opr->check_or_deduce_dtype_fwd(
args.src_layout->dtype, args.filter_layout->dtype,
@@ -150,9 +150,9 @@ void ConvBiasForwardImpl::AlgoBatchedMatmul::exec(const ExecArgs& args) const {
{
auto config = prepare_sub_opr(args);

TensorND A{args.filter_tensor->raw_ptr, config.first[0]},
B{args.src_tensor->raw_ptr, config.first[1]},
C{args.dst_tensor->raw_ptr, config.first[2]};
TensorND A{args.filter_tensor->raw_ptr(), config.first[0]},
B{args.src_tensor->raw_ptr(), config.first[1]},
C{args.dst_tensor->raw_ptr(), config.first[2]};
config.second->exec(A, B, C, bundle.get_workspace(0));
}
handle_bias_and_nonlinear(


+ 5
- 5
dnn/src/cuda/conv_bias/chanwise.cpp View File

@@ -52,9 +52,9 @@ size_t ConvBiasForwardImpl::AlgoChanwise::get_workspace_in_bytes(

void ConvBiasForwardImpl::AlgoChanwise::exec(const ExecArgs& args) const {
WorkspaceBundle bundle{args.workspace.raw_ptr, {get_workspace_in_bytes(args)}};
auto conv_dst_tensor = *args.dst_tensor;
TensorND conv_dst_tensor = *args.dst_tensor;
if (args.dst_layout->dtype.enumv() != args.bias_layout->dtype.enumv()) {
conv_dst_tensor.raw_ptr = bundle.get(0);
conv_dst_tensor = TensorND{bundle.get(0), args.dst_tensor->layout};
conv_dst_tensor.layout.dtype = DType();
args.opr->check_or_deduce_dtype_fwd(
args.src_layout->dtype, args.filter_layout->dtype,
@@ -74,9 +74,9 @@ void ConvBiasForwardImpl::AlgoChanwise::exec(const ExecArgs& args) const {
#if CUDA_VERSION >= 9000
if (is_compute_capability_required(5, 3)) {
chanwise::run_fwd(
static_cast<half*>(conv_dst_tensor.raw_ptr),
static_cast<half*>(args.src_tensor->raw_ptr),
static_cast<half*>(args.filter_tensor->raw_ptr), kparam,
static_cast<half*>(conv_dst_tensor.raw_ptr()),
static_cast<half*>(args.src_tensor->raw_ptr()),
static_cast<half*>(args.filter_tensor->raw_ptr()), kparam,
stream);
} else {
chanwise::run_fwd(


+ 2
- 2
dnn/src/cuda/conv_bias/chanwise_8x8x32.cpp View File

@@ -50,9 +50,9 @@ size_t ConvBiasForwardImpl::AlgoChanwise8x8x32::get_workspace_in_bytes(

void ConvBiasForwardImpl::AlgoChanwise8x8x32::exec(const ExecArgs& args) const {
WorkspaceBundle bundle{args.workspace.raw_ptr, {get_workspace_in_bytes(args)}};
auto conv_dst_tensor = *args.dst_tensor;
TensorND conv_dst_tensor = *args.dst_tensor;
if (args.dst_layout->dtype.enumv() != args.bias_layout->dtype.enumv()) {
conv_dst_tensor.raw_ptr = bundle.get(0);
conv_dst_tensor = TensorND{bundle.get(0), args.dst_tensor->layout};
conv_dst_tensor.layout.dtype = DType();
args.opr->check_or_deduce_dtype_fwd(
args.src_layout->dtype, args.filter_layout->dtype,


+ 5
- 5
dnn/src/cuda/conv_bias/chanwise_small.cpp View File

@@ -65,9 +65,9 @@ size_t ConvBiasForwardImpl::AlgoChanwiseSmall::get_workspace_in_bytes(

void ConvBiasForwardImpl::AlgoChanwiseSmall::exec(const ExecArgs& args) const {
WorkspaceBundle bundle{args.workspace.raw_ptr, {get_workspace_in_bytes(args)}};
auto conv_dst_tensor = *args.dst_tensor;
TensorND conv_dst_tensor = *args.dst_tensor;
if (args.dst_layout->dtype.enumv() != args.bias_layout->dtype.enumv()) {
conv_dst_tensor.raw_ptr = bundle.get(0);
conv_dst_tensor = TensorND{bundle.get(0), conv_dst_tensor.layout};
conv_dst_tensor.layout.dtype = DType();
args.opr->check_or_deduce_dtype_fwd(
args.src_layout->dtype, args.filter_layout->dtype,
@@ -85,9 +85,9 @@ void ConvBiasForwardImpl::AlgoChanwiseSmall::exec(const ExecArgs& args) const {
#if CUDA_VERSION >= 9000
case DTypeEnum::Float16:
chanwise::run_fwd_small(
static_cast<half*>(conv_dst_tensor.raw_ptr),
static_cast<half*>(args.src_tensor->raw_ptr),
static_cast<half*>(args.filter_tensor->raw_ptr), kparam,
static_cast<half*>(conv_dst_tensor.raw_ptr()),
static_cast<half*>(args.src_tensor->raw_ptr()),
static_cast<half*>(args.filter_tensor->raw_ptr()), kparam,
stream);
break;
#endif


+ 5
- 5
dnn/src/cuda/conv_bias/cudnn_conv.cpp View File

@@ -100,9 +100,9 @@ size_t ConvBiasForwardImpl::AlgoCUDNNConv::get_workspace_in_bytes(

void ConvBiasForwardImpl::AlgoCUDNNConv::exec(const ExecArgs& args) const {
auto bundle = get_workspace_bundle(args.workspace.raw_ptr, args);
auto conv_dst_tensor = *args.dst_tensor;
TensorND conv_dst_tensor = *args.dst_tensor;
if (args.dst_layout->dtype.enumv() != args.bias_layout->dtype.enumv()) {
conv_dst_tensor.raw_ptr = bundle.get(1);
conv_dst_tensor = TensorND{bundle.get(1), args.dst_tensor->layout};
conv_dst_tensor.layout.dtype = DType();
args.opr->check_or_deduce_dtype_fwd(
args.src_layout->dtype, args.filter_layout->dtype,
@@ -120,10 +120,10 @@ void ConvBiasForwardImpl::AlgoCUDNNConv::exec(const ExecArgs& args) const {
float alpha = 1.0f, beta = 0.0f;
auto status = cudnnConvolutionForward(
conv_args.handle->cudnn_handle(), &alpha, D.src_desc.desc,
conv_args.src_tensor->raw_ptr, D.filter_desc.desc,
conv_args.filter_tensor->raw_ptr, D.conv_desc.conv_desc, m_cudnn_enum,
conv_args.src_tensor->raw_ptr(), D.filter_desc.desc,
conv_args.filter_tensor->raw_ptr(), D.conv_desc.conv_desc, m_cudnn_enum,
conv_workspace.raw_ptr, conv_workspace.size, &beta, D.dst_desc.desc,
conv_args.dst_tensor->raw_ptr);
conv_args.dst_tensor->raw_ptr());
megdnn_assert(
status == CUDNN_STATUS_SUCCESS, "conv fwd failed: %s; info: %s",
cudnnGetErrorString(status), conv_args.to_string().c_str());


+ 10
- 10
dnn/src/cuda/conv_bias/cudnn_conv_bias_activation.cpp View File

@@ -231,7 +231,7 @@ void ConvBiasForwardImpl::AlgoCUDNNConvBiasActivation::exec(

auto workspace_ptr = args.workspace.raw_ptr;
auto workspace_size = args.workspace.size;
auto bias_ptr = args.bias_tensor->raw_ptr;
auto bias_ptr = args.bias_tensor->raw_ptr();
if (args.bias_layout && args.bias_layout->dtype != dtype::Float32() &&
args.src_layout->dtype.category() != DTypeCategory::FLOAT) {
auto cvt = args.handle->create_operator<TypeCvt>();
@@ -242,7 +242,7 @@ void ConvBiasForwardImpl::AlgoCUDNNConvBiasActivation::exec(
auto bias_size_in_bytes = float_bias_layout.span().dist_byte();
megdnn_assert(args.workspace.size >= bias_size_in_bytes);
cvt->exec(
{args.bias_tensor->raw_ptr, converted_bias_layout},
{args.bias_tensor->raw_ptr(), converted_bias_layout},
TensorND{workspace_ptr, float_bias_layout});

bias_ptr = workspace_ptr;
@@ -254,19 +254,19 @@ void ConvBiasForwardImpl::AlgoCUDNNConvBiasActivation::exec(
if (args.z_layout->ndim == 0) {
status = cudnnConvolutionBiasActivationForward(
args.handle->cudnn_handle(), &alpha, D.src_desc.desc,
args.src_tensor->raw_ptr, D.filter_desc.desc,
args.filter_tensor->raw_ptr, D.conv_desc.conv_desc, m_cudnn_enum,
args.src_tensor->raw_ptr(), D.filter_desc.desc,
args.filter_tensor->raw_ptr(), D.conv_desc.conv_desc, m_cudnn_enum,
workspace_ptr, workspace_size, &beta, D.dst_desc.desc,
args.dst_tensor->raw_ptr, D.bias_desc.desc, bias_ptr,
D.conv_desc.act_desc, D.dst_desc.desc, args.dst_tensor->raw_ptr);
args.dst_tensor->raw_ptr(), D.bias_desc.desc, bias_ptr,
D.conv_desc.act_desc, D.dst_desc.desc, args.dst_tensor->raw_ptr());
} else {
status = cudnnConvolutionBiasActivationForward(
args.handle->cudnn_handle(), &alpha, D.src_desc.desc,
args.src_tensor->raw_ptr, D.filter_desc.desc,
args.filter_tensor->raw_ptr, D.conv_desc.conv_desc, m_cudnn_enum,
args.src_tensor->raw_ptr(), D.filter_desc.desc,
args.filter_tensor->raw_ptr(), D.conv_desc.conv_desc, m_cudnn_enum,
workspace_ptr, workspace_size, &beta, D.z_desc.desc,
args.z_tensor->raw_ptr, D.bias_desc.desc, bias_ptr,
D.conv_desc.act_desc, D.dst_desc.desc, args.dst_tensor->raw_ptr);
args.z_tensor->raw_ptr(), D.bias_desc.desc, bias_ptr,
D.conv_desc.act_desc, D.dst_desc.desc, args.dst_tensor->raw_ptr());
}

megdnn_assert(


+ 11
- 10
dnn/src/cuda/conv_bias/group_conv.cpp View File

@@ -142,9 +142,10 @@ size_t ConvBiasForwardImpl::AlgoGroupConvGeneral::get_workspace_in_bytes(

void ConvBiasForwardImpl::AlgoGroupConvGeneral::exec(const ExecArgs& args) const {
auto bundle = get_workspace_bundle(args.workspace.raw_ptr, args);
auto conv_dst_tensor = *args.dst_tensor;
TensorND conv_dst_tensor = *args.dst_tensor;
if (args.dst_layout->dtype.enumv() != args.bias_layout->dtype.enumv()) {
conv_dst_tensor.raw_ptr = bundle.get(bundle.nr_workspace() - 1);
conv_dst_tensor = TensorND{
bundle.get(bundle.nr_workspace() - 1), args.dst_tensor->layout};
conv_dst_tensor.layout.dtype = DType();
args.opr->check_or_deduce_dtype_fwd(
args.src_layout->dtype, args.filter_layout->dtype,
@@ -156,11 +157,11 @@ void ConvBiasForwardImpl::AlgoGroupConvGeneral::exec(const ExecArgs& args) const
sub_args.dst_layout = &conv_dst_tensor.layout;

auto config = prepare_sub_opr(sub_args);
TensorND tsrc{args.src_tensor->raw_ptr, config.first[0]};
TensorND tfilter{args.filter_tensor->raw_ptr, config.first[1]};
TensorND tbias{args.bias_tensor->raw_ptr, config.first[2]};
TensorND tz{args.z_tensor->raw_ptr, config.first[3]};
TensorND tdst{conv_dst_tensor.raw_ptr, config.first[4]};
TensorND tsrc{args.src_tensor->raw_ptr(), config.first[0]};
TensorND tfilter{args.filter_tensor->raw_ptr(), config.first[1]};
TensorND tbias{args.bias_tensor->raw_ptr(), config.first[2]};
TensorND tz{args.z_tensor->raw_ptr(), config.first[3]};
TensorND tdst{conv_dst_tensor.raw_ptr(), config.first[4]};

size_t c_pos;
if (args.filter_meta.format == Param::Format::NCHW ||
@@ -187,9 +188,9 @@ void ConvBiasForwardImpl::AlgoGroupConvGeneral::exec(const ExecArgs& args) const
for (uint32_t g = 0; g < grp; ++g) {
config.second->exec(
tsrc, tfilter, tbias, tz, tdst, nullptr, bundle.get_workspace(0));
incr_voidp(tsrc.raw_ptr, strd_src);
incr_voidp(tdst.raw_ptr, strd_dst);
incr_voidp(tfilter.raw_ptr, strd_flt);
incr_refp(tsrc.get_ref_ptr(), strd_src);
incr_refp(tdst.get_ref_ptr(), strd_dst);
incr_refp(tfilter.get_ref_ptr(), strd_flt);
}
}
handle_bias_and_nonlinear(


+ 3
- 3
dnn/src/cuda/conv_bias/helper.cpp View File

@@ -189,19 +189,19 @@ SmallVector<size_t> matmul_get_workspace_bundle(const BiasForwardSizeArgs& args)
}

void flip_filter(
const BiasForwardSizeArgs& args, const Workspace& workspace, void*& raw_ptr) {
const BiasForwardSizeArgs& args, const Workspace& workspace, RefPtr& ref_ptr) {
auto&& fm = args.filter_meta;
megdnn_assert(fm.group == 1 && fm.spatial_ndim == 2);
auto OC = fm.ocpg, IC = fm.icpg, FH = fm.spatial[0], FW = fm.spatial[1];
auto dtype = fm.dtype;
megdnn_assert(workspace.size >= dtype.size() * OC * IC * FH * FW);

TensorND src{raw_ptr, {{OC, IC, FH, FW}, dtype}},
TensorND src{{{OC, IC, FH, FW}, dtype}, ref_ptr},
dst{workspace.raw_ptr + (FH * FW - 1) * dtype.size(), src.layout};
dst.layout.stride[2] = -dst.layout.stride[2];
dst.layout.stride[3] = -dst.layout.stride[3];
args.handle->relayout_opr()->exec(src, dst);
raw_ptr = workspace.raw_ptr;
ref_ptr.reset(workspace.raw_ptr);
}

} // namespace conv_bias


+ 1
- 1
dnn/src/cuda/conv_bias/helper.h View File

@@ -58,7 +58,7 @@ SmallVector<size_t> matmul_get_workspace_bundle(const BiasForwardSizeArgs& args)
* change \p raw_ptr to workspace.
*/
void flip_filter(
const BiasForwardSizeArgs& args, const Workspace& workspace, void*& raw_ptr);
const BiasForwardSizeArgs& args, const Workspace& workspace, RefPtr& ref_ptr);

struct CUDNNForwardDescs {
TensorDesc src_desc, dst_desc, bias_desc, z_desc;


+ 3
- 3
dnn/src/cuda/conv_bias/implicit_gemm_int4_int4_nchw64_imma.cpp View File

@@ -39,7 +39,7 @@ SmallVector<TensorLayout> ConvBiasForwardImpl::AlgoInt4Int4NCHW64IMMAImplicitGem
void ConvBiasForwardImpl::AlgoInt4Int4NCHW64IMMAImplicitGemm::exec_preprocess(
const ExecArgs& args) const {
megdnn_assert(args.preprocessed_filter->tensors.size() == 1);
void* filter_ptr = args.preprocessed_filter->tensors[0].raw_ptr;
void* filter_ptr = args.preprocessed_filter->tensors[0].raw_ptr();
reorder_filter(args, filter_ptr);
}

@@ -48,12 +48,12 @@ std::tuple<void*, void*> ConvBiasForwardImpl::AlgoInt4Int4NCHW64IMMAImplicitGemm
void* filter_ptr = nullptr;
if (args.preprocessed_filter) {
megdnn_assert(args.preprocessed_filter->tensors.size() == 1);
filter_ptr = args.preprocessed_filter->tensors[0].raw_ptr;
filter_ptr = args.preprocessed_filter->tensors[0].raw_ptr();
} else {
filter_ptr = reinterpret_cast<void*>(args.workspace.raw_ptr);
reorder_filter(args, filter_ptr);
}
void* bias_ptr = args.bias_tensor->raw_ptr;
void* bias_ptr = args.bias_tensor->raw_ptr();
return {filter_ptr, bias_ptr};
}



+ 3
- 3
dnn/src/cuda/conv_bias/implicit_gemm_int4_int4_nhwc_imma.cpp View File

@@ -39,7 +39,7 @@ SmallVector<TensorLayout> ConvBiasForwardImpl::AlgoInt4Int4NHWCIMMAImplicitGemm:
void ConvBiasForwardImpl::AlgoInt4Int4NHWCIMMAImplicitGemm::exec_preprocess(
const ExecArgs& args) const {
megdnn_assert(args.preprocessed_filter->tensors.size() == 1);
void* filter_ptr = args.preprocessed_filter->tensors[0].raw_ptr;
void* filter_ptr = args.preprocessed_filter->tensors[0].raw_ptr();
reorder_filter(args, m_algo_param.access_size, filter_ptr);
}

@@ -48,12 +48,12 @@ std::tuple<void*, void*> ConvBiasForwardImpl::AlgoInt4Int4NHWCIMMAImplicitGemm::
void* filter_ptr = nullptr;
if (args.preprocessed_filter) {
megdnn_assert(args.preprocessed_filter->tensors.size() == 1);
filter_ptr = args.preprocessed_filter->tensors[0].raw_ptr;
filter_ptr = args.preprocessed_filter->tensors[0].raw_ptr();
} else {
filter_ptr = reinterpret_cast<void*>(args.workspace.raw_ptr);
reorder_filter(args, m_algo_param.access_size, filter_ptr);
}
void* bias_ptr = args.bias_tensor->raw_ptr;
void* bias_ptr = args.bias_tensor->raw_ptr();
return {filter_ptr, bias_ptr};
}



+ 4
- 4
dnn/src/cuda/conv_bias/implicit_gemm_int4_nchw64_imma_base.cpp View File

@@ -103,7 +103,7 @@ void ConvBiasForwardImpl::AlgoInt4NCHW64IMMAImplicitGemmBase::exec(

std::tie(filter_ptr, bias_ptr) = prepare_filter_bias(args);
if (args.z_layout->ndim > 0)
z_ptr = args.z_tensor->raw_ptr;
z_ptr = args.z_tensor->raw_ptr();

// \note these constants of cutlass epilogue will be passed to method
// `execute_cutlass_conv_op` by pointer and interpreted as ElementCompute*,
@@ -131,8 +131,8 @@ void ConvBiasForwardImpl::AlgoInt4NCHW64IMMAImplicitGemmBase::exec(
use_conv_filter_unity_opt, without_shared_load);

execute_cutlass_conv_op(
op, args.src_tensor->raw_ptr, filter_ptr, bias_ptr, z_ptr,
args.dst_tensor->raw_ptr, nullptr, n, hi, wi, ci, co, fh, fw, ho, wo, ph,
op, args.src_tensor->raw_ptr(), filter_ptr, bias_ptr, z_ptr,
args.dst_tensor->raw_ptr(), nullptr, n, hi, wi, ci, co, fh, fw, ho, wo, ph,
pw, sh, sw, dh, dw, &alpha, &beta, &gamma, &delta, &theta, &threshold,
&dst_scale, stream, &src_zero);

@@ -159,7 +159,7 @@ void ConvBiasForwardImpl::AlgoInt4NCHW64IMMAImplicitGemmBase::reorder_filter(
// filter: KCRS64 => CRSK64 and reorder oc
cutlass_wrapper::reorder_ncxhwx_imma_filter<4, 64>(
reinterpret_cast<int8_t*>(reordered_filter),
reinterpret_cast<int8_t*>(args.filter_tensor->raw_ptr), co, ci, fh, fw,
reinterpret_cast<int8_t*>(args.filter_tensor->raw_ptr()), co, ci, fh, fw,
true, stream);
}
#endif


+ 4
- 4
dnn/src/cuda/conv_bias/implicit_gemm_int4_nhwc_imma_base.cpp View File

@@ -115,7 +115,7 @@ void ConvBiasForwardImpl::AlgoInt4NHWCIMMAImplicitGemmBase::exec(

std::tie(filter_ptr, bias_ptr) = prepare_filter_bias(args);
if (args.z_layout->ndim > 0)
z_ptr = args.z_tensor->raw_ptr;
z_ptr = args.z_tensor->raw_ptr();

// \note these constants of cutlass epilogue will be passed to method
// `execute_cutlass_conv_op` by pointer and interpreted as ElementCompute*,
@@ -151,8 +151,8 @@ void ConvBiasForwardImpl::AlgoInt4NHWCIMMAImplicitGemmBase::exec(
use_conv_filter_unity_opt, without_shared_load);

execute_cutlass_conv_op(
op, args.src_tensor->raw_ptr, filter_ptr, bias_ptr, z_ptr,
args.dst_tensor->raw_ptr, nullptr, n, hi, wi, ci, co, fh, fw, ho, wo, ph,
op, args.src_tensor->raw_ptr(), filter_ptr, bias_ptr, z_ptr,
args.dst_tensor->raw_ptr(), nullptr, n, hi, wi, ci, co, fh, fw, ho, wo, ph,
pw, sh, sw, dh, dw, &alpha, &beta, &gamma, &delta, &theta, &threshold,
&dst_scale, stream, &src_zero);

@@ -188,7 +188,7 @@ void ConvBiasForwardImpl::AlgoInt4NHWCIMMAImplicitGemmBase::reorder_filter(

cutlass_wrapper::reorder_nhwc_imma_filter<4>(
reinterpret_cast<int8_t*>(reordered_filter),
reinterpret_cast<int8_t*>(args.filter_tensor->raw_ptr), co, ci, fh, fw,
reinterpret_cast<int8_t*>(args.filter_tensor->raw_ptr()), co, ci, fh, fw,
trans_oc, alignbits, oc_iterleaved, stream);
}
#endif


+ 3
- 6
dnn/src/cuda/conv_bias/implicit_gemm_int8_chwn4_imma_reorder_filter.cpp View File

@@ -158,18 +158,15 @@ void ConvBiasForwardImpl::AlgoInt8CHWN4IMMAImplicitGemmReorderFilter::exec(
UNPACK_CONV_BIAS_CHWN4_PARAM(*(args.src_layout), fm, *(args.dst_layout), param);
// reorder filter
{
TensorLayout in = *(args.filter_layout);
TensorLayout out = {{ci / 16, 4, fh, fw, co, 4}, in.dtype};
TensorLayout out = {
{ci / 16, 4, fh, fw, co, 4}, args.filter_tensor->layout.dtype};
out.stride[0] = 16 * co * fh * fw;
out.stride[1] = 4;
out.stride[2] = fw * co * 16;
out.stride[3] = co * 16;
out.stride[4] = 16;
out.stride[5] = 1;
TensorND ts_in, ts_out;
ts_in.layout = in, ts_out.layout = out;
ts_in.raw_ptr = args.filter_tensor->raw_ptr,
ts_out.raw_ptr = args.workspace.raw_ptr;
TensorND ts_in = *args.filter_tensor, ts_out{args.workspace.raw_ptr, out};
args.opr->handle()->create_operator<RelayoutForward>()->exec(ts_in, ts_out);
}



+ 3
- 6
dnn/src/cuda/conv_bias/implicit_gemm_int8_chwn4_imma_unroll_width.cpp View File

@@ -160,18 +160,15 @@ void ConvBiasForwardImpl::AlgoInt8CHWN4IMMAImplicitGemmUnrollWidth::exec(
UNPACK_CONV_BIAS_CHWN4_PARAM(*(args.src_layout), fm, *(args.dst_layout), param);
// reorder filter
{
TensorLayout in = *(args.filter_layout);
TensorLayout out = {{ci / 16, 4, fh, fw, co, 4}, in.dtype};
TensorLayout out = {
{ci / 16, 4, fh, fw, co, 4}, args.filter_tensor->layout.dtype};
out.stride[0] = 16 * co * fh * fw;
out.stride[1] = 4;
out.stride[2] = fw * co * 16;
out.stride[3] = co * 16;
out.stride[4] = 16;
out.stride[5] = 1;
TensorND ts_in, ts_out;
ts_in.layout = in, ts_out.layout = out;
ts_in.raw_ptr = args.filter_tensor->raw_ptr,
ts_out.raw_ptr = args.workspace.raw_ptr;
TensorND ts_in = *args.filter_tensor, ts_out{args.workspace.raw_ptr, out};
args.opr->handle()->create_operator<RelayoutForward>()->exec(ts_in, ts_out);
}



+ 9
- 9
dnn/src/cuda/conv_bias/implicit_gemm_int8_nchw32_imma.cpp View File

@@ -125,11 +125,11 @@ void ConvBiasForwardImpl::AlgoInt8NCHW32IMMAImplicitGemm::exec(
filter_ptr = reinterpret_cast<int8_t*>(args.workspace.raw_ptr);
// filter: KCRS32 => CRSK32 and reorder oc
cutlass_wrapper::reorder_ncxhwx_imma_filter<8, 32>(
filter_ptr, reinterpret_cast<int8_t*>(args.filter_tensor->raw_ptr), co,
ci, fh, fw, trans_oc, stream);
filter_ptr, reinterpret_cast<int8_t*>(args.filter_tensor->raw_ptr()),
co, ci, fh, fw, trans_oc, stream);
} else {
filter_ptr =
reinterpret_cast<int8_t*>(args.preprocessed_filter->tensors[0].raw_ptr);
filter_ptr = reinterpret_cast<int8_t*>(
args.preprocessed_filter->tensors[0].raw_ptr());
}

float src_scale = args.src_layout->dtype.param<dtype::QuantizedS8>().scale,
@@ -157,9 +157,9 @@ void ConvBiasForwardImpl::AlgoInt8NCHW32IMMAImplicitGemm::exec(
use_conv_filter_unity_opt, without_shared_load);

execute_cutlass_conv_op(
op, args.src_tensor->raw_ptr, filter_ptr, args.bias_tensor->raw_ptr,
z_dev_ptr, args.dst_tensor->raw_ptr, nullptr, n, hi, wi, ci, co, fh, fw, ho,
wo, ph, pw, sh, sw, dh, dw, &alpha, &beta, &gamma, &delta, &theta,
op, args.src_tensor->raw_ptr(), filter_ptr, args.bias_tensor->raw_ptr(),
z_dev_ptr, args.dst_tensor->raw_ptr(), nullptr, n, hi, wi, ci, co, fh, fw,
ho, wo, ph, pw, sh, sw, dh, dw, &alpha, &beta, &gamma, &delta, &theta,
&threshold, &dst_scale, stream);

after_kernel_launch();
@@ -204,8 +204,8 @@ void ConvBiasForwardImpl::AlgoInt8NCHW32IMMAImplicitGemm::exec_preprocess(
cudaStream_t stream = cuda_stream(args.opr->handle());
// filter: KCRS32 => CRSK32 and reorder oc
cutlass_wrapper::reorder_ncxhwx_imma_filter<8, 32>(
reinterpret_cast<int8_t*>(args.preprocessed_filter->tensors[0].raw_ptr),
reinterpret_cast<int8_t*>(args.filter_tensor->raw_ptr), co, ci, fh, fw,
reinterpret_cast<int8_t*>(args.preprocessed_filter->tensors[0].raw_ptr()),
reinterpret_cast<int8_t*>(args.filter_tensor->raw_ptr()), co, ci, fh, fw,
trans_oc, stream);
}
#endif


+ 11
- 17
dnn/src/cuda/conv_bias/implicit_gemm_int8_nchw4_dp4a.cpp View File

@@ -155,16 +155,13 @@ void ConvBiasForwardImpl::AlgoInt8NCHW4DotProdImplicitGemm::exec(
src.init_contiguous_stride();
TensorLayout dst = src;
dst.stride[0] = 1, dst.stride[1] = dst[0];
TensorND ts_src, ts_dst;
ts_src.raw_ptr = args.filter_tensor->raw_ptr;
ts_src.layout = src;
ts_dst.raw_ptr = args.workspace.raw_ptr;
ts_dst.layout = dst;
TensorND ts_src{args.filter_tensor->raw_ptr(), src},
ts_dst{args.workspace.raw_ptr, dst};
auto&& transpose = args.opr->handle()->create_operator<RelayoutForward>();
transpose->exec(ts_src, ts_dst);
} else {
filter_ptr =
reinterpret_cast<int8_t*>(args.preprocessed_filter->tensors[0].raw_ptr);
filter_ptr = reinterpret_cast<int8_t*>(
args.preprocessed_filter->tensors[0].raw_ptr());
}

float src_scale = args.src_layout->dtype.param<dtype::QuantizedS8>().scale,
@@ -190,7 +187,7 @@ void ConvBiasForwardImpl::AlgoInt8NCHW4DotProdImplicitGemm::exec(
float delta = 0.f;
void* z_ptr = nullptr;
if (args.z_layout->ndim > 0) {
z_ptr = args.z_tensor->raw_ptr;
z_ptr = args.z_tensor->raw_ptr();
gamma = 1.f;
if (args.z_layout->dtype.category() == DTypeCategory::QUANTIZED) {
megdnn_assert(
@@ -213,10 +210,10 @@ void ConvBiasForwardImpl::AlgoInt8NCHW4DotProdImplicitGemm::exec(
use_conv_filter_unity_opt, without_shared_load);

execute_cutlass_conv_op(
op, args.src_tensor->raw_ptr, filter_ptr, args.bias_tensor->raw_ptr, z_ptr,
args.dst_tensor->raw_ptr, nullptr, n, hi, wi, ci, co, fh, fw, ho, wo, ph,
pw, sh, sw, dh, dw, &alpha, &beta, &gamma, &delta, &theta, &threshold,
&dst_scale, stream);
op, args.src_tensor->raw_ptr(), filter_ptr, args.bias_tensor->raw_ptr(),
z_ptr, args.dst_tensor->raw_ptr(), nullptr, n, hi, wi, ci, co, fh, fw, ho,
wo, ph, pw, sh, sw, dh, dw, &alpha, &beta, &gamma, &delta, &theta,
&threshold, &dst_scale, stream);

after_kernel_launch();
}
@@ -261,11 +258,8 @@ void ConvBiasForwardImpl::AlgoInt8NCHW4DotProdImplicitGemm::exec_preprocess(
src.init_contiguous_stride();
TensorLayout dst = src;
dst.stride[0] = 1, dst.stride[1] = dst[0];
TensorND ts_src, ts_dst;
ts_src.raw_ptr = args.filter_tensor->raw_ptr;
ts_src.layout = src;
ts_dst.raw_ptr = args.preprocessed_filter->tensors[0].raw_ptr;
ts_dst.layout = dst;
TensorND ts_src{args.filter_tensor->raw_ptr(), src},
ts_dst{args.preprocessed_filter->tensors[0].raw_ptr(), dst};
auto&& transpose = args.opr->handle()->create_operator<RelayoutForward>();
transpose->exec(ts_src, ts_dst);
}


+ 4
- 20
dnn/src/cuda/conv_bias/implicit_gemm_int8_nchw4_imma.cpp View File

@@ -96,11 +96,7 @@ void ConvBiasForwardImpl::AlgoInt8NCHW4IMMAImplicitGemm::exec(
src.init_contiguous_stride();
TensorLayout dst = src;
dst.stride[0] = 1, dst.stride[1] = dst[0];
TensorND ts_src, ts_dst;
ts_src.raw_ptr = args.src_tensor->raw_ptr;
ts_src.layout = src;
ts_dst.raw_ptr = ws_src;
ts_dst.layout = dst;
TensorND ts_src{args.src_tensor->raw_ptr(), src}, ts_dst{ws_src, dst};
auto&& transpose = args.opr->handle()->create_operator<RelayoutForward>();
transpose->exec(ts_src, ts_dst);
}
@@ -111,11 +107,7 @@ void ConvBiasForwardImpl::AlgoInt8NCHW4IMMAImplicitGemm::exec(
src.init_contiguous_stride();
TensorLayout dst = src;
dst.stride[0] = 1, dst.stride[1] = dst[0];
TensorND ts_src, ts_dst;
ts_src.raw_ptr = args.filter_tensor->raw_ptr;
ts_src.layout = src;
ts_dst.raw_ptr = ws_filter;
ts_dst.layout = dst;
TensorND ts_src{args.filter_tensor->raw_ptr(), src}, ts_dst{ws_filter, dst};
auto&& transpose = args.opr->handle()->create_operator<RelayoutForward>();
transpose->exec(ts_src, ts_dst);
}
@@ -142,11 +134,7 @@ void ConvBiasForwardImpl::AlgoInt8NCHW4IMMAImplicitGemm::exec(
src.init_contiguous_stride();
TensorLayout dst = src;
dst.stride[0] = 1, dst.stride[1] = dst[0];
TensorND ts_src, ts_dst;
ts_src.raw_ptr = args.z_tensor->raw_ptr;
ts_src.layout = src;
ts_dst.raw_ptr = ws_z;
ts_dst.layout = dst;
TensorND ts_src{args.z_tensor->raw_ptr(), src}, ts_dst{ws_z, dst};
auto&& transpose = args.opr->handle()->create_operator<RelayoutForward>();
transpose->exec(ts_src, ts_dst);
z_dev_ptr = reinterpret_cast<int8_t*>(ws_z);
@@ -168,11 +156,7 @@ void ConvBiasForwardImpl::AlgoInt8NCHW4IMMAImplicitGemm::exec(
src.init_contiguous_stride();
TensorLayout dst = src;
dst.stride[0] = 1, dst.stride[1] = dst[0];
TensorND ts_src, ts_dst;
ts_src.raw_ptr = ws_dst;
ts_src.layout = src;
ts_dst.raw_ptr = args.dst_tensor->raw_ptr;
ts_dst.layout = dst;
TensorND ts_src{ws_dst, src}, ts_dst{args.dst_tensor->raw_ptr(), dst};
auto&& transpose = args.opr->handle()->create_operator<RelayoutForward>();
transpose->exec(ts_src, ts_dst);
}


+ 7
- 7
dnn/src/cuda/conv_bias/implicit_gemm_int8_nhwc_imma.cpp View File

@@ -114,7 +114,7 @@ SmallVector<TensorLayout> ConvBiasForwardImpl::AlgoInt8NHWCIMMAImplicitGemm::

void ConvBiasForwardImpl::AlgoInt8NHWCIMMAImplicitGemm::exec_preprocess(
const ExecArgs& args) const {
void* filter_ptr = args.preprocessed_filter->tensors[0].raw_ptr;
void* filter_ptr = args.preprocessed_filter->tensors[0].raw_ptr();
reorder_filter(args, m_algo_param.access_size, filter_ptr);
}

@@ -189,15 +189,15 @@ void ConvBiasForwardImpl::AlgoInt8NHWCIMMAImplicitGemm::exec(
void* z_ptr = nullptr;

if (args.preprocessed_filter) {
filter_ptr = args.preprocessed_filter->tensors[0].raw_ptr;
filter_ptr = args.preprocessed_filter->tensors[0].raw_ptr();
} else {
filter_ptr = reinterpret_cast<void*>(args.workspace.raw_ptr);
reorder_filter(args, m_algo_param.access_size, filter_ptr);
}
bias_ptr = args.bias_tensor->raw_ptr;
bias_ptr = args.bias_tensor->raw_ptr();

if (args.z_layout->ndim > 0)
z_ptr = args.z_tensor->raw_ptr;
z_ptr = args.z_tensor->raw_ptr();

// \note these constants of cutlass epilogue will be passed to method
// `execute_cutlass_conv_op` by pointer and interpreted as ElementCompute*,
@@ -233,8 +233,8 @@ void ConvBiasForwardImpl::AlgoInt8NHWCIMMAImplicitGemm::exec(
use_conv_filter_unity_opt, without_shared_load);

execute_cutlass_conv_op(
op, args.src_tensor->raw_ptr, filter_ptr, bias_ptr, z_ptr,
args.dst_tensor->raw_ptr, nullptr, n, hi, wi, ci, co, fh, fw, ho, wo, ph,
op, args.src_tensor->raw_ptr(), filter_ptr, bias_ptr, z_ptr,
args.dst_tensor->raw_ptr(), nullptr, n, hi, wi, ci, co, fh, fw, ho, wo, ph,
pw, sh, sw, dh, dw, &alpha, &beta, &gamma, &delta, &theta, &threshold,
&dst_scale, stream);

@@ -272,7 +272,7 @@ void ConvBiasForwardImpl::AlgoInt8NHWCIMMAImplicitGemm::reorder_filter(

cutlass_wrapper::reorder_nhwc_imma_filter<8>(
reinterpret_cast<int8_t*>(reordered_filter),
reinterpret_cast<int8_t*>(args.filter_tensor->raw_ptr), co, ci, fh, fw,
reinterpret_cast<int8_t*>(args.filter_tensor->raw_ptr()), co, ci, fh, fw,
trans_oc, alignbits, oc_iterleaved, stream);
}
#endif


+ 5
- 5
dnn/src/cuda/conv_bias/implicit_gemm_uint4_int4_nchw64_imma.cpp View File

@@ -52,8 +52,8 @@ SmallVector<TensorLayout> ConvBiasForwardImpl::AlgoUInt4Int4NCHW64IMMAImplicitGe
void ConvBiasForwardImpl::AlgoUInt4Int4NCHW64IMMAImplicitGemm::exec_preprocess(
const ExecArgs& args) const {
megdnn_assert(args.preprocessed_filter->tensors.size() == 2);
void* filter_ptr = args.preprocessed_filter->tensors[0].raw_ptr;
void* bias_ptr = args.preprocessed_filter->tensors[1].raw_ptr;
void* filter_ptr = args.preprocessed_filter->tensors[0].raw_ptr();
void* bias_ptr = args.preprocessed_filter->tensors[1].raw_ptr();
void* reduce_filter_ptr = reinterpret_cast<void*>(args.workspace.raw_ptr);
void* reduce_workspace = reinterpret_cast<void*>(
args.workspace.raw_ptr + args.bias_layout->span().dist_byte());
@@ -67,8 +67,8 @@ std::tuple<void*, void*> ConvBiasForwardImpl::AlgoUInt4Int4NCHW64IMMAImplicitGem
void* bias_ptr = nullptr;
if (args.preprocessed_filter) {
megdnn_assert(args.preprocessed_filter->tensors.size() == 2);
filter_ptr = args.preprocessed_filter->tensors[0].raw_ptr;
bias_ptr = args.preprocessed_filter->tensors[1].raw_ptr;
filter_ptr = args.preprocessed_filter->tensors[0].raw_ptr();
bias_ptr = args.preprocessed_filter->tensors[1].raw_ptr();
return {filter_ptr, bias_ptr};
} else {
filter_ptr = reinterpret_cast<void*>(args.workspace.raw_ptr);
@@ -130,7 +130,7 @@ void ConvBiasForwardImpl::AlgoUInt4Int4NCHW64IMMAImplicitGemm::update_bias(
int src_zero_point =
args.src_tensor->layout.dtype.param<dtype::Quantized4Asymm>().zero_point;
do_dispatch_reduce_filter_and_update_bias_4bit<true>(
reinterpret_cast<uint8_t*>(args.filter_tensor->raw_ptr),
reinterpret_cast<uint8_t*>(args.filter_tensor->raw_ptr()),
args.bias_tensor->compatible_ptr<int32_t>(), co, ci * fh * fw / 8,
reinterpret_cast<int32_t*>(updated_bias),
reinterpret_cast<int32_t*>(reduce_workspace), src_zero_point, stream);


+ 5
- 5
dnn/src/cuda/conv_bias/implicit_gemm_uint4_int4_nhwc_imma.cpp View File

@@ -52,8 +52,8 @@ SmallVector<TensorLayout> ConvBiasForwardImpl::AlgoUInt4Int4NHWCIMMAImplicitGemm
void ConvBiasForwardImpl::AlgoUInt4Int4NHWCIMMAImplicitGemm::exec_preprocess(
const ExecArgs& args) const {
megdnn_assert(args.preprocessed_filter->tensors.size() == 2);
void* filter_ptr = args.preprocessed_filter->tensors[0].raw_ptr;
void* bias_ptr = args.preprocessed_filter->tensors[1].raw_ptr;
void* filter_ptr = args.preprocessed_filter->tensors[0].raw_ptr();
void* bias_ptr = args.preprocessed_filter->tensors[1].raw_ptr();
void* reduce_filter_ptr = reinterpret_cast<void*>(args.workspace.raw_ptr);
void* reduce_workspace = reinterpret_cast<void*>(
args.workspace.raw_ptr + args.bias_layout->span().dist_byte());
@@ -67,8 +67,8 @@ std::tuple<void*, void*> ConvBiasForwardImpl::AlgoUInt4Int4NHWCIMMAImplicitGemm:
void* bias_ptr = nullptr;
if (args.preprocessed_filter) {
megdnn_assert(args.preprocessed_filter->tensors.size() == 2);
filter_ptr = args.preprocessed_filter->tensors[0].raw_ptr;
bias_ptr = args.preprocessed_filter->tensors[1].raw_ptr;
filter_ptr = args.preprocessed_filter->tensors[0].raw_ptr();
bias_ptr = args.preprocessed_filter->tensors[1].raw_ptr();
return {filter_ptr, bias_ptr};
} else {
filter_ptr = reinterpret_cast<void*>(args.workspace.raw_ptr);
@@ -146,7 +146,7 @@ void ConvBiasForwardImpl::AlgoUInt4Int4NHWCIMMAImplicitGemm::update_bias(
int src_zero_point =
args.src_tensor->layout.dtype.param<dtype::Quantized4Asymm>().zero_point;
do_dispatch_reduce_filter_and_update_bias_4bit<true>(
reinterpret_cast<uint8_t*>(args.filter_tensor->raw_ptr),
reinterpret_cast<uint8_t*>(args.filter_tensor->raw_ptr()),
args.bias_tensor->compatible_ptr<int32_t>(), co, ci * fh * fw / 8,
reinterpret_cast<int32_t*>(updated_bias),
reinterpret_cast<int32_t*>(reduce_workspace), src_zero_point, stream);


+ 2
- 2
dnn/src/cuda/conv_bias/inplace_matmul.cpp View File

@@ -40,9 +40,9 @@ size_t ConvBiasForwardImpl::AlgoInplaceMatmul::get_workspace_in_bytes(

void ConvBiasForwardImpl::AlgoInplaceMatmul::exec(const ExecArgs& args) const {
WorkspaceBundle bundle{args.workspace.raw_ptr, {get_workspace_in_bytes(args)}};
auto conv_dst_tensor = *args.dst_tensor;
TensorND conv_dst_tensor = *args.dst_tensor;
if (args.dst_layout->dtype.enumv() != args.bias_layout->dtype.enumv()) {
conv_dst_tensor.raw_ptr = bundle.get(0);
conv_dst_tensor = TensorND{bundle.get(0), args.dst_tensor->layout};
conv_dst_tensor.layout.dtype = DType();
args.opr->check_or_deduce_dtype_fwd(
args.src_layout->dtype, args.filter_layout->dtype,


+ 4
- 3
dnn/src/cuda/conv_bias/matmul.cpp View File

@@ -115,9 +115,10 @@ size_t ConvBiasForwardImpl::AlgoMatmul::get_workspace_in_bytes(

void ConvBiasForwardImpl::AlgoMatmul::exec(const ExecArgs& args) const {
auto bundle = get_workspace_bundle(args.workspace.raw_ptr, args);
auto conv_dst_tensor = *args.dst_tensor;
TensorND conv_dst_tensor = *args.dst_tensor;
if (args.dst_layout->dtype.enumv() != args.bias_layout->dtype.enumv()) {
conv_dst_tensor.raw_ptr = bundle.get(bundle.nr_workspace() - 1);
conv_dst_tensor = TensorND{
bundle.get(bundle.nr_workspace() - 1), args.dst_tensor->layout};
conv_dst_tensor.layout.dtype = DType();
args.opr->check_or_deduce_dtype_fwd(
args.src_layout->dtype, args.filter_layout->dtype,
@@ -168,7 +169,7 @@ void ConvBiasForwardImpl::AlgoMatmul::exec_internal(
C(dst_t, config.first[2]);
size_t matmul_ws_idx = 2;
if (fm.should_flip) {
conv_bias::flip_filter(args, bundle.get_workspace(2), A.raw_ptr);
conv_bias::flip_filter(args, bundle.get_workspace(2), A.get_ref_ptr());
matmul_ws_idx = 3;
}



+ 13
- 13
dnn/src/cuda/conv_bias/matmul_8x8x32.cpp View File

@@ -128,12 +128,10 @@ void ConvBiasForwardImpl::AlgoMatmul8x8x32::exec_internal(const ExecArgs& args)
auto bundle = get_bundle<format>(args);
bundle.set(args.workspace.raw_ptr);

TensorND src_tensor, dst_tensor, filter_tensor;
if (format == Param::Format::NHWC) {
src_tensor = *args.src_tensor;
dst_tensor = *args.dst_tensor;
filter_tensor = *args.filter_tensor;
} else {
TensorND src_tensor = *args.src_tensor;
TensorND dst_tensor = *args.dst_tensor;
TensorND filter_tensor = *args.filter_tensor;
if (format == Param::Format::NCHW4) {
// NCHW4
auto to_nhwc = [](const TensorLayout& layout, void* raw_ptr) -> TensorND {
return {raw_ptr,
@@ -147,7 +145,7 @@ void ConvBiasForwardImpl::AlgoMatmul8x8x32::exec_internal(const ExecArgs& args)
auto N = src.layout[0], C = src.layout[1] * 4, H = src.layout[2],
W = src.layout[3];
args.handle->relayout_opr()->exec(
{src.raw_ptr,
{src.raw_ptr(),
TensorLayout{
{N, H, W, C / 4, 4},
{src.layout.stride[0], src.layout.stride[2],
@@ -156,8 +154,8 @@ void ConvBiasForwardImpl::AlgoMatmul8x8x32::exec_internal(const ExecArgs& args)
src.layout.dtype}},
{dst_ptr, TensorLayout{{N, H, W, C / 4, 4}, src.layout.dtype}});
};
relayout(*args.src_tensor, src_tensor.raw_ptr);
relayout(*args.filter_tensor, filter_tensor.raw_ptr);
relayout(*args.src_tensor, src_tensor.raw_ptr());
relayout(*args.filter_tensor, filter_tensor.raw_ptr());
}

size_t N, IH, IW, IC;
@@ -193,7 +191,7 @@ void ConvBiasForwardImpl::AlgoMatmul8x8x32::exec_internal(const ExecArgs& args)
// copy (OC, FH*FW*IC) to (OC, FH*FW*IC) with stride=LD
inp1 = static_cast<int8_t*>(bundle.get(1));
cuda_check(cudaMemcpy2DAsync(
inp1, LD * sizeof(int8_t), filter_tensor.raw_ptr,
inp1, LD * sizeof(int8_t), filter_tensor.raw_ptr(),
FH * FW * IC * sizeof(int8_t), FH * FW * IC * sizeof(int8_t), OC,
cudaMemcpyDeviceToDevice, stream));
inp1_stride = LD;
@@ -222,12 +220,13 @@ void ConvBiasForwardImpl::AlgoMatmul8x8x32::exec_internal(const ExecArgs& args)

void ConvBiasForwardImpl::AlgoMatmul8x8x32::exec(const ExecArgs& args) const {
ExecArgs conv_args = args;
auto conv_dst_tensor = *args.dst_tensor;
TensorND conv_dst_tensor = *args.dst_tensor;
if (args.filter_meta.format == Param::Format::NHWC) {
auto bundle = get_bundle<Param::Format::NHWC>(args);
bundle.set(args.workspace.raw_ptr);
if (args.dst_layout->dtype.enumv() != args.bias_layout->dtype.enumv()) {
conv_dst_tensor.raw_ptr = bundle.get(bundle.nr_workspace() - 1);
conv_dst_tensor = TensorND{
bundle.get(bundle.nr_workspace() - 1), args.dst_tensor->layout};
conv_dst_tensor.layout.dtype = DType();
args.opr->check_or_deduce_dtype_fwd(
args.src_layout->dtype, args.filter_layout->dtype,
@@ -239,7 +238,8 @@ void ConvBiasForwardImpl::AlgoMatmul8x8x32::exec(const ExecArgs& args) const {
auto bundle = get_bundle<Param::Format::NCHW4>(args);
bundle.set(args.workspace.raw_ptr);
if (args.dst_layout->dtype.enumv() != args.bias_layout->dtype.enumv()) {
conv_dst_tensor.raw_ptr = bundle.get(bundle.nr_workspace() - 1);
conv_dst_tensor = TensorND{
bundle.get(bundle.nr_workspace() - 1), args.dst_tensor->layout};
conv_dst_tensor.layout.dtype = DType();
args.opr->check_or_deduce_dtype_fwd(
args.src_layout->dtype, args.filter_layout->dtype,


+ 8
- 8
dnn/src/cuda/conv_bias/quint4x4x32_wmma.cpp View File

@@ -131,26 +131,26 @@ void ConvBiasForwardImpl::AlgoQUInt4x4x32WMMA::exec(const ExecArgs& args) const
auto&& stream = cuda_stream(handle);
// zp filter
do_dispatch_reduce_with_scale_filter_4bit<false>(
static_cast<uint8_t*>(args.filter_tensor->raw_ptr), -zp_data, OC,
static_cast<uint8_t*>(args.filter_tensor->raw_ptr()), -zp_data, OC,
FH * FW * IC / 8, ws_zp_filter.ptr<int32_t>(), stream);
// zp data
do_dispatch_reduce_with_scale_data_u4(
ws_zp_data.ptr<int32_t>(), static_cast<uint8_t*>(args.src_tensor->raw_ptr),
N, IH, IW, OH, OW, PH, PW, FH, FW, SH, SW, IC, -zp_filter,
static_cast<uint8_t>(zp_data), stream);
ws_zp_data.ptr<int32_t>(),
static_cast<uint8_t*>(args.src_tensor->raw_ptr()), N, IH, IW, OH, OW, PH,
PW, FH, FW, SH, SW, IC, -zp_filter, static_cast<uint8_t>(zp_data), stream);

// do conv
if (use_kernel_fhxfw(args)) {
wmma_conv_integer_subbyte::_do_wmma_conv_integer_subbyte_fhxfw(
static_cast<uint8_t*>(args.src_tensor->raw_ptr),
static_cast<uint8_t*>(args.filter_tensor->raw_ptr),
static_cast<uint8_t*>(args.src_tensor->raw_ptr()),
static_cast<uint8_t*>(args.filter_tensor->raw_ptr()),
args.dst_tensor->compatible_ptr<int32_t>(), N, IH, IW, OH, OW, PH, PW,
IC, OC, FH, FW, SH, SW, static_cast<uint8_t>(zp_data), stream);
} else {
auto&& ws_relayout_filter = ws_bundle.get_workspace(2);
wmma_conv_integer_subbyte::_do_wmma_conv_integer_subbyte_1xfw(
static_cast<uint8_t*>(args.src_tensor->raw_ptr),
static_cast<uint8_t*>(args.filter_tensor->raw_ptr),
static_cast<uint8_t*>(args.src_tensor->raw_ptr()),
static_cast<uint8_t*>(args.filter_tensor->raw_ptr()),
args.dst_tensor->compatible_ptr<int32_t>(),
ws_relayout_filter.ptr<uint8_t>(), N, IH, IW, OH, OW, PH, PW, IC, OC,
FH, FW, SH, SW, static_cast<uint8_t>(zp_data), stream);


+ 3
- 3
dnn/src/cuda/convolution/backward_data/chanwise.cpp View File

@@ -60,9 +60,9 @@ void ConvolutionBackwardDataImpl::AlgoChanwise::exec(const ExecArgs& args) const
#if CUDA_VERSION >= 9000
if (is_compute_capability_required(5, 3)) {
return chanwise::run_bwd_data(
static_cast<__half*>(args.grad_tensor->raw_ptr),
static_cast<__half*>(args.diff_tensor->raw_ptr),
static_cast<__half*>(args.filter_tensor->raw_ptr), kparam,
static_cast<__half*>(args.grad_tensor->raw_ptr()),
static_cast<__half*>(args.diff_tensor->raw_ptr()),
static_cast<__half*>(args.filter_tensor->raw_ptr()), kparam,
stream);
} else {
return chanwise::run_bwd_data(


+ 3
- 3
dnn/src/cuda/convolution/backward_data/chanwise_small.cpp View File

@@ -68,9 +68,9 @@ void ConvolutionBackwardDataImpl::AlgoChanwiseSmall::exec(const ExecArgs& args)
#if CUDA_VERSION >= 9000
case DTypeEnum::Float16:
return chanwise::run_bwd_data_small(
static_cast<half*>(args.grad_tensor->raw_ptr),
static_cast<half*>(args.diff_tensor->raw_ptr),
static_cast<half*>(args.filter_tensor->raw_ptr), kparam, stream);
static_cast<half*>(args.grad_tensor->raw_ptr()),
static_cast<half*>(args.diff_tensor->raw_ptr()),
static_cast<half*>(args.filter_tensor->raw_ptr()), kparam, stream);
#endif
default:
break;


+ 4
- 3
dnn/src/cuda/convolution/backward_data/cudnn.cpp View File

@@ -71,9 +71,10 @@ void ConvolutionBackwardDataImpl::AlgoCUDNN::exec(const ExecArgs& args) const {
float alpha = 1.0f, beta = 0.0f;
auto status = cudnnConvolutionBackwardData(
args.handle->cudnn_handle(), &alpha, D.filter_desc.desc,
args.filter_tensor->raw_ptr, D.diff_desc.desc, args.diff_tensor->raw_ptr,
D.conv_desc.desc, m_cudnn_enum, args.workspace.raw_ptr, args.workspace.size,
&beta, D.grad_desc.desc, args.grad_tensor->raw_ptr);
args.filter_tensor->raw_ptr(), D.diff_desc.desc,
args.diff_tensor->raw_ptr(), D.conv_desc.desc, m_cudnn_enum,
args.workspace.raw_ptr, args.workspace.size, &beta, D.grad_desc.desc,
args.grad_tensor->raw_ptr());
megdnn_assert(
status == CUDNN_STATUS_SUCCESS, "conv bwd_data failed: %s; info: %s",
cudnnGetErrorString(status), args.to_string().c_str());


+ 6
- 6
dnn/src/cuda/convolution/backward_data/group_conv.cpp View File

@@ -103,9 +103,9 @@ void ConvolutionBackwardDataImpl::AlgoGroupConvGeneral::exec(
auto bundle = get_workspace_bundle(args.workspace.raw_ptr, args);
{
auto config = prepare_sub_opr(args);
TensorND tfilter{args.filter_tensor->raw_ptr, config.first[0]};
TensorND tdiff{args.diff_tensor->raw_ptr, config.first[1]};
TensorND tgrad{args.grad_tensor->raw_ptr, config.first[2]};
TensorND tfilter{args.filter_tensor->raw_ptr(), config.first[0]};
TensorND tdiff{args.diff_tensor->raw_ptr(), config.first[1]};
TensorND tgrad{args.grad_tensor->raw_ptr(), config.first[2]};

size_t c_pos = 1;

@@ -121,9 +121,9 @@ void ConvolutionBackwardDataImpl::AlgoGroupConvGeneral::exec(
auto grp = args.filter_meta.group;
for (uint32_t g = 0; g < grp; ++g) {
config.second->exec(tfilter, tdiff, tgrad, bundle.get_workspace(0));
incr_voidp(tfilter.raw_ptr, strd_flt);
incr_voidp(tdiff.raw_ptr, strd_diff);
incr_voidp(tgrad.raw_ptr, strd_grad);
incr_refp(tfilter.get_ref_ptr(), strd_flt);
incr_refp(tdiff.get_ref_ptr(), strd_diff);
incr_refp(tgrad.get_ref_ptr(), strd_grad);
}
}
}


+ 4
- 3
dnn/src/cuda/convolution/backward_data/implicit_gemm_int8_nchw_dp4a.cpp View File

@@ -140,7 +140,8 @@ void ConvolutionBackwardDataImpl::AlgoInt8NCHWDotProdImplicitGemm::exec(

auto&& relayout = args.opr->handle()->create_operator<RelayoutForward>();
relayout->exec(
{args.filter_tensor->raw_ptr, exec_src}, {inner_filter_ptr, exec_dst});
{args.filter_tensor->raw_ptr(), exec_src},
{inner_filter_ptr, exec_dst});
}
{
inner_diff_ptr = reinterpret_cast<int8_t*>(bundle.get(1));
@@ -152,7 +153,7 @@ void ConvolutionBackwardDataImpl::AlgoInt8NCHWDotProdImplicitGemm::exec(

auto&& relayout = args.opr->handle()->create_operator<RelayoutForward>();
relayout->exec(
{args.diff_tensor->raw_ptr, exec_src}, {inner_diff_ptr, exec_dst});
{args.diff_tensor->raw_ptr(), exec_src}, {inner_diff_ptr, exec_dst});
}
int8_t* inner_grad_ptr = reinterpret_cast<int8_t*>(bundle.get(2));

@@ -196,7 +197,7 @@ void ConvolutionBackwardDataImpl::AlgoInt8NCHWDotProdImplicitGemm::exec(

auto&& relayout = args.opr->handle()->create_operator<RelayoutForward>();
relayout->exec(
{inner_grad_ptr, exec_src}, {args.grad_tensor->raw_ptr, exec_dst});
{inner_grad_ptr, exec_src}, {args.grad_tensor->raw_ptr(), exec_dst});
}
}
// vim: syntax=cpp.doxygen

+ 1
- 1
dnn/src/cuda/convolution/backward_data/matmul.cpp View File

@@ -143,7 +143,7 @@ void ConvolutionBackwardDataImpl::AlgoMatmul::exec_internal(const ExecArgs& args
TensorND A(args.filter_tensor->ptr<T>(), Al), B(col, Bl), C(diff_t, Cl);
if (fm.should_flip) {
convolution::flip_filter(
args.as_fwd_args(), wbundle.get_workspace(2), A.raw_ptr);
args.as_fwd_args(), wbundle.get_workspace(2), A.get_ref_ptr());
config.second->exec(A, C, B, wbundle.get_workspace(3));
} else {
config.second->exec(A, C, B, wbundle.get_workspace(2));


+ 3
- 3
dnn/src/cuda/convolution/backward_filter/chanwise.cpp View File

@@ -50,9 +50,9 @@ void ConvolutionBackwardFilterImpl::AlgoChanwise::exec(const ExecArgs& args) con
#if CUDA_VERSION >= 9000
if (is_compute_capability_required(5, 3)) {
return chanwise::run_bwd_filter(
static_cast<__half*>(args.grad_tensor->raw_ptr),
static_cast<__half*>(args.src_tensor->raw_ptr),
static_cast<__half*>(args.diff_tensor->raw_ptr), kparam,
static_cast<__half*>(args.grad_tensor->raw_ptr()),
static_cast<__half*>(args.src_tensor->raw_ptr()),
static_cast<__half*>(args.diff_tensor->raw_ptr()), kparam,
stream);
} else {
return chanwise::run_bwd_filter(


+ 2
- 2
dnn/src/cuda/convolution/backward_filter/cudnn.cpp View File

@@ -71,9 +71,9 @@ void ConvolutionBackwardFilterImpl::AlgoCUDNN::exec(const ExecArgs& args) const
float alpha = 1.0f, beta = 0.0f;
auto status = cudnnConvolutionBackwardFilter(
args.handle->cudnn_handle(), &alpha, D.src_desc.desc,
args.src_tensor->raw_ptr, D.diff_desc.desc, args.diff_tensor->raw_ptr,
args.src_tensor->raw_ptr(), D.diff_desc.desc, args.diff_tensor->raw_ptr(),
D.conv_desc.desc, m_cudnn_enum, args.workspace.raw_ptr, args.workspace.size,
&beta, D.grad_desc.desc, args.grad_tensor->raw_ptr);
&beta, D.grad_desc.desc, args.grad_tensor->raw_ptr());
megdnn_assert(
status == CUDNN_STATUS_SUCCESS, "conv bwd_data failed: %s; info: %s",
cudnnGetErrorString(status), args.to_string().c_str());


+ 6
- 6
dnn/src/cuda/convolution/backward_filter/group_conv.cpp View File

@@ -101,9 +101,9 @@ void ConvolutionBackwardFilterImpl::AlgoGroupConvGeneral::exec(

{
auto config = prepare_sub_opr(args);
TensorND tsrc{args.src_tensor->raw_ptr, config.first[0]};
TensorND tdiff{args.diff_tensor->raw_ptr, config.first[1]};
TensorND tgrad{args.grad_tensor->raw_ptr, config.first[2]};
TensorND tsrc{args.src_tensor->raw_ptr(), config.first[0]};
TensorND tdiff{args.diff_tensor->raw_ptr(), config.first[1]};
TensorND tgrad{args.grad_tensor->raw_ptr(), config.first[2]};

size_t c_pos = 1;

@@ -118,9 +118,9 @@ void ConvolutionBackwardFilterImpl::AlgoGroupConvGeneral::exec(
auto grp = fm.group;
for (uint32_t g = 0; g < grp; ++g) {
config.second->exec(tsrc, tdiff, tgrad, bundle.get_workspace(0));
incr_voidp(tsrc.raw_ptr, strd_src);
incr_voidp(tdiff.raw_ptr, strd_diff);
incr_voidp(tgrad.raw_ptr, strd_grad);
incr_refp(tsrc.get_ref_ptr(), strd_src);
incr_refp(tdiff.get_ref_ptr(), strd_diff);
incr_refp(tgrad.get_ref_ptr(), strd_grad);
}
}
}


+ 4
- 4
dnn/src/cuda/convolution/backward_filter/matmul.cpp View File

@@ -133,7 +133,7 @@ void ConvolutionBackwardFilterImpl::AlgoMatmul::exec_internal(const ExecArgs& ar
froml.stride[0] = args.diff_layout->stride[0];
tol.stride[0] = 1;
tol.stride[1] = N;
TensorND from(args.diff_tensor->ptr<T>(), froml), to(diff_t, tol);
TensorND from(args.diff_tensor->raw_ptr(), froml), to(diff_t, tol);
args.handle->relayout_opr()->exec(from, to);
}
{
@@ -149,13 +149,13 @@ void ConvolutionBackwardFilterImpl::AlgoMatmul::exec_internal(const ExecArgs& ar
Cl({OC, OH * OW * N}, typename DTypeTrait<T>::dtype());
TensorND A(args.grad_tensor->ptr<T>(), Al), B(col, Bl), C(diff_t, Cl);
if (fm.should_flip) {
A.raw_ptr = wbundle.get(2);
A.reset_ptr(wbundle.get(2));
config.second->exec(C, B, A, wbundle.get_workspace(3));
convolution::flip_filter(
args.as_fwd_args(),
{static_cast<dt_byte*>(args.grad_tensor->raw_ptr),
{static_cast<dt_byte*>(args.grad_tensor->raw_ptr()),
wbundle.get_size(2)},
A.raw_ptr);
A.get_ref_ptr());
} else {
config.second->exec(C, B, A, wbundle.get_workspace(2));
}


+ 3
- 3
dnn/src/cuda/convolution/helper.cpp View File

@@ -68,19 +68,19 @@ SmallVector<size_t> convolution::matmul_get_workspace_bundle(
}

void convolution::flip_filter(
const ForwardSizeArgs& args, const Workspace& workspace, void*& raw_ptr) {
const ForwardSizeArgs& args, const Workspace& workspace, RefPtr& ref_ptr) {
auto&& fm = args.filter_meta;
megdnn_assert(fm.group == 1 && fm.spatial_ndim == 2);
auto OC = fm.ocpg, IC = fm.icpg, FH = fm.spatial[0], FW = fm.spatial[1];
auto dtype = fm.dtype;
megdnn_assert(workspace.size >= dtype.size() * OC * IC * FH * FW);

TensorND src{raw_ptr, {{OC, IC, FH, FW}, dtype}},
TensorND src{{{OC, IC, FH, FW}, dtype}, ref_ptr},
dst{workspace.raw_ptr + (FH * FW - 1) * dtype.size(), src.layout};
dst.layout.stride[2] = -dst.layout.stride[2];
dst.layout.stride[3] = -dst.layout.stride[3];
args.handle->relayout_opr()->exec(src, dst);
raw_ptr = workspace.raw_ptr;
ref_ptr.reset(workspace.raw_ptr);
}

// vim: syntax=cpp.doxygen

+ 1
- 1
dnn/src/cuda/convolution/helper.h View File

@@ -85,7 +85,7 @@ struct CUDNNBwdFilterDescs {
* change \p raw_ptr to workspace.
*/
void flip_filter(
const ForwardSizeArgs& args, const Workspace& workspace, void*& raw_ptr);
const ForwardSizeArgs& args, const Workspace& workspace, RefPtr& raw_ptr);

} // namespace convolution
} // namespace cuda


+ 4
- 3
dnn/src/cuda/convolution3d/backward_data/cudnn.cpp View File

@@ -55,9 +55,10 @@ void Convolution3DBackwardDataImpl::AlgoCUDNN::exec(const ExecArgs& args) const
float alpha = 1.0f, beta = 0.0f;
auto status = cudnnConvolutionBackwardData(
args.handle->cudnn_handle(), &alpha, D.filter_desc.desc,
args.filter_tensor->raw_ptr, D.diff_desc.desc, args.diff_tensor->raw_ptr,
D.conv_desc.desc, m_cudnn_enum, args.workspace.raw_ptr, args.workspace.size,
&beta, D.grad_desc.desc, args.grad_tensor->raw_ptr);
args.filter_tensor->raw_ptr(), D.diff_desc.desc,
args.diff_tensor->raw_ptr(), D.conv_desc.desc, m_cudnn_enum,
args.workspace.raw_ptr, args.workspace.size, &beta, D.grad_desc.desc,
args.grad_tensor->raw_ptr());
megdnn_assert(
status == CUDNN_STATUS_SUCCESS, "conv bwd_data failed: %s; info: %s",
cudnnGetErrorString(status), args.to_string().c_str());


+ 6
- 6
dnn/src/cuda/convolution3d/backward_data/group_conv.cpp View File

@@ -96,9 +96,9 @@ void Convolution3DBackwardDataImpl::AlgoGroupConvGeneral::exec(
auto bundle = get_workspace_bundle(args.workspace.raw_ptr, args);
{
auto config = prepare_sub_opr(args);
TensorND tfilter{args.filter_tensor->raw_ptr, config.first[0]};
TensorND tdiff{args.diff_tensor->raw_ptr, config.first[1]};
TensorND tgrad{args.grad_tensor->raw_ptr, config.first[2]};
TensorND tfilter{args.filter_tensor->raw_ptr(), config.first[0]};
TensorND tdiff{args.diff_tensor->raw_ptr(), config.first[1]};
TensorND tgrad{args.grad_tensor->raw_ptr(), config.first[2]};

size_t c_pos = 1;
auto grp = args.filter_meta.group;
@@ -114,9 +114,9 @@ void Convolution3DBackwardDataImpl::AlgoGroupConvGeneral::exec(

for (uint32_t g = 0; g < grp; ++g) {
config.second->exec(tfilter, tdiff, tgrad, bundle.get_workspace(0));
incr_voidp(tfilter.raw_ptr, strd_flt);
incr_voidp(tdiff.raw_ptr, strd_diff);
incr_voidp(tgrad.raw_ptr, strd_grad);
incr_refp(tfilter.get_ref_ptr(), strd_flt);
incr_refp(tdiff.get_ref_ptr(), strd_diff);
incr_refp(tgrad.get_ref_ptr(), strd_grad);
}
}
}


+ 2
- 2
dnn/src/cuda/convolution3d/backward_filter/cudnn.cpp View File

@@ -56,9 +56,9 @@ void Convolution3DBackwardFilterImpl::AlgoCUDNN::exec(const ExecArgs& args) cons
float alpha = 1.0f, beta = 0.0f;
auto status = cudnnConvolutionBackwardFilter(
args.handle->cudnn_handle(), &alpha, D.src_desc.desc,
args.src_tensor->raw_ptr, D.diff_desc.desc, args.diff_tensor->raw_ptr,
args.src_tensor->raw_ptr(), D.diff_desc.desc, args.diff_tensor->raw_ptr(),
D.conv_desc.desc, m_cudnn_enum, args.workspace.raw_ptr, args.workspace.size,
&beta, D.grad_desc.desc, args.grad_tensor->raw_ptr);
&beta, D.grad_desc.desc, args.grad_tensor->raw_ptr());
megdnn_assert(
status == CUDNN_STATUS_SUCCESS, "conv bwd_data failed: %s; info: %s",
cudnnGetErrorString(status), args.to_string().c_str());


+ 6
- 6
dnn/src/cuda/convolution3d/backward_filter/group_conv.cpp View File

@@ -98,9 +98,9 @@ void Convolution3DBackwardFilterImpl::AlgoGroupConvGeneral::exec(
auto bundle = get_workspace_bundle(args.workspace.raw_ptr, args);
{
auto config = prepare_sub_opr(args);
TensorND tsrc{args.src_tensor->raw_ptr, config.first[0]};
TensorND tdiff{args.diff_tensor->raw_ptr, config.first[1]};
TensorND tgrad{args.grad_tensor->raw_ptr, config.first[2]};
TensorND tsrc{args.src_tensor->raw_ptr(), config.first[0]};
TensorND tdiff{args.diff_tensor->raw_ptr(), config.first[1]};
TensorND tgrad{args.grad_tensor->raw_ptr(), config.first[2]};

size_t c_pos = 1;
auto grp = args.grad_filter_meta.group;
@@ -116,9 +116,9 @@ void Convolution3DBackwardFilterImpl::AlgoGroupConvGeneral::exec(

for (uint32_t g = 0; g < grp; ++g) {
config.second->exec(tsrc, tdiff, tgrad, bundle.get_workspace(0));
incr_voidp(tsrc.raw_ptr, strd_src);
incr_voidp(tdiff.raw_ptr, strd_diff);
incr_voidp(tgrad.raw_ptr, strd_grad);
incr_refp(tsrc.get_ref_ptr(), strd_src);
incr_refp(tdiff.get_ref_ptr(), strd_diff);
incr_refp(tgrad.get_ref_ptr(), strd_grad);
}
}
}


+ 5
- 5
dnn/src/cuda/convolution3d/forward/1x1x1.cpp View File

@@ -54,17 +54,17 @@ size_t Convolution3DForwardImpl::Algo1x1x1::get_workspace_in_bytes(
void Convolution3DForwardImpl::Algo1x1x1::exec(const ExecArgs& args) const {
TensorND A, B, C;
extract_matmul_layouts(args, A.layout, B.layout, C.layout);
A.raw_ptr = args.filter_tensor->raw_ptr;
B.raw_ptr = args.src_tensor->raw_ptr;
C.raw_ptr = args.dst_tensor->raw_ptr;
A.reset_ptr(args.filter_tensor->raw_ptr());
B.reset_ptr(args.src_tensor->raw_ptr());
C.reset_ptr(args.dst_tensor->raw_ptr());
size_t batch = args.src_layout->shape[0];
auto mm = args.handle->matmul_opr();
auto strd_B = args.src_layout->stride[0] * args.src_layout->dtype.size(),
strd_C = args.dst_layout->stride[0] * args.dst_layout->dtype.size();
for (size_t i = 0; i < batch; ++i) {
mm->exec(A, B, C, args.workspace);
incr_voidp(B.raw_ptr, strd_B);
incr_voidp(C.raw_ptr, strd_C);
incr_refp(B.get_ref_ptr(), strd_B);
incr_refp(C.get_ref_ptr(), strd_C);
}
}
// vim: syntax=cpp.doxygen

+ 4
- 3
dnn/src/cuda/convolution3d/forward/cudnn.cpp View File

@@ -53,9 +53,10 @@ void Convolution3DForwardImpl::AlgoCUDNN::exec(const ExecArgs& args) const {
float alpha = 1.0f, beta = 0.0f;
auto status = cudnnConvolutionForward(
args.handle->cudnn_handle(), &alpha, D.src_desc.desc,
args.src_tensor->raw_ptr, D.filter_desc.desc, args.filter_tensor->raw_ptr,
D.conv_desc.desc, m_cudnn_enum, args.workspace.raw_ptr, args.workspace.size,
&beta, D.dst_desc.desc, args.dst_tensor->raw_ptr);
args.src_tensor->raw_ptr(), D.filter_desc.desc,
args.filter_tensor->raw_ptr(), D.conv_desc.desc, m_cudnn_enum,
args.workspace.raw_ptr, args.workspace.size, &beta, D.dst_desc.desc,
args.dst_tensor->raw_ptr());
megdnn_assert(
status == CUDNN_STATUS_SUCCESS, "conv fwd failed: %s; info: %s",
cudnnGetErrorString(status), args.to_string().c_str());


+ 6
- 6
dnn/src/cuda/convolution3d/forward/group_conv.cpp View File

@@ -103,9 +103,9 @@ void Convolution3DForwardImpl::AlgoGroupConvGeneral::exec(const ExecArgs& args)
auto bundle = get_workspace_bundle(args.workspace.raw_ptr, args);
{
auto config = prepare_sub_opr(args);
TensorND tsrc{args.src_tensor->raw_ptr, config.first[0]};
TensorND tfilter{args.filter_tensor->raw_ptr, config.first[1]};
TensorND tdst{args.dst_tensor->raw_ptr, config.first[2]};
TensorND tsrc{args.src_tensor->raw_ptr(), config.first[0]};
TensorND tfilter{args.filter_tensor->raw_ptr(), config.first[1]};
TensorND tdst{args.dst_tensor->raw_ptr(), config.first[2]};

size_t c_pos;
if (args.filter_meta.format == Param::Format::NCDHW) {
@@ -127,9 +127,9 @@ void Convolution3DForwardImpl::AlgoGroupConvGeneral::exec(const ExecArgs& args)

for (uint32_t g = 0; g < grp; ++g) {
config.second->exec(tsrc, tfilter, tdst, bundle.get_workspace(0));
incr_voidp(tsrc.raw_ptr, strd_src);
incr_voidp(tdst.raw_ptr, strd_dst);
incr_voidp(tfilter.raw_ptr, strd_flt);
incr_refp(tsrc.get_ref_ptr(), strd_src);
incr_refp(tdst.get_ref_ptr(), strd_dst);
incr_refp(tfilter.get_ref_ptr(), strd_flt);
}
}
}


+ 3
- 3
dnn/src/cuda/convolution3d/helper.cpp View File

@@ -35,20 +35,20 @@ bool convolution3d::is_cudnn_supported(const ForwardSizeArgs& args) {
}

void convolution3d::flip_filter(
const ForwardSizeArgs& args, const Workspace& workspace, void*& raw_ptr) {
const ForwardSizeArgs& args, const Workspace& workspace, RefPtr& ref_ptr) {
auto&& fm = args.filter_meta;
megdnn_assert(fm.group == 1 && fm.spatial_ndim == 3);
auto OC = fm.ocpg, IC = fm.icpg, FD = fm.spatial[0], FH = fm.spatial[1],
FW = fm.spatial[2];
auto dtype = DType::from_enum(fm.dtype_enum);
megdnn_assert(workspace.size >= dtype.size() * OC * IC * FD * FH * FW);
TensorND src{raw_ptr, {{OC, IC, FD, FH, FW}, dtype}},
TensorND src{{{OC, IC, FD, FH, FW}, dtype}, ref_ptr},
dst{workspace.raw_ptr + (FD * FH * FW - 1) * dtype.size(), src.layout};
dst.layout.stride[2] = -dst.layout.stride[2];
dst.layout.stride[3] = -dst.layout.stride[3];
dst.layout.stride[4] = -dst.layout.stride[4];
args.handle->relayout_opr()->exec(src, dst);
raw_ptr = workspace.raw_ptr;
ref_ptr.reset(workspace.raw_ptr);
}

// vim: syntax=cpp.doxygen

+ 1
- 1
dnn/src/cuda/convolution3d/helper.h View File

@@ -84,7 +84,7 @@ struct CUDNNBwdFilterDescs {
* change \p raw_ptr to workspace.
*/
void flip_filter(
const ForwardSizeArgs& args, const Workspace& workspace, void*& raw_ptr);
const ForwardSizeArgs& args, const Workspace& workspace, RefPtr& raw_ptr);

inline bool cudnn_get_convolution_fwd_algo_helper(
cudnnHandle_t cudnn_handle, const cudnnTensorDescriptor_t x_desc,


+ 4
- 4
dnn/src/cuda/convpooling/opr_impl.cpp View File

@@ -169,10 +169,10 @@ void ConvPoolingForwardImpl::exec(
nonlineMode = IDENTITY;
}

float *src_ptr = static_cast<float*>(src.raw_ptr),
*filter_ptr = static_cast<float*>(filter.raw_ptr),
*bias_ptr = static_cast<float*>(bias.raw_ptr),
*dst_ptr = static_cast<float*>(dst.raw_ptr);
float *src_ptr = static_cast<float*>(src.raw_ptr()),
*filter_ptr = static_cast<float*>(filter.raw_ptr()),
*bias_ptr = static_cast<float*>(bias.raw_ptr()),
*dst_ptr = static_cast<float*>(dst.raw_ptr());

switch (this->param().method) {
case Param::Method::WITH_SHARED_MEM:


+ 1
- 1
dnn/src/cuda/cumsum/opr_impl.cpp View File

@@ -12,7 +12,7 @@
#include "./opr_impl.h"
#include "./kern.cuh"

#include "src/common/reduce_helper.h"
#include "src/common/reduce_helper_device.h"
#include "src/cuda/utils.h"

using namespace megdnn;


+ 1
- 1
dnn/src/cuda/dct/opr_impl.cpp View File

@@ -58,7 +58,7 @@ void DctChannelSelectForwardImpl::exec(
megdnn_assert(
param().format == Param::Format::NCHW4, "qint8 only support nchw4");
dct::call_kern_dct<dct_block, dct::DctLayoutFormat::NCHW4>(
src.ptr<uint8_t>(), (int8_t*)dst.raw_ptr, in, ic, ih, iw, oc,
src.ptr<uint8_t>(), (int8_t*)dst.raw_ptr(), in, ic, ih, iw, oc,
with_fix_32_mask, mask_offset_ptr, mask_val_ptr, stream, error_info,
m_error_tracker,
dst.layout.dtype.param<::megdnn::dtype::QuantizedS8>().scale);


+ 1
- 1
dnn/src/cuda/elemwise_helper.cpp View File

@@ -227,7 +227,7 @@ INST(dt_quint8);
template <int ndim>
void ParamElemVisitor4bitBase<ndim, BCAST_OTHER>::host_init(
const TensorND& rv, int /*grid_size*/, int /*block_size*/) {
m_ptr = reinterpret_cast<Storage*>(rv.raw_ptr);
m_ptr = reinterpret_cast<Storage*>(rv.raw_ptr());
ptrdiff_t min_stride = std::numeric_limits<ptrdiff_t>::max();
for (size_t i = 0; i < rv.layout.ndim; ++i) {
m_stride[i] = rv.layout.stride[i];


+ 38
- 38
dnn/src/cuda/elemwise_multi_type/opr_impl.cpp View File

@@ -21,31 +21,31 @@ using namespace megdnn;
using namespace cuda;

void ElemwiseMultiTypeImpl::on_fuse_mul_add3_int16x32x32x32(
const ElemwiseOpParamN<3>& param, dt_int32* dst) {
const ElemwiseOpParamN<3>& param, const TensorND& dst) {
BroadcastChannelInfo binfo0, binfo1;
if (is_vector(param[0].layout) &&
is_broadcasted_channel_like(param[1].layout, binfo0) &&
is_broadcasted_channel_like(param[2].layout, binfo1) && binfo0 == binfo1) {
elemwise_multi_type::fma3_int16x32x32x32_1c1(
param, dst, cuda_stream(this->handle()));
param, dst.ptr<dt_int32>(), cuda_stream(this->handle()));
return;
}
megdnn_throw("unsupported fma3 int16x32x32x32 layout");
}

void ElemwiseMultiTypeImpl::on_fuse_mul_add3_iXxf32xf32xi8(
const ElemwiseOpParamN<3>& param, dt_int8* dst) {
const ElemwiseOpParamN<3>& param, const TensorND& dst) {
Broadcast1xInfo binfo0, binfo1;
auto p1 = param[1].ptr<float>(), p2 = param[2].ptr<float>();
auto stream = cuda_stream(this->handle());
if (is_vector(param[0].layout) && is_broadcasted_1x(param[1].layout, binfo0) &&
is_broadcasted_1x(param[2].layout, binfo1) && binfo0 == binfo1) {
switch (param[0].layout.dtype.enumv()) {
#define cb(t) \
case DTypeTrait<t>::enumv: \
elemwise_multi_type::fma3_iXxf32xf32xi8_bcast_1x( \
param[0].ptr<DTypeTrait<t>::ctype>(), p1, p2, dst, binfo0.x, binfo0.y, \
stream); \
#define cb(t) \
case DTypeTrait<t>::enumv: \
elemwise_multi_type::fma3_iXxf32xf32xi8_bcast_1x( \
param[0].ptr<DTypeTrait<t>::ctype>(), p1, p2, dst.ptr<dt_int8>(), \
binfo0.x, binfo0.y, stream); \
return;
MEGDNN_FOREACH_COMPUTING_DTYPE_INT(cb)
#undef cb
@@ -58,14 +58,14 @@ void ElemwiseMultiTypeImpl::on_fuse_mul_add3_iXxf32xf32xi8(
}

void ElemwiseMultiTypeImpl::on_round_shr_saturate_iXxi8xi8(
const ElemwiseOpParamN<2>& param, dt_int8* dst) {
const ElemwiseOpParamN<2>& param, const TensorND& dst) {
auto stream = cuda_stream(this->handle());
if (is_vector(param[0].layout) && is_broadcasted_scalar(param[1].layout)) {
switch (param[0].layout.dtype.enumv()) {
#define DISPATCH(t) \
case DTypeTrait<t>::enumv: \
elemwise_multi_type::round_shr_saturate_iXxi8xiX_scalar< \
DTypeTrait<t>::ctype, dt_int8>(param, dst, stream); \
#define DISPATCH(t) \
case DTypeTrait<t>::enumv: \
elemwise_multi_type::round_shr_saturate_iXxi8xiX_scalar< \
DTypeTrait<t>::ctype, dt_int8>(param, dst.ptr<dt_int8>(), stream); \
return;
DISPATCH(::megdnn::dtype::Int32)
DISPATCH(::megdnn::dtype::Int16)
@@ -85,7 +85,7 @@ void ElemwiseMultiTypeImpl::on_round_shr_saturate_iXxi8xi8(
}

void ElemwiseMultiTypeImpl::on_fuse_add_rmulh_round_shr_saturate_int16x16x16x8(
const ElemwiseOpParamN<6>& param, dt_int8* dst) {
const ElemwiseOpParamN<6>& param, const TensorND& dst) {
auto stream = cuda_stream(this->handle());
BroadcastChannelInfo info;
if (is_vector(param[0].layout) &&
@@ -95,7 +95,7 @@ void ElemwiseMultiTypeImpl::on_fuse_add_rmulh_round_shr_saturate_int16x16x16x8(
is_broadcasted_scalar(param[4].layout) &&
is_broadcasted_scalar(param[5].layout)) {
elemwise_multi_type::fuse_add_rmulh_round_shr_saturate_bcast_1c11<dt_int16>(
param, dst, stream);
param, dst.ptr<dt_int8>(), stream);
return;
}
megdnn_throw(
@@ -106,7 +106,7 @@ void ElemwiseMultiTypeImpl::on_fuse_add_rmulh_round_shr_saturate_int16x16x16x8(
}

void ElemwiseMultiTypeImpl::on_fuse_add_rmulh_round_shr_saturate_int32x32x32x8(
const ElemwiseOpParamN<6>& param, dt_int8* dst) {
const ElemwiseOpParamN<6>& param, const TensorND& dst) {
auto stream = cuda_stream(this->handle());
BroadcastChannelInfo info;
if (is_vector(param[0].layout) &&
@@ -116,7 +116,7 @@ void ElemwiseMultiTypeImpl::on_fuse_add_rmulh_round_shr_saturate_int32x32x32x8(
is_broadcasted_scalar(param[4].layout) &&
is_broadcasted_scalar(param[5].layout)) {
elemwise_multi_type::fuse_add_rmulh_round_shr_saturate_bcast_1c11<dt_int32>(
param, dst, stream);
param, dst.ptr<dt_int8>(), stream);
return;
}
megdnn_throw(
@@ -127,14 +127,14 @@ void ElemwiseMultiTypeImpl::on_fuse_add_rmulh_round_shr_saturate_int32x32x32x8(
}

void ElemwiseMultiTypeImpl::on_round_shr_saturate_iXxi8xi16(
const ElemwiseOpParamN<2>& param, dt_int16* dst) {
const ElemwiseOpParamN<2>& param, const TensorND& dst) {
auto stream = cuda_stream(this->handle());
if (is_vector(param[0].layout) && is_broadcasted_scalar(param[1].layout)) {
switch (param[0].layout.dtype.enumv()) {
#define DISPATCH(t) \
case DTypeTrait<t>::enumv: \
elemwise_multi_type::round_shr_saturate_iXxi8xiX_scalar< \
DTypeTrait<t>::ctype, dt_int16>(param, dst, stream); \
#define DISPATCH(t) \
case DTypeTrait<t>::enumv: \
elemwise_multi_type::round_shr_saturate_iXxi8xiX_scalar< \
DTypeTrait<t>::ctype, dt_int16>(param, dst.ptr<dt_int16>(), stream); \
return;
DISPATCH(::megdnn::dtype::Int32)
DISPATCH(::megdnn::dtype::Int16)
@@ -227,22 +227,22 @@ IMPL_MODE_DISPATCHER(2, dt_quint4, dt_qint32);

#undef _cb_dispatch_mode

#define _cb_dispatch_mode(_m) \
case param::Elemwise::Mode::_m: \
do { \
using KernImpl = ElemwiseKern< \
megcorePlatformCUDA, param_enumv::Elemwise::Mode::_m, float>; \
using Op = kern_ops_quantized::QuantizedMultiTypeOp< \
arity, src_ctype, dst_ctype, KernImpl>; \
using dst_storage = typename VectTypeTrait<dst_ctype>::Storage; \
dst_storage* dst = reinterpret_cast<dst_storage*>(dst_tensor.raw_ptr); \
Op op(src_params, dst, dst_param); \
ElemwiseOpParamN<1> param_dst; \
param_dst[0] = dst_tensor; \
param_dst.init_from_given_tensor(); \
run_elemwise<Op, src_ctype, dst_ctype, arity>( \
param, param_dst, stream, op); \
return; \
#define _cb_dispatch_mode(_m) \
case param::Elemwise::Mode::_m: \
do { \
using KernImpl = ElemwiseKern< \
megcorePlatformCUDA, param_enumv::Elemwise::Mode::_m, float>; \
using Op = kern_ops_quantized::QuantizedMultiTypeOp< \
arity, src_ctype, dst_ctype, KernImpl>; \
using dst_storage = typename VectTypeTrait<dst_ctype>::Storage; \
dst_storage* dst = reinterpret_cast<dst_storage*>(dst_tensor.raw_ptr()); \
Op op(src_params, dst, dst_param); \
ElemwiseOpParamN<1> param_dst; \
param_dst[0] = dst_tensor; \
param_dst.init_from_given_tensor(); \
run_elemwise<Op, src_ctype, dst_ctype, arity>( \
param, param_dst, stream, op); \
return; \
} while (0);

#define FOREACH(cb) \


+ 6
- 6
dnn/src/cuda/elemwise_multi_type/opr_impl.h View File

@@ -18,22 +18,22 @@ namespace cuda {

class ElemwiseMultiTypeImpl final : public ElemwiseMultiTypeImplHelper {
void on_fuse_mul_add3_int16x32x32x32(
const ElemwiseOpParamN<3>& param, dt_int32* dst) override;
const ElemwiseOpParamN<3>& param, const TensorND& dst) override;

void on_fuse_mul_add3_iXxf32xf32xi8(
const ElemwiseOpParamN<3>& param, dt_int8* dst) override;
const ElemwiseOpParamN<3>& param, const TensorND& dst) override;

void on_round_shr_saturate_iXxi8xi8(
const ElemwiseOpParamN<2>& param, dt_int8* dst) override;
const ElemwiseOpParamN<2>& param, const TensorND& dst) override;

void on_fuse_add_rmulh_round_shr_saturate_int16x16x16x8(
const ElemwiseOpParamN<6>& param, dt_int8* dst) override;
const ElemwiseOpParamN<6>& param, const TensorND& dst) override;

void on_fuse_add_rmulh_round_shr_saturate_int32x32x32x8(
const ElemwiseOpParamN<6>& param, dt_int8* dst) override;
const ElemwiseOpParamN<6>& param, const TensorND& dst) override;

void on_round_shr_saturate_iXxi8xi16(
const ElemwiseOpParamN<2>& param, dt_int16* dst) override;
const ElemwiseOpParamN<2>& param, const TensorND& dst) override;

void on_quantized_mode(
const ElemwiseOpParamN<1>& param, const TensorND& dst,


+ 12
- 15
dnn/src/cuda/group_local/forward/opr_impl.cpp View File

@@ -32,11 +32,6 @@ std::unique_ptr<LocalForward> get_opr(Handle* handle, param::Convolution param)
return std::move(opr);
}

template <typename T>
void incr_ptr(T*& dst, ptrdiff_t delta) {
dst = reinterpret_cast<T*>(reinterpret_cast<uintptr_t>(dst) + delta);
}

TensorLayout prepare_src_dst(const TensorLayout& input, size_t g) {
TensorLayout ret = input;
megdnn_assert(ret[1] % g == 0);
@@ -84,18 +79,20 @@ void GroupLocalForwardImpl::exec(
SH, SW, stream);
} else {
auto&& opr = get_opr(handle, param());
TensorND src_g = {src.raw_ptr, prepare_src_dst(src.layout, G)};
TensorND dst_g = {dst.raw_ptr, prepare_src_dst(dst.layout, G)};
TensorND filter_g = {filter.raw_ptr, prepare_filter(filter.layout)};
TensorND src_g = {src.raw_ptr(), prepare_src_dst(src.layout, G)};
TensorND dst_g = {dst.raw_ptr(), prepare_src_dst(dst.layout, G)};
TensorND filter_g = {filter.raw_ptr(), prepare_filter(filter.layout)};
for (size_t g = 0; g < G; ++g) {
opr->exec(src_g, filter_g, dst_g, workspace);
incr_ptr(
src_g.raw_ptr, src_g.layout.stride[1] * src_g.layout.shape[1] *
src_g.layout.dtype.size());
incr_ptr(
dst_g.raw_ptr, dst_g.layout.stride[1] * dst_g.layout.shape[1] *
dst_g.layout.dtype.size());
incr_ptr(filter_g.raw_ptr, filter_g.layout.span().dist_byte());
incr_refp(
src_g.get_ref_ptr(), src_g.layout.stride[1] *
src_g.layout.shape[1] *
src_g.layout.dtype.size());
incr_refp(
dst_g.get_ref_ptr(), dst_g.layout.stride[1] *
dst_g.layout.shape[1] *
dst_g.layout.dtype.size());
incr_refp(filter_g.get_ref_ptr(), filter_g.layout.span().dist_byte());
}
}
}


+ 2
- 2
dnn/src/cuda/local_share/backward_data/batched_matmul.cpp View File

@@ -106,7 +106,7 @@ void LocalShareBackwardDataImpl::AlgoBatchedMatMul::exec(const ExecArgs& args) c
B1.stride[4] = wo;
B1.stride[5] = 1;
B1.stride[6] = co * ho * wo;
TensorND ts_B1{args.diff_tensor->raw_ptr, B1};
TensorND ts_B1{args.diff_tensor->raw_ptr(), B1};
TensorLayout B2{
{groups * sgh * sgw, ocpg, ho / sgh * wo / sgw * n}, dtype::Float32()};
B2.init_contiguous_stride();
@@ -122,7 +122,7 @@ void LocalShareBackwardDataImpl::AlgoBatchedMatMul::exec(const ExecArgs& args) c
TensorLayout C{
{groups * sgh * sgw, icpg * fh * fw, ho / sgh * wo / sgw * n},
dtype::Float32()};
TensorND ts_A{args.filter_tensor->raw_ptr, A};
TensorND ts_A{args.filter_tensor->raw_ptr(), A};
TensorND ts_B{ws_pretranspose, B};
TensorND ts_C{ws_col2im, C};
Workspace ws_wrapper;


+ 2
- 2
dnn/src/cuda/local_share/backward_filter/batched_matmul.cpp View File

@@ -113,7 +113,7 @@ void LocalShareBackwardFilterImpl::AlgoBatchedMatMul::exec(const ExecArgs& args)
B1.stride[4] = co * ho * wo;
B1.stride[5] = wo;
B1.stride[6] = 1;
TensorND ts_B1{args.diff_tensor->raw_ptr, B1};
TensorND ts_B1{args.diff_tensor->raw_ptr(), B1};
TensorLayout B2{
{groups * sgh * sgw, ocpg, ho / sgh * wo / sgw * n}, dtype::Float32()};
B2.init_contiguous_stride();
@@ -133,7 +133,7 @@ void LocalShareBackwardFilterImpl::AlgoBatchedMatMul::exec(const ExecArgs& args)
TensorLayout C{{groups * sgh * sgw, icpg * fh * fw, ocpg}, dtype::Float32()};
TensorND ts_A{ws_im2col, A};
TensorND ts_B{ws_pretranspose, B};
TensorND ts_C{args.grad_tensor->raw_ptr, C};
TensorND ts_C{args.grad_tensor->raw_ptr(), C};
Workspace ws_wrapper;
ws_wrapper.raw_ptr = reinterpret_cast<dt_byte*>(ws_matmul);
ws_wrapper.size = ws.get_size(2);


+ 2
- 2
dnn/src/cuda/local_share/forward/batched_matmul.cpp View File

@@ -100,7 +100,7 @@ void LocalShareForwardImpl::AlgoBatchedMatMul::exec(const ExecArgs& args) const
TensorLayout C{
{groups * sgh * sgw, ho / sgh * wo / sgw * n, ocpg}, dtype::Float32()};
TensorND ts_A{ws_im2col, A};
TensorND ts_B{args.filter_tensor->raw_ptr, B};
TensorND ts_B{args.filter_tensor->raw_ptr(), B};
TensorND ts_C{ws_posttranspose, C};
Workspace ws_wrapper;
ws_wrapper.raw_ptr = reinterpret_cast<dt_byte*>(ws_matmul);
@@ -119,7 +119,7 @@ void LocalShareForwardImpl::AlgoBatchedMatMul::exec(const ExecArgs& args) const
C1.stride[6] = ocpg;
TensorLayout C2 = args.dst_layout;
TensorND ts_C1{ws_posttranspose, C1};
TensorND ts_C2{args.dst_tensor->raw_ptr, C2};
TensorND ts_C2{args.dst_tensor->raw_ptr(), C2};
auto&& relayout_opr = args.opr->handle()->create_operator<Relayout>();
relayout_opr->exec(ts_C1, ts_C2);
}


+ 3
- 3
dnn/src/cuda/lrn/opr_impl.cpp View File

@@ -29,7 +29,7 @@ void LRNForwardImpl::exec(
float alpha = 1.0f, beta = 0.0f;
cudnn_check(cudnnLRNCrossChannelForward(
handle, lrn_desc.desc, CUDNN_LRN_CROSS_CHANNEL_DIM1, &alpha, src_desc.desc,
src.raw_ptr, &beta, dst_desc.desc, dst.raw_ptr));
src.raw_ptr(), &beta, dst_desc.desc, dst.raw_ptr()));
}

void LRNBackwardImpl::setup_descs(
@@ -51,8 +51,8 @@ void LRNBackwardImpl::exec(
float alpha = 1.0f, beta = 0.0f;
cudnn_check(cudnnLRNCrossChannelBackward(
handle, lrn_desc.desc, CUDNN_LRN_CROSS_CHANNEL_DIM1, &alpha, dst_desc.desc,
dst.raw_ptr, diff_desc.desc, diff.raw_ptr, src_desc.desc, src.raw_ptr,
&beta, grad_desc.desc, grad.raw_ptr));
dst.raw_ptr(), diff_desc.desc, diff.raw_ptr(), src_desc.desc, src.raw_ptr(),
&beta, grad_desc.desc, grad.raw_ptr()));
}

} // namespace cuda


+ 2
- 2
dnn/src/cuda/matrix_inverse/opr_impl.cpp View File

@@ -37,11 +37,11 @@ void MatrixInverseImpl::exec(
auto stream = handle->stream();
batched_matrix_mul::arange<uintptr_t>(
reinterpret_cast<uintptr_t*>(psrc_batch),
reinterpret_cast<uintptr_t>(src.raw_ptr), n * n * sizeof(float), batch,
reinterpret_cast<uintptr_t>(src.raw_ptr()), n * n * sizeof(float), batch,
stream);
batched_matrix_mul::arange<uintptr_t>(
reinterpret_cast<uintptr_t*>(pdst_batch),
reinterpret_cast<uintptr_t>(dst.raw_ptr), n * n * sizeof(float), batch,
reinterpret_cast<uintptr_t>(dst.raw_ptr()), n * n * sizeof(float), batch,
stream);
cublas_check(cublasSmatinvBatched(
handle->cublas_handle(), n, psrc_batch, n, pdst_batch, n, info, batch));


Some files were not shown because too many files changed in this diff

Loading…
Cancel
Save